Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

+15

Documentation/devicetree/bindings/crypto/fsl-imx-sahara.txt

··· 1 + Freescale SAHARA Cryptographic Accelerator included in some i.MX chips. 2 + Currently only i.MX27 is supported. 3 + 4 + Required properties: 5 + - compatible : Should be "fsl,<soc>-sahara" 6 + - reg : Should contain SAHARA registers location and length 7 + - interrupts : Should contain SAHARA interrupt number 8 + 9 + Example: 10 + 11 + sah@10025000 { 12 + compatible = "fsl,imx27-sahara"; 13 + reg = < 0x10025000 0x800>; 14 + interrupts = <75>; 15 + };

+18

Documentation/devicetree/bindings/hwrng/timeriomem_rng.txt

··· 1 + HWRNG support for the timeriomem_rng driver 2 + 3 + Required properties: 4 + - compatible : "timeriomem_rng" 5 + - reg : base address to sample from 6 + - period : wait time in microseconds to use between samples 7 + 8 + N.B. currently 'reg' must be four bytes wide and aligned 9 + 10 + Example: 11 + 12 + hwrng@44 { 13 + #address-cells = <1>; 14 + #size-cells = <1>; 15 + compatible = "timeriomem_rng"; 16 + reg = <0x44 0x04>; 17 + period = <1000000>; 18 + };

+13

Documentation/devicetree/bindings/rng/brcm,bcm2835.txt

··· 1 + BCM2835 Random number generator 2 + 3 + Required properties: 4 + 5 + - compatible : should be "brcm,bcm2835-rng" 6 + - reg : Specifies base physical address and size of the registers. 7 + 8 + Example: 9 + 10 + rng { 11 + compatible = "brcm,bcm2835-rng"; 12 + reg = <0x7e104000 0x10>; 13 + };

+1 -1

Documentation/hw_random.txt

··· 63 63 64 64 * FIXME: support poll(2) 65 65 66 - NOTE: request_mem_region was removed, for two reasons: 66 + NOTE: request_mem_region was removed, for three reasons: 67 67 1) Only one RNG is supported by this driver, 2) The location 68 68 used by the RNG is a fixed location in MMIO-addressable memory, 69 69 3) users with properly working BIOS e820 handling will always

+6 -8

arch/arm/mach-at91/at91sam9g45_devices.c

··· 18 18 #include <linux/platform_device.h> 19 19 #include <linux/i2c-gpio.h> 20 20 #include <linux/atmel-mci.h> 21 - #include <linux/platform_data/atmel-aes.h> 21 + #include <linux/platform_data/crypto-atmel.h> 22 22 23 23 #include <linux/platform_data/at91_adc.h> 24 24 ··· 1900 1900 * -------------------------------------------------------------------- */ 1901 1901 1902 1902 #if defined(CONFIG_CRYPTO_DEV_ATMEL_AES) || defined(CONFIG_CRYPTO_DEV_ATMEL_AES_MODULE) 1903 - static struct aes_platform_data aes_data; 1903 + static struct crypto_platform_data aes_data; 1904 + static struct crypto_dma_data alt_atslave; 1904 1905 static u64 aes_dmamask = DMA_BIT_MASK(32); 1905 1906 1906 1907 static struct resource aes_resources[] = { ··· 1932 1931 static void __init at91_add_device_aes(void) 1933 1932 { 1934 1933 struct at_dma_slave *atslave; 1935 - struct aes_dma_data *alt_atslave; 1936 - 1937 - alt_atslave = kzalloc(sizeof(struct aes_dma_data), GFP_KERNEL); 1938 1934 1939 1935 /* DMA TX slave channel configuration */ 1940 - atslave = &alt_atslave->txdata; 1936 + atslave = &alt_atslave.txdata; 1941 1937 atslave->dma_dev = &at_hdmac_device.dev; 1942 1938 atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE | ATC_SRC_H2SEL_HW | 1943 1939 ATC_SRC_PER(AT_DMA_ID_AES_RX); 1944 1940 1945 1941 /* DMA RX slave channel configuration */ 1946 - atslave = &alt_atslave->rxdata; 1942 + atslave = &alt_atslave.rxdata; 1947 1943 atslave->dma_dev = &at_hdmac_device.dev; 1948 1944 atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE | ATC_DST_H2SEL_HW | 1949 1945 ATC_DST_PER(AT_DMA_ID_AES_TX); 1950 1946 1951 - aes_data.dma_slave = alt_atslave; 1947 + aes_data.dma_slave = &alt_atslave; 1952 1948 platform_device_register(&at91sam9g45_aes_device); 1953 1949 } 1954 1950 #else

+45 -12

arch/x86/crypto/Makefile

··· 2 2 # Arch-specific CryptoAPI modules. 3 3 # 4 4 5 + avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) 6 + avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ 7 + $(comma)4)$(comma)%ymm2,yes,no) 8 + 5 9 obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o 6 10 obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o 7 11 ··· 16 12 17 13 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o 18 14 obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o 19 - obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o 20 - obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o 21 - obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o 22 15 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o 23 16 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 24 17 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o 25 - obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o 26 18 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 27 19 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o 28 - obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o 29 20 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 30 21 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o 31 22 32 23 obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o 33 24 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o 34 25 obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o 26 + obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o 27 + obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o 28 + 29 + # These modules require assembler to support AVX. 30 + ifeq ($(avx_supported),yes) 31 + obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \ 32 + camellia-aesni-avx-x86_64.o 33 + obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o 34 + obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o 35 + obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o 36 + obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o 37 + endif 38 + 39 + # These modules require assembler to support AVX2. 40 + ifeq ($(avx2_supported),yes) 41 + obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o 42 + obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o 43 + obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o 44 + obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o 45 + endif 35 46 36 47 aes-i586-y := aes-i586-asm_32.o aes_glue.o 37 48 twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o ··· 55 36 56 37 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o 57 38 camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o 58 - camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ 59 - camellia_aesni_avx_glue.o 60 - cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o 61 - cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o 62 39 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o 63 40 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 64 41 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o 65 - twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o 66 42 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 67 43 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o 68 - serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o 44 + 45 + ifeq ($(avx_supported),yes) 46 + camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ 47 + camellia_aesni_avx_glue.o 48 + cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o 49 + cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o 50 + twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o \ 51 + twofish_avx_glue.o 52 + serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o \ 53 + serpent_avx_glue.o 54 + endif 55 + 56 + ifeq ($(avx2_supported),yes) 57 + blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o 58 + camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o 59 + serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o 60 + twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o 61 + endif 69 62 70 63 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 71 64 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 72 65 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o 73 66 crc32c-intel-y := crc32c-intel_glue.o 74 - crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o 67 + crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o 75 68 crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o 69 + sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o 70 + sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o

+117

arch/x86/crypto/aesni-intel_asm.S

··· 34 34 35 35 #ifdef __x86_64__ 36 36 .data 37 + .align 16 38 + .Lgf128mul_x_ble_mask: 39 + .octa 0x00000000000000010000000000000087 40 + 37 41 POLY: .octa 0xC2000000000000000000000000000001 38 42 TWOONE: .octa 0x00000001000000000000000000000001 39 43 ··· 108 104 #define BSWAP_MASK %xmm10 109 105 #define CTR %xmm11 110 106 #define INC %xmm12 107 + 108 + #define GF128MUL_MASK %xmm10 111 109 112 110 #ifdef __x86_64__ 113 111 #define AREG %rax ··· 2642 2636 .Lctr_enc_just_ret: 2643 2637 ret 2644 2638 ENDPROC(aesni_ctr_enc) 2639 + 2640 + /* 2641 + * _aesni_gf128mul_x_ble: internal ABI 2642 + * Multiply in GF(2^128) for XTS IVs 2643 + * input: 2644 + * IV: current IV 2645 + * GF128MUL_MASK == mask with 0x87 and 0x01 2646 + * output: 2647 + * IV: next IV 2648 + * changed: 2649 + * CTR: == temporary value 2650 + */ 2651 + #define _aesni_gf128mul_x_ble() \ 2652 + pshufd $0x13, IV, CTR; \ 2653 + paddq IV, IV; \ 2654 + psrad $31, CTR; \ 2655 + pand GF128MUL_MASK, CTR; \ 2656 + pxor CTR, IV; 2657 + 2658 + /* 2659 + * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2660 + * bool enc, u8 *iv) 2661 + */ 2662 + ENTRY(aesni_xts_crypt8) 2663 + cmpb $0, %cl 2664 + movl $0, %ecx 2665 + movl $240, %r10d 2666 + leaq _aesni_enc4, %r11 2667 + leaq _aesni_dec4, %rax 2668 + cmovel %r10d, %ecx 2669 + cmoveq %rax, %r11 2670 + 2671 + movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2672 + movups (IVP), IV 2673 + 2674 + mov 480(KEYP), KLEN 2675 + addq %rcx, KEYP 2676 + 2677 + movdqa IV, STATE1 2678 + pxor 0x00(INP), STATE1 2679 + movdqu IV, 0x00(OUTP) 2680 + 2681 + _aesni_gf128mul_x_ble() 2682 + movdqa IV, STATE2 2683 + pxor 0x10(INP), STATE2 2684 + movdqu IV, 0x10(OUTP) 2685 + 2686 + _aesni_gf128mul_x_ble() 2687 + movdqa IV, STATE3 2688 + pxor 0x20(INP), STATE3 2689 + movdqu IV, 0x20(OUTP) 2690 + 2691 + _aesni_gf128mul_x_ble() 2692 + movdqa IV, STATE4 2693 + pxor 0x30(INP), STATE4 2694 + movdqu IV, 0x30(OUTP) 2695 + 2696 + call *%r11 2697 + 2698 + pxor 0x00(OUTP), STATE1 2699 + movdqu STATE1, 0x00(OUTP) 2700 + 2701 + _aesni_gf128mul_x_ble() 2702 + movdqa IV, STATE1 2703 + pxor 0x40(INP), STATE1 2704 + movdqu IV, 0x40(OUTP) 2705 + 2706 + pxor 0x10(OUTP), STATE2 2707 + movdqu STATE2, 0x10(OUTP) 2708 + 2709 + _aesni_gf128mul_x_ble() 2710 + movdqa IV, STATE2 2711 + pxor 0x50(INP), STATE2 2712 + movdqu IV, 0x50(OUTP) 2713 + 2714 + pxor 0x20(OUTP), STATE3 2715 + movdqu STATE3, 0x20(OUTP) 2716 + 2717 + _aesni_gf128mul_x_ble() 2718 + movdqa IV, STATE3 2719 + pxor 0x60(INP), STATE3 2720 + movdqu IV, 0x60(OUTP) 2721 + 2722 + pxor 0x30(OUTP), STATE4 2723 + movdqu STATE4, 0x30(OUTP) 2724 + 2725 + _aesni_gf128mul_x_ble() 2726 + movdqa IV, STATE4 2727 + pxor 0x70(INP), STATE4 2728 + movdqu IV, 0x70(OUTP) 2729 + 2730 + _aesni_gf128mul_x_ble() 2731 + movups IV, (IVP) 2732 + 2733 + call *%r11 2734 + 2735 + pxor 0x40(OUTP), STATE1 2736 + movdqu STATE1, 0x40(OUTP) 2737 + 2738 + pxor 0x50(OUTP), STATE2 2739 + movdqu STATE2, 0x50(OUTP) 2740 + 2741 + pxor 0x60(OUTP), STATE3 2742 + movdqu STATE3, 0x60(OUTP) 2743 + 2744 + pxor 0x70(OUTP), STATE4 2745 + movdqu STATE4, 0x70(OUTP) 2746 + 2747 + ret 2748 + ENDPROC(aesni_xts_crypt8) 2749 + 2645 2750 #endif

+80

arch/x86/crypto/aesni-intel_glue.c

··· 39 39 #include <crypto/internal/aead.h> 40 40 #include <linux/workqueue.h> 41 41 #include <linux/spinlock.h> 42 + #ifdef CONFIG_X86_64 43 + #include <asm/crypto/glue_helper.h> 44 + #endif 42 45 43 46 #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) 44 47 #define HAS_PCBC ··· 104 101 #ifdef CONFIG_X86_64 105 102 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, 106 103 const u8 *in, unsigned int len, u8 *iv); 104 + 105 + asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out, 106 + const u8 *in, bool enc, u8 *iv); 107 107 108 108 /* asmlinkage void aesni_gcm_enc() 109 109 * void *ctx, AES Key schedule. Starts on a 16 byte boundary. ··· 516 510 aesni_enc(ctx, out, in); 517 511 } 518 512 513 + #ifdef CONFIG_X86_64 514 + 515 + static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) 516 + { 517 + glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc)); 518 + } 519 + 520 + static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) 521 + { 522 + glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec)); 523 + } 524 + 525 + static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv) 526 + { 527 + aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv); 528 + } 529 + 530 + static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv) 531 + { 532 + aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv); 533 + } 534 + 535 + static const struct common_glue_ctx aesni_enc_xts = { 536 + .num_funcs = 2, 537 + .fpu_blocks_limit = 1, 538 + 539 + .funcs = { { 540 + .num_blocks = 8, 541 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) } 542 + }, { 543 + .num_blocks = 1, 544 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) } 545 + } } 546 + }; 547 + 548 + static const struct common_glue_ctx aesni_dec_xts = { 549 + .num_funcs = 2, 550 + .fpu_blocks_limit = 1, 551 + 552 + .funcs = { { 553 + .num_blocks = 8, 554 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) } 555 + }, { 556 + .num_blocks = 1, 557 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) } 558 + } } 559 + }; 560 + 561 + static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 562 + struct scatterlist *src, unsigned int nbytes) 563 + { 564 + struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 565 + 566 + return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes, 567 + XTS_TWEAK_CAST(aesni_xts_tweak), 568 + aes_ctx(ctx->raw_tweak_ctx), 569 + aes_ctx(ctx->raw_crypt_ctx)); 570 + } 571 + 572 + static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 573 + struct scatterlist *src, unsigned int nbytes) 574 + { 575 + struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 576 + 577 + return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes, 578 + XTS_TWEAK_CAST(aesni_xts_tweak), 579 + aes_ctx(ctx->raw_tweak_ctx), 580 + aes_ctx(ctx->raw_crypt_ctx)); 581 + } 582 + 583 + #else 584 + 519 585 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 520 586 struct scatterlist *src, unsigned int nbytes) 521 587 { ··· 637 559 638 560 return ret; 639 561 } 562 + 563 + #endif 640 564 641 565 #ifdef CONFIG_X86_64 642 566 static int rfc4106_init(struct crypto_tfm *tfm)

+449

arch/x86/crypto/blowfish-avx2-asm_64.S

··· 1 + /* 2 + * x86_64/AVX2 assembler optimized version of Blowfish 3 + * 4 + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License as published by 8 + * the Free Software Foundation; either version 2 of the License, or 9 + * (at your option) any later version. 10 + * 11 + */ 12 + 13 + #include <linux/linkage.h> 14 + 15 + .file "blowfish-avx2-asm_64.S" 16 + 17 + .data 18 + .align 32 19 + 20 + .Lprefetch_mask: 21 + .long 0*64 22 + .long 1*64 23 + .long 2*64 24 + .long 3*64 25 + .long 4*64 26 + .long 5*64 27 + .long 6*64 28 + .long 7*64 29 + 30 + .Lbswap32_mask: 31 + .long 0x00010203 32 + .long 0x04050607 33 + .long 0x08090a0b 34 + .long 0x0c0d0e0f 35 + 36 + .Lbswap128_mask: 37 + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 38 + .Lbswap_iv_mask: 39 + .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0 40 + 41 + .text 42 + /* structure of crypto context */ 43 + #define p 0 44 + #define s0 ((16 + 2) * 4) 45 + #define s1 ((16 + 2 + (1 * 256)) * 4) 46 + #define s2 ((16 + 2 + (2 * 256)) * 4) 47 + #define s3 ((16 + 2 + (3 * 256)) * 4) 48 + 49 + /* register macros */ 50 + #define CTX %rdi 51 + #define RIO %rdx 52 + 53 + #define RS0 %rax 54 + #define RS1 %r8 55 + #define RS2 %r9 56 + #define RS3 %r10 57 + 58 + #define RLOOP %r11 59 + #define RLOOPd %r11d 60 + 61 + #define RXr0 %ymm8 62 + #define RXr1 %ymm9 63 + #define RXr2 %ymm10 64 + #define RXr3 %ymm11 65 + #define RXl0 %ymm12 66 + #define RXl1 %ymm13 67 + #define RXl2 %ymm14 68 + #define RXl3 %ymm15 69 + 70 + /* temp regs */ 71 + #define RT0 %ymm0 72 + #define RT0x %xmm0 73 + #define RT1 %ymm1 74 + #define RT1x %xmm1 75 + #define RIDX0 %ymm2 76 + #define RIDX1 %ymm3 77 + #define RIDX1x %xmm3 78 + #define RIDX2 %ymm4 79 + #define RIDX3 %ymm5 80 + 81 + /* vpgatherdd mask and '-1' */ 82 + #define RNOT %ymm6 83 + 84 + /* byte mask, (-1 >> 24) */ 85 + #define RBYTE %ymm7 86 + 87 + /*********************************************************************** 88 + * 32-way AVX2 blowfish 89 + ***********************************************************************/ 90 + #define F(xl, xr) \ 91 + vpsrld $24, xl, RIDX0; \ 92 + vpsrld $16, xl, RIDX1; \ 93 + vpsrld $8, xl, RIDX2; \ 94 + vpand RBYTE, RIDX1, RIDX1; \ 95 + vpand RBYTE, RIDX2, RIDX2; \ 96 + vpand RBYTE, xl, RIDX3; \ 97 + \ 98 + vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \ 99 + vpcmpeqd RNOT, RNOT, RNOT; \ 100 + vpcmpeqd RIDX0, RIDX0, RIDX0; \ 101 + \ 102 + vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \ 103 + vpcmpeqd RIDX1, RIDX1, RIDX1; \ 104 + vpaddd RT0, RT1, RT0; \ 105 + \ 106 + vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \ 107 + vpxor RT0, RT1, RT0; \ 108 + \ 109 + vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \ 110 + vpcmpeqd RNOT, RNOT, RNOT; \ 111 + vpaddd RT0, RT1, RT0; \ 112 + \ 113 + vpxor RT0, xr, xr; 114 + 115 + #define add_roundkey(xl, nmem) \ 116 + vpbroadcastd nmem, RT0; \ 117 + vpxor RT0, xl ## 0, xl ## 0; \ 118 + vpxor RT0, xl ## 1, xl ## 1; \ 119 + vpxor RT0, xl ## 2, xl ## 2; \ 120 + vpxor RT0, xl ## 3, xl ## 3; 121 + 122 + #define round_enc() \ 123 + add_roundkey(RXr, p(CTX,RLOOP,4)); \ 124 + F(RXl0, RXr0); \ 125 + F(RXl1, RXr1); \ 126 + F(RXl2, RXr2); \ 127 + F(RXl3, RXr3); \ 128 + \ 129 + add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ 130 + F(RXr0, RXl0); \ 131 + F(RXr1, RXl1); \ 132 + F(RXr2, RXl2); \ 133 + F(RXr3, RXl3); 134 + 135 + #define round_dec() \ 136 + add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \ 137 + F(RXl0, RXr0); \ 138 + F(RXl1, RXr1); \ 139 + F(RXl2, RXr2); \ 140 + F(RXl3, RXr3); \ 141 + \ 142 + add_roundkey(RXl, p+4(CTX,RLOOP,4)); \ 143 + F(RXr0, RXl0); \ 144 + F(RXr1, RXl1); \ 145 + F(RXr2, RXl2); \ 146 + F(RXr3, RXl3); 147 + 148 + #define init_round_constants() \ 149 + vpcmpeqd RNOT, RNOT, RNOT; \ 150 + leaq s0(CTX), RS0; \ 151 + leaq s1(CTX), RS1; \ 152 + leaq s2(CTX), RS2; \ 153 + leaq s3(CTX), RS3; \ 154 + vpsrld $24, RNOT, RBYTE; 155 + 156 + #define transpose_2x2(x0, x1, t0) \ 157 + vpunpckldq x0, x1, t0; \ 158 + vpunpckhdq x0, x1, x1; \ 159 + \ 160 + vpunpcklqdq t0, x1, x0; \ 161 + vpunpckhqdq t0, x1, x1; 162 + 163 + #define read_block(xl, xr) \ 164 + vbroadcasti128 .Lbswap32_mask, RT1; \ 165 + \ 166 + vpshufb RT1, xl ## 0, xl ## 0; \ 167 + vpshufb RT1, xr ## 0, xr ## 0; \ 168 + vpshufb RT1, xl ## 1, xl ## 1; \ 169 + vpshufb RT1, xr ## 1, xr ## 1; \ 170 + vpshufb RT1, xl ## 2, xl ## 2; \ 171 + vpshufb RT1, xr ## 2, xr ## 2; \ 172 + vpshufb RT1, xl ## 3, xl ## 3; \ 173 + vpshufb RT1, xr ## 3, xr ## 3; \ 174 + \ 175 + transpose_2x2(xl ## 0, xr ## 0, RT0); \ 176 + transpose_2x2(xl ## 1, xr ## 1, RT0); \ 177 + transpose_2x2(xl ## 2, xr ## 2, RT0); \ 178 + transpose_2x2(xl ## 3, xr ## 3, RT0); 179 + 180 + #define write_block(xl, xr) \ 181 + vbroadcasti128 .Lbswap32_mask, RT1; \ 182 + \ 183 + transpose_2x2(xl ## 0, xr ## 0, RT0); \ 184 + transpose_2x2(xl ## 1, xr ## 1, RT0); \ 185 + transpose_2x2(xl ## 2, xr ## 2, RT0); \ 186 + transpose_2x2(xl ## 3, xr ## 3, RT0); \ 187 + \ 188 + vpshufb RT1, xl ## 0, xl ## 0; \ 189 + vpshufb RT1, xr ## 0, xr ## 0; \ 190 + vpshufb RT1, xl ## 1, xl ## 1; \ 191 + vpshufb RT1, xr ## 1, xr ## 1; \ 192 + vpshufb RT1, xl ## 2, xl ## 2; \ 193 + vpshufb RT1, xr ## 2, xr ## 2; \ 194 + vpshufb RT1, xl ## 3, xl ## 3; \ 195 + vpshufb RT1, xr ## 3, xr ## 3; 196 + 197 + .align 8 198 + __blowfish_enc_blk32: 199 + /* input: 200 + * %rdi: ctx, CTX 201 + * RXl0..4, RXr0..4: plaintext 202 + * output: 203 + * RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped) 204 + */ 205 + init_round_constants(); 206 + 207 + read_block(RXl, RXr); 208 + 209 + movl $1, RLOOPd; 210 + add_roundkey(RXl, p+4*(0)(CTX)); 211 + 212 + .align 4 213 + .L__enc_loop: 214 + round_enc(); 215 + 216 + leal 2(RLOOPd), RLOOPd; 217 + cmpl $17, RLOOPd; 218 + jne .L__enc_loop; 219 + 220 + add_roundkey(RXr, p+4*(17)(CTX)); 221 + 222 + write_block(RXl, RXr); 223 + 224 + ret; 225 + ENDPROC(__blowfish_enc_blk32) 226 + 227 + .align 8 228 + __blowfish_dec_blk32: 229 + /* input: 230 + * %rdi: ctx, CTX 231 + * RXl0..4, RXr0..4: ciphertext 232 + * output: 233 + * RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped) 234 + */ 235 + init_round_constants(); 236 + 237 + read_block(RXl, RXr); 238 + 239 + movl $14, RLOOPd; 240 + add_roundkey(RXl, p+4*(17)(CTX)); 241 + 242 + .align 4 243 + .L__dec_loop: 244 + round_dec(); 245 + 246 + addl $-2, RLOOPd; 247 + jns .L__dec_loop; 248 + 249 + add_roundkey(RXr, p+4*(0)(CTX)); 250 + 251 + write_block(RXl, RXr); 252 + 253 + ret; 254 + ENDPROC(__blowfish_dec_blk32) 255 + 256 + ENTRY(blowfish_ecb_enc_32way) 257 + /* input: 258 + * %rdi: ctx, CTX 259 + * %rsi: dst 260 + * %rdx: src 261 + */ 262 + 263 + vzeroupper; 264 + 265 + vmovdqu 0*32(%rdx), RXl0; 266 + vmovdqu 1*32(%rdx), RXr0; 267 + vmovdqu 2*32(%rdx), RXl1; 268 + vmovdqu 3*32(%rdx), RXr1; 269 + vmovdqu 4*32(%rdx), RXl2; 270 + vmovdqu 5*32(%rdx), RXr2; 271 + vmovdqu 6*32(%rdx), RXl3; 272 + vmovdqu 7*32(%rdx), RXr3; 273 + 274 + call __blowfish_enc_blk32; 275 + 276 + vmovdqu RXr0, 0*32(%rsi); 277 + vmovdqu RXl0, 1*32(%rsi); 278 + vmovdqu RXr1, 2*32(%rsi); 279 + vmovdqu RXl1, 3*32(%rsi); 280 + vmovdqu RXr2, 4*32(%rsi); 281 + vmovdqu RXl2, 5*32(%rsi); 282 + vmovdqu RXr3, 6*32(%rsi); 283 + vmovdqu RXl3, 7*32(%rsi); 284 + 285 + vzeroupper; 286 + 287 + ret; 288 + ENDPROC(blowfish_ecb_enc_32way) 289 + 290 + ENTRY(blowfish_ecb_dec_32way) 291 + /* input: 292 + * %rdi: ctx, CTX 293 + * %rsi: dst 294 + * %rdx: src 295 + */ 296 + 297 + vzeroupper; 298 + 299 + vmovdqu 0*32(%rdx), RXl0; 300 + vmovdqu 1*32(%rdx), RXr0; 301 + vmovdqu 2*32(%rdx), RXl1; 302 + vmovdqu 3*32(%rdx), RXr1; 303 + vmovdqu 4*32(%rdx), RXl2; 304 + vmovdqu 5*32(%rdx), RXr2; 305 + vmovdqu 6*32(%rdx), RXl3; 306 + vmovdqu 7*32(%rdx), RXr3; 307 + 308 + call __blowfish_dec_blk32; 309 + 310 + vmovdqu RXr0, 0*32(%rsi); 311 + vmovdqu RXl0, 1*32(%rsi); 312 + vmovdqu RXr1, 2*32(%rsi); 313 + vmovdqu RXl1, 3*32(%rsi); 314 + vmovdqu RXr2, 4*32(%rsi); 315 + vmovdqu RXl2, 5*32(%rsi); 316 + vmovdqu RXr3, 6*32(%rsi); 317 + vmovdqu RXl3, 7*32(%rsi); 318 + 319 + vzeroupper; 320 + 321 + ret; 322 + ENDPROC(blowfish_ecb_dec_32way) 323 + 324 + ENTRY(blowfish_cbc_dec_32way) 325 + /* input: 326 + * %rdi: ctx, CTX 327 + * %rsi: dst 328 + * %rdx: src 329 + */ 330 + 331 + vzeroupper; 332 + 333 + vmovdqu 0*32(%rdx), RXl0; 334 + vmovdqu 1*32(%rdx), RXr0; 335 + vmovdqu 2*32(%rdx), RXl1; 336 + vmovdqu 3*32(%rdx), RXr1; 337 + vmovdqu 4*32(%rdx), RXl2; 338 + vmovdqu 5*32(%rdx), RXr2; 339 + vmovdqu 6*32(%rdx), RXl3; 340 + vmovdqu 7*32(%rdx), RXr3; 341 + 342 + call __blowfish_dec_blk32; 343 + 344 + /* xor with src */ 345 + vmovq (%rdx), RT0x; 346 + vpshufd $0x4f, RT0x, RT0x; 347 + vinserti128 $1, 8(%rdx), RT0, RT0; 348 + vpxor RT0, RXr0, RXr0; 349 + vpxor 0*32+24(%rdx), RXl0, RXl0; 350 + vpxor 1*32+24(%rdx), RXr1, RXr1; 351 + vpxor 2*32+24(%rdx), RXl1, RXl1; 352 + vpxor 3*32+24(%rdx), RXr2, RXr2; 353 + vpxor 4*32+24(%rdx), RXl2, RXl2; 354 + vpxor 5*32+24(%rdx), RXr3, RXr3; 355 + vpxor 6*32+24(%rdx), RXl3, RXl3; 356 + 357 + vmovdqu RXr0, (0*32)(%rsi); 358 + vmovdqu RXl0, (1*32)(%rsi); 359 + vmovdqu RXr1, (2*32)(%rsi); 360 + vmovdqu RXl1, (3*32)(%rsi); 361 + vmovdqu RXr2, (4*32)(%rsi); 362 + vmovdqu RXl2, (5*32)(%rsi); 363 + vmovdqu RXr3, (6*32)(%rsi); 364 + vmovdqu RXl3, (7*32)(%rsi); 365 + 366 + vzeroupper; 367 + 368 + ret; 369 + ENDPROC(blowfish_cbc_dec_32way) 370 + 371 + ENTRY(blowfish_ctr_32way) 372 + /* input: 373 + * %rdi: ctx, CTX 374 + * %rsi: dst 375 + * %rdx: src 376 + * %rcx: iv (big endian, 64bit) 377 + */ 378 + 379 + vzeroupper; 380 + 381 + vpcmpeqd RT0, RT0, RT0; 382 + vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */ 383 + 384 + vpcmpeqd RT1x, RT1x, RT1x; 385 + vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */ 386 + vpxor RIDX0, RIDX0, RIDX0; 387 + vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */ 388 + 389 + vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */ 390 + 391 + vpcmpeqd RT1, RT1, RT1; 392 + vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */ 393 + vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */ 394 + 395 + vbroadcasti128 .Lbswap_iv_mask, RIDX0; 396 + vbroadcasti128 .Lbswap128_mask, RIDX1; 397 + 398 + /* load IV and byteswap */ 399 + vmovq (%rcx), RT1x; 400 + vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */ 401 + vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */ 402 + 403 + /* construct IVs */ 404 + vpsubq RT0, RT1, RT1; /* a: le1, b: le0, c: le3, d: le2 */ 405 + vpshufb RIDX1, RT1, RXl0; /* a: be0, b: be1, c: be2, d: be3 */ 406 + vpsubq RIDX2, RT1, RT1; /* le5, le4, le7, le6 */ 407 + vpshufb RIDX1, RT1, RXr0; /* be4, be5, be6, be7 */ 408 + vpsubq RIDX2, RT1, RT1; 409 + vpshufb RIDX1, RT1, RXl1; 410 + vpsubq RIDX2, RT1, RT1; 411 + vpshufb RIDX1, RT1, RXr1; 412 + vpsubq RIDX2, RT1, RT1; 413 + vpshufb RIDX1, RT1, RXl2; 414 + vpsubq RIDX2, RT1, RT1; 415 + vpshufb RIDX1, RT1, RXr2; 416 + vpsubq RIDX2, RT1, RT1; 417 + vpshufb RIDX1, RT1, RXl3; 418 + vpsubq RIDX2, RT1, RT1; 419 + vpshufb RIDX1, RT1, RXr3; 420 + 421 + /* store last IV */ 422 + vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */ 423 + vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */ 424 + vmovq RT1x, (%rcx); 425 + 426 + call __blowfish_enc_blk32; 427 + 428 + /* dst = src ^ iv */ 429 + vpxor 0*32(%rdx), RXr0, RXr0; 430 + vpxor 1*32(%rdx), RXl0, RXl0; 431 + vpxor 2*32(%rdx), RXr1, RXr1; 432 + vpxor 3*32(%rdx), RXl1, RXl1; 433 + vpxor 4*32(%rdx), RXr2, RXr2; 434 + vpxor 5*32(%rdx), RXl2, RXl2; 435 + vpxor 6*32(%rdx), RXr3, RXr3; 436 + vpxor 7*32(%rdx), RXl3, RXl3; 437 + vmovdqu RXr0, (0*32)(%rsi); 438 + vmovdqu RXl0, (1*32)(%rsi); 439 + vmovdqu RXr1, (2*32)(%rsi); 440 + vmovdqu RXl1, (3*32)(%rsi); 441 + vmovdqu RXr2, (4*32)(%rsi); 442 + vmovdqu RXl2, (5*32)(%rsi); 443 + vmovdqu RXr3, (6*32)(%rsi); 444 + vmovdqu RXl3, (7*32)(%rsi); 445 + 446 + vzeroupper; 447 + 448 + ret; 449 + ENDPROC(blowfish_ctr_32way)

+585

arch/x86/crypto/blowfish_avx2_glue.c

··· 1 + /* 2 + * Glue Code for x86_64/AVX2 assembler optimized version of Blowfish 3 + * 4 + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 + * 6 + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: 7 + * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> 8 + * CTR part based on code (crypto/ctr.c) by: 9 + * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> 10 + * 11 + * This program is free software; you can redistribute it and/or modify 12 + * it under the terms of the GNU General Public License as published by 13 + * the Free Software Foundation; either version 2 of the License, or 14 + * (at your option) any later version. 15 + * 16 + * This program is distributed in the hope that it will be useful, 17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 + * GNU General Public License for more details. 20 + * 21 + */ 22 + 23 + #include <linux/module.h> 24 + #include <linux/types.h> 25 + #include <linux/crypto.h> 26 + #include <linux/err.h> 27 + #include <crypto/algapi.h> 28 + #include <crypto/blowfish.h> 29 + #include <crypto/cryptd.h> 30 + #include <crypto/ctr.h> 31 + #include <asm/i387.h> 32 + #include <asm/xcr.h> 33 + #include <asm/xsave.h> 34 + #include <asm/crypto/blowfish.h> 35 + #include <asm/crypto/ablk_helper.h> 36 + #include <crypto/scatterwalk.h> 37 + 38 + #define BF_AVX2_PARALLEL_BLOCKS 32 39 + 40 + /* 32-way AVX2 parallel cipher functions */ 41 + asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst, 42 + const u8 *src); 43 + asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst, 44 + const u8 *src); 45 + asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst, 46 + const u8 *src); 47 + asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src, 48 + __be64 *iv); 49 + 50 + static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes) 51 + { 52 + if (fpu_enabled) 53 + return true; 54 + 55 + /* FPU is only used when chunk to be processed is large enough, so 56 + * do not enable FPU until it is necessary. 57 + */ 58 + if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS) 59 + return false; 60 + 61 + kernel_fpu_begin(); 62 + return true; 63 + } 64 + 65 + static inline void bf_fpu_end(bool fpu_enabled) 66 + { 67 + if (fpu_enabled) 68 + kernel_fpu_end(); 69 + } 70 + 71 + static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, 72 + bool enc) 73 + { 74 + bool fpu_enabled = false; 75 + struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 76 + const unsigned int bsize = BF_BLOCK_SIZE; 77 + unsigned int nbytes; 78 + int err; 79 + 80 + err = blkcipher_walk_virt(desc, walk); 81 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 82 + 83 + while ((nbytes = walk->nbytes)) { 84 + u8 *wsrc = walk->src.virt.addr; 85 + u8 *wdst = walk->dst.virt.addr; 86 + 87 + fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); 88 + 89 + /* Process multi-block AVX2 batch */ 90 + if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { 91 + do { 92 + if (enc) 93 + blowfish_ecb_enc_32way(ctx, wdst, wsrc); 94 + else 95 + blowfish_ecb_dec_32way(ctx, wdst, wsrc); 96 + 97 + wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS; 98 + wdst += bsize * BF_AVX2_PARALLEL_BLOCKS; 99 + nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS; 100 + } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); 101 + 102 + if (nbytes < bsize) 103 + goto done; 104 + } 105 + 106 + /* Process multi-block batch */ 107 + if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { 108 + do { 109 + if (enc) 110 + blowfish_enc_blk_4way(ctx, wdst, wsrc); 111 + else 112 + blowfish_dec_blk_4way(ctx, wdst, wsrc); 113 + 114 + wsrc += bsize * BF_PARALLEL_BLOCKS; 115 + wdst += bsize * BF_PARALLEL_BLOCKS; 116 + nbytes -= bsize * BF_PARALLEL_BLOCKS; 117 + } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); 118 + 119 + if (nbytes < bsize) 120 + goto done; 121 + } 122 + 123 + /* Handle leftovers */ 124 + do { 125 + if (enc) 126 + blowfish_enc_blk(ctx, wdst, wsrc); 127 + else 128 + blowfish_dec_blk(ctx, wdst, wsrc); 129 + 130 + wsrc += bsize; 131 + wdst += bsize; 132 + nbytes -= bsize; 133 + } while (nbytes >= bsize); 134 + 135 + done: 136 + err = blkcipher_walk_done(desc, walk, nbytes); 137 + } 138 + 139 + bf_fpu_end(fpu_enabled); 140 + return err; 141 + } 142 + 143 + static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 144 + struct scatterlist *src, unsigned int nbytes) 145 + { 146 + struct blkcipher_walk walk; 147 + 148 + blkcipher_walk_init(&walk, dst, src, nbytes); 149 + return ecb_crypt(desc, &walk, true); 150 + } 151 + 152 + static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 153 + struct scatterlist *src, unsigned int nbytes) 154 + { 155 + struct blkcipher_walk walk; 156 + 157 + blkcipher_walk_init(&walk, dst, src, nbytes); 158 + return ecb_crypt(desc, &walk, false); 159 + } 160 + 161 + static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, 162 + struct blkcipher_walk *walk) 163 + { 164 + struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 165 + unsigned int bsize = BF_BLOCK_SIZE; 166 + unsigned int nbytes = walk->nbytes; 167 + u64 *src = (u64 *)walk->src.virt.addr; 168 + u64 *dst = (u64 *)walk->dst.virt.addr; 169 + u64 *iv = (u64 *)walk->iv; 170 + 171 + do { 172 + *dst = *src ^ *iv; 173 + blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); 174 + iv = dst; 175 + 176 + src += 1; 177 + dst += 1; 178 + nbytes -= bsize; 179 + } while (nbytes >= bsize); 180 + 181 + *(u64 *)walk->iv = *iv; 182 + return nbytes; 183 + } 184 + 185 + static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 186 + struct scatterlist *src, unsigned int nbytes) 187 + { 188 + struct blkcipher_walk walk; 189 + int err; 190 + 191 + blkcipher_walk_init(&walk, dst, src, nbytes); 192 + err = blkcipher_walk_virt(desc, &walk); 193 + 194 + while ((nbytes = walk.nbytes)) { 195 + nbytes = __cbc_encrypt(desc, &walk); 196 + err = blkcipher_walk_done(desc, &walk, nbytes); 197 + } 198 + 199 + return err; 200 + } 201 + 202 + static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, 203 + struct blkcipher_walk *walk) 204 + { 205 + struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 206 + const unsigned int bsize = BF_BLOCK_SIZE; 207 + unsigned int nbytes = walk->nbytes; 208 + u64 *src = (u64 *)walk->src.virt.addr; 209 + u64 *dst = (u64 *)walk->dst.virt.addr; 210 + u64 last_iv; 211 + int i; 212 + 213 + /* Start of the last block. */ 214 + src += nbytes / bsize - 1; 215 + dst += nbytes / bsize - 1; 216 + 217 + last_iv = *src; 218 + 219 + /* Process multi-block AVX2 batch */ 220 + if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { 221 + do { 222 + nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1); 223 + src -= BF_AVX2_PARALLEL_BLOCKS - 1; 224 + dst -= BF_AVX2_PARALLEL_BLOCKS - 1; 225 + 226 + blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src); 227 + 228 + nbytes -= bsize; 229 + if (nbytes < bsize) 230 + goto done; 231 + 232 + *dst ^= *(src - 1); 233 + src -= 1; 234 + dst -= 1; 235 + } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); 236 + 237 + if (nbytes < bsize) 238 + goto done; 239 + } 240 + 241 + /* Process multi-block batch */ 242 + if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { 243 + u64 ivs[BF_PARALLEL_BLOCKS - 1]; 244 + 245 + do { 246 + nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1); 247 + src -= BF_PARALLEL_BLOCKS - 1; 248 + dst -= BF_PARALLEL_BLOCKS - 1; 249 + 250 + for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++) 251 + ivs[i] = src[i]; 252 + 253 + blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src); 254 + 255 + for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++) 256 + dst[i + 1] ^= ivs[i]; 257 + 258 + nbytes -= bsize; 259 + if (nbytes < bsize) 260 + goto done; 261 + 262 + *dst ^= *(src - 1); 263 + src -= 1; 264 + dst -= 1; 265 + } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); 266 + 267 + if (nbytes < bsize) 268 + goto done; 269 + } 270 + 271 + /* Handle leftovers */ 272 + for (;;) { 273 + blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src); 274 + 275 + nbytes -= bsize; 276 + if (nbytes < bsize) 277 + break; 278 + 279 + *dst ^= *(src - 1); 280 + src -= 1; 281 + dst -= 1; 282 + } 283 + 284 + done: 285 + *dst ^= *(u64 *)walk->iv; 286 + *(u64 *)walk->iv = last_iv; 287 + 288 + return nbytes; 289 + } 290 + 291 + static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 292 + struct scatterlist *src, unsigned int nbytes) 293 + { 294 + bool fpu_enabled = false; 295 + struct blkcipher_walk walk; 296 + int err; 297 + 298 + blkcipher_walk_init(&walk, dst, src, nbytes); 299 + err = blkcipher_walk_virt(desc, &walk); 300 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 301 + 302 + while ((nbytes = walk.nbytes)) { 303 + fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); 304 + nbytes = __cbc_decrypt(desc, &walk); 305 + err = blkcipher_walk_done(desc, &walk, nbytes); 306 + } 307 + 308 + bf_fpu_end(fpu_enabled); 309 + return err; 310 + } 311 + 312 + static void ctr_crypt_final(struct blkcipher_desc *desc, 313 + struct blkcipher_walk *walk) 314 + { 315 + struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 316 + u8 *ctrblk = walk->iv; 317 + u8 keystream[BF_BLOCK_SIZE]; 318 + u8 *src = walk->src.virt.addr; 319 + u8 *dst = walk->dst.virt.addr; 320 + unsigned int nbytes = walk->nbytes; 321 + 322 + blowfish_enc_blk(ctx, keystream, ctrblk); 323 + crypto_xor(keystream, src, nbytes); 324 + memcpy(dst, keystream, nbytes); 325 + 326 + crypto_inc(ctrblk, BF_BLOCK_SIZE); 327 + } 328 + 329 + static unsigned int __ctr_crypt(struct blkcipher_desc *desc, 330 + struct blkcipher_walk *walk) 331 + { 332 + struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 333 + unsigned int bsize = BF_BLOCK_SIZE; 334 + unsigned int nbytes = walk->nbytes; 335 + u64 *src = (u64 *)walk->src.virt.addr; 336 + u64 *dst = (u64 *)walk->dst.virt.addr; 337 + int i; 338 + 339 + /* Process multi-block AVX2 batch */ 340 + if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) { 341 + do { 342 + blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src, 343 + (__be64 *)walk->iv); 344 + 345 + src += BF_AVX2_PARALLEL_BLOCKS; 346 + dst += BF_AVX2_PARALLEL_BLOCKS; 347 + nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS; 348 + } while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS); 349 + 350 + if (nbytes < bsize) 351 + goto done; 352 + } 353 + 354 + /* Process four block batch */ 355 + if (nbytes >= bsize * BF_PARALLEL_BLOCKS) { 356 + __be64 ctrblocks[BF_PARALLEL_BLOCKS]; 357 + u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); 358 + 359 + do { 360 + /* create ctrblks for parallel encrypt */ 361 + for (i = 0; i < BF_PARALLEL_BLOCKS; i++) { 362 + if (dst != src) 363 + dst[i] = src[i]; 364 + 365 + ctrblocks[i] = cpu_to_be64(ctrblk++); 366 + } 367 + 368 + blowfish_enc_blk_xor_4way(ctx, (u8 *)dst, 369 + (u8 *)ctrblocks); 370 + 371 + src += BF_PARALLEL_BLOCKS; 372 + dst += BF_PARALLEL_BLOCKS; 373 + nbytes -= bsize * BF_PARALLEL_BLOCKS; 374 + } while (nbytes >= bsize * BF_PARALLEL_BLOCKS); 375 + 376 + *(__be64 *)walk->iv = cpu_to_be64(ctrblk); 377 + 378 + if (nbytes < bsize) 379 + goto done; 380 + } 381 + 382 + /* Handle leftovers */ 383 + do { 384 + u64 ctrblk; 385 + 386 + if (dst != src) 387 + *dst = *src; 388 + 389 + ctrblk = *(u64 *)walk->iv; 390 + be64_add_cpu((__be64 *)walk->iv, 1); 391 + 392 + blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); 393 + 394 + src += 1; 395 + dst += 1; 396 + } while ((nbytes -= bsize) >= bsize); 397 + 398 + done: 399 + return nbytes; 400 + } 401 + 402 + static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, 403 + struct scatterlist *src, unsigned int nbytes) 404 + { 405 + bool fpu_enabled = false; 406 + struct blkcipher_walk walk; 407 + int err; 408 + 409 + blkcipher_walk_init(&walk, dst, src, nbytes); 410 + err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE); 411 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 412 + 413 + while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) { 414 + fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes); 415 + nbytes = __ctr_crypt(desc, &walk); 416 + err = blkcipher_walk_done(desc, &walk, nbytes); 417 + } 418 + 419 + bf_fpu_end(fpu_enabled); 420 + 421 + if (walk.nbytes) { 422 + ctr_crypt_final(desc, &walk); 423 + err = blkcipher_walk_done(desc, &walk, 0); 424 + } 425 + 426 + return err; 427 + } 428 + 429 + static struct crypto_alg bf_algs[6] = { { 430 + .cra_name = "__ecb-blowfish-avx2", 431 + .cra_driver_name = "__driver-ecb-blowfish-avx2", 432 + .cra_priority = 0, 433 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 434 + .cra_blocksize = BF_BLOCK_SIZE, 435 + .cra_ctxsize = sizeof(struct bf_ctx), 436 + .cra_alignmask = 0, 437 + .cra_type = &crypto_blkcipher_type, 438 + .cra_module = THIS_MODULE, 439 + .cra_u = { 440 + .blkcipher = { 441 + .min_keysize = BF_MIN_KEY_SIZE, 442 + .max_keysize = BF_MAX_KEY_SIZE, 443 + .setkey = blowfish_setkey, 444 + .encrypt = ecb_encrypt, 445 + .decrypt = ecb_decrypt, 446 + }, 447 + }, 448 + }, { 449 + .cra_name = "__cbc-blowfish-avx2", 450 + .cra_driver_name = "__driver-cbc-blowfish-avx2", 451 + .cra_priority = 0, 452 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 453 + .cra_blocksize = BF_BLOCK_SIZE, 454 + .cra_ctxsize = sizeof(struct bf_ctx), 455 + .cra_alignmask = 0, 456 + .cra_type = &crypto_blkcipher_type, 457 + .cra_module = THIS_MODULE, 458 + .cra_u = { 459 + .blkcipher = { 460 + .min_keysize = BF_MIN_KEY_SIZE, 461 + .max_keysize = BF_MAX_KEY_SIZE, 462 + .setkey = blowfish_setkey, 463 + .encrypt = cbc_encrypt, 464 + .decrypt = cbc_decrypt, 465 + }, 466 + }, 467 + }, { 468 + .cra_name = "__ctr-blowfish-avx2", 469 + .cra_driver_name = "__driver-ctr-blowfish-avx2", 470 + .cra_priority = 0, 471 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 472 + .cra_blocksize = 1, 473 + .cra_ctxsize = sizeof(struct bf_ctx), 474 + .cra_alignmask = 0, 475 + .cra_type = &crypto_blkcipher_type, 476 + .cra_module = THIS_MODULE, 477 + .cra_u = { 478 + .blkcipher = { 479 + .min_keysize = BF_MIN_KEY_SIZE, 480 + .max_keysize = BF_MAX_KEY_SIZE, 481 + .ivsize = BF_BLOCK_SIZE, 482 + .setkey = blowfish_setkey, 483 + .encrypt = ctr_crypt, 484 + .decrypt = ctr_crypt, 485 + }, 486 + }, 487 + }, { 488 + .cra_name = "ecb(blowfish)", 489 + .cra_driver_name = "ecb-blowfish-avx2", 490 + .cra_priority = 400, 491 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 492 + .cra_blocksize = BF_BLOCK_SIZE, 493 + .cra_ctxsize = sizeof(struct async_helper_ctx), 494 + .cra_alignmask = 0, 495 + .cra_type = &crypto_ablkcipher_type, 496 + .cra_module = THIS_MODULE, 497 + .cra_init = ablk_init, 498 + .cra_exit = ablk_exit, 499 + .cra_u = { 500 + .ablkcipher = { 501 + .min_keysize = BF_MIN_KEY_SIZE, 502 + .max_keysize = BF_MAX_KEY_SIZE, 503 + .setkey = ablk_set_key, 504 + .encrypt = ablk_encrypt, 505 + .decrypt = ablk_decrypt, 506 + }, 507 + }, 508 + }, { 509 + .cra_name = "cbc(blowfish)", 510 + .cra_driver_name = "cbc-blowfish-avx2", 511 + .cra_priority = 400, 512 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 513 + .cra_blocksize = BF_BLOCK_SIZE, 514 + .cra_ctxsize = sizeof(struct async_helper_ctx), 515 + .cra_alignmask = 0, 516 + .cra_type = &crypto_ablkcipher_type, 517 + .cra_module = THIS_MODULE, 518 + .cra_init = ablk_init, 519 + .cra_exit = ablk_exit, 520 + .cra_u = { 521 + .ablkcipher = { 522 + .min_keysize = BF_MIN_KEY_SIZE, 523 + .max_keysize = BF_MAX_KEY_SIZE, 524 + .ivsize = BF_BLOCK_SIZE, 525 + .setkey = ablk_set_key, 526 + .encrypt = __ablk_encrypt, 527 + .decrypt = ablk_decrypt, 528 + }, 529 + }, 530 + }, { 531 + .cra_name = "ctr(blowfish)", 532 + .cra_driver_name = "ctr-blowfish-avx2", 533 + .cra_priority = 400, 534 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 535 + .cra_blocksize = 1, 536 + .cra_ctxsize = sizeof(struct async_helper_ctx), 537 + .cra_alignmask = 0, 538 + .cra_type = &crypto_ablkcipher_type, 539 + .cra_module = THIS_MODULE, 540 + .cra_init = ablk_init, 541 + .cra_exit = ablk_exit, 542 + .cra_u = { 543 + .ablkcipher = { 544 + .min_keysize = BF_MIN_KEY_SIZE, 545 + .max_keysize = BF_MAX_KEY_SIZE, 546 + .ivsize = BF_BLOCK_SIZE, 547 + .setkey = ablk_set_key, 548 + .encrypt = ablk_encrypt, 549 + .decrypt = ablk_encrypt, 550 + .geniv = "chainiv", 551 + }, 552 + }, 553 + } }; 554 + 555 + 556 + static int __init init(void) 557 + { 558 + u64 xcr0; 559 + 560 + if (!cpu_has_avx2 || !cpu_has_osxsave) { 561 + pr_info("AVX2 instructions are not detected.\n"); 562 + return -ENODEV; 563 + } 564 + 565 + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 566 + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { 567 + pr_info("AVX detected but unusable.\n"); 568 + return -ENODEV; 569 + } 570 + 571 + return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs)); 572 + } 573 + 574 + static void __exit fini(void) 575 + { 576 + crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs)); 577 + } 578 + 579 + module_init(init); 580 + module_exit(fini); 581 + 582 + MODULE_LICENSE("GPL"); 583 + MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized"); 584 + MODULE_ALIAS("blowfish"); 585 + MODULE_ALIAS("blowfish-asm");

+8 -24

arch/x86/crypto/blowfish_glue.c

··· 1 1 /* 2 2 * Glue Code for assembler optimized version of Blowfish 3 3 * 4 - * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 4 + * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 5 5 * 6 6 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: 7 7 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> ··· 32 32 #include <linux/module.h> 33 33 #include <linux/types.h> 34 34 #include <crypto/algapi.h> 35 + #include <asm/crypto/blowfish.h> 35 36 36 37 /* regular block cipher functions */ 37 38 asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, 38 39 bool xor); 40 + EXPORT_SYMBOL_GPL(__blowfish_enc_blk); 41 + 39 42 asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); 43 + EXPORT_SYMBOL_GPL(blowfish_dec_blk); 40 44 41 45 /* 4-way parallel cipher functions */ 42 46 asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, 43 47 const u8 *src, bool xor); 48 + EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way); 49 + 44 50 asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, 45 51 const u8 *src); 46 - 47 - static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) 48 - { 49 - __blowfish_enc_blk(ctx, dst, src, false); 50 - } 51 - 52 - static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, 53 - const u8 *src) 54 - { 55 - __blowfish_enc_blk(ctx, dst, src, true); 56 - } 57 - 58 - static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, 59 - const u8 *src) 60 - { 61 - __blowfish_enc_blk_4way(ctx, dst, src, false); 62 - } 63 - 64 - static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, 65 - const u8 *src) 66 - { 67 - __blowfish_enc_blk_4way(ctx, dst, src, true); 68 - } 52 + EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way); 69 53 70 54 static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) 71 55 {

+179 -1

arch/x86/crypto/camellia-aesni-avx-asm_64.S

··· 1 1 /* 2 2 * x86_64/AVX/AES-NI assembler implementation of Camellia 3 3 * 4 - * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 4 + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 5 5 * 6 6 * This program is free software; you can redistribute it and/or modify 7 7 * it under the terms of the GNU General Public License as published by ··· 589 589 .Lbswap128_mask: 590 590 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 591 591 592 + /* For XTS mode IV generation */ 593 + .Lxts_gf128mul_and_shl1_mask: 594 + .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 595 + 592 596 /* 593 597 * pre-SubByte transform 594 598 * ··· 1094 1090 1095 1091 ret; 1096 1092 ENDPROC(camellia_ctr_16way) 1093 + 1094 + #define gf128mul_x_ble(iv, mask, tmp) \ 1095 + vpsrad $31, iv, tmp; \ 1096 + vpaddq iv, iv, iv; \ 1097 + vpshufd $0x13, tmp, tmp; \ 1098 + vpand mask, tmp, tmp; \ 1099 + vpxor tmp, iv, iv; 1100 + 1101 + .align 8 1102 + camellia_xts_crypt_16way: 1103 + /* input: 1104 + * %rdi: ctx, CTX 1105 + * %rsi: dst (16 blocks) 1106 + * %rdx: src (16 blocks) 1107 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1108 + * %r8: index for input whitening key 1109 + * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16 1110 + */ 1111 + 1112 + subq $(16 * 16), %rsp; 1113 + movq %rsp, %rax; 1114 + 1115 + vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14; 1116 + 1117 + /* load IV */ 1118 + vmovdqu (%rcx), %xmm0; 1119 + vpxor 0 * 16(%rdx), %xmm0, %xmm15; 1120 + vmovdqu %xmm15, 15 * 16(%rax); 1121 + vmovdqu %xmm0, 0 * 16(%rsi); 1122 + 1123 + /* construct IVs */ 1124 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1125 + vpxor 1 * 16(%rdx), %xmm0, %xmm15; 1126 + vmovdqu %xmm15, 14 * 16(%rax); 1127 + vmovdqu %xmm0, 1 * 16(%rsi); 1128 + 1129 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1130 + vpxor 2 * 16(%rdx), %xmm0, %xmm13; 1131 + vmovdqu %xmm0, 2 * 16(%rsi); 1132 + 1133 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1134 + vpxor 3 * 16(%rdx), %xmm0, %xmm12; 1135 + vmovdqu %xmm0, 3 * 16(%rsi); 1136 + 1137 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1138 + vpxor 4 * 16(%rdx), %xmm0, %xmm11; 1139 + vmovdqu %xmm0, 4 * 16(%rsi); 1140 + 1141 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1142 + vpxor 5 * 16(%rdx), %xmm0, %xmm10; 1143 + vmovdqu %xmm0, 5 * 16(%rsi); 1144 + 1145 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1146 + vpxor 6 * 16(%rdx), %xmm0, %xmm9; 1147 + vmovdqu %xmm0, 6 * 16(%rsi); 1148 + 1149 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1150 + vpxor 7 * 16(%rdx), %xmm0, %xmm8; 1151 + vmovdqu %xmm0, 7 * 16(%rsi); 1152 + 1153 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1154 + vpxor 8 * 16(%rdx), %xmm0, %xmm7; 1155 + vmovdqu %xmm0, 8 * 16(%rsi); 1156 + 1157 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1158 + vpxor 9 * 16(%rdx), %xmm0, %xmm6; 1159 + vmovdqu %xmm0, 9 * 16(%rsi); 1160 + 1161 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1162 + vpxor 10 * 16(%rdx), %xmm0, %xmm5; 1163 + vmovdqu %xmm0, 10 * 16(%rsi); 1164 + 1165 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1166 + vpxor 11 * 16(%rdx), %xmm0, %xmm4; 1167 + vmovdqu %xmm0, 11 * 16(%rsi); 1168 + 1169 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1170 + vpxor 12 * 16(%rdx), %xmm0, %xmm3; 1171 + vmovdqu %xmm0, 12 * 16(%rsi); 1172 + 1173 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1174 + vpxor 13 * 16(%rdx), %xmm0, %xmm2; 1175 + vmovdqu %xmm0, 13 * 16(%rsi); 1176 + 1177 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1178 + vpxor 14 * 16(%rdx), %xmm0, %xmm1; 1179 + vmovdqu %xmm0, 14 * 16(%rsi); 1180 + 1181 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1182 + vpxor 15 * 16(%rdx), %xmm0, %xmm15; 1183 + vmovdqu %xmm15, 0 * 16(%rax); 1184 + vmovdqu %xmm0, 15 * 16(%rsi); 1185 + 1186 + gf128mul_x_ble(%xmm0, %xmm14, %xmm15); 1187 + vmovdqu %xmm0, (%rcx); 1188 + 1189 + /* inpack16_pre: */ 1190 + vmovq (key_table)(CTX, %r8, 8), %xmm15; 1191 + vpshufb .Lpack_bswap, %xmm15, %xmm15; 1192 + vpxor 0 * 16(%rax), %xmm15, %xmm0; 1193 + vpxor %xmm1, %xmm15, %xmm1; 1194 + vpxor %xmm2, %xmm15, %xmm2; 1195 + vpxor %xmm3, %xmm15, %xmm3; 1196 + vpxor %xmm4, %xmm15, %xmm4; 1197 + vpxor %xmm5, %xmm15, %xmm5; 1198 + vpxor %xmm6, %xmm15, %xmm6; 1199 + vpxor %xmm7, %xmm15, %xmm7; 1200 + vpxor %xmm8, %xmm15, %xmm8; 1201 + vpxor %xmm9, %xmm15, %xmm9; 1202 + vpxor %xmm10, %xmm15, %xmm10; 1203 + vpxor %xmm11, %xmm15, %xmm11; 1204 + vpxor %xmm12, %xmm15, %xmm12; 1205 + vpxor %xmm13, %xmm15, %xmm13; 1206 + vpxor 14 * 16(%rax), %xmm15, %xmm14; 1207 + vpxor 15 * 16(%rax), %xmm15, %xmm15; 1208 + 1209 + call *%r9; 1210 + 1211 + addq $(16 * 16), %rsp; 1212 + 1213 + vpxor 0 * 16(%rsi), %xmm7, %xmm7; 1214 + vpxor 1 * 16(%rsi), %xmm6, %xmm6; 1215 + vpxor 2 * 16(%rsi), %xmm5, %xmm5; 1216 + vpxor 3 * 16(%rsi), %xmm4, %xmm4; 1217 + vpxor 4 * 16(%rsi), %xmm3, %xmm3; 1218 + vpxor 5 * 16(%rsi), %xmm2, %xmm2; 1219 + vpxor 6 * 16(%rsi), %xmm1, %xmm1; 1220 + vpxor 7 * 16(%rsi), %xmm0, %xmm0; 1221 + vpxor 8 * 16(%rsi), %xmm15, %xmm15; 1222 + vpxor 9 * 16(%rsi), %xmm14, %xmm14; 1223 + vpxor 10 * 16(%rsi), %xmm13, %xmm13; 1224 + vpxor 11 * 16(%rsi), %xmm12, %xmm12; 1225 + vpxor 12 * 16(%rsi), %xmm11, %xmm11; 1226 + vpxor 13 * 16(%rsi), %xmm10, %xmm10; 1227 + vpxor 14 * 16(%rsi), %xmm9, %xmm9; 1228 + vpxor 15 * 16(%rsi), %xmm8, %xmm8; 1229 + write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, 1230 + %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, 1231 + %xmm8, %rsi); 1232 + 1233 + ret; 1234 + ENDPROC(camellia_xts_crypt_16way) 1235 + 1236 + ENTRY(camellia_xts_enc_16way) 1237 + /* input: 1238 + * %rdi: ctx, CTX 1239 + * %rsi: dst (16 blocks) 1240 + * %rdx: src (16 blocks) 1241 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1242 + */ 1243 + xorl %r8d, %r8d; /* input whitening key, 0 for enc */ 1244 + 1245 + leaq __camellia_enc_blk16, %r9; 1246 + 1247 + jmp camellia_xts_crypt_16way; 1248 + ENDPROC(camellia_xts_enc_16way) 1249 + 1250 + ENTRY(camellia_xts_dec_16way) 1251 + /* input: 1252 + * %rdi: ctx, CTX 1253 + * %rsi: dst (16 blocks) 1254 + * %rdx: src (16 blocks) 1255 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1256 + */ 1257 + 1258 + cmpl $16, key_length(CTX); 1259 + movl $32, %r8d; 1260 + movl $24, %eax; 1261 + cmovel %eax, %r8d; /* input whitening key, last for dec */ 1262 + 1263 + leaq __camellia_dec_blk16, %r9; 1264 + 1265 + jmp camellia_xts_crypt_16way; 1266 + ENDPROC(camellia_xts_dec_16way)

+1368

arch/x86/crypto/camellia-aesni-avx2-asm_64.S

··· 1 + /* 2 + * x86_64/AVX2/AES-NI assembler implementation of Camellia 3 + * 4 + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License as published by 8 + * the Free Software Foundation; either version 2 of the License, or 9 + * (at your option) any later version. 10 + * 11 + */ 12 + 13 + #include <linux/linkage.h> 14 + 15 + #define CAMELLIA_TABLE_BYTE_LEN 272 16 + 17 + /* struct camellia_ctx: */ 18 + #define key_table 0 19 + #define key_length CAMELLIA_TABLE_BYTE_LEN 20 + 21 + /* register macros */ 22 + #define CTX %rdi 23 + #define RIO %r8 24 + 25 + /********************************************************************** 26 + helper macros 27 + **********************************************************************/ 28 + #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 29 + vpand x, mask4bit, tmp0; \ 30 + vpandn x, mask4bit, x; \ 31 + vpsrld $4, x, x; \ 32 + \ 33 + vpshufb tmp0, lo_t, tmp0; \ 34 + vpshufb x, hi_t, x; \ 35 + vpxor tmp0, x, x; 36 + 37 + #define ymm0_x xmm0 38 + #define ymm1_x xmm1 39 + #define ymm2_x xmm2 40 + #define ymm3_x xmm3 41 + #define ymm4_x xmm4 42 + #define ymm5_x xmm5 43 + #define ymm6_x xmm6 44 + #define ymm7_x xmm7 45 + #define ymm8_x xmm8 46 + #define ymm9_x xmm9 47 + #define ymm10_x xmm10 48 + #define ymm11_x xmm11 49 + #define ymm12_x xmm12 50 + #define ymm13_x xmm13 51 + #define ymm14_x xmm14 52 + #define ymm15_x xmm15 53 + 54 + /* 55 + * AES-NI instructions do not support ymmX registers, so we need splitting and 56 + * merging. 57 + */ 58 + #define vaesenclast256(zero, yreg, tmp) \ 59 + vextracti128 $1, yreg, tmp##_x; \ 60 + vaesenclast zero##_x, yreg##_x, yreg##_x; \ 61 + vaesenclast zero##_x, tmp##_x, tmp##_x; \ 62 + vinserti128 $1, tmp##_x, yreg, yreg; 63 + 64 + /********************************************************************** 65 + 32-way camellia 66 + **********************************************************************/ 67 + 68 + /* 69 + * IN: 70 + * x0..x7: byte-sliced AB state 71 + * mem_cd: register pointer storing CD state 72 + * key: index for key material 73 + * OUT: 74 + * x0..x7: new byte-sliced CD state 75 + */ 76 + #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ 77 + t7, mem_cd, key) \ 78 + /* \ 79 + * S-function with AES subbytes \ 80 + */ \ 81 + vbroadcasti128 .Linv_shift_row, t4; \ 82 + vpbroadcastb .L0f0f0f0f, t7; \ 83 + vbroadcasti128 .Lpre_tf_lo_s1, t0; \ 84 + vbroadcasti128 .Lpre_tf_hi_s1, t1; \ 85 + \ 86 + /* AES inverse shift rows */ \ 87 + vpshufb t4, x0, x0; \ 88 + vpshufb t4, x7, x7; \ 89 + vpshufb t4, x1, x1; \ 90 + vpshufb t4, x4, x4; \ 91 + vpshufb t4, x2, x2; \ 92 + vpshufb t4, x5, x5; \ 93 + vpshufb t4, x3, x3; \ 94 + vpshufb t4, x6, x6; \ 95 + \ 96 + /* prefilter sboxes 1, 2 and 3 */ \ 97 + vbroadcasti128 .Lpre_tf_lo_s4, t2; \ 98 + vbroadcasti128 .Lpre_tf_hi_s4, t3; \ 99 + filter_8bit(x0, t0, t1, t7, t6); \ 100 + filter_8bit(x7, t0, t1, t7, t6); \ 101 + filter_8bit(x1, t0, t1, t7, t6); \ 102 + filter_8bit(x4, t0, t1, t7, t6); \ 103 + filter_8bit(x2, t0, t1, t7, t6); \ 104 + filter_8bit(x5, t0, t1, t7, t6); \ 105 + \ 106 + /* prefilter sbox 4 */ \ 107 + vpxor t4##_x, t4##_x, t4##_x; \ 108 + filter_8bit(x3, t2, t3, t7, t6); \ 109 + filter_8bit(x6, t2, t3, t7, t6); \ 110 + \ 111 + /* AES subbytes + AES shift rows */ \ 112 + vbroadcasti128 .Lpost_tf_lo_s1, t0; \ 113 + vbroadcasti128 .Lpost_tf_hi_s1, t1; \ 114 + vaesenclast256(t4, x0, t5); \ 115 + vaesenclast256(t4, x7, t5); \ 116 + vaesenclast256(t4, x1, t5); \ 117 + vaesenclast256(t4, x4, t5); \ 118 + vaesenclast256(t4, x2, t5); \ 119 + vaesenclast256(t4, x5, t5); \ 120 + vaesenclast256(t4, x3, t5); \ 121 + vaesenclast256(t4, x6, t5); \ 122 + \ 123 + /* postfilter sboxes 1 and 4 */ \ 124 + vbroadcasti128 .Lpost_tf_lo_s3, t2; \ 125 + vbroadcasti128 .Lpost_tf_hi_s3, t3; \ 126 + filter_8bit(x0, t0, t1, t7, t6); \ 127 + filter_8bit(x7, t0, t1, t7, t6); \ 128 + filter_8bit(x3, t0, t1, t7, t6); \ 129 + filter_8bit(x6, t0, t1, t7, t6); \ 130 + \ 131 + /* postfilter sbox 3 */ \ 132 + vbroadcasti128 .Lpost_tf_lo_s2, t4; \ 133 + vbroadcasti128 .Lpost_tf_hi_s2, t5; \ 134 + filter_8bit(x2, t2, t3, t7, t6); \ 135 + filter_8bit(x5, t2, t3, t7, t6); \ 136 + \ 137 + vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ 138 + \ 139 + /* postfilter sbox 2 */ \ 140 + filter_8bit(x1, t4, t5, t7, t2); \ 141 + filter_8bit(x4, t4, t5, t7, t2); \ 142 + \ 143 + vpsrldq $1, t0, t1; \ 144 + vpsrldq $2, t0, t2; \ 145 + vpsrldq $3, t0, t3; \ 146 + vpsrldq $4, t0, t4; \ 147 + vpsrldq $5, t0, t5; \ 148 + vpsrldq $6, t0, t6; \ 149 + vpsrldq $7, t0, t7; \ 150 + vpbroadcastb t0##_x, t0; \ 151 + vpbroadcastb t1##_x, t1; \ 152 + vpbroadcastb t2##_x, t2; \ 153 + vpbroadcastb t3##_x, t3; \ 154 + vpbroadcastb t4##_x, t4; \ 155 + vpbroadcastb t6##_x, t6; \ 156 + vpbroadcastb t5##_x, t5; \ 157 + vpbroadcastb t7##_x, t7; \ 158 + \ 159 + /* P-function */ \ 160 + vpxor x5, x0, x0; \ 161 + vpxor x6, x1, x1; \ 162 + vpxor x7, x2, x2; \ 163 + vpxor x4, x3, x3; \ 164 + \ 165 + vpxor x2, x4, x4; \ 166 + vpxor x3, x5, x5; \ 167 + vpxor x0, x6, x6; \ 168 + vpxor x1, x7, x7; \ 169 + \ 170 + vpxor x7, x0, x0; \ 171 + vpxor x4, x1, x1; \ 172 + vpxor x5, x2, x2; \ 173 + vpxor x6, x3, x3; \ 174 + \ 175 + vpxor x3, x4, x4; \ 176 + vpxor x0, x5, x5; \ 177 + vpxor x1, x6, x6; \ 178 + vpxor x2, x7, x7; /* note: high and low parts swapped */ \ 179 + \ 180 + /* Add key material and result to CD (x becomes new CD) */ \ 181 + \ 182 + vpxor t7, x0, x0; \ 183 + vpxor 4 * 32(mem_cd), x0, x0; \ 184 + \ 185 + vpxor t6, x1, x1; \ 186 + vpxor 5 * 32(mem_cd), x1, x1; \ 187 + \ 188 + vpxor t5, x2, x2; \ 189 + vpxor 6 * 32(mem_cd), x2, x2; \ 190 + \ 191 + vpxor t4, x3, x3; \ 192 + vpxor 7 * 32(mem_cd), x3, x3; \ 193 + \ 194 + vpxor t3, x4, x4; \ 195 + vpxor 0 * 32(mem_cd), x4, x4; \ 196 + \ 197 + vpxor t2, x5, x5; \ 198 + vpxor 1 * 32(mem_cd), x5, x5; \ 199 + \ 200 + vpxor t1, x6, x6; \ 201 + vpxor 2 * 32(mem_cd), x6, x6; \ 202 + \ 203 + vpxor t0, x7, x7; \ 204 + vpxor 3 * 32(mem_cd), x7, x7; 205 + 206 + /* 207 + * Size optimization... with inlined roundsm16 binary would be over 5 times 208 + * larger and would only marginally faster. 209 + */ 210 + .align 8 211 + roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: 212 + roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 213 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, 214 + %rcx, (%r9)); 215 + ret; 216 + ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) 217 + 218 + .align 8 219 + roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: 220 + roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, 221 + %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, 222 + %rax, (%r9)); 223 + ret; 224 + ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) 225 + 226 + /* 227 + * IN/OUT: 228 + * x0..x7: byte-sliced AB state preloaded 229 + * mem_ab: byte-sliced AB state in memory 230 + * mem_cb: byte-sliced CD state in memory 231 + */ 232 + #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 233 + y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ 234 + leaq (key_table + (i) * 8)(CTX), %r9; \ 235 + call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ 236 + \ 237 + vmovdqu x0, 4 * 32(mem_cd); \ 238 + vmovdqu x1, 5 * 32(mem_cd); \ 239 + vmovdqu x2, 6 * 32(mem_cd); \ 240 + vmovdqu x3, 7 * 32(mem_cd); \ 241 + vmovdqu x4, 0 * 32(mem_cd); \ 242 + vmovdqu x5, 1 * 32(mem_cd); \ 243 + vmovdqu x6, 2 * 32(mem_cd); \ 244 + vmovdqu x7, 3 * 32(mem_cd); \ 245 + \ 246 + leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ 247 + call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ 248 + \ 249 + store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); 250 + 251 + #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ 252 + 253 + #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ 254 + /* Store new AB state */ \ 255 + vmovdqu x4, 4 * 32(mem_ab); \ 256 + vmovdqu x5, 5 * 32(mem_ab); \ 257 + vmovdqu x6, 6 * 32(mem_ab); \ 258 + vmovdqu x7, 7 * 32(mem_ab); \ 259 + vmovdqu x0, 0 * 32(mem_ab); \ 260 + vmovdqu x1, 1 * 32(mem_ab); \ 261 + vmovdqu x2, 2 * 32(mem_ab); \ 262 + vmovdqu x3, 3 * 32(mem_ab); 263 + 264 + #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 265 + y6, y7, mem_ab, mem_cd, i) \ 266 + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 267 + y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ 268 + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 269 + y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ 270 + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 271 + y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); 272 + 273 + #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 274 + y6, y7, mem_ab, mem_cd, i) \ 275 + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 276 + y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ 277 + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 278 + y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ 279 + two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 280 + y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); 281 + 282 + /* 283 + * IN: 284 + * v0..3: byte-sliced 32-bit integers 285 + * OUT: 286 + * v0..3: (IN <<< 1) 287 + */ 288 + #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ 289 + vpcmpgtb v0, zero, t0; \ 290 + vpaddb v0, v0, v0; \ 291 + vpabsb t0, t0; \ 292 + \ 293 + vpcmpgtb v1, zero, t1; \ 294 + vpaddb v1, v1, v1; \ 295 + vpabsb t1, t1; \ 296 + \ 297 + vpcmpgtb v2, zero, t2; \ 298 + vpaddb v2, v2, v2; \ 299 + vpabsb t2, t2; \ 300 + \ 301 + vpor t0, v1, v1; \ 302 + \ 303 + vpcmpgtb v3, zero, t0; \ 304 + vpaddb v3, v3, v3; \ 305 + vpabsb t0, t0; \ 306 + \ 307 + vpor t1, v2, v2; \ 308 + vpor t2, v3, v3; \ 309 + vpor t0, v0, v0; 310 + 311 + /* 312 + * IN: 313 + * r: byte-sliced AB state in memory 314 + * l: byte-sliced CD state in memory 315 + * OUT: 316 + * x0..x7: new byte-sliced CD state 317 + */ 318 + #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ 319 + tt1, tt2, tt3, kll, klr, krl, krr) \ 320 + /* \ 321 + * t0 = kll; \ 322 + * t0 &= ll; \ 323 + * lr ^= rol32(t0, 1); \ 324 + */ \ 325 + vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ 326 + vpxor tt0, tt0, tt0; \ 327 + vpbroadcastb t0##_x, t3; \ 328 + vpsrldq $1, t0, t0; \ 329 + vpbroadcastb t0##_x, t2; \ 330 + vpsrldq $1, t0, t0; \ 331 + vpbroadcastb t0##_x, t1; \ 332 + vpsrldq $1, t0, t0; \ 333 + vpbroadcastb t0##_x, t0; \ 334 + \ 335 + vpand l0, t0, t0; \ 336 + vpand l1, t1, t1; \ 337 + vpand l2, t2, t2; \ 338 + vpand l3, t3, t3; \ 339 + \ 340 + rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 341 + \ 342 + vpxor l4, t0, l4; \ 343 + vmovdqu l4, 4 * 32(l); \ 344 + vpxor l5, t1, l5; \ 345 + vmovdqu l5, 5 * 32(l); \ 346 + vpxor l6, t2, l6; \ 347 + vmovdqu l6, 6 * 32(l); \ 348 + vpxor l7, t3, l7; \ 349 + vmovdqu l7, 7 * 32(l); \ 350 + \ 351 + /* \ 352 + * t2 = krr; \ 353 + * t2 |= rr; \ 354 + * rl ^= t2; \ 355 + */ \ 356 + \ 357 + vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ 358 + vpbroadcastb t0##_x, t3; \ 359 + vpsrldq $1, t0, t0; \ 360 + vpbroadcastb t0##_x, t2; \ 361 + vpsrldq $1, t0, t0; \ 362 + vpbroadcastb t0##_x, t1; \ 363 + vpsrldq $1, t0, t0; \ 364 + vpbroadcastb t0##_x, t0; \ 365 + \ 366 + vpor 4 * 32(r), t0, t0; \ 367 + vpor 5 * 32(r), t1, t1; \ 368 + vpor 6 * 32(r), t2, t2; \ 369 + vpor 7 * 32(r), t3, t3; \ 370 + \ 371 + vpxor 0 * 32(r), t0, t0; \ 372 + vpxor 1 * 32(r), t1, t1; \ 373 + vpxor 2 * 32(r), t2, t2; \ 374 + vpxor 3 * 32(r), t3, t3; \ 375 + vmovdqu t0, 0 * 32(r); \ 376 + vmovdqu t1, 1 * 32(r); \ 377 + vmovdqu t2, 2 * 32(r); \ 378 + vmovdqu t3, 3 * 32(r); \ 379 + \ 380 + /* \ 381 + * t2 = krl; \ 382 + * t2 &= rl; \ 383 + * rr ^= rol32(t2, 1); \ 384 + */ \ 385 + vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ 386 + vpbroadcastb t0##_x, t3; \ 387 + vpsrldq $1, t0, t0; \ 388 + vpbroadcastb t0##_x, t2; \ 389 + vpsrldq $1, t0, t0; \ 390 + vpbroadcastb t0##_x, t1; \ 391 + vpsrldq $1, t0, t0; \ 392 + vpbroadcastb t0##_x, t0; \ 393 + \ 394 + vpand 0 * 32(r), t0, t0; \ 395 + vpand 1 * 32(r), t1, t1; \ 396 + vpand 2 * 32(r), t2, t2; \ 397 + vpand 3 * 32(r), t3, t3; \ 398 + \ 399 + rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 400 + \ 401 + vpxor 4 * 32(r), t0, t0; \ 402 + vpxor 5 * 32(r), t1, t1; \ 403 + vpxor 6 * 32(r), t2, t2; \ 404 + vpxor 7 * 32(r), t3, t3; \ 405 + vmovdqu t0, 4 * 32(r); \ 406 + vmovdqu t1, 5 * 32(r); \ 407 + vmovdqu t2, 6 * 32(r); \ 408 + vmovdqu t3, 7 * 32(r); \ 409 + \ 410 + /* \ 411 + * t0 = klr; \ 412 + * t0 |= lr; \ 413 + * ll ^= t0; \ 414 + */ \ 415 + \ 416 + vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ 417 + vpbroadcastb t0##_x, t3; \ 418 + vpsrldq $1, t0, t0; \ 419 + vpbroadcastb t0##_x, t2; \ 420 + vpsrldq $1, t0, t0; \ 421 + vpbroadcastb t0##_x, t1; \ 422 + vpsrldq $1, t0, t0; \ 423 + vpbroadcastb t0##_x, t0; \ 424 + \ 425 + vpor l4, t0, t0; \ 426 + vpor l5, t1, t1; \ 427 + vpor l6, t2, t2; \ 428 + vpor l7, t3, t3; \ 429 + \ 430 + vpxor l0, t0, l0; \ 431 + vmovdqu l0, 0 * 32(l); \ 432 + vpxor l1, t1, l1; \ 433 + vmovdqu l1, 1 * 32(l); \ 434 + vpxor l2, t2, l2; \ 435 + vmovdqu l2, 2 * 32(l); \ 436 + vpxor l3, t3, l3; \ 437 + vmovdqu l3, 3 * 32(l); 438 + 439 + #define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 440 + vpunpckhdq x1, x0, t2; \ 441 + vpunpckldq x1, x0, x0; \ 442 + \ 443 + vpunpckldq x3, x2, t1; \ 444 + vpunpckhdq x3, x2, x2; \ 445 + \ 446 + vpunpckhqdq t1, x0, x1; \ 447 + vpunpcklqdq t1, x0, x0; \ 448 + \ 449 + vpunpckhqdq x2, t2, x3; \ 450 + vpunpcklqdq x2, t2, x2; 451 + 452 + #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ 453 + a3, b3, c3, d3, st0, st1) \ 454 + vmovdqu d2, st0; \ 455 + vmovdqu d3, st1; \ 456 + transpose_4x4(a0, a1, a2, a3, d2, d3); \ 457 + transpose_4x4(b0, b1, b2, b3, d2, d3); \ 458 + vmovdqu st0, d2; \ 459 + vmovdqu st1, d3; \ 460 + \ 461 + vmovdqu a0, st0; \ 462 + vmovdqu a1, st1; \ 463 + transpose_4x4(c0, c1, c2, c3, a0, a1); \ 464 + transpose_4x4(d0, d1, d2, d3, a0, a1); \ 465 + \ 466 + vbroadcasti128 .Lshufb_16x16b, a0; \ 467 + vmovdqu st1, a1; \ 468 + vpshufb a0, a2, a2; \ 469 + vpshufb a0, a3, a3; \ 470 + vpshufb a0, b0, b0; \ 471 + vpshufb a0, b1, b1; \ 472 + vpshufb a0, b2, b2; \ 473 + vpshufb a0, b3, b3; \ 474 + vpshufb a0, a1, a1; \ 475 + vpshufb a0, c0, c0; \ 476 + vpshufb a0, c1, c1; \ 477 + vpshufb a0, c2, c2; \ 478 + vpshufb a0, c3, c3; \ 479 + vpshufb a0, d0, d0; \ 480 + vpshufb a0, d1, d1; \ 481 + vpshufb a0, d2, d2; \ 482 + vpshufb a0, d3, d3; \ 483 + vmovdqu d3, st1; \ 484 + vmovdqu st0, d3; \ 485 + vpshufb a0, d3, a0; \ 486 + vmovdqu d2, st0; \ 487 + \ 488 + transpose_4x4(a0, b0, c0, d0, d2, d3); \ 489 + transpose_4x4(a1, b1, c1, d1, d2, d3); \ 490 + vmovdqu st0, d2; \ 491 + vmovdqu st1, d3; \ 492 + \ 493 + vmovdqu b0, st0; \ 494 + vmovdqu b1, st1; \ 495 + transpose_4x4(a2, b2, c2, d2, b0, b1); \ 496 + transpose_4x4(a3, b3, c3, d3, b0, b1); \ 497 + vmovdqu st0, b0; \ 498 + vmovdqu st1, b1; \ 499 + /* does not adjust output bytes inside vectors */ 500 + 501 + /* load blocks to registers and apply pre-whitening */ 502 + #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 503 + y6, y7, rio, key) \ 504 + vpbroadcastq key, x0; \ 505 + vpshufb .Lpack_bswap, x0, x0; \ 506 + \ 507 + vpxor 0 * 32(rio), x0, y7; \ 508 + vpxor 1 * 32(rio), x0, y6; \ 509 + vpxor 2 * 32(rio), x0, y5; \ 510 + vpxor 3 * 32(rio), x0, y4; \ 511 + vpxor 4 * 32(rio), x0, y3; \ 512 + vpxor 5 * 32(rio), x0, y2; \ 513 + vpxor 6 * 32(rio), x0, y1; \ 514 + vpxor 7 * 32(rio), x0, y0; \ 515 + vpxor 8 * 32(rio), x0, x7; \ 516 + vpxor 9 * 32(rio), x0, x6; \ 517 + vpxor 10 * 32(rio), x0, x5; \ 518 + vpxor 11 * 32(rio), x0, x4; \ 519 + vpxor 12 * 32(rio), x0, x3; \ 520 + vpxor 13 * 32(rio), x0, x2; \ 521 + vpxor 14 * 32(rio), x0, x1; \ 522 + vpxor 15 * 32(rio), x0, x0; 523 + 524 + /* byteslice pre-whitened blocks and store to temporary memory */ 525 + #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 526 + y6, y7, mem_ab, mem_cd) \ 527 + byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ 528 + y4, y5, y6, y7, (mem_ab), (mem_cd)); \ 529 + \ 530 + vmovdqu x0, 0 * 32(mem_ab); \ 531 + vmovdqu x1, 1 * 32(mem_ab); \ 532 + vmovdqu x2, 2 * 32(mem_ab); \ 533 + vmovdqu x3, 3 * 32(mem_ab); \ 534 + vmovdqu x4, 4 * 32(mem_ab); \ 535 + vmovdqu x5, 5 * 32(mem_ab); \ 536 + vmovdqu x6, 6 * 32(mem_ab); \ 537 + vmovdqu x7, 7 * 32(mem_ab); \ 538 + vmovdqu y0, 0 * 32(mem_cd); \ 539 + vmovdqu y1, 1 * 32(mem_cd); \ 540 + vmovdqu y2, 2 * 32(mem_cd); \ 541 + vmovdqu y3, 3 * 32(mem_cd); \ 542 + vmovdqu y4, 4 * 32(mem_cd); \ 543 + vmovdqu y5, 5 * 32(mem_cd); \ 544 + vmovdqu y6, 6 * 32(mem_cd); \ 545 + vmovdqu y7, 7 * 32(mem_cd); 546 + 547 + /* de-byteslice, apply post-whitening and store blocks */ 548 + #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ 549 + y5, y6, y7, key, stack_tmp0, stack_tmp1) \ 550 + byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ 551 + y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ 552 + \ 553 + vmovdqu x0, stack_tmp0; \ 554 + \ 555 + vpbroadcastq key, x0; \ 556 + vpshufb .Lpack_bswap, x0, x0; \ 557 + \ 558 + vpxor x0, y7, y7; \ 559 + vpxor x0, y6, y6; \ 560 + vpxor x0, y5, y5; \ 561 + vpxor x0, y4, y4; \ 562 + vpxor x0, y3, y3; \ 563 + vpxor x0, y2, y2; \ 564 + vpxor x0, y1, y1; \ 565 + vpxor x0, y0, y0; \ 566 + vpxor x0, x7, x7; \ 567 + vpxor x0, x6, x6; \ 568 + vpxor x0, x5, x5; \ 569 + vpxor x0, x4, x4; \ 570 + vpxor x0, x3, x3; \ 571 + vpxor x0, x2, x2; \ 572 + vpxor x0, x1, x1; \ 573 + vpxor stack_tmp0, x0, x0; 574 + 575 + #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 576 + y6, y7, rio) \ 577 + vmovdqu x0, 0 * 32(rio); \ 578 + vmovdqu x1, 1 * 32(rio); \ 579 + vmovdqu x2, 2 * 32(rio); \ 580 + vmovdqu x3, 3 * 32(rio); \ 581 + vmovdqu x4, 4 * 32(rio); \ 582 + vmovdqu x5, 5 * 32(rio); \ 583 + vmovdqu x6, 6 * 32(rio); \ 584 + vmovdqu x7, 7 * 32(rio); \ 585 + vmovdqu y0, 8 * 32(rio); \ 586 + vmovdqu y1, 9 * 32(rio); \ 587 + vmovdqu y2, 10 * 32(rio); \ 588 + vmovdqu y3, 11 * 32(rio); \ 589 + vmovdqu y4, 12 * 32(rio); \ 590 + vmovdqu y5, 13 * 32(rio); \ 591 + vmovdqu y6, 14 * 32(rio); \ 592 + vmovdqu y7, 15 * 32(rio); 593 + 594 + .data 595 + .align 32 596 + 597 + #define SHUFB_BYTES(idx) \ 598 + 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 599 + 600 + .Lshufb_16x16b: 601 + .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 602 + .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 603 + 604 + .Lpack_bswap: 605 + .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 606 + .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 607 + 608 + /* For CTR-mode IV byteswap */ 609 + .Lbswap128_mask: 610 + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 611 + 612 + /* For XTS mode */ 613 + .Lxts_gf128mul_and_shl1_mask_0: 614 + .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 615 + .Lxts_gf128mul_and_shl1_mask_1: 616 + .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 617 + 618 + /* 619 + * pre-SubByte transform 620 + * 621 + * pre-lookup for sbox1, sbox2, sbox3: 622 + * swap_bitendianness( 623 + * isom_map_camellia_to_aes( 624 + * camellia_f( 625 + * swap_bitendianess(in) 626 + * ) 627 + * ) 628 + * ) 629 + * 630 + * (note: '⊕ 0xc5' inside camellia_f()) 631 + */ 632 + .Lpre_tf_lo_s1: 633 + .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 634 + .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 635 + .Lpre_tf_hi_s1: 636 + .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a 637 + .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 638 + 639 + /* 640 + * pre-SubByte transform 641 + * 642 + * pre-lookup for sbox4: 643 + * swap_bitendianness( 644 + * isom_map_camellia_to_aes( 645 + * camellia_f( 646 + * swap_bitendianess(in <<< 1) 647 + * ) 648 + * ) 649 + * ) 650 + * 651 + * (note: '⊕ 0xc5' inside camellia_f()) 652 + */ 653 + .Lpre_tf_lo_s4: 654 + .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 655 + .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 656 + .Lpre_tf_hi_s4: 657 + .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 658 + .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf 659 + 660 + /* 661 + * post-SubByte transform 662 + * 663 + * post-lookup for sbox1, sbox4: 664 + * swap_bitendianness( 665 + * camellia_h( 666 + * isom_map_aes_to_camellia( 667 + * swap_bitendianness( 668 + * aes_inverse_affine_transform(in) 669 + * ) 670 + * ) 671 + * ) 672 + * ) 673 + * 674 + * (note: '⊕ 0x6e' inside camellia_h()) 675 + */ 676 + .Lpost_tf_lo_s1: 677 + .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 678 + .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 679 + .Lpost_tf_hi_s1: 680 + .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 681 + .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c 682 + 683 + /* 684 + * post-SubByte transform 685 + * 686 + * post-lookup for sbox2: 687 + * swap_bitendianness( 688 + * camellia_h( 689 + * isom_map_aes_to_camellia( 690 + * swap_bitendianness( 691 + * aes_inverse_affine_transform(in) 692 + * ) 693 + * ) 694 + * ) 695 + * ) <<< 1 696 + * 697 + * (note: '⊕ 0x6e' inside camellia_h()) 698 + */ 699 + .Lpost_tf_lo_s2: 700 + .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 701 + .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 702 + .Lpost_tf_hi_s2: 703 + .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 704 + .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 705 + 706 + /* 707 + * post-SubByte transform 708 + * 709 + * post-lookup for sbox3: 710 + * swap_bitendianness( 711 + * camellia_h( 712 + * isom_map_aes_to_camellia( 713 + * swap_bitendianness( 714 + * aes_inverse_affine_transform(in) 715 + * ) 716 + * ) 717 + * ) 718 + * ) >>> 1 719 + * 720 + * (note: '⊕ 0x6e' inside camellia_h()) 721 + */ 722 + .Lpost_tf_lo_s3: 723 + .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 724 + .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 725 + .Lpost_tf_hi_s3: 726 + .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 727 + .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 728 + 729 + /* For isolating SubBytes from AESENCLAST, inverse shift row */ 730 + .Linv_shift_row: 731 + .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 732 + .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 733 + 734 + .align 4 735 + /* 4-bit mask */ 736 + .L0f0f0f0f: 737 + .long 0x0f0f0f0f 738 + 739 + .text 740 + 741 + .align 8 742 + __camellia_enc_blk32: 743 + /* input: 744 + * %rdi: ctx, CTX 745 + * %rax: temporary storage, 512 bytes 746 + * %ymm0..%ymm15: 32 plaintext blocks 747 + * output: 748 + * %ymm0..%ymm15: 32 encrypted blocks, order swapped: 749 + * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 750 + */ 751 + 752 + leaq 8 * 32(%rax), %rcx; 753 + 754 + inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 755 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 756 + %ymm15, %rax, %rcx); 757 + 758 + enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 759 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 760 + %ymm15, %rax, %rcx, 0); 761 + 762 + fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 763 + %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 764 + %ymm15, 765 + ((key_table + (8) * 8) + 0)(CTX), 766 + ((key_table + (8) * 8) + 4)(CTX), 767 + ((key_table + (8) * 8) + 8)(CTX), 768 + ((key_table + (8) * 8) + 12)(CTX)); 769 + 770 + enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 771 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 772 + %ymm15, %rax, %rcx, 8); 773 + 774 + fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 775 + %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 776 + %ymm15, 777 + ((key_table + (16) * 8) + 0)(CTX), 778 + ((key_table + (16) * 8) + 4)(CTX), 779 + ((key_table + (16) * 8) + 8)(CTX), 780 + ((key_table + (16) * 8) + 12)(CTX)); 781 + 782 + enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 783 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 784 + %ymm15, %rax, %rcx, 16); 785 + 786 + movl $24, %r8d; 787 + cmpl $16, key_length(CTX); 788 + jne .Lenc_max32; 789 + 790 + .Lenc_done: 791 + /* load CD for output */ 792 + vmovdqu 0 * 32(%rcx), %ymm8; 793 + vmovdqu 1 * 32(%rcx), %ymm9; 794 + vmovdqu 2 * 32(%rcx), %ymm10; 795 + vmovdqu 3 * 32(%rcx), %ymm11; 796 + vmovdqu 4 * 32(%rcx), %ymm12; 797 + vmovdqu 5 * 32(%rcx), %ymm13; 798 + vmovdqu 6 * 32(%rcx), %ymm14; 799 + vmovdqu 7 * 32(%rcx), %ymm15; 800 + 801 + outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 802 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 803 + %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax)); 804 + 805 + ret; 806 + 807 + .align 8 808 + .Lenc_max32: 809 + movl $32, %r8d; 810 + 811 + fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 812 + %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 813 + %ymm15, 814 + ((key_table + (24) * 8) + 0)(CTX), 815 + ((key_table + (24) * 8) + 4)(CTX), 816 + ((key_table + (24) * 8) + 8)(CTX), 817 + ((key_table + (24) * 8) + 12)(CTX)); 818 + 819 + enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 820 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 821 + %ymm15, %rax, %rcx, 24); 822 + 823 + jmp .Lenc_done; 824 + ENDPROC(__camellia_enc_blk32) 825 + 826 + .align 8 827 + __camellia_dec_blk32: 828 + /* input: 829 + * %rdi: ctx, CTX 830 + * %rax: temporary storage, 512 bytes 831 + * %r8d: 24 for 16 byte key, 32 for larger 832 + * %ymm0..%ymm15: 16 encrypted blocks 833 + * output: 834 + * %ymm0..%ymm15: 16 plaintext blocks, order swapped: 835 + * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 836 + */ 837 + 838 + leaq 8 * 32(%rax), %rcx; 839 + 840 + inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 841 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 842 + %ymm15, %rax, %rcx); 843 + 844 + cmpl $32, %r8d; 845 + je .Ldec_max32; 846 + 847 + .Ldec_max24: 848 + dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 849 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 850 + %ymm15, %rax, %rcx, 16); 851 + 852 + fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 853 + %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 854 + %ymm15, 855 + ((key_table + (16) * 8) + 8)(CTX), 856 + ((key_table + (16) * 8) + 12)(CTX), 857 + ((key_table + (16) * 8) + 0)(CTX), 858 + ((key_table + (16) * 8) + 4)(CTX)); 859 + 860 + dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 861 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 862 + %ymm15, %rax, %rcx, 8); 863 + 864 + fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 865 + %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 866 + %ymm15, 867 + ((key_table + (8) * 8) + 8)(CTX), 868 + ((key_table + (8) * 8) + 12)(CTX), 869 + ((key_table + (8) * 8) + 0)(CTX), 870 + ((key_table + (8) * 8) + 4)(CTX)); 871 + 872 + dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 873 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 874 + %ymm15, %rax, %rcx, 0); 875 + 876 + /* load CD for output */ 877 + vmovdqu 0 * 32(%rcx), %ymm8; 878 + vmovdqu 1 * 32(%rcx), %ymm9; 879 + vmovdqu 2 * 32(%rcx), %ymm10; 880 + vmovdqu 3 * 32(%rcx), %ymm11; 881 + vmovdqu 4 * 32(%rcx), %ymm12; 882 + vmovdqu 5 * 32(%rcx), %ymm13; 883 + vmovdqu 6 * 32(%rcx), %ymm14; 884 + vmovdqu 7 * 32(%rcx), %ymm15; 885 + 886 + outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 887 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 888 + %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); 889 + 890 + ret; 891 + 892 + .align 8 893 + .Ldec_max32: 894 + dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 895 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 896 + %ymm15, %rax, %rcx, 24); 897 + 898 + fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 899 + %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 900 + %ymm15, 901 + ((key_table + (24) * 8) + 8)(CTX), 902 + ((key_table + (24) * 8) + 12)(CTX), 903 + ((key_table + (24) * 8) + 0)(CTX), 904 + ((key_table + (24) * 8) + 4)(CTX)); 905 + 906 + jmp .Ldec_max24; 907 + ENDPROC(__camellia_dec_blk32) 908 + 909 + ENTRY(camellia_ecb_enc_32way) 910 + /* input: 911 + * %rdi: ctx, CTX 912 + * %rsi: dst (32 blocks) 913 + * %rdx: src (32 blocks) 914 + */ 915 + 916 + vzeroupper; 917 + 918 + inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 919 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 920 + %ymm15, %rdx, (key_table)(CTX)); 921 + 922 + /* now dst can be used as temporary buffer (even in src == dst case) */ 923 + movq %rsi, %rax; 924 + 925 + call __camellia_enc_blk32; 926 + 927 + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 928 + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 929 + %ymm8, %rsi); 930 + 931 + vzeroupper; 932 + 933 + ret; 934 + ENDPROC(camellia_ecb_enc_32way) 935 + 936 + ENTRY(camellia_ecb_dec_32way) 937 + /* input: 938 + * %rdi: ctx, CTX 939 + * %rsi: dst (32 blocks) 940 + * %rdx: src (32 blocks) 941 + */ 942 + 943 + vzeroupper; 944 + 945 + cmpl $16, key_length(CTX); 946 + movl $32, %r8d; 947 + movl $24, %eax; 948 + cmovel %eax, %r8d; /* max */ 949 + 950 + inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 951 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 952 + %ymm15, %rdx, (key_table)(CTX, %r8, 8)); 953 + 954 + /* now dst can be used as temporary buffer (even in src == dst case) */ 955 + movq %rsi, %rax; 956 + 957 + call __camellia_dec_blk32; 958 + 959 + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 960 + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 961 + %ymm8, %rsi); 962 + 963 + vzeroupper; 964 + 965 + ret; 966 + ENDPROC(camellia_ecb_dec_32way) 967 + 968 + ENTRY(camellia_cbc_dec_32way) 969 + /* input: 970 + * %rdi: ctx, CTX 971 + * %rsi: dst (32 blocks) 972 + * %rdx: src (32 blocks) 973 + */ 974 + 975 + vzeroupper; 976 + 977 + cmpl $16, key_length(CTX); 978 + movl $32, %r8d; 979 + movl $24, %eax; 980 + cmovel %eax, %r8d; /* max */ 981 + 982 + inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 983 + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 984 + %ymm15, %rdx, (key_table)(CTX, %r8, 8)); 985 + 986 + movq %rsp, %r10; 987 + cmpq %rsi, %rdx; 988 + je .Lcbc_dec_use_stack; 989 + 990 + /* dst can be used as temporary storage, src is not overwritten. */ 991 + movq %rsi, %rax; 992 + jmp .Lcbc_dec_continue; 993 + 994 + .Lcbc_dec_use_stack: 995 + /* 996 + * dst still in-use (because dst == src), so use stack for temporary 997 + * storage. 998 + */ 999 + subq $(16 * 32), %rsp; 1000 + movq %rsp, %rax; 1001 + 1002 + .Lcbc_dec_continue: 1003 + call __camellia_dec_blk32; 1004 + 1005 + vmovdqu %ymm7, (%rax); 1006 + vpxor %ymm7, %ymm7, %ymm7; 1007 + vinserti128 $1, (%rdx), %ymm7, %ymm7; 1008 + vpxor (%rax), %ymm7, %ymm7; 1009 + movq %r10, %rsp; 1010 + vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; 1011 + vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; 1012 + vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; 1013 + vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3; 1014 + vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2; 1015 + vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1; 1016 + vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0; 1017 + vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15; 1018 + vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14; 1019 + vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13; 1020 + vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12; 1021 + vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11; 1022 + vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10; 1023 + vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9; 1024 + vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8; 1025 + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 1026 + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 1027 + %ymm8, %rsi); 1028 + 1029 + vzeroupper; 1030 + 1031 + ret; 1032 + ENDPROC(camellia_cbc_dec_32way) 1033 + 1034 + #define inc_le128(x, minus_one, tmp) \ 1035 + vpcmpeqq minus_one, x, tmp; \ 1036 + vpsubq minus_one, x, x; \ 1037 + vpslldq $8, tmp, tmp; \ 1038 + vpsubq tmp, x, x; 1039 + 1040 + #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ 1041 + vpcmpeqq minus_one, x, tmp1; \ 1042 + vpcmpeqq minus_two, x, tmp2; \ 1043 + vpsubq minus_two, x, x; \ 1044 + vpor tmp2, tmp1, tmp1; \ 1045 + vpslldq $8, tmp1, tmp1; \ 1046 + vpsubq tmp1, x, x; 1047 + 1048 + ENTRY(camellia_ctr_32way) 1049 + /* input: 1050 + * %rdi: ctx, CTX 1051 + * %rsi: dst (32 blocks) 1052 + * %rdx: src (32 blocks) 1053 + * %rcx: iv (little endian, 128bit) 1054 + */ 1055 + 1056 + vzeroupper; 1057 + 1058 + movq %rsp, %r10; 1059 + cmpq %rsi, %rdx; 1060 + je .Lctr_use_stack; 1061 + 1062 + /* dst can be used as temporary storage, src is not overwritten. */ 1063 + movq %rsi, %rax; 1064 + jmp .Lctr_continue; 1065 + 1066 + .Lctr_use_stack: 1067 + subq $(16 * 32), %rsp; 1068 + movq %rsp, %rax; 1069 + 1070 + .Lctr_continue: 1071 + vpcmpeqd %ymm15, %ymm15, %ymm15; 1072 + vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ 1073 + vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */ 1074 + 1075 + /* load IV and byteswap */ 1076 + vmovdqu (%rcx), %xmm0; 1077 + vmovdqa %xmm0, %xmm1; 1078 + inc_le128(%xmm0, %xmm15, %xmm14); 1079 + vbroadcasti128 .Lbswap128_mask, %ymm14; 1080 + vinserti128 $1, %xmm0, %ymm1, %ymm0; 1081 + vpshufb %ymm14, %ymm0, %ymm13; 1082 + vmovdqu %ymm13, 15 * 32(%rax); 1083 + 1084 + /* construct IVs */ 1085 + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */ 1086 + vpshufb %ymm14, %ymm0, %ymm13; 1087 + vmovdqu %ymm13, 14 * 32(%rax); 1088 + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1089 + vpshufb %ymm14, %ymm0, %ymm13; 1090 + vmovdqu %ymm13, 13 * 32(%rax); 1091 + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1092 + vpshufb %ymm14, %ymm0, %ymm13; 1093 + vmovdqu %ymm13, 12 * 32(%rax); 1094 + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1095 + vpshufb %ymm14, %ymm0, %ymm13; 1096 + vmovdqu %ymm13, 11 * 32(%rax); 1097 + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1098 + vpshufb %ymm14, %ymm0, %ymm10; 1099 + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1100 + vpshufb %ymm14, %ymm0, %ymm9; 1101 + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1102 + vpshufb %ymm14, %ymm0, %ymm8; 1103 + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1104 + vpshufb %ymm14, %ymm0, %ymm7; 1105 + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1106 + vpshufb %ymm14, %ymm0, %ymm6; 1107 + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1108 + vpshufb %ymm14, %ymm0, %ymm5; 1109 + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1110 + vpshufb %ymm14, %ymm0, %ymm4; 1111 + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1112 + vpshufb %ymm14, %ymm0, %ymm3; 1113 + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1114 + vpshufb %ymm14, %ymm0, %ymm2; 1115 + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1116 + vpshufb %ymm14, %ymm0, %ymm1; 1117 + add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1118 + vextracti128 $1, %ymm0, %xmm13; 1119 + vpshufb %ymm14, %ymm0, %ymm0; 1120 + inc_le128(%xmm13, %xmm15, %xmm14); 1121 + vmovdqu %xmm13, (%rcx); 1122 + 1123 + /* inpack32_pre: */ 1124 + vpbroadcastq (key_table)(CTX), %ymm15; 1125 + vpshufb .Lpack_bswap, %ymm15, %ymm15; 1126 + vpxor %ymm0, %ymm15, %ymm0; 1127 + vpxor %ymm1, %ymm15, %ymm1; 1128 + vpxor %ymm2, %ymm15, %ymm2; 1129 + vpxor %ymm3, %ymm15, %ymm3; 1130 + vpxor %ymm4, %ymm15, %ymm4; 1131 + vpxor %ymm5, %ymm15, %ymm5; 1132 + vpxor %ymm6, %ymm15, %ymm6; 1133 + vpxor %ymm7, %ymm15, %ymm7; 1134 + vpxor %ymm8, %ymm15, %ymm8; 1135 + vpxor %ymm9, %ymm15, %ymm9; 1136 + vpxor %ymm10, %ymm15, %ymm10; 1137 + vpxor 11 * 32(%rax), %ymm15, %ymm11; 1138 + vpxor 12 * 32(%rax), %ymm15, %ymm12; 1139 + vpxor 13 * 32(%rax), %ymm15, %ymm13; 1140 + vpxor 14 * 32(%rax), %ymm15, %ymm14; 1141 + vpxor 15 * 32(%rax), %ymm15, %ymm15; 1142 + 1143 + call __camellia_enc_blk32; 1144 + 1145 + movq %r10, %rsp; 1146 + 1147 + vpxor 0 * 32(%rdx), %ymm7, %ymm7; 1148 + vpxor 1 * 32(%rdx), %ymm6, %ymm6; 1149 + vpxor 2 * 32(%rdx), %ymm5, %ymm5; 1150 + vpxor 3 * 32(%rdx), %ymm4, %ymm4; 1151 + vpxor 4 * 32(%rdx), %ymm3, %ymm3; 1152 + vpxor 5 * 32(%rdx), %ymm2, %ymm2; 1153 + vpxor 6 * 32(%rdx), %ymm1, %ymm1; 1154 + vpxor 7 * 32(%rdx), %ymm0, %ymm0; 1155 + vpxor 8 * 32(%rdx), %ymm15, %ymm15; 1156 + vpxor 9 * 32(%rdx), %ymm14, %ymm14; 1157 + vpxor 10 * 32(%rdx), %ymm13, %ymm13; 1158 + vpxor 11 * 32(%rdx), %ymm12, %ymm12; 1159 + vpxor 12 * 32(%rdx), %ymm11, %ymm11; 1160 + vpxor 13 * 32(%rdx), %ymm10, %ymm10; 1161 + vpxor 14 * 32(%rdx), %ymm9, %ymm9; 1162 + vpxor 15 * 32(%rdx), %ymm8, %ymm8; 1163 + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 1164 + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 1165 + %ymm8, %rsi); 1166 + 1167 + vzeroupper; 1168 + 1169 + ret; 1170 + ENDPROC(camellia_ctr_32way) 1171 + 1172 + #define gf128mul_x_ble(iv, mask, tmp) \ 1173 + vpsrad $31, iv, tmp; \ 1174 + vpaddq iv, iv, iv; \ 1175 + vpshufd $0x13, tmp, tmp; \ 1176 + vpand mask, tmp, tmp; \ 1177 + vpxor tmp, iv, iv; 1178 + 1179 + #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ 1180 + vpsrad $31, iv, tmp0; \ 1181 + vpaddq iv, iv, tmp1; \ 1182 + vpsllq $2, iv, iv; \ 1183 + vpshufd $0x13, tmp0, tmp0; \ 1184 + vpsrad $31, tmp1, tmp1; \ 1185 + vpand mask2, tmp0, tmp0; \ 1186 + vpshufd $0x13, tmp1, tmp1; \ 1187 + vpxor tmp0, iv, iv; \ 1188 + vpand mask1, tmp1, tmp1; \ 1189 + vpxor tmp1, iv, iv; 1190 + 1191 + .align 8 1192 + camellia_xts_crypt_32way: 1193 + /* input: 1194 + * %rdi: ctx, CTX 1195 + * %rsi: dst (32 blocks) 1196 + * %rdx: src (32 blocks) 1197 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1198 + * %r8: index for input whitening key 1199 + * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32 1200 + */ 1201 + 1202 + vzeroupper; 1203 + 1204 + subq $(16 * 32), %rsp; 1205 + movq %rsp, %rax; 1206 + 1207 + vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12; 1208 + 1209 + /* load IV and construct second IV */ 1210 + vmovdqu (%rcx), %xmm0; 1211 + vmovdqa %xmm0, %xmm15; 1212 + gf128mul_x_ble(%xmm0, %xmm12, %xmm13); 1213 + vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13; 1214 + vinserti128 $1, %xmm0, %ymm15, %ymm0; 1215 + vpxor 0 * 32(%rdx), %ymm0, %ymm15; 1216 + vmovdqu %ymm15, 15 * 32(%rax); 1217 + vmovdqu %ymm0, 0 * 32(%rsi); 1218 + 1219 + /* construct IVs */ 1220 + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1221 + vpxor 1 * 32(%rdx), %ymm0, %ymm15; 1222 + vmovdqu %ymm15, 14 * 32(%rax); 1223 + vmovdqu %ymm0, 1 * 32(%rsi); 1224 + 1225 + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1226 + vpxor 2 * 32(%rdx), %ymm0, %ymm15; 1227 + vmovdqu %ymm15, 13 * 32(%rax); 1228 + vmovdqu %ymm0, 2 * 32(%rsi); 1229 + 1230 + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1231 + vpxor 3 * 32(%rdx), %ymm0, %ymm15; 1232 + vmovdqu %ymm15, 12 * 32(%rax); 1233 + vmovdqu %ymm0, 3 * 32(%rsi); 1234 + 1235 + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1236 + vpxor 4 * 32(%rdx), %ymm0, %ymm11; 1237 + vmovdqu %ymm0, 4 * 32(%rsi); 1238 + 1239 + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1240 + vpxor 5 * 32(%rdx), %ymm0, %ymm10; 1241 + vmovdqu %ymm0, 5 * 32(%rsi); 1242 + 1243 + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1244 + vpxor 6 * 32(%rdx), %ymm0, %ymm9; 1245 + vmovdqu %ymm0, 6 * 32(%rsi); 1246 + 1247 + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1248 + vpxor 7 * 32(%rdx), %ymm0, %ymm8; 1249 + vmovdqu %ymm0, 7 * 32(%rsi); 1250 + 1251 + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1252 + vpxor 8 * 32(%rdx), %ymm0, %ymm7; 1253 + vmovdqu %ymm0, 8 * 32(%rsi); 1254 + 1255 + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1256 + vpxor 9 * 32(%rdx), %ymm0, %ymm6; 1257 + vmovdqu %ymm0, 9 * 32(%rsi); 1258 + 1259 + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1260 + vpxor 10 * 32(%rdx), %ymm0, %ymm5; 1261 + vmovdqu %ymm0, 10 * 32(%rsi); 1262 + 1263 + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1264 + vpxor 11 * 32(%rdx), %ymm0, %ymm4; 1265 + vmovdqu %ymm0, 11 * 32(%rsi); 1266 + 1267 + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1268 + vpxor 12 * 32(%rdx), %ymm0, %ymm3; 1269 + vmovdqu %ymm0, 12 * 32(%rsi); 1270 + 1271 + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1272 + vpxor 13 * 32(%rdx), %ymm0, %ymm2; 1273 + vmovdqu %ymm0, 13 * 32(%rsi); 1274 + 1275 + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1276 + vpxor 14 * 32(%rdx), %ymm0, %ymm1; 1277 + vmovdqu %ymm0, 14 * 32(%rsi); 1278 + 1279 + gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1280 + vpxor 15 * 32(%rdx), %ymm0, %ymm15; 1281 + vmovdqu %ymm15, 0 * 32(%rax); 1282 + vmovdqu %ymm0, 15 * 32(%rsi); 1283 + 1284 + vextracti128 $1, %ymm0, %xmm0; 1285 + gf128mul_x_ble(%xmm0, %xmm12, %xmm15); 1286 + vmovdqu %xmm0, (%rcx); 1287 + 1288 + /* inpack32_pre: */ 1289 + vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; 1290 + vpshufb .Lpack_bswap, %ymm15, %ymm15; 1291 + vpxor 0 * 32(%rax), %ymm15, %ymm0; 1292 + vpxor %ymm1, %ymm15, %ymm1; 1293 + vpxor %ymm2, %ymm15, %ymm2; 1294 + vpxor %ymm3, %ymm15, %ymm3; 1295 + vpxor %ymm4, %ymm15, %ymm4; 1296 + vpxor %ymm5, %ymm15, %ymm5; 1297 + vpxor %ymm6, %ymm15, %ymm6; 1298 + vpxor %ymm7, %ymm15, %ymm7; 1299 + vpxor %ymm8, %ymm15, %ymm8; 1300 + vpxor %ymm9, %ymm15, %ymm9; 1301 + vpxor %ymm10, %ymm15, %ymm10; 1302 + vpxor %ymm11, %ymm15, %ymm11; 1303 + vpxor 12 * 32(%rax), %ymm15, %ymm12; 1304 + vpxor 13 * 32(%rax), %ymm15, %ymm13; 1305 + vpxor 14 * 32(%rax), %ymm15, %ymm14; 1306 + vpxor 15 * 32(%rax), %ymm15, %ymm15; 1307 + 1308 + call *%r9; 1309 + 1310 + addq $(16 * 32), %rsp; 1311 + 1312 + vpxor 0 * 32(%rsi), %ymm7, %ymm7; 1313 + vpxor 1 * 32(%rsi), %ymm6, %ymm6; 1314 + vpxor 2 * 32(%rsi), %ymm5, %ymm5; 1315 + vpxor 3 * 32(%rsi), %ymm4, %ymm4; 1316 + vpxor 4 * 32(%rsi), %ymm3, %ymm3; 1317 + vpxor 5 * 32(%rsi), %ymm2, %ymm2; 1318 + vpxor 6 * 32(%rsi), %ymm1, %ymm1; 1319 + vpxor 7 * 32(%rsi), %ymm0, %ymm0; 1320 + vpxor 8 * 32(%rsi), %ymm15, %ymm15; 1321 + vpxor 9 * 32(%rsi), %ymm14, %ymm14; 1322 + vpxor 10 * 32(%rsi), %ymm13, %ymm13; 1323 + vpxor 11 * 32(%rsi), %ymm12, %ymm12; 1324 + vpxor 12 * 32(%rsi), %ymm11, %ymm11; 1325 + vpxor 13 * 32(%rsi), %ymm10, %ymm10; 1326 + vpxor 14 * 32(%rsi), %ymm9, %ymm9; 1327 + vpxor 15 * 32(%rsi), %ymm8, %ymm8; 1328 + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 1329 + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 1330 + %ymm8, %rsi); 1331 + 1332 + vzeroupper; 1333 + 1334 + ret; 1335 + ENDPROC(camellia_xts_crypt_32way) 1336 + 1337 + ENTRY(camellia_xts_enc_32way) 1338 + /* input: 1339 + * %rdi: ctx, CTX 1340 + * %rsi: dst (32 blocks) 1341 + * %rdx: src (32 blocks) 1342 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1343 + */ 1344 + 1345 + xorl %r8d, %r8d; /* input whitening key, 0 for enc */ 1346 + 1347 + leaq __camellia_enc_blk32, %r9; 1348 + 1349 + jmp camellia_xts_crypt_32way; 1350 + ENDPROC(camellia_xts_enc_32way) 1351 + 1352 + ENTRY(camellia_xts_dec_32way) 1353 + /* input: 1354 + * %rdi: ctx, CTX 1355 + * %rsi: dst (32 blocks) 1356 + * %rdx: src (32 blocks) 1357 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1358 + */ 1359 + 1360 + cmpl $16, key_length(CTX); 1361 + movl $32, %r8d; 1362 + movl $24, %eax; 1363 + cmovel %eax, %r8d; /* input whitening key, last for dec */ 1364 + 1365 + leaq __camellia_dec_blk32, %r9; 1366 + 1367 + jmp camellia_xts_crypt_32way; 1368 + ENDPROC(camellia_xts_dec_32way)

+586

arch/x86/crypto/camellia_aesni_avx2_glue.c

··· 1 + /* 2 + * Glue Code for x86_64/AVX2/AES-NI assembler optimized version of Camellia 3 + * 4 + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License as published by 8 + * the Free Software Foundation; either version 2 of the License, or 9 + * (at your option) any later version. 10 + * 11 + */ 12 + 13 + #include <linux/module.h> 14 + #include <linux/types.h> 15 + #include <linux/crypto.h> 16 + #include <linux/err.h> 17 + #include <crypto/algapi.h> 18 + #include <crypto/ctr.h> 19 + #include <crypto/lrw.h> 20 + #include <crypto/xts.h> 21 + #include <asm/xcr.h> 22 + #include <asm/xsave.h> 23 + #include <asm/crypto/camellia.h> 24 + #include <asm/crypto/ablk_helper.h> 25 + #include <asm/crypto/glue_helper.h> 26 + 27 + #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 28 + #define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32 29 + 30 + /* 32-way AVX2/AES-NI parallel cipher functions */ 31 + asmlinkage void camellia_ecb_enc_32way(struct camellia_ctx *ctx, u8 *dst, 32 + const u8 *src); 33 + asmlinkage void camellia_ecb_dec_32way(struct camellia_ctx *ctx, u8 *dst, 34 + const u8 *src); 35 + 36 + asmlinkage void camellia_cbc_dec_32way(struct camellia_ctx *ctx, u8 *dst, 37 + const u8 *src); 38 + asmlinkage void camellia_ctr_32way(struct camellia_ctx *ctx, u8 *dst, 39 + const u8 *src, le128 *iv); 40 + 41 + asmlinkage void camellia_xts_enc_32way(struct camellia_ctx *ctx, u8 *dst, 42 + const u8 *src, le128 *iv); 43 + asmlinkage void camellia_xts_dec_32way(struct camellia_ctx *ctx, u8 *dst, 44 + const u8 *src, le128 *iv); 45 + 46 + static const struct common_glue_ctx camellia_enc = { 47 + .num_funcs = 4, 48 + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, 49 + 50 + .funcs = { { 51 + .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, 52 + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_32way) } 53 + }, { 54 + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, 55 + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) } 56 + }, { 57 + .num_blocks = 2, 58 + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } 59 + }, { 60 + .num_blocks = 1, 61 + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } 62 + } } 63 + }; 64 + 65 + static const struct common_glue_ctx camellia_ctr = { 66 + .num_funcs = 4, 67 + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, 68 + 69 + .funcs = { { 70 + .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, 71 + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_32way) } 72 + }, { 73 + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, 74 + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) } 75 + }, { 76 + .num_blocks = 2, 77 + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } 78 + }, { 79 + .num_blocks = 1, 80 + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } 81 + } } 82 + }; 83 + 84 + static const struct common_glue_ctx camellia_enc_xts = { 85 + .num_funcs = 3, 86 + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, 87 + 88 + .funcs = { { 89 + .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, 90 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_32way) } 91 + }, { 92 + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, 93 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) } 94 + }, { 95 + .num_blocks = 1, 96 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) } 97 + } } 98 + }; 99 + 100 + static const struct common_glue_ctx camellia_dec = { 101 + .num_funcs = 4, 102 + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, 103 + 104 + .funcs = { { 105 + .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, 106 + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_32way) } 107 + }, { 108 + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, 109 + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) } 110 + }, { 111 + .num_blocks = 2, 112 + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } 113 + }, { 114 + .num_blocks = 1, 115 + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } 116 + } } 117 + }; 118 + 119 + static const struct common_glue_ctx camellia_dec_cbc = { 120 + .num_funcs = 4, 121 + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, 122 + 123 + .funcs = { { 124 + .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, 125 + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_32way) } 126 + }, { 127 + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, 128 + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) } 129 + }, { 130 + .num_blocks = 2, 131 + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } 132 + }, { 133 + .num_blocks = 1, 134 + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } 135 + } } 136 + }; 137 + 138 + static const struct common_glue_ctx camellia_dec_xts = { 139 + .num_funcs = 3, 140 + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, 141 + 142 + .funcs = { { 143 + .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, 144 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_32way) } 145 + }, { 146 + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, 147 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) } 148 + }, { 149 + .num_blocks = 1, 150 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) } 151 + } } 152 + }; 153 + 154 + static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 155 + struct scatterlist *src, unsigned int nbytes) 156 + { 157 + return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes); 158 + } 159 + 160 + static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 161 + struct scatterlist *src, unsigned int nbytes) 162 + { 163 + return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes); 164 + } 165 + 166 + static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 167 + struct scatterlist *src, unsigned int nbytes) 168 + { 169 + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc, 170 + dst, src, nbytes); 171 + } 172 + 173 + static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 174 + struct scatterlist *src, unsigned int nbytes) 175 + { 176 + return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src, 177 + nbytes); 178 + } 179 + 180 + static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, 181 + struct scatterlist *src, unsigned int nbytes) 182 + { 183 + return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes); 184 + } 185 + 186 + static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes) 187 + { 188 + return glue_fpu_begin(CAMELLIA_BLOCK_SIZE, 189 + CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled, 190 + nbytes); 191 + } 192 + 193 + static inline void camellia_fpu_end(bool fpu_enabled) 194 + { 195 + glue_fpu_end(fpu_enabled); 196 + } 197 + 198 + static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, 199 + unsigned int key_len) 200 + { 201 + return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len, 202 + &tfm->crt_flags); 203 + } 204 + 205 + struct crypt_priv { 206 + struct camellia_ctx *ctx; 207 + bool fpu_enabled; 208 + }; 209 + 210 + static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) 211 + { 212 + const unsigned int bsize = CAMELLIA_BLOCK_SIZE; 213 + struct crypt_priv *ctx = priv; 214 + int i; 215 + 216 + ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); 217 + 218 + if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) { 219 + camellia_ecb_enc_32way(ctx->ctx, srcdst, srcdst); 220 + srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; 221 + nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; 222 + } 223 + 224 + if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { 225 + camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst); 226 + srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; 227 + nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; 228 + } 229 + 230 + while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { 231 + camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); 232 + srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; 233 + nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; 234 + } 235 + 236 + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) 237 + camellia_enc_blk(ctx->ctx, srcdst, srcdst); 238 + } 239 + 240 + static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) 241 + { 242 + const unsigned int bsize = CAMELLIA_BLOCK_SIZE; 243 + struct crypt_priv *ctx = priv; 244 + int i; 245 + 246 + ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); 247 + 248 + if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) { 249 + camellia_ecb_dec_32way(ctx->ctx, srcdst, srcdst); 250 + srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; 251 + nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; 252 + } 253 + 254 + if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { 255 + camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst); 256 + srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; 257 + nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; 258 + } 259 + 260 + while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { 261 + camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); 262 + srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; 263 + nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; 264 + } 265 + 266 + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) 267 + camellia_dec_blk(ctx->ctx, srcdst, srcdst); 268 + } 269 + 270 + static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 271 + struct scatterlist *src, unsigned int nbytes) 272 + { 273 + struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 274 + be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS]; 275 + struct crypt_priv crypt_ctx = { 276 + .ctx = &ctx->camellia_ctx, 277 + .fpu_enabled = false, 278 + }; 279 + struct lrw_crypt_req req = { 280 + .tbuf = buf, 281 + .tbuflen = sizeof(buf), 282 + 283 + .table_ctx = &ctx->lrw_table, 284 + .crypt_ctx = &crypt_ctx, 285 + .crypt_fn = encrypt_callback, 286 + }; 287 + int ret; 288 + 289 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 290 + ret = lrw_crypt(desc, dst, src, nbytes, &req); 291 + camellia_fpu_end(crypt_ctx.fpu_enabled); 292 + 293 + return ret; 294 + } 295 + 296 + static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 297 + struct scatterlist *src, unsigned int nbytes) 298 + { 299 + struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 300 + be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS]; 301 + struct crypt_priv crypt_ctx = { 302 + .ctx = &ctx->camellia_ctx, 303 + .fpu_enabled = false, 304 + }; 305 + struct lrw_crypt_req req = { 306 + .tbuf = buf, 307 + .tbuflen = sizeof(buf), 308 + 309 + .table_ctx = &ctx->lrw_table, 310 + .crypt_ctx = &crypt_ctx, 311 + .crypt_fn = decrypt_callback, 312 + }; 313 + int ret; 314 + 315 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 316 + ret = lrw_crypt(desc, dst, src, nbytes, &req); 317 + camellia_fpu_end(crypt_ctx.fpu_enabled); 318 + 319 + return ret; 320 + } 321 + 322 + static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 323 + struct scatterlist *src, unsigned int nbytes) 324 + { 325 + struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 326 + 327 + return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes, 328 + XTS_TWEAK_CAST(camellia_enc_blk), 329 + &ctx->tweak_ctx, &ctx->crypt_ctx); 330 + } 331 + 332 + static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 333 + struct scatterlist *src, unsigned int nbytes) 334 + { 335 + struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 336 + 337 + return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes, 338 + XTS_TWEAK_CAST(camellia_enc_blk), 339 + &ctx->tweak_ctx, &ctx->crypt_ctx); 340 + } 341 + 342 + static struct crypto_alg cmll_algs[10] = { { 343 + .cra_name = "__ecb-camellia-aesni-avx2", 344 + .cra_driver_name = "__driver-ecb-camellia-aesni-avx2", 345 + .cra_priority = 0, 346 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 347 + .cra_blocksize = CAMELLIA_BLOCK_SIZE, 348 + .cra_ctxsize = sizeof(struct camellia_ctx), 349 + .cra_alignmask = 0, 350 + .cra_type = &crypto_blkcipher_type, 351 + .cra_module = THIS_MODULE, 352 + .cra_u = { 353 + .blkcipher = { 354 + .min_keysize = CAMELLIA_MIN_KEY_SIZE, 355 + .max_keysize = CAMELLIA_MAX_KEY_SIZE, 356 + .setkey = camellia_setkey, 357 + .encrypt = ecb_encrypt, 358 + .decrypt = ecb_decrypt, 359 + }, 360 + }, 361 + }, { 362 + .cra_name = "__cbc-camellia-aesni-avx2", 363 + .cra_driver_name = "__driver-cbc-camellia-aesni-avx2", 364 + .cra_priority = 0, 365 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 366 + .cra_blocksize = CAMELLIA_BLOCK_SIZE, 367 + .cra_ctxsize = sizeof(struct camellia_ctx), 368 + .cra_alignmask = 0, 369 + .cra_type = &crypto_blkcipher_type, 370 + .cra_module = THIS_MODULE, 371 + .cra_u = { 372 + .blkcipher = { 373 + .min_keysize = CAMELLIA_MIN_KEY_SIZE, 374 + .max_keysize = CAMELLIA_MAX_KEY_SIZE, 375 + .setkey = camellia_setkey, 376 + .encrypt = cbc_encrypt, 377 + .decrypt = cbc_decrypt, 378 + }, 379 + }, 380 + }, { 381 + .cra_name = "__ctr-camellia-aesni-avx2", 382 + .cra_driver_name = "__driver-ctr-camellia-aesni-avx2", 383 + .cra_priority = 0, 384 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 385 + .cra_blocksize = 1, 386 + .cra_ctxsize = sizeof(struct camellia_ctx), 387 + .cra_alignmask = 0, 388 + .cra_type = &crypto_blkcipher_type, 389 + .cra_module = THIS_MODULE, 390 + .cra_u = { 391 + .blkcipher = { 392 + .min_keysize = CAMELLIA_MIN_KEY_SIZE, 393 + .max_keysize = CAMELLIA_MAX_KEY_SIZE, 394 + .ivsize = CAMELLIA_BLOCK_SIZE, 395 + .setkey = camellia_setkey, 396 + .encrypt = ctr_crypt, 397 + .decrypt = ctr_crypt, 398 + }, 399 + }, 400 + }, { 401 + .cra_name = "__lrw-camellia-aesni-avx2", 402 + .cra_driver_name = "__driver-lrw-camellia-aesni-avx2", 403 + .cra_priority = 0, 404 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 405 + .cra_blocksize = CAMELLIA_BLOCK_SIZE, 406 + .cra_ctxsize = sizeof(struct camellia_lrw_ctx), 407 + .cra_alignmask = 0, 408 + .cra_type = &crypto_blkcipher_type, 409 + .cra_module = THIS_MODULE, 410 + .cra_exit = lrw_camellia_exit_tfm, 411 + .cra_u = { 412 + .blkcipher = { 413 + .min_keysize = CAMELLIA_MIN_KEY_SIZE + 414 + CAMELLIA_BLOCK_SIZE, 415 + .max_keysize = CAMELLIA_MAX_KEY_SIZE + 416 + CAMELLIA_BLOCK_SIZE, 417 + .ivsize = CAMELLIA_BLOCK_SIZE, 418 + .setkey = lrw_camellia_setkey, 419 + .encrypt = lrw_encrypt, 420 + .decrypt = lrw_decrypt, 421 + }, 422 + }, 423 + }, { 424 + .cra_name = "__xts-camellia-aesni-avx2", 425 + .cra_driver_name = "__driver-xts-camellia-aesni-avx2", 426 + .cra_priority = 0, 427 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 428 + .cra_blocksize = CAMELLIA_BLOCK_SIZE, 429 + .cra_ctxsize = sizeof(struct camellia_xts_ctx), 430 + .cra_alignmask = 0, 431 + .cra_type = &crypto_blkcipher_type, 432 + .cra_module = THIS_MODULE, 433 + .cra_u = { 434 + .blkcipher = { 435 + .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, 436 + .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, 437 + .ivsize = CAMELLIA_BLOCK_SIZE, 438 + .setkey = xts_camellia_setkey, 439 + .encrypt = xts_encrypt, 440 + .decrypt = xts_decrypt, 441 + }, 442 + }, 443 + }, { 444 + .cra_name = "ecb(camellia)", 445 + .cra_driver_name = "ecb-camellia-aesni-avx2", 446 + .cra_priority = 500, 447 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 448 + .cra_blocksize = CAMELLIA_BLOCK_SIZE, 449 + .cra_ctxsize = sizeof(struct async_helper_ctx), 450 + .cra_alignmask = 0, 451 + .cra_type = &crypto_ablkcipher_type, 452 + .cra_module = THIS_MODULE, 453 + .cra_init = ablk_init, 454 + .cra_exit = ablk_exit, 455 + .cra_u = { 456 + .ablkcipher = { 457 + .min_keysize = CAMELLIA_MIN_KEY_SIZE, 458 + .max_keysize = CAMELLIA_MAX_KEY_SIZE, 459 + .setkey = ablk_set_key, 460 + .encrypt = ablk_encrypt, 461 + .decrypt = ablk_decrypt, 462 + }, 463 + }, 464 + }, { 465 + .cra_name = "cbc(camellia)", 466 + .cra_driver_name = "cbc-camellia-aesni-avx2", 467 + .cra_priority = 500, 468 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 469 + .cra_blocksize = CAMELLIA_BLOCK_SIZE, 470 + .cra_ctxsize = sizeof(struct async_helper_ctx), 471 + .cra_alignmask = 0, 472 + .cra_type = &crypto_ablkcipher_type, 473 + .cra_module = THIS_MODULE, 474 + .cra_init = ablk_init, 475 + .cra_exit = ablk_exit, 476 + .cra_u = { 477 + .ablkcipher = { 478 + .min_keysize = CAMELLIA_MIN_KEY_SIZE, 479 + .max_keysize = CAMELLIA_MAX_KEY_SIZE, 480 + .ivsize = CAMELLIA_BLOCK_SIZE, 481 + .setkey = ablk_set_key, 482 + .encrypt = __ablk_encrypt, 483 + .decrypt = ablk_decrypt, 484 + }, 485 + }, 486 + }, { 487 + .cra_name = "ctr(camellia)", 488 + .cra_driver_name = "ctr-camellia-aesni-avx2", 489 + .cra_priority = 500, 490 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 491 + .cra_blocksize = 1, 492 + .cra_ctxsize = sizeof(struct async_helper_ctx), 493 + .cra_alignmask = 0, 494 + .cra_type = &crypto_ablkcipher_type, 495 + .cra_module = THIS_MODULE, 496 + .cra_init = ablk_init, 497 + .cra_exit = ablk_exit, 498 + .cra_u = { 499 + .ablkcipher = { 500 + .min_keysize = CAMELLIA_MIN_KEY_SIZE, 501 + .max_keysize = CAMELLIA_MAX_KEY_SIZE, 502 + .ivsize = CAMELLIA_BLOCK_SIZE, 503 + .setkey = ablk_set_key, 504 + .encrypt = ablk_encrypt, 505 + .decrypt = ablk_encrypt, 506 + .geniv = "chainiv", 507 + }, 508 + }, 509 + }, { 510 + .cra_name = "lrw(camellia)", 511 + .cra_driver_name = "lrw-camellia-aesni-avx2", 512 + .cra_priority = 500, 513 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 514 + .cra_blocksize = CAMELLIA_BLOCK_SIZE, 515 + .cra_ctxsize = sizeof(struct async_helper_ctx), 516 + .cra_alignmask = 0, 517 + .cra_type = &crypto_ablkcipher_type, 518 + .cra_module = THIS_MODULE, 519 + .cra_init = ablk_init, 520 + .cra_exit = ablk_exit, 521 + .cra_u = { 522 + .ablkcipher = { 523 + .min_keysize = CAMELLIA_MIN_KEY_SIZE + 524 + CAMELLIA_BLOCK_SIZE, 525 + .max_keysize = CAMELLIA_MAX_KEY_SIZE + 526 + CAMELLIA_BLOCK_SIZE, 527 + .ivsize = CAMELLIA_BLOCK_SIZE, 528 + .setkey = ablk_set_key, 529 + .encrypt = ablk_encrypt, 530 + .decrypt = ablk_decrypt, 531 + }, 532 + }, 533 + }, { 534 + .cra_name = "xts(camellia)", 535 + .cra_driver_name = "xts-camellia-aesni-avx2", 536 + .cra_priority = 500, 537 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 538 + .cra_blocksize = CAMELLIA_BLOCK_SIZE, 539 + .cra_ctxsize = sizeof(struct async_helper_ctx), 540 + .cra_alignmask = 0, 541 + .cra_type = &crypto_ablkcipher_type, 542 + .cra_module = THIS_MODULE, 543 + .cra_init = ablk_init, 544 + .cra_exit = ablk_exit, 545 + .cra_u = { 546 + .ablkcipher = { 547 + .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, 548 + .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, 549 + .ivsize = CAMELLIA_BLOCK_SIZE, 550 + .setkey = ablk_set_key, 551 + .encrypt = ablk_encrypt, 552 + .decrypt = ablk_decrypt, 553 + }, 554 + }, 555 + } }; 556 + 557 + static int __init camellia_aesni_init(void) 558 + { 559 + u64 xcr0; 560 + 561 + if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { 562 + pr_info("AVX2 or AES-NI instructions are not detected.\n"); 563 + return -ENODEV; 564 + } 565 + 566 + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 567 + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { 568 + pr_info("AVX2 detected but unusable.\n"); 569 + return -ENODEV; 570 + } 571 + 572 + return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); 573 + } 574 + 575 + static void __exit camellia_aesni_fini(void) 576 + { 577 + crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); 578 + } 579 + 580 + module_init(camellia_aesni_init); 581 + module_exit(camellia_aesni_fini); 582 + 583 + MODULE_LICENSE("GPL"); 584 + MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized"); 585 + MODULE_ALIAS("camellia"); 586 + MODULE_ALIAS("camellia-asm");

+62 -42

arch/x86/crypto/camellia_aesni_avx_glue.c

··· 1 1 /* 2 2 * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia 3 3 * 4 - * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 4 + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 5 5 * 6 6 * This program is free software; you can redistribute it and/or modify 7 7 * it under the terms of the GNU General Public License as published by ··· 26 26 27 27 #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 28 28 29 - /* 16-way AES-NI parallel cipher functions */ 29 + /* 16-way parallel cipher functions (avx/aes-ni) */ 30 30 asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, 31 31 const u8 *src); 32 + EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way); 33 + 32 34 asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, 33 35 const u8 *src); 36 + EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way); 34 37 35 38 asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, 36 39 const u8 *src); 40 + EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way); 41 + 37 42 asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, 38 43 const u8 *src, le128 *iv); 44 + EXPORT_SYMBOL_GPL(camellia_ctr_16way); 45 + 46 + asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst, 47 + const u8 *src, le128 *iv); 48 + EXPORT_SYMBOL_GPL(camellia_xts_enc_16way); 49 + 50 + asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst, 51 + const u8 *src, le128 *iv); 52 + EXPORT_SYMBOL_GPL(camellia_xts_dec_16way); 53 + 54 + void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) 55 + { 56 + glue_xts_crypt_128bit_one(ctx, dst, src, iv, 57 + GLUE_FUNC_CAST(camellia_enc_blk)); 58 + } 59 + EXPORT_SYMBOL_GPL(camellia_xts_enc); 60 + 61 + void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) 62 + { 63 + glue_xts_crypt_128bit_one(ctx, dst, src, iv, 64 + GLUE_FUNC_CAST(camellia_dec_blk)); 65 + } 66 + EXPORT_SYMBOL_GPL(camellia_xts_dec); 39 67 40 68 static const struct common_glue_ctx camellia_enc = { 41 69 .num_funcs = 3, ··· 97 69 } } 98 70 }; 99 71 72 + static const struct common_glue_ctx camellia_enc_xts = { 73 + .num_funcs = 2, 74 + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, 75 + 76 + .funcs = { { 77 + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, 78 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) } 79 + }, { 80 + .num_blocks = 1, 81 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) } 82 + } } 83 + }; 84 + 100 85 static const struct common_glue_ctx camellia_dec = { 101 86 .num_funcs = 3, 102 87 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, ··· 139 98 }, { 140 99 .num_blocks = 1, 141 100 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } 101 + } } 102 + }; 103 + 104 + static const struct common_glue_ctx camellia_dec_xts = { 105 + .num_funcs = 2, 106 + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, 107 + 108 + .funcs = { { 109 + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, 110 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) } 111 + }, { 112 + .num_blocks = 1, 113 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) } 142 114 } } 143 115 }; 144 116 ··· 315 261 struct scatterlist *src, unsigned int nbytes) 316 262 { 317 263 struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 318 - be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; 319 - struct crypt_priv crypt_ctx = { 320 - .ctx = &ctx->crypt_ctx, 321 - .fpu_enabled = false, 322 - }; 323 - struct xts_crypt_req req = { 324 - .tbuf = buf, 325 - .tbuflen = sizeof(buf), 326 264 327 - .tweak_ctx = &ctx->tweak_ctx, 328 - .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), 329 - .crypt_ctx = &crypt_ctx, 330 - .crypt_fn = encrypt_callback, 331 - }; 332 - int ret; 333 - 334 - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 335 - ret = xts_crypt(desc, dst, src, nbytes, &req); 336 - camellia_fpu_end(crypt_ctx.fpu_enabled); 337 - 338 - return ret; 265 + return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes, 266 + XTS_TWEAK_CAST(camellia_enc_blk), 267 + &ctx->tweak_ctx, &ctx->crypt_ctx); 339 268 } 340 269 341 270 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 342 271 struct scatterlist *src, unsigned int nbytes) 343 272 { 344 273 struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 345 - be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; 346 - struct crypt_priv crypt_ctx = { 347 - .ctx = &ctx->crypt_ctx, 348 - .fpu_enabled = false, 349 - }; 350 - struct xts_crypt_req req = { 351 - .tbuf = buf, 352 - .tbuflen = sizeof(buf), 353 274 354 - .tweak_ctx = &ctx->tweak_ctx, 355 - .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), 356 - .crypt_ctx = &crypt_ctx, 357 - .crypt_fn = decrypt_callback, 358 - }; 359 - int ret; 360 - 361 - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 362 - ret = xts_crypt(desc, dst, src, nbytes, &req); 363 - camellia_fpu_end(crypt_ctx.fpu_enabled); 364 - 365 - return ret; 275 + return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes, 276 + XTS_TWEAK_CAST(camellia_enc_blk), 277 + &ctx->tweak_ctx, &ctx->crypt_ctx); 366 278 } 367 279 368 280 static struct crypto_alg cmll_algs[10] = { {

+47 -1

arch/x86/crypto/cast6-avx-x86_64-asm_64.S

··· 4 4 * Copyright (C) 2012 Johannes Goetzfried 5 5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 6 6 * 7 - * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 7 + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 8 8 * 9 9 * This program is free software; you can redistribute it and/or modify 10 10 * it under the terms of the GNU General Public License as published by ··· 227 227 .data 228 228 229 229 .align 16 230 + .Lxts_gf128mul_and_shl1_mask: 231 + .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 230 232 .Lbswap_mask: 231 233 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 232 234 .Lbswap128_mask: ··· 426 424 427 425 ret; 428 426 ENDPROC(cast6_ctr_8way) 427 + 428 + ENTRY(cast6_xts_enc_8way) 429 + /* input: 430 + * %rdi: ctx, CTX 431 + * %rsi: dst 432 + * %rdx: src 433 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 434 + */ 435 + 436 + movq %rsi, %r11; 437 + 438 + /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 439 + load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 440 + RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); 441 + 442 + call __cast6_enc_blk8; 443 + 444 + /* dst <= regs xor IVs(in dst) */ 445 + store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 446 + 447 + ret; 448 + ENDPROC(cast6_xts_enc_8way) 449 + 450 + ENTRY(cast6_xts_dec_8way) 451 + /* input: 452 + * %rdi: ctx, CTX 453 + * %rsi: dst 454 + * %rdx: src 455 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 456 + */ 457 + 458 + movq %rsi, %r11; 459 + 460 + /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 461 + load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 462 + RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask); 463 + 464 + call __cast6_dec_blk8; 465 + 466 + /* dst <= regs xor IVs(in dst) */ 467 + store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 468 + 469 + ret; 470 + ENDPROC(cast6_xts_dec_8way)

+51 -40

arch/x86/crypto/cast6_avx_glue.c

··· 4 4 * Copyright (C) 2012 Johannes Goetzfried 5 5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 6 6 * 7 + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 8 + * 7 9 * This program is free software; you can redistribute it and/or modify 8 10 * it under the terms of the GNU General Public License as published by 9 11 * the Free Software Foundation; either version 2 of the License, or ··· 52 50 asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src, 53 51 le128 *iv); 54 52 53 + asmlinkage void cast6_xts_enc_8way(struct cast6_ctx *ctx, u8 *dst, 54 + const u8 *src, le128 *iv); 55 + asmlinkage void cast6_xts_dec_8way(struct cast6_ctx *ctx, u8 *dst, 56 + const u8 *src, le128 *iv); 57 + 58 + static void cast6_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) 59 + { 60 + glue_xts_crypt_128bit_one(ctx, dst, src, iv, 61 + GLUE_FUNC_CAST(__cast6_encrypt)); 62 + } 63 + 64 + static void cast6_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) 65 + { 66 + glue_xts_crypt_128bit_one(ctx, dst, src, iv, 67 + GLUE_FUNC_CAST(__cast6_decrypt)); 68 + } 69 + 55 70 static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) 56 71 { 57 72 be128 ctrblk; ··· 106 87 } } 107 88 }; 108 89 90 + static const struct common_glue_ctx cast6_enc_xts = { 91 + .num_funcs = 2, 92 + .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, 93 + 94 + .funcs = { { 95 + .num_blocks = CAST6_PARALLEL_BLOCKS, 96 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc_8way) } 97 + }, { 98 + .num_blocks = 1, 99 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc) } 100 + } } 101 + }; 102 + 109 103 static const struct common_glue_ctx cast6_dec = { 110 104 .num_funcs = 2, 111 105 .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, ··· 142 110 }, { 143 111 .num_blocks = 1, 144 112 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) } 113 + } } 114 + }; 115 + 116 + static const struct common_glue_ctx cast6_dec_xts = { 117 + .num_funcs = 2, 118 + .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, 119 + 120 + .funcs = { { 121 + .num_blocks = CAST6_PARALLEL_BLOCKS, 122 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec_8way) } 123 + }, { 124 + .num_blocks = 1, 125 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec) } 145 126 } } 146 127 }; 147 128 ··· 352 307 struct scatterlist *src, unsigned int nbytes) 353 308 { 354 309 struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 355 - be128 buf[CAST6_PARALLEL_BLOCKS]; 356 - struct crypt_priv crypt_ctx = { 357 - .ctx = &ctx->crypt_ctx, 358 - .fpu_enabled = false, 359 - }; 360 - struct xts_crypt_req req = { 361 - .tbuf = buf, 362 - .tbuflen = sizeof(buf), 363 310 364 - .tweak_ctx = &ctx->tweak_ctx, 365 - .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt), 366 - .crypt_ctx = &crypt_ctx, 367 - .crypt_fn = encrypt_callback, 368 - }; 369 - int ret; 370 - 371 - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 372 - ret = xts_crypt(desc, dst, src, nbytes, &req); 373 - cast6_fpu_end(crypt_ctx.fpu_enabled); 374 - 375 - return ret; 311 + return glue_xts_crypt_128bit(&cast6_enc_xts, desc, dst, src, nbytes, 312 + XTS_TWEAK_CAST(__cast6_encrypt), 313 + &ctx->tweak_ctx, &ctx->crypt_ctx); 376 314 } 377 315 378 316 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 379 317 struct scatterlist *src, unsigned int nbytes) 380 318 { 381 319 struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 382 - be128 buf[CAST6_PARALLEL_BLOCKS]; 383 - struct crypt_priv crypt_ctx = { 384 - .ctx = &ctx->crypt_ctx, 385 - .fpu_enabled = false, 386 - }; 387 - struct xts_crypt_req req = { 388 - .tbuf = buf, 389 - .tbuflen = sizeof(buf), 390 320 391 - .tweak_ctx = &ctx->tweak_ctx, 392 - .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt), 393 - .crypt_ctx = &crypt_ctx, 394 - .crypt_fn = decrypt_callback, 395 - }; 396 - int ret; 397 - 398 - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 399 - ret = xts_crypt(desc, dst, src, nbytes, &req); 400 - cast6_fpu_end(crypt_ctx.fpu_enabled); 401 - 402 - return ret; 321 + return glue_xts_crypt_128bit(&cast6_dec_xts, desc, dst, src, nbytes, 322 + XTS_TWEAK_CAST(__cast6_encrypt), 323 + &ctx->tweak_ctx, &ctx->crypt_ctx); 403 324 } 404 325 405 326 static struct crypto_alg cast6_algs[10] = { {

+3 -3

arch/x86/crypto/crc32-pclmul_asm.S

··· 101 101 * uint crc32_pclmul_le_16(unsigned char const *buffer, 102 102 * size_t len, uint crc32) 103 103 */ 104 - .globl crc32_pclmul_le_16 105 - .align 4, 0x90 106 - crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */ 104 + 105 + ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ 107 106 movdqa (BUF), %xmm1 108 107 movdqa 0x10(BUF), %xmm2 109 108 movdqa 0x20(BUF), %xmm3 ··· 243 244 pextrd $0x01, %xmm1, %eax 244 245 245 246 ret 247 + ENDPROC(crc32_pclmul_le_16)

+6 -4

arch/x86/crypto/crc32c-pcl-intel-asm_64.S

··· 1 1 /* 2 2 * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) 3 3 * 4 - * The white paper on CRC32C calculations with PCLMULQDQ instruction can be 4 + * The white papers on CRC32C calculations with PCLMULQDQ instruction can be 5 5 * downloaded from: 6 - * http://download.intel.com/design/intarch/papers/323405.pdf 6 + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf 7 + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf 7 8 * 8 9 * Copyright (C) 2012 Intel Corporation. 9 10 * ··· 43 42 * SOFTWARE. 44 43 */ 45 44 45 + #include <asm/inst.h> 46 46 #include <linux/linkage.h> 47 47 48 48 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction ··· 227 225 movdqa (bufp), %xmm0 # 2 consts: K1:K2 228 226 229 227 movq crc_init, %xmm1 # CRC for block 1 230 - pclmulqdq $0x00,%xmm0,%xmm1 # Multiply by K2 228 + PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2 231 229 232 230 movq crc1, %xmm2 # CRC for block 2 233 - pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 231 + PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1 234 232 235 233 pxor %xmm2,%xmm1 236 234 movq %xmm1, %rax

+60 -1

arch/x86/crypto/glue_helper-asm-avx.S

··· 1 1 /* 2 2 * Shared glue code for 128bit block ciphers, AVX assembler macros 3 3 * 4 - * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 4 + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 5 5 * 6 6 * This program is free software; you can redistribute it and/or modify 7 7 * it under the terms of the GNU General Public License as published by ··· 88 88 vpxor (5*16)(src), x5, x5; \ 89 89 vpxor (6*16)(src), x6, x6; \ 90 90 vpxor (7*16)(src), x7, x7; \ 91 + store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); 92 + 93 + #define gf128mul_x_ble(iv, mask, tmp) \ 94 + vpsrad $31, iv, tmp; \ 95 + vpaddq iv, iv, iv; \ 96 + vpshufd $0x13, tmp, tmp; \ 97 + vpand mask, tmp, tmp; \ 98 + vpxor tmp, iv, iv; 99 + 100 + #define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \ 101 + t1, xts_gf128mul_and_shl1_mask) \ 102 + vmovdqa xts_gf128mul_and_shl1_mask, t0; \ 103 + \ 104 + /* load IV */ \ 105 + vmovdqu (iv), tiv; \ 106 + vpxor (0*16)(src), tiv, x0; \ 107 + vmovdqu tiv, (0*16)(dst); \ 108 + \ 109 + /* construct and store IVs, also xor with source */ \ 110 + gf128mul_x_ble(tiv, t0, t1); \ 111 + vpxor (1*16)(src), tiv, x1; \ 112 + vmovdqu tiv, (1*16)(dst); \ 113 + \ 114 + gf128mul_x_ble(tiv, t0, t1); \ 115 + vpxor (2*16)(src), tiv, x2; \ 116 + vmovdqu tiv, (2*16)(dst); \ 117 + \ 118 + gf128mul_x_ble(tiv, t0, t1); \ 119 + vpxor (3*16)(src), tiv, x3; \ 120 + vmovdqu tiv, (3*16)(dst); \ 121 + \ 122 + gf128mul_x_ble(tiv, t0, t1); \ 123 + vpxor (4*16)(src), tiv, x4; \ 124 + vmovdqu tiv, (4*16)(dst); \ 125 + \ 126 + gf128mul_x_ble(tiv, t0, t1); \ 127 + vpxor (5*16)(src), tiv, x5; \ 128 + vmovdqu tiv, (5*16)(dst); \ 129 + \ 130 + gf128mul_x_ble(tiv, t0, t1); \ 131 + vpxor (6*16)(src), tiv, x6; \ 132 + vmovdqu tiv, (6*16)(dst); \ 133 + \ 134 + gf128mul_x_ble(tiv, t0, t1); \ 135 + vpxor (7*16)(src), tiv, x7; \ 136 + vmovdqu tiv, (7*16)(dst); \ 137 + \ 138 + gf128mul_x_ble(tiv, t0, t1); \ 139 + vmovdqu tiv, (iv); 140 + 141 + #define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 142 + vpxor (0*16)(dst), x0, x0; \ 143 + vpxor (1*16)(dst), x1, x1; \ 144 + vpxor (2*16)(dst), x2, x2; \ 145 + vpxor (3*16)(dst), x3, x3; \ 146 + vpxor (4*16)(dst), x4, x4; \ 147 + vpxor (5*16)(dst), x5, x5; \ 148 + vpxor (6*16)(dst), x6, x6; \ 149 + vpxor (7*16)(dst), x7, x7; \ 91 150 store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);

+180

arch/x86/crypto/glue_helper-asm-avx2.S

··· 1 + /* 2 + * Shared glue code for 128bit block ciphers, AVX2 assembler macros 3 + * 4 + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License as published by 8 + * the Free Software Foundation; either version 2 of the License, or 9 + * (at your option) any later version. 10 + * 11 + */ 12 + 13 + #define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \ 14 + vmovdqu (0*32)(src), x0; \ 15 + vmovdqu (1*32)(src), x1; \ 16 + vmovdqu (2*32)(src), x2; \ 17 + vmovdqu (3*32)(src), x3; \ 18 + vmovdqu (4*32)(src), x4; \ 19 + vmovdqu (5*32)(src), x5; \ 20 + vmovdqu (6*32)(src), x6; \ 21 + vmovdqu (7*32)(src), x7; 22 + 23 + #define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 24 + vmovdqu x0, (0*32)(dst); \ 25 + vmovdqu x1, (1*32)(dst); \ 26 + vmovdqu x2, (2*32)(dst); \ 27 + vmovdqu x3, (3*32)(dst); \ 28 + vmovdqu x4, (4*32)(dst); \ 29 + vmovdqu x5, (5*32)(dst); \ 30 + vmovdqu x6, (6*32)(dst); \ 31 + vmovdqu x7, (7*32)(dst); 32 + 33 + #define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \ 34 + vpxor t0, t0, t0; \ 35 + vinserti128 $1, (src), t0, t0; \ 36 + vpxor t0, x0, x0; \ 37 + vpxor (0*32+16)(src), x1, x1; \ 38 + vpxor (1*32+16)(src), x2, x2; \ 39 + vpxor (2*32+16)(src), x3, x3; \ 40 + vpxor (3*32+16)(src), x4, x4; \ 41 + vpxor (4*32+16)(src), x5, x5; \ 42 + vpxor (5*32+16)(src), x6, x6; \ 43 + vpxor (6*32+16)(src), x7, x7; \ 44 + store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); 45 + 46 + #define inc_le128(x, minus_one, tmp) \ 47 + vpcmpeqq minus_one, x, tmp; \ 48 + vpsubq minus_one, x, x; \ 49 + vpslldq $8, tmp, tmp; \ 50 + vpsubq tmp, x, x; 51 + 52 + #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ 53 + vpcmpeqq minus_one, x, tmp1; \ 54 + vpcmpeqq minus_two, x, tmp2; \ 55 + vpsubq minus_two, x, x; \ 56 + vpor tmp2, tmp1, tmp1; \ 57 + vpslldq $8, tmp1, tmp1; \ 58 + vpsubq tmp1, x, x; 59 + 60 + #define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \ 61 + t1x, t2, t2x, t3, t3x, t4, t5) \ 62 + vpcmpeqd t0, t0, t0; \ 63 + vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \ 64 + vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\ 65 + \ 66 + /* load IV and byteswap */ \ 67 + vmovdqu (iv), t2x; \ 68 + vmovdqa t2x, t3x; \ 69 + inc_le128(t2x, t0x, t1x); \ 70 + vbroadcasti128 bswap, t1; \ 71 + vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \ 72 + vpshufb t1, t2, x0; \ 73 + \ 74 + /* construct IVs */ \ 75 + add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \ 76 + vpshufb t1, t2, x1; \ 77 + add2_le128(t2, t0, t4, t3, t5); \ 78 + vpshufb t1, t2, x2; \ 79 + add2_le128(t2, t0, t4, t3, t5); \ 80 + vpshufb t1, t2, x3; \ 81 + add2_le128(t2, t0, t4, t3, t5); \ 82 + vpshufb t1, t2, x4; \ 83 + add2_le128(t2, t0, t4, t3, t5); \ 84 + vpshufb t1, t2, x5; \ 85 + add2_le128(t2, t0, t4, t3, t5); \ 86 + vpshufb t1, t2, x6; \ 87 + add2_le128(t2, t0, t4, t3, t5); \ 88 + vpshufb t1, t2, x7; \ 89 + vextracti128 $1, t2, t2x; \ 90 + inc_le128(t2x, t0x, t3x); \ 91 + vmovdqu t2x, (iv); 92 + 93 + #define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 94 + vpxor (0*32)(src), x0, x0; \ 95 + vpxor (1*32)(src), x1, x1; \ 96 + vpxor (2*32)(src), x2, x2; \ 97 + vpxor (3*32)(src), x3, x3; \ 98 + vpxor (4*32)(src), x4, x4; \ 99 + vpxor (5*32)(src), x5, x5; \ 100 + vpxor (6*32)(src), x6, x6; \ 101 + vpxor (7*32)(src), x7, x7; \ 102 + store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); 103 + 104 + #define gf128mul_x_ble(iv, mask, tmp) \ 105 + vpsrad $31, iv, tmp; \ 106 + vpaddq iv, iv, iv; \ 107 + vpshufd $0x13, tmp, tmp; \ 108 + vpand mask, tmp, tmp; \ 109 + vpxor tmp, iv, iv; 110 + 111 + #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ 112 + vpsrad $31, iv, tmp0; \ 113 + vpaddq iv, iv, tmp1; \ 114 + vpsllq $2, iv, iv; \ 115 + vpshufd $0x13, tmp0, tmp0; \ 116 + vpsrad $31, tmp1, tmp1; \ 117 + vpand mask2, tmp0, tmp0; \ 118 + vpshufd $0x13, tmp1, tmp1; \ 119 + vpxor tmp0, iv, iv; \ 120 + vpand mask1, tmp1, tmp1; \ 121 + vpxor tmp1, iv, iv; 122 + 123 + #define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \ 124 + tivx, t0, t0x, t1, t1x, t2, t2x, t3, \ 125 + xts_gf128mul_and_shl1_mask_0, \ 126 + xts_gf128mul_and_shl1_mask_1) \ 127 + vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \ 128 + \ 129 + /* load IV and construct second IV */ \ 130 + vmovdqu (iv), tivx; \ 131 + vmovdqa tivx, t0x; \ 132 + gf128mul_x_ble(tivx, t1x, t2x); \ 133 + vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \ 134 + vinserti128 $1, tivx, t0, tiv; \ 135 + vpxor (0*32)(src), tiv, x0; \ 136 + vmovdqu tiv, (0*32)(dst); \ 137 + \ 138 + /* construct and store IVs, also xor with source */ \ 139 + gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 140 + vpxor (1*32)(src), tiv, x1; \ 141 + vmovdqu tiv, (1*32)(dst); \ 142 + \ 143 + gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 144 + vpxor (2*32)(src), tiv, x2; \ 145 + vmovdqu tiv, (2*32)(dst); \ 146 + \ 147 + gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 148 + vpxor (3*32)(src), tiv, x3; \ 149 + vmovdqu tiv, (3*32)(dst); \ 150 + \ 151 + gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 152 + vpxor (4*32)(src), tiv, x4; \ 153 + vmovdqu tiv, (4*32)(dst); \ 154 + \ 155 + gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 156 + vpxor (5*32)(src), tiv, x5; \ 157 + vmovdqu tiv, (5*32)(dst); \ 158 + \ 159 + gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 160 + vpxor (6*32)(src), tiv, x6; \ 161 + vmovdqu tiv, (6*32)(dst); \ 162 + \ 163 + gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 164 + vpxor (7*32)(src), tiv, x7; \ 165 + vmovdqu tiv, (7*32)(dst); \ 166 + \ 167 + vextracti128 $1, tiv, tivx; \ 168 + gf128mul_x_ble(tivx, t1x, t2x); \ 169 + vmovdqu tivx, (iv); 170 + 171 + #define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 172 + vpxor (0*32)(dst), x0, x0; \ 173 + vpxor (1*32)(dst), x1, x1; \ 174 + vpxor (2*32)(dst), x2, x2; \ 175 + vpxor (3*32)(dst), x3, x3; \ 176 + vpxor (4*32)(dst), x4, x4; \ 177 + vpxor (5*32)(dst), x5, x5; \ 178 + vpxor (6*32)(dst), x6, x6; \ 179 + vpxor (7*32)(dst), x7, x7; \ 180 + store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);

+96 -1

arch/x86/crypto/glue_helper.c

··· 1 1 /* 2 2 * Shared glue code for 128bit block ciphers 3 3 * 4 - * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 4 + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 5 5 * 6 6 * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: 7 7 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> ··· 303 303 return err; 304 304 } 305 305 EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit); 306 + 307 + static unsigned int __glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, 308 + void *ctx, 309 + struct blkcipher_desc *desc, 310 + struct blkcipher_walk *walk) 311 + { 312 + const unsigned int bsize = 128 / 8; 313 + unsigned int nbytes = walk->nbytes; 314 + u128 *src = (u128 *)walk->src.virt.addr; 315 + u128 *dst = (u128 *)walk->dst.virt.addr; 316 + unsigned int num_blocks, func_bytes; 317 + unsigned int i; 318 + 319 + /* Process multi-block batch */ 320 + for (i = 0; i < gctx->num_funcs; i++) { 321 + num_blocks = gctx->funcs[i].num_blocks; 322 + func_bytes = bsize * num_blocks; 323 + 324 + if (nbytes >= func_bytes) { 325 + do { 326 + gctx->funcs[i].fn_u.xts(ctx, dst, src, 327 + (le128 *)walk->iv); 328 + 329 + src += num_blocks; 330 + dst += num_blocks; 331 + nbytes -= func_bytes; 332 + } while (nbytes >= func_bytes); 333 + 334 + if (nbytes < bsize) 335 + goto done; 336 + } 337 + } 338 + 339 + done: 340 + return nbytes; 341 + } 342 + 343 + /* for implementations implementing faster XTS IV generator */ 344 + int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, 345 + struct blkcipher_desc *desc, struct scatterlist *dst, 346 + struct scatterlist *src, unsigned int nbytes, 347 + void (*tweak_fn)(void *ctx, u8 *dst, const u8 *src), 348 + void *tweak_ctx, void *crypt_ctx) 349 + { 350 + const unsigned int bsize = 128 / 8; 351 + bool fpu_enabled = false; 352 + struct blkcipher_walk walk; 353 + int err; 354 + 355 + blkcipher_walk_init(&walk, dst, src, nbytes); 356 + 357 + err = blkcipher_walk_virt(desc, &walk); 358 + nbytes = walk.nbytes; 359 + if (!nbytes) 360 + return err; 361 + 362 + /* set minimum length to bsize, for tweak_fn */ 363 + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, 364 + desc, fpu_enabled, 365 + nbytes < bsize ? bsize : nbytes); 366 + 367 + /* calculate first value of T */ 368 + tweak_fn(tweak_ctx, walk.iv, walk.iv); 369 + 370 + while (nbytes) { 371 + nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk); 372 + 373 + err = blkcipher_walk_done(desc, &walk, nbytes); 374 + nbytes = walk.nbytes; 375 + } 376 + 377 + glue_fpu_end(fpu_enabled); 378 + 379 + return err; 380 + } 381 + EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit); 382 + 383 + void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, 384 + common_glue_func_t fn) 385 + { 386 + le128 ivblk = *iv; 387 + 388 + /* generate next IV */ 389 + le128_gf128mul_x_ble(iv, &ivblk); 390 + 391 + /* CC <- T xor C */ 392 + u128_xor(dst, src, (u128 *)&ivblk); 393 + 394 + /* PP <- D(Key2,CC) */ 395 + fn(ctx, (u8 *)dst, (u8 *)dst); 396 + 397 + /* P <- T xor PP */ 398 + u128_xor(dst, dst, (u128 *)&ivblk); 399 + } 400 + EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one); 306 401 307 402 MODULE_LICENSE("GPL");

+43 -2

arch/x86/crypto/serpent-avx-x86_64-asm_64.S

··· 4 4 * Copyright (C) 2012 Johannes Goetzfried 5 5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 6 6 * 7 - * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by 8 - * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 7 + * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 9 8 * 10 9 * This program is free software; you can redistribute it and/or modify 11 10 * it under the terms of the GNU General Public License as published by ··· 33 34 34 35 .Lbswap128_mask: 35 36 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 37 + .Lxts_gf128mul_and_shl1_mask: 38 + .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 36 39 37 40 .text 38 41 ··· 740 739 741 740 ret; 742 741 ENDPROC(serpent_ctr_8way_avx) 742 + 743 + ENTRY(serpent_xts_enc_8way_avx) 744 + /* input: 745 + * %rdi: ctx, CTX 746 + * %rsi: dst 747 + * %rdx: src 748 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 749 + */ 750 + 751 + /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 752 + load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 753 + RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); 754 + 755 + call __serpent_enc_blk8_avx; 756 + 757 + /* dst <= regs xor IVs(in dst) */ 758 + store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 759 + 760 + ret; 761 + ENDPROC(serpent_xts_enc_8way_avx) 762 + 763 + ENTRY(serpent_xts_dec_8way_avx) 764 + /* input: 765 + * %rdi: ctx, CTX 766 + * %rsi: dst 767 + * %rdx: src 768 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 769 + */ 770 + 771 + /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 772 + load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 773 + RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); 774 + 775 + call __serpent_dec_blk8_avx; 776 + 777 + /* dst <= regs xor IVs(in dst) */ 778 + store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 779 + 780 + ret; 781 + ENDPROC(serpent_xts_dec_8way_avx)

+800

arch/x86/crypto/serpent-avx2-asm_64.S

··· 1 + /* 2 + * x86_64/AVX2 assembler optimized version of Serpent 3 + * 4 + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 + * 6 + * Based on AVX assembler implementation of Serpent by: 7 + * Copyright © 2012 Johannes Goetzfried 8 + * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 9 + * 10 + * This program is free software; you can redistribute it and/or modify 11 + * it under the terms of the GNU General Public License as published by 12 + * the Free Software Foundation; either version 2 of the License, or 13 + * (at your option) any later version. 14 + * 15 + */ 16 + 17 + #include <linux/linkage.h> 18 + #include "glue_helper-asm-avx2.S" 19 + 20 + .file "serpent-avx2-asm_64.S" 21 + 22 + .data 23 + .align 16 24 + 25 + .Lbswap128_mask: 26 + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 27 + .Lxts_gf128mul_and_shl1_mask_0: 28 + .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 29 + .Lxts_gf128mul_and_shl1_mask_1: 30 + .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 31 + 32 + .text 33 + 34 + #define CTX %rdi 35 + 36 + #define RNOT %ymm0 37 + #define tp %ymm1 38 + 39 + #define RA1 %ymm2 40 + #define RA2 %ymm3 41 + #define RB1 %ymm4 42 + #define RB2 %ymm5 43 + #define RC1 %ymm6 44 + #define RC2 %ymm7 45 + #define RD1 %ymm8 46 + #define RD2 %ymm9 47 + #define RE1 %ymm10 48 + #define RE2 %ymm11 49 + 50 + #define RK0 %ymm12 51 + #define RK1 %ymm13 52 + #define RK2 %ymm14 53 + #define RK3 %ymm15 54 + 55 + #define RK0x %xmm12 56 + #define RK1x %xmm13 57 + #define RK2x %xmm14 58 + #define RK3x %xmm15 59 + 60 + #define S0_1(x0, x1, x2, x3, x4) \ 61 + vpor x0, x3, tp; \ 62 + vpxor x3, x0, x0; \ 63 + vpxor x2, x3, x4; \ 64 + vpxor RNOT, x4, x4; \ 65 + vpxor x1, tp, x3; \ 66 + vpand x0, x1, x1; \ 67 + vpxor x4, x1, x1; \ 68 + vpxor x0, x2, x2; 69 + #define S0_2(x0, x1, x2, x3, x4) \ 70 + vpxor x3, x0, x0; \ 71 + vpor x0, x4, x4; \ 72 + vpxor x2, x0, x0; \ 73 + vpand x1, x2, x2; \ 74 + vpxor x2, x3, x3; \ 75 + vpxor RNOT, x1, x1; \ 76 + vpxor x4, x2, x2; \ 77 + vpxor x2, x1, x1; 78 + 79 + #define S1_1(x0, x1, x2, x3, x4) \ 80 + vpxor x0, x1, tp; \ 81 + vpxor x3, x0, x0; \ 82 + vpxor RNOT, x3, x3; \ 83 + vpand tp, x1, x4; \ 84 + vpor tp, x0, x0; \ 85 + vpxor x2, x3, x3; \ 86 + vpxor x3, x0, x0; \ 87 + vpxor x3, tp, x1; 88 + #define S1_2(x0, x1, x2, x3, x4) \ 89 + vpxor x4, x3, x3; \ 90 + vpor x4, x1, x1; \ 91 + vpxor x2, x4, x4; \ 92 + vpand x0, x2, x2; \ 93 + vpxor x1, x2, x2; \ 94 + vpor x0, x1, x1; \ 95 + vpxor RNOT, x0, x0; \ 96 + vpxor x2, x0, x0; \ 97 + vpxor x1, x4, x4; 98 + 99 + #define S2_1(x0, x1, x2, x3, x4) \ 100 + vpxor RNOT, x3, x3; \ 101 + vpxor x0, x1, x1; \ 102 + vpand x2, x0, tp; \ 103 + vpxor x3, tp, tp; \ 104 + vpor x0, x3, x3; \ 105 + vpxor x1, x2, x2; \ 106 + vpxor x1, x3, x3; \ 107 + vpand tp, x1, x1; 108 + #define S2_2(x0, x1, x2, x3, x4) \ 109 + vpxor x2, tp, tp; \ 110 + vpand x3, x2, x2; \ 111 + vpor x1, x3, x3; \ 112 + vpxor RNOT, tp, tp; \ 113 + vpxor tp, x3, x3; \ 114 + vpxor tp, x0, x4; \ 115 + vpxor x2, tp, x0; \ 116 + vpor x2, x1, x1; 117 + 118 + #define S3_1(x0, x1, x2, x3, x4) \ 119 + vpxor x3, x1, tp; \ 120 + vpor x0, x3, x3; \ 121 + vpand x0, x1, x4; \ 122 + vpxor x2, x0, x0; \ 123 + vpxor tp, x2, x2; \ 124 + vpand x3, tp, x1; \ 125 + vpxor x3, x2, x2; \ 126 + vpor x4, x0, x0; \ 127 + vpxor x3, x4, x4; 128 + #define S3_2(x0, x1, x2, x3, x4) \ 129 + vpxor x0, x1, x1; \ 130 + vpand x3, x0, x0; \ 131 + vpand x4, x3, x3; \ 132 + vpxor x2, x3, x3; \ 133 + vpor x1, x4, x4; \ 134 + vpand x1, x2, x2; \ 135 + vpxor x3, x4, x4; \ 136 + vpxor x3, x0, x0; \ 137 + vpxor x2, x3, x3; 138 + 139 + #define S4_1(x0, x1, x2, x3, x4) \ 140 + vpand x0, x3, tp; \ 141 + vpxor x3, x0, x0; \ 142 + vpxor x2, tp, tp; \ 143 + vpor x3, x2, x2; \ 144 + vpxor x1, x0, x0; \ 145 + vpxor tp, x3, x4; \ 146 + vpor x0, x2, x2; \ 147 + vpxor x1, x2, x2; 148 + #define S4_2(x0, x1, x2, x3, x4) \ 149 + vpand x0, x1, x1; \ 150 + vpxor x4, x1, x1; \ 151 + vpand x2, x4, x4; \ 152 + vpxor tp, x2, x2; \ 153 + vpxor x0, x4, x4; \ 154 + vpor x1, tp, x3; \ 155 + vpxor RNOT, x1, x1; \ 156 + vpxor x0, x3, x3; 157 + 158 + #define S5_1(x0, x1, x2, x3, x4) \ 159 + vpor x0, x1, tp; \ 160 + vpxor tp, x2, x2; \ 161 + vpxor RNOT, x3, x3; \ 162 + vpxor x0, x1, x4; \ 163 + vpxor x2, x0, x0; \ 164 + vpand x4, tp, x1; \ 165 + vpor x3, x4, x4; \ 166 + vpxor x0, x4, x4; 167 + #define S5_2(x0, x1, x2, x3, x4) \ 168 + vpand x3, x0, x0; \ 169 + vpxor x3, x1, x1; \ 170 + vpxor x2, x3, x3; \ 171 + vpxor x1, x0, x0; \ 172 + vpand x4, x2, x2; \ 173 + vpxor x2, x1, x1; \ 174 + vpand x0, x2, x2; \ 175 + vpxor x2, x3, x3; 176 + 177 + #define S6_1(x0, x1, x2, x3, x4) \ 178 + vpxor x0, x3, x3; \ 179 + vpxor x2, x1, tp; \ 180 + vpxor x0, x2, x2; \ 181 + vpand x3, x0, x0; \ 182 + vpor x3, tp, tp; \ 183 + vpxor RNOT, x1, x4; \ 184 + vpxor tp, x0, x0; \ 185 + vpxor x2, tp, x1; 186 + #define S6_2(x0, x1, x2, x3, x4) \ 187 + vpxor x4, x3, x3; \ 188 + vpxor x0, x4, x4; \ 189 + vpand x0, x2, x2; \ 190 + vpxor x1, x4, x4; \ 191 + vpxor x3, x2, x2; \ 192 + vpand x1, x3, x3; \ 193 + vpxor x0, x3, x3; \ 194 + vpxor x2, x1, x1; 195 + 196 + #define S7_1(x0, x1, x2, x3, x4) \ 197 + vpxor RNOT, x1, tp; \ 198 + vpxor RNOT, x0, x0; \ 199 + vpand x2, tp, x1; \ 200 + vpxor x3, x1, x1; \ 201 + vpor tp, x3, x3; \ 202 + vpxor x2, tp, x4; \ 203 + vpxor x3, x2, x2; \ 204 + vpxor x0, x3, x3; \ 205 + vpor x1, x0, x0; 206 + #define S7_2(x0, x1, x2, x3, x4) \ 207 + vpand x0, x2, x2; \ 208 + vpxor x4, x0, x0; \ 209 + vpxor x3, x4, x4; \ 210 + vpand x0, x3, x3; \ 211 + vpxor x1, x4, x4; \ 212 + vpxor x4, x2, x2; \ 213 + vpxor x1, x3, x3; \ 214 + vpor x0, x4, x4; \ 215 + vpxor x1, x4, x4; 216 + 217 + #define SI0_1(x0, x1, x2, x3, x4) \ 218 + vpxor x0, x1, x1; \ 219 + vpor x1, x3, tp; \ 220 + vpxor x1, x3, x4; \ 221 + vpxor RNOT, x0, x0; \ 222 + vpxor tp, x2, x2; \ 223 + vpxor x0, tp, x3; \ 224 + vpand x1, x0, x0; \ 225 + vpxor x2, x0, x0; 226 + #define SI0_2(x0, x1, x2, x3, x4) \ 227 + vpand x3, x2, x2; \ 228 + vpxor x4, x3, x3; \ 229 + vpxor x3, x2, x2; \ 230 + vpxor x3, x1, x1; \ 231 + vpand x0, x3, x3; \ 232 + vpxor x0, x1, x1; \ 233 + vpxor x2, x0, x0; \ 234 + vpxor x3, x4, x4; 235 + 236 + #define SI1_1(x0, x1, x2, x3, x4) \ 237 + vpxor x3, x1, x1; \ 238 + vpxor x2, x0, tp; \ 239 + vpxor RNOT, x2, x2; \ 240 + vpor x1, x0, x4; \ 241 + vpxor x3, x4, x4; \ 242 + vpand x1, x3, x3; \ 243 + vpxor x2, x1, x1; \ 244 + vpand x4, x2, x2; 245 + #define SI1_2(x0, x1, x2, x3, x4) \ 246 + vpxor x1, x4, x4; \ 247 + vpor x3, x1, x1; \ 248 + vpxor tp, x3, x3; \ 249 + vpxor tp, x2, x2; \ 250 + vpor x4, tp, x0; \ 251 + vpxor x4, x2, x2; \ 252 + vpxor x0, x1, x1; \ 253 + vpxor x1, x4, x4; 254 + 255 + #define SI2_1(x0, x1, x2, x3, x4) \ 256 + vpxor x1, x2, x2; \ 257 + vpxor RNOT, x3, tp; \ 258 + vpor x2, tp, tp; \ 259 + vpxor x3, x2, x2; \ 260 + vpxor x0, x3, x4; \ 261 + vpxor x1, tp, x3; \ 262 + vpor x2, x1, x1; \ 263 + vpxor x0, x2, x2; 264 + #define SI2_2(x0, x1, x2, x3, x4) \ 265 + vpxor x4, x1, x1; \ 266 + vpor x3, x4, x4; \ 267 + vpxor x3, x2, x2; \ 268 + vpxor x2, x4, x4; \ 269 + vpand x1, x2, x2; \ 270 + vpxor x3, x2, x2; \ 271 + vpxor x4, x3, x3; \ 272 + vpxor x0, x4, x4; 273 + 274 + #define SI3_1(x0, x1, x2, x3, x4) \ 275 + vpxor x1, x2, x2; \ 276 + vpand x2, x1, tp; \ 277 + vpxor x0, tp, tp; \ 278 + vpor x1, x0, x0; \ 279 + vpxor x3, x1, x4; \ 280 + vpxor x3, x0, x0; \ 281 + vpor tp, x3, x3; \ 282 + vpxor x2, tp, x1; 283 + #define SI3_2(x0, x1, x2, x3, x4) \ 284 + vpxor x3, x1, x1; \ 285 + vpxor x2, x0, x0; \ 286 + vpxor x3, x2, x2; \ 287 + vpand x1, x3, x3; \ 288 + vpxor x0, x1, x1; \ 289 + vpand x2, x0, x0; \ 290 + vpxor x3, x4, x4; \ 291 + vpxor x0, x3, x3; \ 292 + vpxor x1, x0, x0; 293 + 294 + #define SI4_1(x0, x1, x2, x3, x4) \ 295 + vpxor x3, x2, x2; \ 296 + vpand x1, x0, tp; \ 297 + vpxor x2, tp, tp; \ 298 + vpor x3, x2, x2; \ 299 + vpxor RNOT, x0, x4; \ 300 + vpxor tp, x1, x1; \ 301 + vpxor x2, tp, x0; \ 302 + vpand x4, x2, x2; 303 + #define SI4_2(x0, x1, x2, x3, x4) \ 304 + vpxor x0, x2, x2; \ 305 + vpor x4, x0, x0; \ 306 + vpxor x3, x0, x0; \ 307 + vpand x2, x3, x3; \ 308 + vpxor x3, x4, x4; \ 309 + vpxor x1, x3, x3; \ 310 + vpand x0, x1, x1; \ 311 + vpxor x1, x4, x4; \ 312 + vpxor x3, x0, x0; 313 + 314 + #define SI5_1(x0, x1, x2, x3, x4) \ 315 + vpor x2, x1, tp; \ 316 + vpxor x1, x2, x2; \ 317 + vpxor x3, tp, tp; \ 318 + vpand x1, x3, x3; \ 319 + vpxor x3, x2, x2; \ 320 + vpor x0, x3, x3; \ 321 + vpxor RNOT, x0, x0; \ 322 + vpxor x2, x3, x3; \ 323 + vpor x0, x2, x2; 324 + #define SI5_2(x0, x1, x2, x3, x4) \ 325 + vpxor tp, x1, x4; \ 326 + vpxor x4, x2, x2; \ 327 + vpand x0, x4, x4; \ 328 + vpxor tp, x0, x0; \ 329 + vpxor x3, tp, x1; \ 330 + vpand x2, x0, x0; \ 331 + vpxor x3, x2, x2; \ 332 + vpxor x2, x0, x0; \ 333 + vpxor x4, x2, x2; \ 334 + vpxor x3, x4, x4; 335 + 336 + #define SI6_1(x0, x1, x2, x3, x4) \ 337 + vpxor x2, x0, x0; \ 338 + vpand x3, x0, tp; \ 339 + vpxor x3, x2, x2; \ 340 + vpxor x2, tp, tp; \ 341 + vpxor x1, x3, x3; \ 342 + vpor x0, x2, x2; \ 343 + vpxor x3, x2, x2; \ 344 + vpand tp, x3, x3; 345 + #define SI6_2(x0, x1, x2, x3, x4) \ 346 + vpxor RNOT, tp, tp; \ 347 + vpxor x1, x3, x3; \ 348 + vpand x2, x1, x1; \ 349 + vpxor tp, x0, x4; \ 350 + vpxor x4, x3, x3; \ 351 + vpxor x2, x4, x4; \ 352 + vpxor x1, tp, x0; \ 353 + vpxor x0, x2, x2; 354 + 355 + #define SI7_1(x0, x1, x2, x3, x4) \ 356 + vpand x0, x3, tp; \ 357 + vpxor x2, x0, x0; \ 358 + vpor x3, x2, x2; \ 359 + vpxor x1, x3, x4; \ 360 + vpxor RNOT, x0, x0; \ 361 + vpor tp, x1, x1; \ 362 + vpxor x0, x4, x4; \ 363 + vpand x2, x0, x0; \ 364 + vpxor x1, x0, x0; 365 + #define SI7_2(x0, x1, x2, x3, x4) \ 366 + vpand x2, x1, x1; \ 367 + vpxor x2, tp, x3; \ 368 + vpxor x3, x4, x4; \ 369 + vpand x3, x2, x2; \ 370 + vpor x0, x3, x3; \ 371 + vpxor x4, x1, x1; \ 372 + vpxor x4, x3, x3; \ 373 + vpand x0, x4, x4; \ 374 + vpxor x2, x4, x4; 375 + 376 + #define get_key(i,j,t) \ 377 + vpbroadcastd (4*(i)+(j))*4(CTX), t; 378 + 379 + #define K2(x0, x1, x2, x3, x4, i) \ 380 + get_key(i, 0, RK0); \ 381 + get_key(i, 1, RK1); \ 382 + get_key(i, 2, RK2); \ 383 + get_key(i, 3, RK3); \ 384 + vpxor RK0, x0 ## 1, x0 ## 1; \ 385 + vpxor RK1, x1 ## 1, x1 ## 1; \ 386 + vpxor RK2, x2 ## 1, x2 ## 1; \ 387 + vpxor RK3, x3 ## 1, x3 ## 1; \ 388 + vpxor RK0, x0 ## 2, x0 ## 2; \ 389 + vpxor RK1, x1 ## 2, x1 ## 2; \ 390 + vpxor RK2, x2 ## 2, x2 ## 2; \ 391 + vpxor RK3, x3 ## 2, x3 ## 2; 392 + 393 + #define LK2(x0, x1, x2, x3, x4, i) \ 394 + vpslld $13, x0 ## 1, x4 ## 1; \ 395 + vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ 396 + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 397 + vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 398 + vpslld $3, x2 ## 1, x4 ## 1; \ 399 + vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ 400 + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 401 + vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 402 + vpslld $13, x0 ## 2, x4 ## 2; \ 403 + vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ 404 + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 405 + vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 406 + vpslld $3, x2 ## 2, x4 ## 2; \ 407 + vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ 408 + vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 409 + vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 410 + vpslld $1, x1 ## 1, x4 ## 1; \ 411 + vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ 412 + vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 413 + vpslld $3, x0 ## 1, x4 ## 1; \ 414 + vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 415 + vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 416 + get_key(i, 1, RK1); \ 417 + vpslld $1, x1 ## 2, x4 ## 2; \ 418 + vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ 419 + vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 420 + vpslld $3, x0 ## 2, x4 ## 2; \ 421 + vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 422 + vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 423 + get_key(i, 3, RK3); \ 424 + vpslld $7, x3 ## 1, x4 ## 1; \ 425 + vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ 426 + vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 427 + vpslld $7, x1 ## 1, x4 ## 1; \ 428 + vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 429 + vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 430 + vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 431 + vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 432 + get_key(i, 0, RK0); \ 433 + vpslld $7, x3 ## 2, x4 ## 2; \ 434 + vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ 435 + vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 436 + vpslld $7, x1 ## 2, x4 ## 2; \ 437 + vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 438 + vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 439 + vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 440 + vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 441 + get_key(i, 2, RK2); \ 442 + vpxor RK1, x1 ## 1, x1 ## 1; \ 443 + vpxor RK3, x3 ## 1, x3 ## 1; \ 444 + vpslld $5, x0 ## 1, x4 ## 1; \ 445 + vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ 446 + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 447 + vpslld $22, x2 ## 1, x4 ## 1; \ 448 + vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ 449 + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 450 + vpxor RK0, x0 ## 1, x0 ## 1; \ 451 + vpxor RK2, x2 ## 1, x2 ## 1; \ 452 + vpxor RK1, x1 ## 2, x1 ## 2; \ 453 + vpxor RK3, x3 ## 2, x3 ## 2; \ 454 + vpslld $5, x0 ## 2, x4 ## 2; \ 455 + vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ 456 + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 457 + vpslld $22, x2 ## 2, x4 ## 2; \ 458 + vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ 459 + vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 460 + vpxor RK0, x0 ## 2, x0 ## 2; \ 461 + vpxor RK2, x2 ## 2, x2 ## 2; 462 + 463 + #define KL2(x0, x1, x2, x3, x4, i) \ 464 + vpxor RK0, x0 ## 1, x0 ## 1; \ 465 + vpxor RK2, x2 ## 1, x2 ## 1; \ 466 + vpsrld $5, x0 ## 1, x4 ## 1; \ 467 + vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ 468 + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 469 + vpxor RK3, x3 ## 1, x3 ## 1; \ 470 + vpxor RK1, x1 ## 1, x1 ## 1; \ 471 + vpsrld $22, x2 ## 1, x4 ## 1; \ 472 + vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ 473 + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 474 + vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 475 + vpxor RK0, x0 ## 2, x0 ## 2; \ 476 + vpxor RK2, x2 ## 2, x2 ## 2; \ 477 + vpsrld $5, x0 ## 2, x4 ## 2; \ 478 + vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ 479 + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 480 + vpxor RK3, x3 ## 2, x3 ## 2; \ 481 + vpxor RK1, x1 ## 2, x1 ## 2; \ 482 + vpsrld $22, x2 ## 2, x4 ## 2; \ 483 + vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ 484 + vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 485 + vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 486 + vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 487 + vpslld $7, x1 ## 1, x4 ## 1; \ 488 + vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 489 + vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 490 + vpsrld $1, x1 ## 1, x4 ## 1; \ 491 + vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ 492 + vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 493 + vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 494 + vpslld $7, x1 ## 2, x4 ## 2; \ 495 + vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 496 + vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 497 + vpsrld $1, x1 ## 2, x4 ## 2; \ 498 + vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ 499 + vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 500 + vpsrld $7, x3 ## 1, x4 ## 1; \ 501 + vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ 502 + vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 503 + vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 504 + vpslld $3, x0 ## 1, x4 ## 1; \ 505 + vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 506 + vpsrld $7, x3 ## 2, x4 ## 2; \ 507 + vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ 508 + vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 509 + vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 510 + vpslld $3, x0 ## 2, x4 ## 2; \ 511 + vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 512 + vpsrld $13, x0 ## 1, x4 ## 1; \ 513 + vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ 514 + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 515 + vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 516 + vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 517 + vpsrld $3, x2 ## 1, x4 ## 1; \ 518 + vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ 519 + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 520 + vpsrld $13, x0 ## 2, x4 ## 2; \ 521 + vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ 522 + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 523 + vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 524 + vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 525 + vpsrld $3, x2 ## 2, x4 ## 2; \ 526 + vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ 527 + vpor x4 ## 2, x2 ## 2, x2 ## 2; 528 + 529 + #define S(SBOX, x0, x1, x2, x3, x4) \ 530 + SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 531 + SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 532 + SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 533 + SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); 534 + 535 + #define SP(SBOX, x0, x1, x2, x3, x4, i) \ 536 + get_key(i, 0, RK0); \ 537 + SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 538 + get_key(i, 2, RK2); \ 539 + SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 540 + get_key(i, 3, RK3); \ 541 + SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 542 + get_key(i, 1, RK1); \ 543 + SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 544 + 545 + #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 546 + vpunpckldq x1, x0, t0; \ 547 + vpunpckhdq x1, x0, t2; \ 548 + vpunpckldq x3, x2, t1; \ 549 + vpunpckhdq x3, x2, x3; \ 550 + \ 551 + vpunpcklqdq t1, t0, x0; \ 552 + vpunpckhqdq t1, t0, x1; \ 553 + vpunpcklqdq x3, t2, x2; \ 554 + vpunpckhqdq x3, t2, x3; 555 + 556 + #define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ 557 + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 558 + 559 + #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ 560 + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 561 + 562 + .align 8 563 + __serpent_enc_blk16: 564 + /* input: 565 + * %rdi: ctx, CTX 566 + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext 567 + * output: 568 + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext 569 + */ 570 + 571 + vpcmpeqd RNOT, RNOT, RNOT; 572 + 573 + read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 574 + read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 575 + 576 + K2(RA, RB, RC, RD, RE, 0); 577 + S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 578 + S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); 579 + S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); 580 + S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); 581 + S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); 582 + S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); 583 + S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); 584 + S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); 585 + S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); 586 + S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); 587 + S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); 588 + S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); 589 + S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); 590 + S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); 591 + S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); 592 + S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); 593 + S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); 594 + S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); 595 + S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); 596 + S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); 597 + S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); 598 + S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); 599 + S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); 600 + S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); 601 + S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); 602 + S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); 603 + S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); 604 + S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); 605 + S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); 606 + S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); 607 + S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 608 + S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 609 + 610 + write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 611 + write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 612 + 613 + ret; 614 + ENDPROC(__serpent_enc_blk16) 615 + 616 + .align 8 617 + __serpent_dec_blk16: 618 + /* input: 619 + * %rdi: ctx, CTX 620 + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext 621 + * output: 622 + * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext 623 + */ 624 + 625 + vpcmpeqd RNOT, RNOT, RNOT; 626 + 627 + read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 628 + read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 629 + 630 + K2(RA, RB, RC, RD, RE, 32); 631 + SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 632 + SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); 633 + SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); 634 + SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); 635 + SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); 636 + SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); 637 + SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); 638 + SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); 639 + SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); 640 + SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); 641 + SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); 642 + SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); 643 + SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); 644 + SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); 645 + SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); 646 + SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); 647 + SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); 648 + SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); 649 + SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); 650 + SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); 651 + SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); 652 + SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); 653 + SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); 654 + SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); 655 + SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); 656 + SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); 657 + SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); 658 + SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); 659 + SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); 660 + SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); 661 + SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 662 + S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 663 + 664 + write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); 665 + write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); 666 + 667 + ret; 668 + ENDPROC(__serpent_dec_blk16) 669 + 670 + ENTRY(serpent_ecb_enc_16way) 671 + /* input: 672 + * %rdi: ctx, CTX 673 + * %rsi: dst 674 + * %rdx: src 675 + */ 676 + 677 + vzeroupper; 678 + 679 + load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 680 + 681 + call __serpent_enc_blk16; 682 + 683 + store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 684 + 685 + vzeroupper; 686 + 687 + ret; 688 + ENDPROC(serpent_ecb_enc_16way) 689 + 690 + ENTRY(serpent_ecb_dec_16way) 691 + /* input: 692 + * %rdi: ctx, CTX 693 + * %rsi: dst 694 + * %rdx: src 695 + */ 696 + 697 + vzeroupper; 698 + 699 + load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 700 + 701 + call __serpent_dec_blk16; 702 + 703 + store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 704 + 705 + vzeroupper; 706 + 707 + ret; 708 + ENDPROC(serpent_ecb_dec_16way) 709 + 710 + ENTRY(serpent_cbc_dec_16way) 711 + /* input: 712 + * %rdi: ctx, CTX 713 + * %rsi: dst 714 + * %rdx: src 715 + */ 716 + 717 + vzeroupper; 718 + 719 + load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 720 + 721 + call __serpent_dec_blk16; 722 + 723 + store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2, 724 + RK0); 725 + 726 + vzeroupper; 727 + 728 + ret; 729 + ENDPROC(serpent_cbc_dec_16way) 730 + 731 + ENTRY(serpent_ctr_16way) 732 + /* input: 733 + * %rdi: ctx, CTX 734 + * %rsi: dst (16 blocks) 735 + * %rdx: src (16 blocks) 736 + * %rcx: iv (little endian, 128bit) 737 + */ 738 + 739 + vzeroupper; 740 + 741 + load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 742 + RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, 743 + tp); 744 + 745 + call __serpent_enc_blk16; 746 + 747 + store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 748 + 749 + vzeroupper; 750 + 751 + ret; 752 + ENDPROC(serpent_ctr_16way) 753 + 754 + ENTRY(serpent_xts_enc_16way) 755 + /* input: 756 + * %rdi: ctx, CTX 757 + * %rsi: dst (16 blocks) 758 + * %rdx: src (16 blocks) 759 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 760 + */ 761 + 762 + vzeroupper; 763 + 764 + load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 765 + RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, 766 + .Lxts_gf128mul_and_shl1_mask_0, 767 + .Lxts_gf128mul_and_shl1_mask_1); 768 + 769 + call __serpent_enc_blk16; 770 + 771 + store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 772 + 773 + vzeroupper; 774 + 775 + ret; 776 + ENDPROC(serpent_xts_enc_16way) 777 + 778 + ENTRY(serpent_xts_dec_16way) 779 + /* input: 780 + * %rdi: ctx, CTX 781 + * %rsi: dst (16 blocks) 782 + * %rdx: src (16 blocks) 783 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 784 + */ 785 + 786 + vzeroupper; 787 + 788 + load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 789 + RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, 790 + .Lxts_gf128mul_and_shl1_mask_0, 791 + .Lxts_gf128mul_and_shl1_mask_1); 792 + 793 + call __serpent_dec_blk16; 794 + 795 + store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 796 + 797 + vzeroupper; 798 + 799 + ret; 800 + ENDPROC(serpent_xts_dec_16way)

+562

arch/x86/crypto/serpent_avx2_glue.c

··· 1 + /* 2 + * Glue Code for x86_64/AVX2 assembler optimized version of Serpent 3 + * 4 + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License as published by 8 + * the Free Software Foundation; either version 2 of the License, or 9 + * (at your option) any later version. 10 + * 11 + */ 12 + 13 + #include <linux/module.h> 14 + #include <linux/types.h> 15 + #include <linux/crypto.h> 16 + #include <linux/err.h> 17 + #include <crypto/algapi.h> 18 + #include <crypto/ctr.h> 19 + #include <crypto/lrw.h> 20 + #include <crypto/xts.h> 21 + #include <crypto/serpent.h> 22 + #include <asm/xcr.h> 23 + #include <asm/xsave.h> 24 + #include <asm/crypto/serpent-avx.h> 25 + #include <asm/crypto/ablk_helper.h> 26 + #include <asm/crypto/glue_helper.h> 27 + 28 + #define SERPENT_AVX2_PARALLEL_BLOCKS 16 29 + 30 + /* 16-way AVX2 parallel cipher functions */ 31 + asmlinkage void serpent_ecb_enc_16way(struct serpent_ctx *ctx, u8 *dst, 32 + const u8 *src); 33 + asmlinkage void serpent_ecb_dec_16way(struct serpent_ctx *ctx, u8 *dst, 34 + const u8 *src); 35 + asmlinkage void serpent_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src); 36 + 37 + asmlinkage void serpent_ctr_16way(void *ctx, u128 *dst, const u128 *src, 38 + le128 *iv); 39 + asmlinkage void serpent_xts_enc_16way(struct serpent_ctx *ctx, u8 *dst, 40 + const u8 *src, le128 *iv); 41 + asmlinkage void serpent_xts_dec_16way(struct serpent_ctx *ctx, u8 *dst, 42 + const u8 *src, le128 *iv); 43 + 44 + static const struct common_glue_ctx serpent_enc = { 45 + .num_funcs = 3, 46 + .fpu_blocks_limit = 8, 47 + 48 + .funcs = { { 49 + .num_blocks = 16, 50 + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_16way) } 51 + }, { 52 + .num_blocks = 8, 53 + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) } 54 + }, { 55 + .num_blocks = 1, 56 + .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } 57 + } } 58 + }; 59 + 60 + static const struct common_glue_ctx serpent_ctr = { 61 + .num_funcs = 3, 62 + .fpu_blocks_limit = 8, 63 + 64 + .funcs = { { 65 + .num_blocks = 16, 66 + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_16way) } 67 + }, { 68 + .num_blocks = 8, 69 + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } 70 + }, { 71 + .num_blocks = 1, 72 + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) } 73 + } } 74 + }; 75 + 76 + static const struct common_glue_ctx serpent_enc_xts = { 77 + .num_funcs = 3, 78 + .fpu_blocks_limit = 8, 79 + 80 + .funcs = { { 81 + .num_blocks = 16, 82 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_16way) } 83 + }, { 84 + .num_blocks = 8, 85 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) } 86 + }, { 87 + .num_blocks = 1, 88 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) } 89 + } } 90 + }; 91 + 92 + static const struct common_glue_ctx serpent_dec = { 93 + .num_funcs = 3, 94 + .fpu_blocks_limit = 8, 95 + 96 + .funcs = { { 97 + .num_blocks = 16, 98 + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_16way) } 99 + }, { 100 + .num_blocks = 8, 101 + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) } 102 + }, { 103 + .num_blocks = 1, 104 + .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } 105 + } } 106 + }; 107 + 108 + static const struct common_glue_ctx serpent_dec_cbc = { 109 + .num_funcs = 3, 110 + .fpu_blocks_limit = 8, 111 + 112 + .funcs = { { 113 + .num_blocks = 16, 114 + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_16way) } 115 + }, { 116 + .num_blocks = 8, 117 + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) } 118 + }, { 119 + .num_blocks = 1, 120 + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } 121 + } } 122 + }; 123 + 124 + static const struct common_glue_ctx serpent_dec_xts = { 125 + .num_funcs = 3, 126 + .fpu_blocks_limit = 8, 127 + 128 + .funcs = { { 129 + .num_blocks = 16, 130 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_16way) } 131 + }, { 132 + .num_blocks = 8, 133 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) } 134 + }, { 135 + .num_blocks = 1, 136 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) } 137 + } } 138 + }; 139 + 140 + static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 141 + struct scatterlist *src, unsigned int nbytes) 142 + { 143 + return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes); 144 + } 145 + 146 + static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 147 + struct scatterlist *src, unsigned int nbytes) 148 + { 149 + return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes); 150 + } 151 + 152 + static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 153 + struct scatterlist *src, unsigned int nbytes) 154 + { 155 + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc, 156 + dst, src, nbytes); 157 + } 158 + 159 + static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 160 + struct scatterlist *src, unsigned int nbytes) 161 + { 162 + return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src, 163 + nbytes); 164 + } 165 + 166 + static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, 167 + struct scatterlist *src, unsigned int nbytes) 168 + { 169 + return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes); 170 + } 171 + 172 + static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) 173 + { 174 + /* since reusing AVX functions, starts using FPU at 8 parallel blocks */ 175 + return glue_fpu_begin(SERPENT_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes); 176 + } 177 + 178 + static inline void serpent_fpu_end(bool fpu_enabled) 179 + { 180 + glue_fpu_end(fpu_enabled); 181 + } 182 + 183 + struct crypt_priv { 184 + struct serpent_ctx *ctx; 185 + bool fpu_enabled; 186 + }; 187 + 188 + static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) 189 + { 190 + const unsigned int bsize = SERPENT_BLOCK_SIZE; 191 + struct crypt_priv *ctx = priv; 192 + int i; 193 + 194 + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); 195 + 196 + if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) { 197 + serpent_ecb_enc_16way(ctx->ctx, srcdst, srcdst); 198 + srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS; 199 + nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS; 200 + } 201 + 202 + while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) { 203 + serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst); 204 + srcdst += bsize * SERPENT_PARALLEL_BLOCKS; 205 + nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; 206 + } 207 + 208 + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) 209 + __serpent_encrypt(ctx->ctx, srcdst, srcdst); 210 + } 211 + 212 + static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) 213 + { 214 + const unsigned int bsize = SERPENT_BLOCK_SIZE; 215 + struct crypt_priv *ctx = priv; 216 + int i; 217 + 218 + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); 219 + 220 + if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) { 221 + serpent_ecb_dec_16way(ctx->ctx, srcdst, srcdst); 222 + srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS; 223 + nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS; 224 + } 225 + 226 + while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) { 227 + serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst); 228 + srcdst += bsize * SERPENT_PARALLEL_BLOCKS; 229 + nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; 230 + } 231 + 232 + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) 233 + __serpent_decrypt(ctx->ctx, srcdst, srcdst); 234 + } 235 + 236 + static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 237 + struct scatterlist *src, unsigned int nbytes) 238 + { 239 + struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 240 + be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS]; 241 + struct crypt_priv crypt_ctx = { 242 + .ctx = &ctx->serpent_ctx, 243 + .fpu_enabled = false, 244 + }; 245 + struct lrw_crypt_req req = { 246 + .tbuf = buf, 247 + .tbuflen = sizeof(buf), 248 + 249 + .table_ctx = &ctx->lrw_table, 250 + .crypt_ctx = &crypt_ctx, 251 + .crypt_fn = encrypt_callback, 252 + }; 253 + int ret; 254 + 255 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 256 + ret = lrw_crypt(desc, dst, src, nbytes, &req); 257 + serpent_fpu_end(crypt_ctx.fpu_enabled); 258 + 259 + return ret; 260 + } 261 + 262 + static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 263 + struct scatterlist *src, unsigned int nbytes) 264 + { 265 + struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 266 + be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS]; 267 + struct crypt_priv crypt_ctx = { 268 + .ctx = &ctx->serpent_ctx, 269 + .fpu_enabled = false, 270 + }; 271 + struct lrw_crypt_req req = { 272 + .tbuf = buf, 273 + .tbuflen = sizeof(buf), 274 + 275 + .table_ctx = &ctx->lrw_table, 276 + .crypt_ctx = &crypt_ctx, 277 + .crypt_fn = decrypt_callback, 278 + }; 279 + int ret; 280 + 281 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 282 + ret = lrw_crypt(desc, dst, src, nbytes, &req); 283 + serpent_fpu_end(crypt_ctx.fpu_enabled); 284 + 285 + return ret; 286 + } 287 + 288 + static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 289 + struct scatterlist *src, unsigned int nbytes) 290 + { 291 + struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 292 + 293 + return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes, 294 + XTS_TWEAK_CAST(__serpent_encrypt), 295 + &ctx->tweak_ctx, &ctx->crypt_ctx); 296 + } 297 + 298 + static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 299 + struct scatterlist *src, unsigned int nbytes) 300 + { 301 + struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 302 + 303 + return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes, 304 + XTS_TWEAK_CAST(__serpent_encrypt), 305 + &ctx->tweak_ctx, &ctx->crypt_ctx); 306 + } 307 + 308 + static struct crypto_alg srp_algs[10] = { { 309 + .cra_name = "__ecb-serpent-avx2", 310 + .cra_driver_name = "__driver-ecb-serpent-avx2", 311 + .cra_priority = 0, 312 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 313 + .cra_blocksize = SERPENT_BLOCK_SIZE, 314 + .cra_ctxsize = sizeof(struct serpent_ctx), 315 + .cra_alignmask = 0, 316 + .cra_type = &crypto_blkcipher_type, 317 + .cra_module = THIS_MODULE, 318 + .cra_list = LIST_HEAD_INIT(srp_algs[0].cra_list), 319 + .cra_u = { 320 + .blkcipher = { 321 + .min_keysize = SERPENT_MIN_KEY_SIZE, 322 + .max_keysize = SERPENT_MAX_KEY_SIZE, 323 + .setkey = serpent_setkey, 324 + .encrypt = ecb_encrypt, 325 + .decrypt = ecb_decrypt, 326 + }, 327 + }, 328 + }, { 329 + .cra_name = "__cbc-serpent-avx2", 330 + .cra_driver_name = "__driver-cbc-serpent-avx2", 331 + .cra_priority = 0, 332 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 333 + .cra_blocksize = SERPENT_BLOCK_SIZE, 334 + .cra_ctxsize = sizeof(struct serpent_ctx), 335 + .cra_alignmask = 0, 336 + .cra_type = &crypto_blkcipher_type, 337 + .cra_module = THIS_MODULE, 338 + .cra_list = LIST_HEAD_INIT(srp_algs[1].cra_list), 339 + .cra_u = { 340 + .blkcipher = { 341 + .min_keysize = SERPENT_MIN_KEY_SIZE, 342 + .max_keysize = SERPENT_MAX_KEY_SIZE, 343 + .setkey = serpent_setkey, 344 + .encrypt = cbc_encrypt, 345 + .decrypt = cbc_decrypt, 346 + }, 347 + }, 348 + }, { 349 + .cra_name = "__ctr-serpent-avx2", 350 + .cra_driver_name = "__driver-ctr-serpent-avx2", 351 + .cra_priority = 0, 352 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 353 + .cra_blocksize = 1, 354 + .cra_ctxsize = sizeof(struct serpent_ctx), 355 + .cra_alignmask = 0, 356 + .cra_type = &crypto_blkcipher_type, 357 + .cra_module = THIS_MODULE, 358 + .cra_list = LIST_HEAD_INIT(srp_algs[2].cra_list), 359 + .cra_u = { 360 + .blkcipher = { 361 + .min_keysize = SERPENT_MIN_KEY_SIZE, 362 + .max_keysize = SERPENT_MAX_KEY_SIZE, 363 + .ivsize = SERPENT_BLOCK_SIZE, 364 + .setkey = serpent_setkey, 365 + .encrypt = ctr_crypt, 366 + .decrypt = ctr_crypt, 367 + }, 368 + }, 369 + }, { 370 + .cra_name = "__lrw-serpent-avx2", 371 + .cra_driver_name = "__driver-lrw-serpent-avx2", 372 + .cra_priority = 0, 373 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 374 + .cra_blocksize = SERPENT_BLOCK_SIZE, 375 + .cra_ctxsize = sizeof(struct serpent_lrw_ctx), 376 + .cra_alignmask = 0, 377 + .cra_type = &crypto_blkcipher_type, 378 + .cra_module = THIS_MODULE, 379 + .cra_list = LIST_HEAD_INIT(srp_algs[3].cra_list), 380 + .cra_exit = lrw_serpent_exit_tfm, 381 + .cra_u = { 382 + .blkcipher = { 383 + .min_keysize = SERPENT_MIN_KEY_SIZE + 384 + SERPENT_BLOCK_SIZE, 385 + .max_keysize = SERPENT_MAX_KEY_SIZE + 386 + SERPENT_BLOCK_SIZE, 387 + .ivsize = SERPENT_BLOCK_SIZE, 388 + .setkey = lrw_serpent_setkey, 389 + .encrypt = lrw_encrypt, 390 + .decrypt = lrw_decrypt, 391 + }, 392 + }, 393 + }, { 394 + .cra_name = "__xts-serpent-avx2", 395 + .cra_driver_name = "__driver-xts-serpent-avx2", 396 + .cra_priority = 0, 397 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 398 + .cra_blocksize = SERPENT_BLOCK_SIZE, 399 + .cra_ctxsize = sizeof(struct serpent_xts_ctx), 400 + .cra_alignmask = 0, 401 + .cra_type = &crypto_blkcipher_type, 402 + .cra_module = THIS_MODULE, 403 + .cra_list = LIST_HEAD_INIT(srp_algs[4].cra_list), 404 + .cra_u = { 405 + .blkcipher = { 406 + .min_keysize = SERPENT_MIN_KEY_SIZE * 2, 407 + .max_keysize = SERPENT_MAX_KEY_SIZE * 2, 408 + .ivsize = SERPENT_BLOCK_SIZE, 409 + .setkey = xts_serpent_setkey, 410 + .encrypt = xts_encrypt, 411 + .decrypt = xts_decrypt, 412 + }, 413 + }, 414 + }, { 415 + .cra_name = "ecb(serpent)", 416 + .cra_driver_name = "ecb-serpent-avx2", 417 + .cra_priority = 600, 418 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 419 + .cra_blocksize = SERPENT_BLOCK_SIZE, 420 + .cra_ctxsize = sizeof(struct async_helper_ctx), 421 + .cra_alignmask = 0, 422 + .cra_type = &crypto_ablkcipher_type, 423 + .cra_module = THIS_MODULE, 424 + .cra_list = LIST_HEAD_INIT(srp_algs[5].cra_list), 425 + .cra_init = ablk_init, 426 + .cra_exit = ablk_exit, 427 + .cra_u = { 428 + .ablkcipher = { 429 + .min_keysize = SERPENT_MIN_KEY_SIZE, 430 + .max_keysize = SERPENT_MAX_KEY_SIZE, 431 + .setkey = ablk_set_key, 432 + .encrypt = ablk_encrypt, 433 + .decrypt = ablk_decrypt, 434 + }, 435 + }, 436 + }, { 437 + .cra_name = "cbc(serpent)", 438 + .cra_driver_name = "cbc-serpent-avx2", 439 + .cra_priority = 600, 440 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 441 + .cra_blocksize = SERPENT_BLOCK_SIZE, 442 + .cra_ctxsize = sizeof(struct async_helper_ctx), 443 + .cra_alignmask = 0, 444 + .cra_type = &crypto_ablkcipher_type, 445 + .cra_module = THIS_MODULE, 446 + .cra_list = LIST_HEAD_INIT(srp_algs[6].cra_list), 447 + .cra_init = ablk_init, 448 + .cra_exit = ablk_exit, 449 + .cra_u = { 450 + .ablkcipher = { 451 + .min_keysize = SERPENT_MIN_KEY_SIZE, 452 + .max_keysize = SERPENT_MAX_KEY_SIZE, 453 + .ivsize = SERPENT_BLOCK_SIZE, 454 + .setkey = ablk_set_key, 455 + .encrypt = __ablk_encrypt, 456 + .decrypt = ablk_decrypt, 457 + }, 458 + }, 459 + }, { 460 + .cra_name = "ctr(serpent)", 461 + .cra_driver_name = "ctr-serpent-avx2", 462 + .cra_priority = 600, 463 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 464 + .cra_blocksize = 1, 465 + .cra_ctxsize = sizeof(struct async_helper_ctx), 466 + .cra_alignmask = 0, 467 + .cra_type = &crypto_ablkcipher_type, 468 + .cra_module = THIS_MODULE, 469 + .cra_list = LIST_HEAD_INIT(srp_algs[7].cra_list), 470 + .cra_init = ablk_init, 471 + .cra_exit = ablk_exit, 472 + .cra_u = { 473 + .ablkcipher = { 474 + .min_keysize = SERPENT_MIN_KEY_SIZE, 475 + .max_keysize = SERPENT_MAX_KEY_SIZE, 476 + .ivsize = SERPENT_BLOCK_SIZE, 477 + .setkey = ablk_set_key, 478 + .encrypt = ablk_encrypt, 479 + .decrypt = ablk_encrypt, 480 + .geniv = "chainiv", 481 + }, 482 + }, 483 + }, { 484 + .cra_name = "lrw(serpent)", 485 + .cra_driver_name = "lrw-serpent-avx2", 486 + .cra_priority = 600, 487 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 488 + .cra_blocksize = SERPENT_BLOCK_SIZE, 489 + .cra_ctxsize = sizeof(struct async_helper_ctx), 490 + .cra_alignmask = 0, 491 + .cra_type = &crypto_ablkcipher_type, 492 + .cra_module = THIS_MODULE, 493 + .cra_list = LIST_HEAD_INIT(srp_algs[8].cra_list), 494 + .cra_init = ablk_init, 495 + .cra_exit = ablk_exit, 496 + .cra_u = { 497 + .ablkcipher = { 498 + .min_keysize = SERPENT_MIN_KEY_SIZE + 499 + SERPENT_BLOCK_SIZE, 500 + .max_keysize = SERPENT_MAX_KEY_SIZE + 501 + SERPENT_BLOCK_SIZE, 502 + .ivsize = SERPENT_BLOCK_SIZE, 503 + .setkey = ablk_set_key, 504 + .encrypt = ablk_encrypt, 505 + .decrypt = ablk_decrypt, 506 + }, 507 + }, 508 + }, { 509 + .cra_name = "xts(serpent)", 510 + .cra_driver_name = "xts-serpent-avx2", 511 + .cra_priority = 600, 512 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 513 + .cra_blocksize = SERPENT_BLOCK_SIZE, 514 + .cra_ctxsize = sizeof(struct async_helper_ctx), 515 + .cra_alignmask = 0, 516 + .cra_type = &crypto_ablkcipher_type, 517 + .cra_module = THIS_MODULE, 518 + .cra_list = LIST_HEAD_INIT(srp_algs[9].cra_list), 519 + .cra_init = ablk_init, 520 + .cra_exit = ablk_exit, 521 + .cra_u = { 522 + .ablkcipher = { 523 + .min_keysize = SERPENT_MIN_KEY_SIZE * 2, 524 + .max_keysize = SERPENT_MAX_KEY_SIZE * 2, 525 + .ivsize = SERPENT_BLOCK_SIZE, 526 + .setkey = ablk_set_key, 527 + .encrypt = ablk_encrypt, 528 + .decrypt = ablk_decrypt, 529 + }, 530 + }, 531 + } }; 532 + 533 + static int __init init(void) 534 + { 535 + u64 xcr0; 536 + 537 + if (!cpu_has_avx2 || !cpu_has_osxsave) { 538 + pr_info("AVX2 instructions are not detected.\n"); 539 + return -ENODEV; 540 + } 541 + 542 + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 543 + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { 544 + pr_info("AVX detected but unusable.\n"); 545 + return -ENODEV; 546 + } 547 + 548 + return crypto_register_algs(srp_algs, ARRAY_SIZE(srp_algs)); 549 + } 550 + 551 + static void __exit fini(void) 552 + { 553 + crypto_unregister_algs(srp_algs, ARRAY_SIZE(srp_algs)); 554 + } 555 + 556 + module_init(init); 557 + module_exit(fini); 558 + 559 + MODULE_LICENSE("GPL"); 560 + MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized"); 561 + MODULE_ALIAS("serpent"); 562 + MODULE_ALIAS("serpent-asm");

+85 -60

arch/x86/crypto/serpent_avx_glue.c

··· 4 4 * Copyright (C) 2012 Johannes Goetzfried 5 5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 6 6 * 7 - * Glue code based on serpent_sse2_glue.c by: 8 - * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 7 + * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 9 8 * 10 9 * This program is free software; you can redistribute it and/or modify 11 10 * it under the terms of the GNU General Public License as published by ··· 41 42 #include <asm/crypto/ablk_helper.h> 42 43 #include <asm/crypto/glue_helper.h> 43 44 44 - static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) 45 + /* 8-way parallel cipher functions */ 46 + asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, 47 + const u8 *src); 48 + EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx); 49 + 50 + asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, 51 + const u8 *src); 52 + EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx); 53 + 54 + asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, 55 + const u8 *src); 56 + EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx); 57 + 58 + asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst, 59 + const u8 *src, le128 *iv); 60 + EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx); 61 + 62 + asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, 63 + const u8 *src, le128 *iv); 64 + EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx); 65 + 66 + asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, 67 + const u8 *src, le128 *iv); 68 + EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx); 69 + 70 + void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) 45 71 { 46 72 be128 ctrblk; 47 73 ··· 76 52 __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); 77 53 u128_xor(dst, src, (u128 *)&ctrblk); 78 54 } 55 + EXPORT_SYMBOL_GPL(__serpent_crypt_ctr); 56 + 57 + void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) 58 + { 59 + glue_xts_crypt_128bit_one(ctx, dst, src, iv, 60 + GLUE_FUNC_CAST(__serpent_encrypt)); 61 + } 62 + EXPORT_SYMBOL_GPL(serpent_xts_enc); 63 + 64 + void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) 65 + { 66 + glue_xts_crypt_128bit_one(ctx, dst, src, iv, 67 + GLUE_FUNC_CAST(__serpent_decrypt)); 68 + } 69 + EXPORT_SYMBOL_GPL(serpent_xts_dec); 70 + 79 71 80 72 static const struct common_glue_ctx serpent_enc = { 81 73 .num_funcs = 2, ··· 115 75 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } 116 76 }, { 117 77 .num_blocks = 1, 118 - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } 78 + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) } 79 + } } 80 + }; 81 + 82 + static const struct common_glue_ctx serpent_enc_xts = { 83 + .num_funcs = 2, 84 + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, 85 + 86 + .funcs = { { 87 + .num_blocks = SERPENT_PARALLEL_BLOCKS, 88 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) } 89 + }, { 90 + .num_blocks = 1, 91 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) } 119 92 } } 120 93 }; 121 94 ··· 155 102 }, { 156 103 .num_blocks = 1, 157 104 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } 105 + } } 106 + }; 107 + 108 + static const struct common_glue_ctx serpent_dec_xts = { 109 + .num_funcs = 2, 110 + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, 111 + 112 + .funcs = { { 113 + .num_blocks = SERPENT_PARALLEL_BLOCKS, 114 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) } 115 + }, { 116 + .num_blocks = 1, 117 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) } 158 118 } } 159 119 }; 160 120 ··· 253 187 __serpent_decrypt(ctx->ctx, srcdst, srcdst); 254 188 } 255 189 256 - struct serpent_lrw_ctx { 257 - struct lrw_table_ctx lrw_table; 258 - struct serpent_ctx serpent_ctx; 259 - }; 260 - 261 - static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, 262 - unsigned int keylen) 190 + int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, 191 + unsigned int keylen) 263 192 { 264 193 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); 265 194 int err; ··· 267 206 return lrw_init_table(&ctx->lrw_table, key + keylen - 268 207 SERPENT_BLOCK_SIZE); 269 208 } 209 + EXPORT_SYMBOL_GPL(lrw_serpent_setkey); 270 210 271 211 static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 272 212 struct scatterlist *src, unsigned int nbytes) ··· 321 259 return ret; 322 260 } 323 261 324 - static void lrw_exit_tfm(struct crypto_tfm *tfm) 262 + void lrw_serpent_exit_tfm(struct crypto_tfm *tfm) 325 263 { 326 264 struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); 327 265 328 266 lrw_free_table(&ctx->lrw_table); 329 267 } 268 + EXPORT_SYMBOL_GPL(lrw_serpent_exit_tfm); 330 269 331 - struct serpent_xts_ctx { 332 - struct serpent_ctx tweak_ctx; 333 - struct serpent_ctx crypt_ctx; 334 - }; 335 - 336 - static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, 337 - unsigned int keylen) 270 + int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, 271 + unsigned int keylen) 338 272 { 339 273 struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm); 340 274 u32 *flags = &tfm->crt_flags; ··· 352 294 /* second half of xts-key is for tweak */ 353 295 return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); 354 296 } 297 + EXPORT_SYMBOL_GPL(xts_serpent_setkey); 355 298 356 299 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 357 300 struct scatterlist *src, unsigned int nbytes) 358 301 { 359 302 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 360 - be128 buf[SERPENT_PARALLEL_BLOCKS]; 361 - struct crypt_priv crypt_ctx = { 362 - .ctx = &ctx->crypt_ctx, 363 - .fpu_enabled = false, 364 - }; 365 - struct xts_crypt_req req = { 366 - .tbuf = buf, 367 - .tbuflen = sizeof(buf), 368 303 369 - .tweak_ctx = &ctx->tweak_ctx, 370 - .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), 371 - .crypt_ctx = &crypt_ctx, 372 - .crypt_fn = encrypt_callback, 373 - }; 374 - int ret; 375 - 376 - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 377 - ret = xts_crypt(desc, dst, src, nbytes, &req); 378 - serpent_fpu_end(crypt_ctx.fpu_enabled); 379 - 380 - return ret; 304 + return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes, 305 + XTS_TWEAK_CAST(__serpent_encrypt), 306 + &ctx->tweak_ctx, &ctx->crypt_ctx); 381 307 } 382 308 383 309 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 384 310 struct scatterlist *src, unsigned int nbytes) 385 311 { 386 312 struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 387 - be128 buf[SERPENT_PARALLEL_BLOCKS]; 388 - struct crypt_priv crypt_ctx = { 389 - .ctx = &ctx->crypt_ctx, 390 - .fpu_enabled = false, 391 - }; 392 - struct xts_crypt_req req = { 393 - .tbuf = buf, 394 - .tbuflen = sizeof(buf), 395 313 396 - .tweak_ctx = &ctx->tweak_ctx, 397 - .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), 398 - .crypt_ctx = &crypt_ctx, 399 - .crypt_fn = decrypt_callback, 400 - }; 401 - int ret; 402 - 403 - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 404 - ret = xts_crypt(desc, dst, src, nbytes, &req); 405 - serpent_fpu_end(crypt_ctx.fpu_enabled); 406 - 407 - return ret; 314 + return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes, 315 + XTS_TWEAK_CAST(__serpent_encrypt), 316 + &ctx->tweak_ctx, &ctx->crypt_ctx); 408 317 } 409 318 410 319 static struct crypto_alg serpent_algs[10] = { { ··· 442 417 .cra_alignmask = 0, 443 418 .cra_type = &crypto_blkcipher_type, 444 419 .cra_module = THIS_MODULE, 445 - .cra_exit = lrw_exit_tfm, 420 + .cra_exit = lrw_serpent_exit_tfm, 446 421 .cra_u = { 447 422 .blkcipher = { 448 423 .min_keysize = SERPENT_MIN_KEY_SIZE +

+496

arch/x86/crypto/sha256-avx-asm.S

··· 1 + ######################################################################## 2 + # Implement fast SHA-256 with AVX1 instructions. (x86_64) 3 + # 4 + # Copyright (C) 2013 Intel Corporation. 5 + # 6 + # Authors: 7 + # James Guilford <james.guilford@intel.com> 8 + # Kirk Yap <kirk.s.yap@intel.com> 9 + # Tim Chen <tim.c.chen@linux.intel.com> 10 + # 11 + # This software is available to you under a choice of one of two 12 + # licenses. You may choose to be licensed under the terms of the GNU 13 + # General Public License (GPL) Version 2, available from the file 14 + # COPYING in the main directory of this source tree, or the 15 + # OpenIB.org BSD license below: 16 + # 17 + # Redistribution and use in source and binary forms, with or 18 + # without modification, are permitted provided that the following 19 + # conditions are met: 20 + # 21 + # - Redistributions of source code must retain the above 22 + # copyright notice, this list of conditions and the following 23 + # disclaimer. 24 + # 25 + # - Redistributions in binary form must reproduce the above 26 + # copyright notice, this list of conditions and the following 27 + # disclaimer in the documentation and/or other materials 28 + # provided with the distribution. 29 + # 30 + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31 + # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32 + # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33 + # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34 + # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35 + # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36 + # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37 + # SOFTWARE. 38 + ######################################################################## 39 + # 40 + # This code is described in an Intel White-Paper: 41 + # "Fast SHA-256 Implementations on Intel Architecture Processors" 42 + # 43 + # To find it, surf to http://www.intel.com/p/en_US/embedded 44 + # and search for that title. 45 + # 46 + ######################################################################## 47 + # This code schedules 1 block at a time, with 4 lanes per block 48 + ######################################################################## 49 + 50 + #ifdef CONFIG_AS_AVX 51 + #include <linux/linkage.h> 52 + 53 + ## assume buffers not aligned 54 + #define VMOVDQ vmovdqu 55 + 56 + ################################ Define Macros 57 + 58 + # addm [mem], reg 59 + # Add reg to mem using reg-mem add and store 60 + .macro addm p1 p2 61 + add \p1, \p2 62 + mov \p2, \p1 63 + .endm 64 + 65 + 66 + .macro MY_ROR p1 p2 67 + shld $(32-(\p1)), \p2, \p2 68 + .endm 69 + 70 + ################################ 71 + 72 + # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask 73 + # Load xmm with mem and byte swap each dword 74 + .macro COPY_XMM_AND_BSWAP p1 p2 p3 75 + VMOVDQ \p2, \p1 76 + vpshufb \p3, \p1, \p1 77 + .endm 78 + 79 + ################################ 80 + 81 + X0 = %xmm4 82 + X1 = %xmm5 83 + X2 = %xmm6 84 + X3 = %xmm7 85 + 86 + XTMP0 = %xmm0 87 + XTMP1 = %xmm1 88 + XTMP2 = %xmm2 89 + XTMP3 = %xmm3 90 + XTMP4 = %xmm8 91 + XFER = %xmm9 92 + XTMP5 = %xmm11 93 + 94 + SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA 95 + SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 96 + BYTE_FLIP_MASK = %xmm13 97 + 98 + NUM_BLKS = %rdx # 3rd arg 99 + CTX = %rsi # 2nd arg 100 + INP = %rdi # 1st arg 101 + 102 + SRND = %rdi # clobbers INP 103 + c = %ecx 104 + d = %r8d 105 + e = %edx 106 + TBL = %rbp 107 + a = %eax 108 + b = %ebx 109 + 110 + f = %r9d 111 + g = %r10d 112 + h = %r11d 113 + 114 + y0 = %r13d 115 + y1 = %r14d 116 + y2 = %r15d 117 + 118 + 119 + _INP_END_SIZE = 8 120 + _INP_SIZE = 8 121 + _XFER_SIZE = 8 122 + _XMM_SAVE_SIZE = 0 123 + 124 + _INP_END = 0 125 + _INP = _INP_END + _INP_END_SIZE 126 + _XFER = _INP + _INP_SIZE 127 + _XMM_SAVE = _XFER + _XFER_SIZE 128 + STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE 129 + 130 + # rotate_Xs 131 + # Rotate values of symbols X0...X3 132 + .macro rotate_Xs 133 + X_ = X0 134 + X0 = X1 135 + X1 = X2 136 + X2 = X3 137 + X3 = X_ 138 + .endm 139 + 140 + # ROTATE_ARGS 141 + # Rotate values of symbols a...h 142 + .macro ROTATE_ARGS 143 + TMP_ = h 144 + h = g 145 + g = f 146 + f = e 147 + e = d 148 + d = c 149 + c = b 150 + b = a 151 + a = TMP_ 152 + .endm 153 + 154 + .macro FOUR_ROUNDS_AND_SCHED 155 + ## compute s0 four at a time and s1 two at a time 156 + ## compute W[-16] + W[-7] 4 at a time 157 + 158 + mov e, y0 # y0 = e 159 + MY_ROR (25-11), y0 # y0 = e >> (25-11) 160 + mov a, y1 # y1 = a 161 + vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 162 + MY_ROR (22-13), y1 # y1 = a >> (22-13) 163 + xor e, y0 # y0 = e ^ (e >> (25-11)) 164 + mov f, y2 # y2 = f 165 + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 166 + xor a, y1 # y1 = a ^ (a >> (22-13) 167 + xor g, y2 # y2 = f^g 168 + vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16] 169 + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 170 + and e, y2 # y2 = (f^g)&e 171 + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 172 + ## compute s0 173 + vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 174 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 175 + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 176 + xor g, y2 # y2 = CH = ((f^g)&e)^g 177 + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 178 + add y0, y2 # y2 = S1 + CH 179 + add _XFER(%rsp), y2 # y2 = k + w + S1 + CH 180 + mov a, y0 # y0 = a 181 + add y2, h # h = h + S1 + CH + k + w 182 + mov a, y2 # y2 = a 183 + vpsrld $7, XTMP1, XTMP2 184 + or c, y0 # y0 = a|c 185 + add h, d # d = d + h + S1 + CH + k + w 186 + and c, y2 # y2 = a&c 187 + vpslld $(32-7), XTMP1, XTMP3 188 + and b, y0 # y0 = (a|c)&b 189 + add y1, h # h = h + S1 + CH + k + w + S0 190 + vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 191 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 192 + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 193 + ROTATE_ARGS 194 + mov e, y0 # y0 = e 195 + mov a, y1 # y1 = a 196 + MY_ROR (25-11), y0 # y0 = e >> (25-11) 197 + xor e, y0 # y0 = e ^ (e >> (25-11)) 198 + mov f, y2 # y2 = f 199 + MY_ROR (22-13), y1 # y1 = a >> (22-13) 200 + vpsrld $18, XTMP1, XTMP2 # 201 + xor a, y1 # y1 = a ^ (a >> (22-13) 202 + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 203 + xor g, y2 # y2 = f^g 204 + vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 205 + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 206 + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 207 + and e, y2 # y2 = (f^g)&e 208 + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 209 + vpslld $(32-18), XTMP1, XTMP1 210 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 211 + xor g, y2 # y2 = CH = ((f^g)&e)^g 212 + vpxor XTMP1, XTMP3, XTMP3 # 213 + add y0, y2 # y2 = S1 + CH 214 + add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 215 + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 216 + vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 217 + mov a, y0 # y0 = a 218 + add y2, h # h = h + S1 + CH + k + w 219 + mov a, y2 # y2 = a 220 + vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 221 + or c, y0 # y0 = a|c 222 + add h, d # d = d + h + S1 + CH + k + w 223 + and c, y2 # y2 = a&c 224 + ## compute low s1 225 + vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 226 + and b, y0 # y0 = (a|c)&b 227 + add y1, h # h = h + S1 + CH + k + w + S0 228 + vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 229 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 230 + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 231 + ROTATE_ARGS 232 + mov e, y0 # y0 = e 233 + mov a, y1 # y1 = a 234 + MY_ROR (25-11), y0 # y0 = e >> (25-11) 235 + xor e, y0 # y0 = e ^ (e >> (25-11)) 236 + MY_ROR (22-13), y1 # y1 = a >> (22-13) 237 + mov f, y2 # y2 = f 238 + xor a, y1 # y1 = a ^ (a >> (22-13) 239 + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 240 + vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 241 + xor g, y2 # y2 = f^g 242 + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA} 243 + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 244 + and e, y2 # y2 = (f^g)&e 245 + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA} 246 + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 247 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 248 + xor g, y2 # y2 = CH = ((f^g)&e)^g 249 + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 250 + vpxor XTMP3, XTMP2, XTMP2 # 251 + add y0, y2 # y2 = S1 + CH 252 + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 253 + add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 254 + vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 255 + mov a, y0 # y0 = a 256 + add y2, h # h = h + S1 + CH + k + w 257 + mov a, y2 # y2 = a 258 + vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 259 + or c, y0 # y0 = a|c 260 + add h, d # d = d + h + S1 + CH + k + w 261 + and c, y2 # y2 = a&c 262 + vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 263 + and b, y0 # y0 = (a|c)&b 264 + add y1, h # h = h + S1 + CH + k + w + S0 265 + ## compute high s1 266 + vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 267 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 268 + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 269 + ROTATE_ARGS 270 + mov e, y0 # y0 = e 271 + MY_ROR (25-11), y0 # y0 = e >> (25-11) 272 + mov a, y1 # y1 = a 273 + MY_ROR (22-13), y1 # y1 = a >> (22-13) 274 + xor e, y0 # y0 = e ^ (e >> (25-11)) 275 + mov f, y2 # y2 = f 276 + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 277 + vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 278 + xor a, y1 # y1 = a ^ (a >> (22-13) 279 + xor g, y2 # y2 = f^g 280 + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC} 281 + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 282 + and e, y2 # y2 = (f^g)&e 283 + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 284 + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC} 285 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 286 + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 287 + xor g, y2 # y2 = CH = ((f^g)&e)^g 288 + vpxor XTMP3, XTMP2, XTMP2 289 + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 290 + add y0, y2 # y2 = S1 + CH 291 + add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 292 + vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 293 + mov a, y0 # y0 = a 294 + add y2, h # h = h + S1 + CH + k + w 295 + mov a, y2 # y2 = a 296 + vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 297 + or c, y0 # y0 = a|c 298 + add h, d # d = d + h + S1 + CH + k + w 299 + and c, y2 # y2 = a&c 300 + vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 301 + and b, y0 # y0 = (a|c)&b 302 + add y1, h # h = h + S1 + CH + k + w + S0 303 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 304 + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 305 + ROTATE_ARGS 306 + rotate_Xs 307 + .endm 308 + 309 + ## input is [rsp + _XFER + %1 * 4] 310 + .macro DO_ROUND round 311 + mov e, y0 # y0 = e 312 + MY_ROR (25-11), y0 # y0 = e >> (25-11) 313 + mov a, y1 # y1 = a 314 + xor e, y0 # y0 = e ^ (e >> (25-11)) 315 + MY_ROR (22-13), y1 # y1 = a >> (22-13) 316 + mov f, y2 # y2 = f 317 + xor a, y1 # y1 = a ^ (a >> (22-13) 318 + MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 319 + xor g, y2 # y2 = f^g 320 + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 321 + MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 322 + and e, y2 # y2 = (f^g)&e 323 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 324 + MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 325 + xor g, y2 # y2 = CH = ((f^g)&e)^g 326 + add y0, y2 # y2 = S1 + CH 327 + MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 328 + offset = \round * 4 + _XFER # 329 + add offset(%rsp), y2 # y2 = k + w + S1 + CH 330 + mov a, y0 # y0 = a 331 + add y2, h # h = h + S1 + CH + k + w 332 + mov a, y2 # y2 = a 333 + or c, y0 # y0 = a|c 334 + add h, d # d = d + h + S1 + CH + k + w 335 + and c, y2 # y2 = a&c 336 + and b, y0 # y0 = (a|c)&b 337 + add y1, h # h = h + S1 + CH + k + w + S0 338 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 339 + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 340 + ROTATE_ARGS 341 + .endm 342 + 343 + ######################################################################## 344 + ## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) 345 + ## arg 1 : pointer to input data 346 + ## arg 2 : pointer to digest 347 + ## arg 3 : Num blocks 348 + ######################################################################## 349 + .text 350 + ENTRY(sha256_transform_avx) 351 + .align 32 352 + pushq %rbx 353 + pushq %rbp 354 + pushq %r13 355 + pushq %r14 356 + pushq %r15 357 + pushq %r12 358 + 359 + mov %rsp, %r12 360 + subq $STACK_SIZE, %rsp # allocate stack space 361 + and $~15, %rsp # align stack pointer 362 + 363 + shl $6, NUM_BLKS # convert to bytes 364 + jz done_hash 365 + add INP, NUM_BLKS # pointer to end of data 366 + mov NUM_BLKS, _INP_END(%rsp) 367 + 368 + ## load initial digest 369 + mov 4*0(CTX), a 370 + mov 4*1(CTX), b 371 + mov 4*2(CTX), c 372 + mov 4*3(CTX), d 373 + mov 4*4(CTX), e 374 + mov 4*5(CTX), f 375 + mov 4*6(CTX), g 376 + mov 4*7(CTX), h 377 + 378 + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 379 + vmovdqa _SHUF_00BA(%rip), SHUF_00BA 380 + vmovdqa _SHUF_DC00(%rip), SHUF_DC00 381 + loop0: 382 + lea K256(%rip), TBL 383 + 384 + ## byte swap first 16 dwords 385 + COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK 386 + COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK 387 + COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK 388 + COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK 389 + 390 + mov INP, _INP(%rsp) 391 + 392 + ## schedule 48 input dwords, by doing 3 rounds of 16 each 393 + mov $3, SRND 394 + .align 16 395 + loop1: 396 + vpaddd (TBL), X0, XFER 397 + vmovdqa XFER, _XFER(%rsp) 398 + FOUR_ROUNDS_AND_SCHED 399 + 400 + vpaddd 1*16(TBL), X0, XFER 401 + vmovdqa XFER, _XFER(%rsp) 402 + FOUR_ROUNDS_AND_SCHED 403 + 404 + vpaddd 2*16(TBL), X0, XFER 405 + vmovdqa XFER, _XFER(%rsp) 406 + FOUR_ROUNDS_AND_SCHED 407 + 408 + vpaddd 3*16(TBL), X0, XFER 409 + vmovdqa XFER, _XFER(%rsp) 410 + add $4*16, TBL 411 + FOUR_ROUNDS_AND_SCHED 412 + 413 + sub $1, SRND 414 + jne loop1 415 + 416 + mov $2, SRND 417 + loop2: 418 + vpaddd (TBL), X0, XFER 419 + vmovdqa XFER, _XFER(%rsp) 420 + DO_ROUND 0 421 + DO_ROUND 1 422 + DO_ROUND 2 423 + DO_ROUND 3 424 + 425 + vpaddd 1*16(TBL), X1, XFER 426 + vmovdqa XFER, _XFER(%rsp) 427 + add $2*16, TBL 428 + DO_ROUND 0 429 + DO_ROUND 1 430 + DO_ROUND 2 431 + DO_ROUND 3 432 + 433 + vmovdqa X2, X0 434 + vmovdqa X3, X1 435 + 436 + sub $1, SRND 437 + jne loop2 438 + 439 + addm (4*0)(CTX),a 440 + addm (4*1)(CTX),b 441 + addm (4*2)(CTX),c 442 + addm (4*3)(CTX),d 443 + addm (4*4)(CTX),e 444 + addm (4*5)(CTX),f 445 + addm (4*6)(CTX),g 446 + addm (4*7)(CTX),h 447 + 448 + mov _INP(%rsp), INP 449 + add $64, INP 450 + cmp _INP_END(%rsp), INP 451 + jne loop0 452 + 453 + done_hash: 454 + 455 + mov %r12, %rsp 456 + 457 + popq %r12 458 + popq %r15 459 + popq %r14 460 + popq %r13 461 + popq %rbp 462 + popq %rbx 463 + ret 464 + ENDPROC(sha256_transform_avx) 465 + 466 + .data 467 + .align 64 468 + K256: 469 + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 470 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 471 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 472 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 473 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 474 + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 475 + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 476 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 477 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 478 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 479 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 480 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 481 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 482 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 483 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 484 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 485 + 486 + PSHUFFLE_BYTE_FLIP_MASK: 487 + .octa 0x0c0d0e0f08090a0b0405060700010203 488 + 489 + # shuffle xBxA -> 00BA 490 + _SHUF_00BA: 491 + .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 492 + 493 + # shuffle xDxC -> DC00 494 + _SHUF_DC00: 495 + .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF 496 + #endif

+772

arch/x86/crypto/sha256-avx2-asm.S

··· 1 + ######################################################################## 2 + # Implement fast SHA-256 with AVX2 instructions. (x86_64) 3 + # 4 + # Copyright (C) 2013 Intel Corporation. 5 + # 6 + # Authors: 7 + # James Guilford <james.guilford@intel.com> 8 + # Kirk Yap <kirk.s.yap@intel.com> 9 + # Tim Chen <tim.c.chen@linux.intel.com> 10 + # 11 + # This software is available to you under a choice of one of two 12 + # licenses. You may choose to be licensed under the terms of the GNU 13 + # General Public License (GPL) Version 2, available from the file 14 + # COPYING in the main directory of this source tree, or the 15 + # OpenIB.org BSD license below: 16 + # 17 + # Redistribution and use in source and binary forms, with or 18 + # without modification, are permitted provided that the following 19 + # conditions are met: 20 + # 21 + # - Redistributions of source code must retain the above 22 + # copyright notice, this list of conditions and the following 23 + # disclaimer. 24 + # 25 + # - Redistributions in binary form must reproduce the above 26 + # copyright notice, this list of conditions and the following 27 + # disclaimer in the documentation and/or other materials 28 + # provided with the distribution. 29 + # 30 + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31 + # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32 + # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33 + # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34 + # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35 + # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36 + # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37 + # SOFTWARE. 38 + # 39 + ######################################################################## 40 + # 41 + # This code is described in an Intel White-Paper: 42 + # "Fast SHA-256 Implementations on Intel Architecture Processors" 43 + # 44 + # To find it, surf to http://www.intel.com/p/en_US/embedded 45 + # and search for that title. 46 + # 47 + ######################################################################## 48 + # This code schedules 2 blocks at a time, with 4 lanes per block 49 + ######################################################################## 50 + 51 + #ifdef CONFIG_AS_AVX2 52 + #include <linux/linkage.h> 53 + 54 + ## assume buffers not aligned 55 + #define VMOVDQ vmovdqu 56 + 57 + ################################ Define Macros 58 + 59 + # addm [mem], reg 60 + # Add reg to mem using reg-mem add and store 61 + .macro addm p1 p2 62 + add \p1, \p2 63 + mov \p2, \p1 64 + .endm 65 + 66 + ################################ 67 + 68 + X0 = %ymm4 69 + X1 = %ymm5 70 + X2 = %ymm6 71 + X3 = %ymm7 72 + 73 + # XMM versions of above 74 + XWORD0 = %xmm4 75 + XWORD1 = %xmm5 76 + XWORD2 = %xmm6 77 + XWORD3 = %xmm7 78 + 79 + XTMP0 = %ymm0 80 + XTMP1 = %ymm1 81 + XTMP2 = %ymm2 82 + XTMP3 = %ymm3 83 + XTMP4 = %ymm8 84 + XFER = %ymm9 85 + XTMP5 = %ymm11 86 + 87 + SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA 88 + SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 89 + BYTE_FLIP_MASK = %ymm13 90 + 91 + X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK 92 + 93 + NUM_BLKS = %rdx # 3rd arg 94 + CTX = %rsi # 2nd arg 95 + INP = %rdi # 1st arg 96 + c = %ecx 97 + d = %r8d 98 + e = %edx # clobbers NUM_BLKS 99 + y3 = %edi # clobbers INP 100 + 101 + 102 + TBL = %rbp 103 + SRND = CTX # SRND is same register as CTX 104 + 105 + a = %eax 106 + b = %ebx 107 + f = %r9d 108 + g = %r10d 109 + h = %r11d 110 + old_h = %r11d 111 + 112 + T1 = %r12d 113 + y0 = %r13d 114 + y1 = %r14d 115 + y2 = %r15d 116 + 117 + 118 + _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round 119 + _XMM_SAVE_SIZE = 0 120 + _INP_END_SIZE = 8 121 + _INP_SIZE = 8 122 + _CTX_SIZE = 8 123 + _RSP_SIZE = 8 124 + 125 + _XFER = 0 126 + _XMM_SAVE = _XFER + _XFER_SIZE 127 + _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE 128 + _INP = _INP_END + _INP_END_SIZE 129 + _CTX = _INP + _INP_SIZE 130 + _RSP = _CTX + _CTX_SIZE 131 + STACK_SIZE = _RSP + _RSP_SIZE 132 + 133 + # rotate_Xs 134 + # Rotate values of symbols X0...X3 135 + .macro rotate_Xs 136 + X_ = X0 137 + X0 = X1 138 + X1 = X2 139 + X2 = X3 140 + X3 = X_ 141 + .endm 142 + 143 + # ROTATE_ARGS 144 + # Rotate values of symbols a...h 145 + .macro ROTATE_ARGS 146 + old_h = h 147 + TMP_ = h 148 + h = g 149 + g = f 150 + f = e 151 + e = d 152 + d = c 153 + c = b 154 + b = a 155 + a = TMP_ 156 + .endm 157 + 158 + .macro FOUR_ROUNDS_AND_SCHED disp 159 + ################################### RND N + 0 ############################ 160 + 161 + mov a, y3 # y3 = a # MAJA 162 + rorx $25, e, y0 # y0 = e >> 25 # S1A 163 + rorx $11, e, y1 # y1 = e >> 11 # S1B 164 + 165 + addl \disp(%rsp, SRND), h # h = k + w + h # -- 166 + or c, y3 # y3 = a|c # MAJA 167 + vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 168 + mov f, y2 # y2 = f # CH 169 + rorx $13, a, T1 # T1 = a >> 13 # S0B 170 + 171 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 172 + xor g, y2 # y2 = f^g # CH 173 + vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 174 + rorx $6, e, y1 # y1 = (e >> 6) # S1 175 + 176 + and e, y2 # y2 = (f^g)&e # CH 177 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 178 + rorx $22, a, y1 # y1 = a >> 22 # S0A 179 + add h, d # d = k + w + h + d # -- 180 + 181 + and b, y3 # y3 = (a|c)&b # MAJA 182 + vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 183 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 184 + rorx $2, a, T1 # T1 = (a >> 2) # S0 185 + 186 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 187 + vpsrld $7, XTMP1, XTMP2 188 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 189 + mov a, T1 # T1 = a # MAJB 190 + and c, T1 # T1 = a&c # MAJB 191 + 192 + add y0, y2 # y2 = S1 + CH # -- 193 + vpslld $(32-7), XTMP1, XTMP3 194 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 195 + add y1, h # h = k + w + h + S0 # -- 196 + 197 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 198 + vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 199 + 200 + vpsrld $18, XTMP1, XTMP2 201 + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 202 + add y3, h # h = t1 + S0 + MAJ # -- 203 + 204 + 205 + ROTATE_ARGS 206 + 207 + ################################### RND N + 1 ############################ 208 + 209 + mov a, y3 # y3 = a # MAJA 210 + rorx $25, e, y0 # y0 = e >> 25 # S1A 211 + rorx $11, e, y1 # y1 = e >> 11 # S1B 212 + offset = \disp + 1*4 213 + addl offset(%rsp, SRND), h # h = k + w + h # -- 214 + or c, y3 # y3 = a|c # MAJA 215 + 216 + 217 + vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 218 + mov f, y2 # y2 = f # CH 219 + rorx $13, a, T1 # T1 = a >> 13 # S0B 220 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 221 + xor g, y2 # y2 = f^g # CH 222 + 223 + 224 + rorx $6, e, y1 # y1 = (e >> 6) # S1 225 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 226 + rorx $22, a, y1 # y1 = a >> 22 # S0A 227 + and e, y2 # y2 = (f^g)&e # CH 228 + add h, d # d = k + w + h + d # -- 229 + 230 + vpslld $(32-18), XTMP1, XTMP1 231 + and b, y3 # y3 = (a|c)&b # MAJA 232 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 233 + 234 + vpxor XTMP1, XTMP3, XTMP3 235 + rorx $2, a, T1 # T1 = (a >> 2) # S0 236 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 237 + 238 + vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 239 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 240 + mov a, T1 # T1 = a # MAJB 241 + and c, T1 # T1 = a&c # MAJB 242 + add y0, y2 # y2 = S1 + CH # -- 243 + 244 + vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 245 + vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 246 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 247 + add y1, h # h = k + w + h + S0 # -- 248 + 249 + vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 250 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 251 + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 252 + add y3, h # h = t1 + S0 + MAJ # -- 253 + 254 + vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 255 + 256 + 257 + ROTATE_ARGS 258 + 259 + ################################### RND N + 2 ############################ 260 + 261 + mov a, y3 # y3 = a # MAJA 262 + rorx $25, e, y0 # y0 = e >> 25 # S1A 263 + offset = \disp + 2*4 264 + addl offset(%rsp, SRND), h # h = k + w + h # -- 265 + 266 + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} 267 + rorx $11, e, y1 # y1 = e >> 11 # S1B 268 + or c, y3 # y3 = a|c # MAJA 269 + mov f, y2 # y2 = f # CH 270 + xor g, y2 # y2 = f^g # CH 271 + 272 + rorx $13, a, T1 # T1 = a >> 13 # S0B 273 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 274 + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} 275 + and e, y2 # y2 = (f^g)&e # CH 276 + 277 + rorx $6, e, y1 # y1 = (e >> 6) # S1 278 + vpxor XTMP3, XTMP2, XTMP2 279 + add h, d # d = k + w + h + d # -- 280 + and b, y3 # y3 = (a|c)&b # MAJA 281 + 282 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 283 + rorx $22, a, y1 # y1 = a >> 22 # S0A 284 + vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 285 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 286 + 287 + vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 288 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 289 + rorx $2, a ,T1 # T1 = (a >> 2) # S0 290 + vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 291 + 292 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 293 + mov a, T1 # T1 = a # MAJB 294 + and c, T1 # T1 = a&c # MAJB 295 + add y0, y2 # y2 = S1 + CH # -- 296 + vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 297 + 298 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 299 + add y1,h # h = k + w + h + S0 # -- 300 + add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- 301 + add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 302 + 303 + add y3,h # h = t1 + S0 + MAJ # -- 304 + 305 + 306 + ROTATE_ARGS 307 + 308 + ################################### RND N + 3 ############################ 309 + 310 + mov a, y3 # y3 = a # MAJA 311 + rorx $25, e, y0 # y0 = e >> 25 # S1A 312 + rorx $11, e, y1 # y1 = e >> 11 # S1B 313 + offset = \disp + 3*4 314 + addl offset(%rsp, SRND), h # h = k + w + h # -- 315 + or c, y3 # y3 = a|c # MAJA 316 + 317 + 318 + vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 319 + mov f, y2 # y2 = f # CH 320 + rorx $13, a, T1 # T1 = a >> 13 # S0B 321 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 322 + xor g, y2 # y2 = f^g # CH 323 + 324 + 325 + vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} 326 + rorx $6, e, y1 # y1 = (e >> 6) # S1 327 + and e, y2 # y2 = (f^g)&e # CH 328 + add h, d # d = k + w + h + d # -- 329 + and b, y3 # y3 = (a|c)&b # MAJA 330 + 331 + vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} 332 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 333 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 334 + 335 + vpxor XTMP3, XTMP2, XTMP2 336 + rorx $22, a, y1 # y1 = a >> 22 # S0A 337 + add y0, y2 # y2 = S1 + CH # -- 338 + 339 + vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 340 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 341 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 342 + 343 + rorx $2, a, T1 # T1 = (a >> 2) # S0 344 + vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 345 + 346 + vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 347 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 348 + mov a, T1 # T1 = a # MAJB 349 + and c, T1 # T1 = a&c # MAJB 350 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 351 + 352 + add y1, h # h = k + w + h + S0 # -- 353 + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 354 + add y3, h # h = t1 + S0 + MAJ # -- 355 + 356 + ROTATE_ARGS 357 + rotate_Xs 358 + .endm 359 + 360 + .macro DO_4ROUNDS disp 361 + ################################### RND N + 0 ########################### 362 + 363 + mov f, y2 # y2 = f # CH 364 + rorx $25, e, y0 # y0 = e >> 25 # S1A 365 + rorx $11, e, y1 # y1 = e >> 11 # S1B 366 + xor g, y2 # y2 = f^g # CH 367 + 368 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 369 + rorx $6, e, y1 # y1 = (e >> 6) # S1 370 + and e, y2 # y2 = (f^g)&e # CH 371 + 372 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 373 + rorx $13, a, T1 # T1 = a >> 13 # S0B 374 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 375 + rorx $22, a, y1 # y1 = a >> 22 # S0A 376 + mov a, y3 # y3 = a # MAJA 377 + 378 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 379 + rorx $2, a, T1 # T1 = (a >> 2) # S0 380 + addl \disp(%rsp, SRND), h # h = k + w + h # -- 381 + or c, y3 # y3 = a|c # MAJA 382 + 383 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 384 + mov a, T1 # T1 = a # MAJB 385 + and b, y3 # y3 = (a|c)&b # MAJA 386 + and c, T1 # T1 = a&c # MAJB 387 + add y0, y2 # y2 = S1 + CH # -- 388 + 389 + 390 + add h, d # d = k + w + h + d # -- 391 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 392 + add y1, h # h = k + w + h + S0 # -- 393 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 394 + 395 + ROTATE_ARGS 396 + 397 + ################################### RND N + 1 ########################### 398 + 399 + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 400 + mov f, y2 # y2 = f # CH 401 + rorx $25, e, y0 # y0 = e >> 25 # S1A 402 + rorx $11, e, y1 # y1 = e >> 11 # S1B 403 + xor g, y2 # y2 = f^g # CH 404 + 405 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 406 + rorx $6, e, y1 # y1 = (e >> 6) # S1 407 + and e, y2 # y2 = (f^g)&e # CH 408 + add y3, old_h # h = t1 + S0 + MAJ # -- 409 + 410 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 411 + rorx $13, a, T1 # T1 = a >> 13 # S0B 412 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 413 + rorx $22, a, y1 # y1 = a >> 22 # S0A 414 + mov a, y3 # y3 = a # MAJA 415 + 416 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 417 + rorx $2, a, T1 # T1 = (a >> 2) # S0 418 + offset = 4*1 + \disp 419 + addl offset(%rsp, SRND), h # h = k + w + h # -- 420 + or c, y3 # y3 = a|c # MAJA 421 + 422 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 423 + mov a, T1 # T1 = a # MAJB 424 + and b, y3 # y3 = (a|c)&b # MAJA 425 + and c, T1 # T1 = a&c # MAJB 426 + add y0, y2 # y2 = S1 + CH # -- 427 + 428 + 429 + add h, d # d = k + w + h + d # -- 430 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 431 + add y1, h # h = k + w + h + S0 # -- 432 + 433 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 434 + 435 + ROTATE_ARGS 436 + 437 + ################################### RND N + 2 ############################## 438 + 439 + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 440 + mov f, y2 # y2 = f # CH 441 + rorx $25, e, y0 # y0 = e >> 25 # S1A 442 + rorx $11, e, y1 # y1 = e >> 11 # S1B 443 + xor g, y2 # y2 = f^g # CH 444 + 445 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 446 + rorx $6, e, y1 # y1 = (e >> 6) # S1 447 + and e, y2 # y2 = (f^g)&e # CH 448 + add y3, old_h # h = t1 + S0 + MAJ # -- 449 + 450 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 451 + rorx $13, a, T1 # T1 = a >> 13 # S0B 452 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 453 + rorx $22, a, y1 # y1 = a >> 22 # S0A 454 + mov a, y3 # y3 = a # MAJA 455 + 456 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 457 + rorx $2, a, T1 # T1 = (a >> 2) # S0 458 + offset = 4*2 + \disp 459 + addl offset(%rsp, SRND), h # h = k + w + h # -- 460 + or c, y3 # y3 = a|c # MAJA 461 + 462 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 463 + mov a, T1 # T1 = a # MAJB 464 + and b, y3 # y3 = (a|c)&b # MAJA 465 + and c, T1 # T1 = a&c # MAJB 466 + add y0, y2 # y2 = S1 + CH # -- 467 + 468 + 469 + add h, d # d = k + w + h + d # -- 470 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 471 + add y1, h # h = k + w + h + S0 # -- 472 + 473 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 474 + 475 + ROTATE_ARGS 476 + 477 + ################################### RND N + 3 ########################### 478 + 479 + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 480 + mov f, y2 # y2 = f # CH 481 + rorx $25, e, y0 # y0 = e >> 25 # S1A 482 + rorx $11, e, y1 # y1 = e >> 11 # S1B 483 + xor g, y2 # y2 = f^g # CH 484 + 485 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 486 + rorx $6, e, y1 # y1 = (e >> 6) # S1 487 + and e, y2 # y2 = (f^g)&e # CH 488 + add y3, old_h # h = t1 + S0 + MAJ # -- 489 + 490 + xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 491 + rorx $13, a, T1 # T1 = a >> 13 # S0B 492 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 493 + rorx $22, a, y1 # y1 = a >> 22 # S0A 494 + mov a, y3 # y3 = a # MAJA 495 + 496 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 497 + rorx $2, a, T1 # T1 = (a >> 2) # S0 498 + offset = 4*3 + \disp 499 + addl offset(%rsp, SRND), h # h = k + w + h # -- 500 + or c, y3 # y3 = a|c # MAJA 501 + 502 + xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 503 + mov a, T1 # T1 = a # MAJB 504 + and b, y3 # y3 = (a|c)&b # MAJA 505 + and c, T1 # T1 = a&c # MAJB 506 + add y0, y2 # y2 = S1 + CH # -- 507 + 508 + 509 + add h, d # d = k + w + h + d # -- 510 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 511 + add y1, h # h = k + w + h + S0 # -- 512 + 513 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 514 + 515 + 516 + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 517 + 518 + add y3, h # h = t1 + S0 + MAJ # -- 519 + 520 + ROTATE_ARGS 521 + 522 + .endm 523 + 524 + ######################################################################## 525 + ## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) 526 + ## arg 1 : pointer to input data 527 + ## arg 2 : pointer to digest 528 + ## arg 3 : Num blocks 529 + ######################################################################## 530 + .text 531 + ENTRY(sha256_transform_rorx) 532 + .align 32 533 + pushq %rbx 534 + pushq %rbp 535 + pushq %r12 536 + pushq %r13 537 + pushq %r14 538 + pushq %r15 539 + 540 + mov %rsp, %rax 541 + subq $STACK_SIZE, %rsp 542 + and $-32, %rsp # align rsp to 32 byte boundary 543 + mov %rax, _RSP(%rsp) 544 + 545 + 546 + shl $6, NUM_BLKS # convert to bytes 547 + jz done_hash 548 + lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block 549 + mov NUM_BLKS, _INP_END(%rsp) 550 + 551 + cmp NUM_BLKS, INP 552 + je only_one_block 553 + 554 + ## load initial digest 555 + mov (CTX), a 556 + mov 4*1(CTX), b 557 + mov 4*2(CTX), c 558 + mov 4*3(CTX), d 559 + mov 4*4(CTX), e 560 + mov 4*5(CTX), f 561 + mov 4*6(CTX), g 562 + mov 4*7(CTX), h 563 + 564 + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 565 + vmovdqa _SHUF_00BA(%rip), SHUF_00BA 566 + vmovdqa _SHUF_DC00(%rip), SHUF_DC00 567 + 568 + mov CTX, _CTX(%rsp) 569 + 570 + loop0: 571 + lea K256(%rip), TBL 572 + 573 + ## Load first 16 dwords from two blocks 574 + VMOVDQ 0*32(INP),XTMP0 575 + VMOVDQ 1*32(INP),XTMP1 576 + VMOVDQ 2*32(INP),XTMP2 577 + VMOVDQ 3*32(INP),XTMP3 578 + 579 + ## byte swap data 580 + vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 581 + vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 582 + vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 583 + vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 584 + 585 + ## transpose data into high/low halves 586 + vperm2i128 $0x20, XTMP2, XTMP0, X0 587 + vperm2i128 $0x31, XTMP2, XTMP0, X1 588 + vperm2i128 $0x20, XTMP3, XTMP1, X2 589 + vperm2i128 $0x31, XTMP3, XTMP1, X3 590 + 591 + last_block_enter: 592 + add $64, INP 593 + mov INP, _INP(%rsp) 594 + 595 + ## schedule 48 input dwords, by doing 3 rounds of 12 each 596 + xor SRND, SRND 597 + 598 + .align 16 599 + loop1: 600 + vpaddd 0*32(TBL, SRND), X0, XFER 601 + vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 602 + FOUR_ROUNDS_AND_SCHED _XFER + 0*32 603 + 604 + vpaddd 1*32(TBL, SRND), X0, XFER 605 + vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 606 + FOUR_ROUNDS_AND_SCHED _XFER + 1*32 607 + 608 + vpaddd 2*32(TBL, SRND), X0, XFER 609 + vmovdqa XFER, 2*32+_XFER(%rsp, SRND) 610 + FOUR_ROUNDS_AND_SCHED _XFER + 2*32 611 + 612 + vpaddd 3*32(TBL, SRND), X0, XFER 613 + vmovdqa XFER, 3*32+_XFER(%rsp, SRND) 614 + FOUR_ROUNDS_AND_SCHED _XFER + 3*32 615 + 616 + add $4*32, SRND 617 + cmp $3*4*32, SRND 618 + jb loop1 619 + 620 + loop2: 621 + ## Do last 16 rounds with no scheduling 622 + vpaddd 0*32(TBL, SRND), X0, XFER 623 + vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 624 + DO_4ROUNDS _XFER + 0*32 625 + vpaddd 1*32(TBL, SRND), X1, XFER 626 + vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 627 + DO_4ROUNDS _XFER + 1*32 628 + add $2*32, SRND 629 + 630 + vmovdqa X2, X0 631 + vmovdqa X3, X1 632 + 633 + cmp $4*4*32, SRND 634 + jb loop2 635 + 636 + mov _CTX(%rsp), CTX 637 + mov _INP(%rsp), INP 638 + 639 + addm (4*0)(CTX),a 640 + addm (4*1)(CTX),b 641 + addm (4*2)(CTX),c 642 + addm (4*3)(CTX),d 643 + addm (4*4)(CTX),e 644 + addm (4*5)(CTX),f 645 + addm (4*6)(CTX),g 646 + addm (4*7)(CTX),h 647 + 648 + cmp _INP_END(%rsp), INP 649 + ja done_hash 650 + 651 + #### Do second block using previously scheduled results 652 + xor SRND, SRND 653 + .align 16 654 + loop3: 655 + DO_4ROUNDS _XFER + 0*32 + 16 656 + DO_4ROUNDS _XFER + 1*32 + 16 657 + add $2*32, SRND 658 + cmp $4*4*32, SRND 659 + jb loop3 660 + 661 + mov _CTX(%rsp), CTX 662 + mov _INP(%rsp), INP 663 + add $64, INP 664 + 665 + addm (4*0)(CTX),a 666 + addm (4*1)(CTX),b 667 + addm (4*2)(CTX),c 668 + addm (4*3)(CTX),d 669 + addm (4*4)(CTX),e 670 + addm (4*5)(CTX),f 671 + addm (4*6)(CTX),g 672 + addm (4*7)(CTX),h 673 + 674 + cmp _INP_END(%rsp), INP 675 + jb loop0 676 + ja done_hash 677 + 678 + do_last_block: 679 + #### do last block 680 + lea K256(%rip), TBL 681 + 682 + VMOVDQ 0*16(INP),XWORD0 683 + VMOVDQ 1*16(INP),XWORD1 684 + VMOVDQ 2*16(INP),XWORD2 685 + VMOVDQ 3*16(INP),XWORD3 686 + 687 + vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 688 + vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 689 + vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 690 + vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 691 + 692 + jmp last_block_enter 693 + 694 + only_one_block: 695 + 696 + ## load initial digest 697 + mov (4*0)(CTX),a 698 + mov (4*1)(CTX),b 699 + mov (4*2)(CTX),c 700 + mov (4*3)(CTX),d 701 + mov (4*4)(CTX),e 702 + mov (4*5)(CTX),f 703 + mov (4*6)(CTX),g 704 + mov (4*7)(CTX),h 705 + 706 + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 707 + vmovdqa _SHUF_00BA(%rip), SHUF_00BA 708 + vmovdqa _SHUF_DC00(%rip), SHUF_DC00 709 + 710 + mov CTX, _CTX(%rsp) 711 + jmp do_last_block 712 + 713 + done_hash: 714 + 715 + mov _RSP(%rsp), %rsp 716 + 717 + popq %r15 718 + popq %r14 719 + popq %r13 720 + popq %r12 721 + popq %rbp 722 + popq %rbx 723 + ret 724 + ENDPROC(sha256_transform_rorx) 725 + 726 + .data 727 + .align 64 728 + K256: 729 + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 730 + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 731 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 732 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 733 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 734 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 735 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 736 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 737 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 738 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 739 + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 740 + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 741 + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 742 + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 743 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 744 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 745 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 746 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 747 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 748 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 749 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 750 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 751 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 752 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 753 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 754 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 755 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 756 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 757 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 758 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 759 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 760 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 761 + 762 + PSHUFFLE_BYTE_FLIP_MASK: 763 + .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 764 + 765 + # shuffle xBxA -> 00BA 766 + _SHUF_00BA: 767 + .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 768 + 769 + # shuffle xDxC -> DC00 770 + _SHUF_DC00: 771 + .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF 772 + #endif

+506

arch/x86/crypto/sha256-ssse3-asm.S

··· 1 + ######################################################################## 2 + # Implement fast SHA-256 with SSSE3 instructions. (x86_64) 3 + # 4 + # Copyright (C) 2013 Intel Corporation. 5 + # 6 + # Authors: 7 + # James Guilford <james.guilford@intel.com> 8 + # Kirk Yap <kirk.s.yap@intel.com> 9 + # Tim Chen <tim.c.chen@linux.intel.com> 10 + # 11 + # This software is available to you under a choice of one of two 12 + # licenses. You may choose to be licensed under the terms of the GNU 13 + # General Public License (GPL) Version 2, available from the file 14 + # COPYING in the main directory of this source tree, or the 15 + # OpenIB.org BSD license below: 16 + # 17 + # Redistribution and use in source and binary forms, with or 18 + # without modification, are permitted provided that the following 19 + # conditions are met: 20 + # 21 + # - Redistributions of source code must retain the above 22 + # copyright notice, this list of conditions and the following 23 + # disclaimer. 24 + # 25 + # - Redistributions in binary form must reproduce the above 26 + # copyright notice, this list of conditions and the following 27 + # disclaimer in the documentation and/or other materials 28 + # provided with the distribution. 29 + # 30 + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31 + # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32 + # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33 + # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34 + # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35 + # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36 + # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37 + # SOFTWARE. 38 + # 39 + ######################################################################## 40 + # 41 + # This code is described in an Intel White-Paper: 42 + # "Fast SHA-256 Implementations on Intel Architecture Processors" 43 + # 44 + # To find it, surf to http://www.intel.com/p/en_US/embedded 45 + # and search for that title. 46 + # 47 + ######################################################################## 48 + 49 + #include <linux/linkage.h> 50 + 51 + ## assume buffers not aligned 52 + #define MOVDQ movdqu 53 + 54 + ################################ Define Macros 55 + 56 + # addm [mem], reg 57 + # Add reg to mem using reg-mem add and store 58 + .macro addm p1 p2 59 + add \p1, \p2 60 + mov \p2, \p1 61 + .endm 62 + 63 + ################################ 64 + 65 + # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask 66 + # Load xmm with mem and byte swap each dword 67 + .macro COPY_XMM_AND_BSWAP p1 p2 p3 68 + MOVDQ \p2, \p1 69 + pshufb \p3, \p1 70 + .endm 71 + 72 + ################################ 73 + 74 + X0 = %xmm4 75 + X1 = %xmm5 76 + X2 = %xmm6 77 + X3 = %xmm7 78 + 79 + XTMP0 = %xmm0 80 + XTMP1 = %xmm1 81 + XTMP2 = %xmm2 82 + XTMP3 = %xmm3 83 + XTMP4 = %xmm8 84 + XFER = %xmm9 85 + 86 + SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA 87 + SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 88 + BYTE_FLIP_MASK = %xmm12 89 + 90 + NUM_BLKS = %rdx # 3rd arg 91 + CTX = %rsi # 2nd arg 92 + INP = %rdi # 1st arg 93 + 94 + SRND = %rdi # clobbers INP 95 + c = %ecx 96 + d = %r8d 97 + e = %edx 98 + TBL = %rbp 99 + a = %eax 100 + b = %ebx 101 + 102 + f = %r9d 103 + g = %r10d 104 + h = %r11d 105 + 106 + y0 = %r13d 107 + y1 = %r14d 108 + y2 = %r15d 109 + 110 + 111 + 112 + _INP_END_SIZE = 8 113 + _INP_SIZE = 8 114 + _XFER_SIZE = 8 115 + _XMM_SAVE_SIZE = 0 116 + 117 + _INP_END = 0 118 + _INP = _INP_END + _INP_END_SIZE 119 + _XFER = _INP + _INP_SIZE 120 + _XMM_SAVE = _XFER + _XFER_SIZE 121 + STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE 122 + 123 + # rotate_Xs 124 + # Rotate values of symbols X0...X3 125 + .macro rotate_Xs 126 + X_ = X0 127 + X0 = X1 128 + X1 = X2 129 + X2 = X3 130 + X3 = X_ 131 + .endm 132 + 133 + # ROTATE_ARGS 134 + # Rotate values of symbols a...h 135 + .macro ROTATE_ARGS 136 + TMP_ = h 137 + h = g 138 + g = f 139 + f = e 140 + e = d 141 + d = c 142 + c = b 143 + b = a 144 + a = TMP_ 145 + .endm 146 + 147 + .macro FOUR_ROUNDS_AND_SCHED 148 + ## compute s0 four at a time and s1 two at a time 149 + ## compute W[-16] + W[-7] 4 at a time 150 + movdqa X3, XTMP0 151 + mov e, y0 # y0 = e 152 + ror $(25-11), y0 # y0 = e >> (25-11) 153 + mov a, y1 # y1 = a 154 + palignr $4, X2, XTMP0 # XTMP0 = W[-7] 155 + ror $(22-13), y1 # y1 = a >> (22-13) 156 + xor e, y0 # y0 = e ^ (e >> (25-11)) 157 + mov f, y2 # y2 = f 158 + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 159 + movdqa X1, XTMP1 160 + xor a, y1 # y1 = a ^ (a >> (22-13) 161 + xor g, y2 # y2 = f^g 162 + paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16] 163 + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 164 + and e, y2 # y2 = (f^g)&e 165 + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 166 + ## compute s0 167 + palignr $4, X0, XTMP1 # XTMP1 = W[-15] 168 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 169 + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 170 + xor g, y2 # y2 = CH = ((f^g)&e)^g 171 + movdqa XTMP1, XTMP2 # XTMP2 = W[-15] 172 + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 173 + add y0, y2 # y2 = S1 + CH 174 + add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH 175 + movdqa XTMP1, XTMP3 # XTMP3 = W[-15] 176 + mov a, y0 # y0 = a 177 + add y2, h # h = h + S1 + CH + k + w 178 + mov a, y2 # y2 = a 179 + pslld $(32-7), XTMP1 # 180 + or c, y0 # y0 = a|c 181 + add h, d # d = d + h + S1 + CH + k + w 182 + and c, y2 # y2 = a&c 183 + psrld $7, XTMP2 # 184 + and b, y0 # y0 = (a|c)&b 185 + add y1, h # h = h + S1 + CH + k + w + S0 186 + por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 187 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 188 + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 189 + # 190 + ROTATE_ARGS # 191 + movdqa XTMP3, XTMP2 # XTMP2 = W[-15] 192 + mov e, y0 # y0 = e 193 + mov a, y1 # y1 = a 194 + movdqa XTMP3, XTMP4 # XTMP4 = W[-15] 195 + ror $(25-11), y0 # y0 = e >> (25-11) 196 + xor e, y0 # y0 = e ^ (e >> (25-11)) 197 + mov f, y2 # y2 = f 198 + ror $(22-13), y1 # y1 = a >> (22-13) 199 + pslld $(32-18), XTMP3 # 200 + xor a, y1 # y1 = a ^ (a >> (22-13) 201 + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 202 + xor g, y2 # y2 = f^g 203 + psrld $18, XTMP2 # 204 + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 205 + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 206 + and e, y2 # y2 = (f^g)&e 207 + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 208 + pxor XTMP3, XTMP1 209 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 210 + xor g, y2 # y2 = CH = ((f^g)&e)^g 211 + psrld $3, XTMP4 # XTMP4 = W[-15] >> 3 212 + add y0, y2 # y2 = S1 + CH 213 + add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 214 + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 215 + pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 216 + mov a, y0 # y0 = a 217 + add y2, h # h = h + S1 + CH + k + w 218 + mov a, y2 # y2 = a 219 + pxor XTMP4, XTMP1 # XTMP1 = s0 220 + or c, y0 # y0 = a|c 221 + add h, d # d = d + h + S1 + CH + k + w 222 + and c, y2 # y2 = a&c 223 + ## compute low s1 224 + pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 225 + and b, y0 # y0 = (a|c)&b 226 + add y1, h # h = h + S1 + CH + k + w + S0 227 + paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 228 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 229 + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 230 + 231 + ROTATE_ARGS 232 + movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA} 233 + mov e, y0 # y0 = e 234 + mov a, y1 # y1 = a 235 + ror $(25-11), y0 # y0 = e >> (25-11) 236 + movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA} 237 + xor e, y0 # y0 = e ^ (e >> (25-11)) 238 + ror $(22-13), y1 # y1 = a >> (22-13) 239 + mov f, y2 # y2 = f 240 + xor a, y1 # y1 = a ^ (a >> (22-13) 241 + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 242 + psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} 243 + xor g, y2 # y2 = f^g 244 + psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} 245 + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 246 + and e, y2 # y2 = (f^g)&e 247 + psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 248 + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 249 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 250 + xor g, y2 # y2 = CH = ((f^g)&e)^g 251 + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 252 + pxor XTMP3, XTMP2 253 + add y0, y2 # y2 = S1 + CH 254 + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 255 + add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 256 + pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA} 257 + mov a, y0 # y0 = a 258 + add y2, h # h = h + S1 + CH + k + w 259 + mov a, y2 # y2 = a 260 + pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA} 261 + or c, y0 # y0 = a|c 262 + add h, d # d = d + h + S1 + CH + k + w 263 + and c, y2 # y2 = a&c 264 + paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 265 + and b, y0 # y0 = (a|c)&b 266 + add y1, h # h = h + S1 + CH + k + w + S0 267 + ## compute high s1 268 + pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA} 269 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 270 + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 271 + # 272 + ROTATE_ARGS # 273 + movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC} 274 + mov e, y0 # y0 = e 275 + ror $(25-11), y0 # y0 = e >> (25-11) 276 + mov a, y1 # y1 = a 277 + movdqa XTMP2, X0 # X0 = W[-2] {DDCC} 278 + ror $(22-13), y1 # y1 = a >> (22-13) 279 + xor e, y0 # y0 = e ^ (e >> (25-11)) 280 + mov f, y2 # y2 = f 281 + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 282 + psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} 283 + xor a, y1 # y1 = a ^ (a >> (22-13) 284 + xor g, y2 # y2 = f^g 285 + psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} 286 + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25 287 + and e, y2 # y2 = (f^g)&e 288 + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 289 + psrld $10, X0 # X0 = W[-2] >> 10 {DDCC} 290 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22 291 + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 292 + xor g, y2 # y2 = CH = ((f^g)&e)^g 293 + pxor XTMP3, XTMP2 # 294 + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 295 + add y0, y2 # y2 = S1 + CH 296 + add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH 297 + pxor XTMP2, X0 # X0 = s1 {xDxC} 298 + mov a, y0 # y0 = a 299 + add y2, h # h = h + S1 + CH + k + w 300 + mov a, y2 # y2 = a 301 + pshufb SHUF_DC00, X0 # X0 = s1 {DC00} 302 + or c, y0 # y0 = a|c 303 + add h, d # d = d + h + S1 + CH + k + w 304 + and c, y2 # y2 = a&c 305 + paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]} 306 + and b, y0 # y0 = (a|c)&b 307 + add y1, h # h = h + S1 + CH + k + w + S0 308 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 309 + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 310 + 311 + ROTATE_ARGS 312 + rotate_Xs 313 + .endm 314 + 315 + ## input is [rsp + _XFER + %1 * 4] 316 + .macro DO_ROUND round 317 + mov e, y0 # y0 = e 318 + ror $(25-11), y0 # y0 = e >> (25-11) 319 + mov a, y1 # y1 = a 320 + xor e, y0 # y0 = e ^ (e >> (25-11)) 321 + ror $(22-13), y1 # y1 = a >> (22-13) 322 + mov f, y2 # y2 = f 323 + xor a, y1 # y1 = a ^ (a >> (22-13) 324 + ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) 325 + xor g, y2 # y2 = f^g 326 + xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) 327 + ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) 328 + and e, y2 # y2 = (f^g)&e 329 + xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) 330 + ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) 331 + xor g, y2 # y2 = CH = ((f^g)&e)^g 332 + add y0, y2 # y2 = S1 + CH 333 + ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) 334 + offset = \round * 4 + _XFER 335 + add offset(%rsp), y2 # y2 = k + w + S1 + CH 336 + mov a, y0 # y0 = a 337 + add y2, h # h = h + S1 + CH + k + w 338 + mov a, y2 # y2 = a 339 + or c, y0 # y0 = a|c 340 + add h, d # d = d + h + S1 + CH + k + w 341 + and c, y2 # y2 = a&c 342 + and b, y0 # y0 = (a|c)&b 343 + add y1, h # h = h + S1 + CH + k + w + S0 344 + or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) 345 + add y0, h # h = h + S1 + CH + k + w + S0 + MAJ 346 + ROTATE_ARGS 347 + .endm 348 + 349 + ######################################################################## 350 + ## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks) 351 + ## arg 1 : pointer to input data 352 + ## arg 2 : pointer to digest 353 + ## arg 3 : Num blocks 354 + ######################################################################## 355 + .text 356 + ENTRY(sha256_transform_ssse3) 357 + .align 32 358 + pushq %rbx 359 + pushq %rbp 360 + pushq %r13 361 + pushq %r14 362 + pushq %r15 363 + pushq %r12 364 + 365 + mov %rsp, %r12 366 + subq $STACK_SIZE, %rsp 367 + and $~15, %rsp 368 + 369 + shl $6, NUM_BLKS # convert to bytes 370 + jz done_hash 371 + add INP, NUM_BLKS 372 + mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data 373 + 374 + ## load initial digest 375 + mov 4*0(CTX), a 376 + mov 4*1(CTX), b 377 + mov 4*2(CTX), c 378 + mov 4*3(CTX), d 379 + mov 4*4(CTX), e 380 + mov 4*5(CTX), f 381 + mov 4*6(CTX), g 382 + mov 4*7(CTX), h 383 + 384 + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 385 + movdqa _SHUF_00BA(%rip), SHUF_00BA 386 + movdqa _SHUF_DC00(%rip), SHUF_DC00 387 + 388 + loop0: 389 + lea K256(%rip), TBL 390 + 391 + ## byte swap first 16 dwords 392 + COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK 393 + COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK 394 + COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK 395 + COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK 396 + 397 + mov INP, _INP(%rsp) 398 + 399 + ## schedule 48 input dwords, by doing 3 rounds of 16 each 400 + mov $3, SRND 401 + .align 16 402 + loop1: 403 + movdqa (TBL), XFER 404 + paddd X0, XFER 405 + movdqa XFER, _XFER(%rsp) 406 + FOUR_ROUNDS_AND_SCHED 407 + 408 + movdqa 1*16(TBL), XFER 409 + paddd X0, XFER 410 + movdqa XFER, _XFER(%rsp) 411 + FOUR_ROUNDS_AND_SCHED 412 + 413 + movdqa 2*16(TBL), XFER 414 + paddd X0, XFER 415 + movdqa XFER, _XFER(%rsp) 416 + FOUR_ROUNDS_AND_SCHED 417 + 418 + movdqa 3*16(TBL), XFER 419 + paddd X0, XFER 420 + movdqa XFER, _XFER(%rsp) 421 + add $4*16, TBL 422 + FOUR_ROUNDS_AND_SCHED 423 + 424 + sub $1, SRND 425 + jne loop1 426 + 427 + mov $2, SRND 428 + loop2: 429 + paddd (TBL), X0 430 + movdqa X0, _XFER(%rsp) 431 + DO_ROUND 0 432 + DO_ROUND 1 433 + DO_ROUND 2 434 + DO_ROUND 3 435 + paddd 1*16(TBL), X1 436 + movdqa X1, _XFER(%rsp) 437 + add $2*16, TBL 438 + DO_ROUND 0 439 + DO_ROUND 1 440 + DO_ROUND 2 441 + DO_ROUND 3 442 + 443 + movdqa X2, X0 444 + movdqa X3, X1 445 + 446 + sub $1, SRND 447 + jne loop2 448 + 449 + addm (4*0)(CTX),a 450 + addm (4*1)(CTX),b 451 + addm (4*2)(CTX),c 452 + addm (4*3)(CTX),d 453 + addm (4*4)(CTX),e 454 + addm (4*5)(CTX),f 455 + addm (4*6)(CTX),g 456 + addm (4*7)(CTX),h 457 + 458 + mov _INP(%rsp), INP 459 + add $64, INP 460 + cmp _INP_END(%rsp), INP 461 + jne loop0 462 + 463 + done_hash: 464 + 465 + mov %r12, %rsp 466 + 467 + popq %r12 468 + popq %r15 469 + popq %r14 470 + popq %r13 471 + popq %rbp 472 + popq %rbx 473 + 474 + ret 475 + ENDPROC(sha256_transform_ssse3) 476 + 477 + .data 478 + .align 64 479 + K256: 480 + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 481 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 482 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 483 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 484 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 485 + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 486 + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 487 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 488 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 489 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 490 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 491 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 492 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 493 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 494 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 495 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 496 + 497 + PSHUFFLE_BYTE_FLIP_MASK: 498 + .octa 0x0c0d0e0f08090a0b0405060700010203 499 + 500 + # shuffle xBxA -> 00BA 501 + _SHUF_00BA: 502 + .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 503 + 504 + # shuffle xDxC -> DC00 505 + _SHUF_DC00: 506 + .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF

+275

arch/x86/crypto/sha256_ssse3_glue.c

··· 1 + /* 2 + * Cryptographic API. 3 + * 4 + * Glue code for the SHA256 Secure Hash Algorithm assembler 5 + * implementation using supplemental SSE3 / AVX / AVX2 instructions. 6 + * 7 + * This file is based on sha256_generic.c 8 + * 9 + * Copyright (C) 2013 Intel Corporation. 10 + * 11 + * Author: 12 + * Tim Chen <tim.c.chen@linux.intel.com> 13 + * 14 + * This program is free software; you can redistribute it and/or modify it 15 + * under the terms of the GNU General Public License as published by the Free 16 + * Software Foundation; either version 2 of the License, or (at your option) 17 + * any later version. 18 + * 19 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 20 + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 22 + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 23 + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 24 + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 25 + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 + * SOFTWARE. 27 + */ 28 + 29 + 30 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 31 + 32 + #include <crypto/internal/hash.h> 33 + #include <linux/init.h> 34 + #include <linux/module.h> 35 + #include <linux/mm.h> 36 + #include <linux/cryptohash.h> 37 + #include <linux/types.h> 38 + #include <crypto/sha.h> 39 + #include <asm/byteorder.h> 40 + #include <asm/i387.h> 41 + #include <asm/xcr.h> 42 + #include <asm/xsave.h> 43 + #include <linux/string.h> 44 + 45 + asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest, 46 + u64 rounds); 47 + #ifdef CONFIG_AS_AVX 48 + asmlinkage void sha256_transform_avx(const char *data, u32 *digest, 49 + u64 rounds); 50 + #endif 51 + #ifdef CONFIG_AS_AVX2 52 + asmlinkage void sha256_transform_rorx(const char *data, u32 *digest, 53 + u64 rounds); 54 + #endif 55 + 56 + static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64); 57 + 58 + 59 + static int sha256_ssse3_init(struct shash_desc *desc) 60 + { 61 + struct sha256_state *sctx = shash_desc_ctx(desc); 62 + 63 + sctx->state[0] = SHA256_H0; 64 + sctx->state[1] = SHA256_H1; 65 + sctx->state[2] = SHA256_H2; 66 + sctx->state[3] = SHA256_H3; 67 + sctx->state[4] = SHA256_H4; 68 + sctx->state[5] = SHA256_H5; 69 + sctx->state[6] = SHA256_H6; 70 + sctx->state[7] = SHA256_H7; 71 + sctx->count = 0; 72 + 73 + return 0; 74 + } 75 + 76 + static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data, 77 + unsigned int len, unsigned int partial) 78 + { 79 + struct sha256_state *sctx = shash_desc_ctx(desc); 80 + unsigned int done = 0; 81 + 82 + sctx->count += len; 83 + 84 + if (partial) { 85 + done = SHA256_BLOCK_SIZE - partial; 86 + memcpy(sctx->buf + partial, data, done); 87 + sha256_transform_asm(sctx->buf, sctx->state, 1); 88 + } 89 + 90 + if (len - done >= SHA256_BLOCK_SIZE) { 91 + const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE; 92 + 93 + sha256_transform_asm(data + done, sctx->state, (u64) rounds); 94 + 95 + done += rounds * SHA256_BLOCK_SIZE; 96 + } 97 + 98 + memcpy(sctx->buf, data + done, len - done); 99 + 100 + return 0; 101 + } 102 + 103 + static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, 104 + unsigned int len) 105 + { 106 + struct sha256_state *sctx = shash_desc_ctx(desc); 107 + unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; 108 + int res; 109 + 110 + /* Handle the fast case right here */ 111 + if (partial + len < SHA256_BLOCK_SIZE) { 112 + sctx->count += len; 113 + memcpy(sctx->buf + partial, data, len); 114 + 115 + return 0; 116 + } 117 + 118 + if (!irq_fpu_usable()) { 119 + res = crypto_sha256_update(desc, data, len); 120 + } else { 121 + kernel_fpu_begin(); 122 + res = __sha256_ssse3_update(desc, data, len, partial); 123 + kernel_fpu_end(); 124 + } 125 + 126 + return res; 127 + } 128 + 129 + 130 + /* Add padding and return the message digest. */ 131 + static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) 132 + { 133 + struct sha256_state *sctx = shash_desc_ctx(desc); 134 + unsigned int i, index, padlen; 135 + __be32 *dst = (__be32 *)out; 136 + __be64 bits; 137 + static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; 138 + 139 + bits = cpu_to_be64(sctx->count << 3); 140 + 141 + /* Pad out to 56 mod 64 and append length */ 142 + index = sctx->count % SHA256_BLOCK_SIZE; 143 + padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index); 144 + 145 + if (!irq_fpu_usable()) { 146 + crypto_sha256_update(desc, padding, padlen); 147 + crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits)); 148 + } else { 149 + kernel_fpu_begin(); 150 + /* We need to fill a whole block for __sha256_ssse3_update() */ 151 + if (padlen <= 56) { 152 + sctx->count += padlen; 153 + memcpy(sctx->buf + index, padding, padlen); 154 + } else { 155 + __sha256_ssse3_update(desc, padding, padlen, index); 156 + } 157 + __sha256_ssse3_update(desc, (const u8 *)&bits, 158 + sizeof(bits), 56); 159 + kernel_fpu_end(); 160 + } 161 + 162 + /* Store state in digest */ 163 + for (i = 0; i < 8; i++) 164 + dst[i] = cpu_to_be32(sctx->state[i]); 165 + 166 + /* Wipe context */ 167 + memset(sctx, 0, sizeof(*sctx)); 168 + 169 + return 0; 170 + } 171 + 172 + static int sha256_ssse3_export(struct shash_desc *desc, void *out) 173 + { 174 + struct sha256_state *sctx = shash_desc_ctx(desc); 175 + 176 + memcpy(out, sctx, sizeof(*sctx)); 177 + 178 + return 0; 179 + } 180 + 181 + static int sha256_ssse3_import(struct shash_desc *desc, const void *in) 182 + { 183 + struct sha256_state *sctx = shash_desc_ctx(desc); 184 + 185 + memcpy(sctx, in, sizeof(*sctx)); 186 + 187 + return 0; 188 + } 189 + 190 + static struct shash_alg alg = { 191 + .digestsize = SHA256_DIGEST_SIZE, 192 + .init = sha256_ssse3_init, 193 + .update = sha256_ssse3_update, 194 + .final = sha256_ssse3_final, 195 + .export = sha256_ssse3_export, 196 + .import = sha256_ssse3_import, 197 + .descsize = sizeof(struct sha256_state), 198 + .statesize = sizeof(struct sha256_state), 199 + .base = { 200 + .cra_name = "sha256", 201 + .cra_driver_name = "sha256-ssse3", 202 + .cra_priority = 150, 203 + .cra_flags = CRYPTO_ALG_TYPE_SHASH, 204 + .cra_blocksize = SHA256_BLOCK_SIZE, 205 + .cra_module = THIS_MODULE, 206 + } 207 + }; 208 + 209 + #ifdef CONFIG_AS_AVX 210 + static bool __init avx_usable(void) 211 + { 212 + u64 xcr0; 213 + 214 + if (!cpu_has_avx || !cpu_has_osxsave) 215 + return false; 216 + 217 + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 218 + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { 219 + pr_info("AVX detected but unusable.\n"); 220 + 221 + return false; 222 + } 223 + 224 + return true; 225 + } 226 + #endif 227 + 228 + static int __init sha256_ssse3_mod_init(void) 229 + { 230 + /* test for SSE3 first */ 231 + if (cpu_has_ssse3) 232 + sha256_transform_asm = sha256_transform_ssse3; 233 + 234 + #ifdef CONFIG_AS_AVX 235 + /* allow AVX to override SSSE3, it's a little faster */ 236 + if (avx_usable()) { 237 + #ifdef CONFIG_AS_AVX2 238 + if (boot_cpu_has(X86_FEATURE_AVX2)) 239 + sha256_transform_asm = sha256_transform_rorx; 240 + else 241 + #endif 242 + sha256_transform_asm = sha256_transform_avx; 243 + } 244 + #endif 245 + 246 + if (sha256_transform_asm) { 247 + #ifdef CONFIG_AS_AVX 248 + if (sha256_transform_asm == sha256_transform_avx) 249 + pr_info("Using AVX optimized SHA-256 implementation\n"); 250 + #ifdef CONFIG_AS_AVX2 251 + else if (sha256_transform_asm == sha256_transform_rorx) 252 + pr_info("Using AVX2 optimized SHA-256 implementation\n"); 253 + #endif 254 + else 255 + #endif 256 + pr_info("Using SSSE3 optimized SHA-256 implementation\n"); 257 + return crypto_register_shash(&alg); 258 + } 259 + pr_info("Neither AVX nor SSSE3 is available/usable.\n"); 260 + 261 + return -ENODEV; 262 + } 263 + 264 + static void __exit sha256_ssse3_mod_fini(void) 265 + { 266 + crypto_unregister_shash(&alg); 267 + } 268 + 269 + module_init(sha256_ssse3_mod_init); 270 + module_exit(sha256_ssse3_mod_fini); 271 + 272 + MODULE_LICENSE("GPL"); 273 + MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); 274 + 275 + MODULE_ALIAS("sha256");

+423

arch/x86/crypto/sha512-avx-asm.S

··· 1 + ######################################################################## 2 + # Implement fast SHA-512 with AVX instructions. (x86_64) 3 + # 4 + # Copyright (C) 2013 Intel Corporation. 5 + # 6 + # Authors: 7 + # James Guilford <james.guilford@intel.com> 8 + # Kirk Yap <kirk.s.yap@intel.com> 9 + # David Cote <david.m.cote@intel.com> 10 + # Tim Chen <tim.c.chen@linux.intel.com> 11 + # 12 + # This software is available to you under a choice of one of two 13 + # licenses. You may choose to be licensed under the terms of the GNU 14 + # General Public License (GPL) Version 2, available from the file 15 + # COPYING in the main directory of this source tree, or the 16 + # OpenIB.org BSD license below: 17 + # 18 + # Redistribution and use in source and binary forms, with or 19 + # without modification, are permitted provided that the following 20 + # conditions are met: 21 + # 22 + # - Redistributions of source code must retain the above 23 + # copyright notice, this list of conditions and the following 24 + # disclaimer. 25 + # 26 + # - Redistributions in binary form must reproduce the above 27 + # copyright notice, this list of conditions and the following 28 + # disclaimer in the documentation and/or other materials 29 + # provided with the distribution. 30 + # 31 + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 32 + # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 33 + # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 34 + # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 35 + # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 36 + # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 37 + # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 38 + # SOFTWARE. 39 + # 40 + ######################################################################## 41 + # 42 + # This code is described in an Intel White-Paper: 43 + # "Fast SHA-512 Implementations on Intel Architecture Processors" 44 + # 45 + # To find it, surf to http://www.intel.com/p/en_US/embedded 46 + # and search for that title. 47 + # 48 + ######################################################################## 49 + 50 + #ifdef CONFIG_AS_AVX 51 + #include <linux/linkage.h> 52 + 53 + .text 54 + 55 + # Virtual Registers 56 + # ARG1 57 + msg = %rdi 58 + # ARG2 59 + digest = %rsi 60 + # ARG3 61 + msglen = %rdx 62 + T1 = %rcx 63 + T2 = %r8 64 + a_64 = %r9 65 + b_64 = %r10 66 + c_64 = %r11 67 + d_64 = %r12 68 + e_64 = %r13 69 + f_64 = %r14 70 + g_64 = %r15 71 + h_64 = %rbx 72 + tmp0 = %rax 73 + 74 + # Local variables (stack frame) 75 + 76 + # Message Schedule 77 + W_SIZE = 80*8 78 + # W[t] + K[t] | W[t+1] + K[t+1] 79 + WK_SIZE = 2*8 80 + RSPSAVE_SIZE = 1*8 81 + GPRSAVE_SIZE = 5*8 82 + 83 + frame_W = 0 84 + frame_WK = frame_W + W_SIZE 85 + frame_RSPSAVE = frame_WK + WK_SIZE 86 + frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE 87 + frame_size = frame_GPRSAVE + GPRSAVE_SIZE 88 + 89 + # Useful QWORD "arrays" for simpler memory references 90 + # MSG, DIGEST, K_t, W_t are arrays 91 + # WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even 92 + 93 + # Input message (arg1) 94 + #define MSG(i) 8*i(msg) 95 + 96 + # Output Digest (arg2) 97 + #define DIGEST(i) 8*i(digest) 98 + 99 + # SHA Constants (static mem) 100 + #define K_t(i) 8*i+K512(%rip) 101 + 102 + # Message Schedule (stack frame) 103 + #define W_t(i) 8*i+frame_W(%rsp) 104 + 105 + # W[t]+K[t] (stack frame) 106 + #define WK_2(i) 8*((i%2))+frame_WK(%rsp) 107 + 108 + .macro RotateState 109 + # Rotate symbols a..h right 110 + TMP = h_64 111 + h_64 = g_64 112 + g_64 = f_64 113 + f_64 = e_64 114 + e_64 = d_64 115 + d_64 = c_64 116 + c_64 = b_64 117 + b_64 = a_64 118 + a_64 = TMP 119 + .endm 120 + 121 + .macro RORQ p1 p2 122 + # shld is faster than ror on Sandybridge 123 + shld $(64-\p2), \p1, \p1 124 + .endm 125 + 126 + .macro SHA512_Round rnd 127 + # Compute Round %%t 128 + mov f_64, T1 # T1 = f 129 + mov e_64, tmp0 # tmp = e 130 + xor g_64, T1 # T1 = f ^ g 131 + RORQ tmp0, 23 # 41 # tmp = e ror 23 132 + and e_64, T1 # T1 = (f ^ g) & e 133 + xor e_64, tmp0 # tmp = (e ror 23) ^ e 134 + xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) 135 + idx = \rnd 136 + add WK_2(idx), T1 # W[t] + K[t] from message scheduler 137 + RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4 138 + xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e 139 + mov a_64, T2 # T2 = a 140 + add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h 141 + RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) 142 + add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) 143 + mov a_64, tmp0 # tmp = a 144 + xor c_64, T2 # T2 = a ^ c 145 + and c_64, tmp0 # tmp = a & c 146 + and b_64, T2 # T2 = (a ^ c) & b 147 + xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) 148 + mov a_64, tmp0 # tmp = a 149 + RORQ tmp0, 5 # 39 # tmp = a ror 5 150 + xor a_64, tmp0 # tmp = (a ror 5) ^ a 151 + add T1, d_64 # e(next_state) = d + T1 152 + RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6 153 + xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a 154 + lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) 155 + RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) 156 + add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) 157 + RotateState 158 + .endm 159 + 160 + .macro SHA512_2Sched_2Round_avx rnd 161 + # Compute rounds t-2 and t-1 162 + # Compute message schedule QWORDS t and t+1 163 + 164 + # Two rounds are computed based on the values for K[t-2]+W[t-2] and 165 + # K[t-1]+W[t-1] which were previously stored at WK_2 by the message 166 + # scheduler. 167 + # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)]. 168 + # They are then added to their respective SHA512 constants at 169 + # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)] 170 + # For brievity, the comments following vectored instructions only refer to 171 + # the first of a pair of QWORDS. 172 + # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} 173 + # The computation of the message schedule and the rounds are tightly 174 + # stitched to take advantage of instruction-level parallelism. 175 + 176 + idx = \rnd - 2 177 + vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2] 178 + idx = \rnd - 15 179 + vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15] 180 + mov f_64, T1 181 + vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61 182 + mov e_64, tmp0 183 + vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1 184 + xor g_64, T1 185 + RORQ tmp0, 23 # 41 186 + vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19 187 + and e_64, T1 188 + xor e_64, tmp0 189 + vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 190 + xor g_64, T1 191 + idx = \rnd 192 + add WK_2(idx), T1# 193 + vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8 194 + RORQ tmp0, 4 # 18 195 + vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6 196 + xor e_64, tmp0 197 + mov a_64, T2 198 + add h_64, T1 199 + vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 200 + RORQ tmp0, 14 # 14 201 + add tmp0, T1 202 + vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7 203 + mov a_64, tmp0 204 + xor c_64, T2 205 + vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3 206 + and c_64, tmp0 207 + and b_64, T2 208 + vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3 209 + xor tmp0, T2 210 + mov a_64, tmp0 211 + vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63 212 + RORQ tmp0, 5 # 39 213 + vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63 214 + xor a_64, tmp0 215 + add T1, d_64 216 + RORQ tmp0, 6 # 34 217 + xor a_64, tmp0 218 + vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ 219 + # W[t-15]>>7 ^ W[t-15]<<63 220 + lea (T1, T2), h_64 221 + RORQ tmp0, 28 # 28 222 + vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25 223 + add tmp0, h_64 224 + RotateState 225 + vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ 226 + # W[t-2]<<25 227 + mov f_64, T1 228 + vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) 229 + mov e_64, tmp0 230 + xor g_64, T1 231 + idx = \rnd - 16 232 + vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] 233 + idx = \rnd - 7 234 + vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7] 235 + RORQ tmp0, 23 # 41 236 + and e_64, T1 237 + xor e_64, tmp0 238 + xor g_64, T1 239 + vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56 240 + idx = \rnd + 1 241 + add WK_2(idx), T1 242 + vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15]) 243 + RORQ tmp0, 4 # 18 244 + vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) 245 + xor e_64, tmp0 246 + vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] + 247 + # s0(W[t-15]) + W[t-16] 248 + mov a_64, T2 249 + add h_64, T1 250 + RORQ tmp0, 14 # 14 251 + add tmp0, T1 252 + idx = \rnd 253 + vmovdqa %xmm0, W_t(idx) # Store W[t] 254 + vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t] 255 + vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds 256 + mov a_64, tmp0 257 + xor c_64, T2 258 + and c_64, tmp0 259 + and b_64, T2 260 + xor tmp0, T2 261 + mov a_64, tmp0 262 + RORQ tmp0, 5 # 39 263 + xor a_64, tmp0 264 + add T1, d_64 265 + RORQ tmp0, 6 # 34 266 + xor a_64, tmp0 267 + lea (T1, T2), h_64 268 + RORQ tmp0, 28 # 28 269 + add tmp0, h_64 270 + RotateState 271 + .endm 272 + 273 + ######################################################################## 274 + # void sha512_transform_avx(const void* M, void* D, u64 L) 275 + # Purpose: Updates the SHA512 digest stored at D with the message stored in M. 276 + # The size of the message pointed to by M must be an integer multiple of SHA512 277 + # message blocks. 278 + # L is the message length in SHA512 blocks 279 + ######################################################################## 280 + ENTRY(sha512_transform_avx) 281 + cmp $0, msglen 282 + je nowork 283 + 284 + # Allocate Stack Space 285 + mov %rsp, %rax 286 + sub $frame_size, %rsp 287 + and $~(0x20 - 1), %rsp 288 + mov %rax, frame_RSPSAVE(%rsp) 289 + 290 + # Save GPRs 291 + mov %rbx, frame_GPRSAVE(%rsp) 292 + mov %r12, frame_GPRSAVE +8*1(%rsp) 293 + mov %r13, frame_GPRSAVE +8*2(%rsp) 294 + mov %r14, frame_GPRSAVE +8*3(%rsp) 295 + mov %r15, frame_GPRSAVE +8*4(%rsp) 296 + 297 + updateblock: 298 + 299 + # Load state variables 300 + mov DIGEST(0), a_64 301 + mov DIGEST(1), b_64 302 + mov DIGEST(2), c_64 303 + mov DIGEST(3), d_64 304 + mov DIGEST(4), e_64 305 + mov DIGEST(5), f_64 306 + mov DIGEST(6), g_64 307 + mov DIGEST(7), h_64 308 + 309 + t = 0 310 + .rept 80/2 + 1 311 + # (80 rounds) / (2 rounds/iteration) + (1 iteration) 312 + # +1 iteration because the scheduler leads hashing by 1 iteration 313 + .if t < 2 314 + # BSWAP 2 QWORDS 315 + vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1 316 + vmovdqu MSG(t), %xmm0 317 + vpshufb %xmm1, %xmm0, %xmm0 # BSWAP 318 + vmovdqa %xmm0, W_t(t) # Store Scheduled Pair 319 + vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] 320 + vmovdqa %xmm0, WK_2(t) # Store into WK for rounds 321 + .elseif t < 16 322 + # BSWAP 2 QWORDS# Compute 2 Rounds 323 + vmovdqu MSG(t), %xmm0 324 + vpshufb %xmm1, %xmm0, %xmm0 # BSWAP 325 + SHA512_Round t-2 # Round t-2 326 + vmovdqa %xmm0, W_t(t) # Store Scheduled Pair 327 + vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] 328 + SHA512_Round t-1 # Round t-1 329 + vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK 330 + .elseif t < 79 331 + # Schedule 2 QWORDS# Compute 2 Rounds 332 + SHA512_2Sched_2Round_avx t 333 + .else 334 + # Compute 2 Rounds 335 + SHA512_Round t-2 336 + SHA512_Round t-1 337 + .endif 338 + t = t+2 339 + .endr 340 + 341 + # Update digest 342 + add a_64, DIGEST(0) 343 + add b_64, DIGEST(1) 344 + add c_64, DIGEST(2) 345 + add d_64, DIGEST(3) 346 + add e_64, DIGEST(4) 347 + add f_64, DIGEST(5) 348 + add g_64, DIGEST(6) 349 + add h_64, DIGEST(7) 350 + 351 + # Advance to next message block 352 + add $16*8, msg 353 + dec msglen 354 + jnz updateblock 355 + 356 + # Restore GPRs 357 + mov frame_GPRSAVE(%rsp), %rbx 358 + mov frame_GPRSAVE +8*1(%rsp), %r12 359 + mov frame_GPRSAVE +8*2(%rsp), %r13 360 + mov frame_GPRSAVE +8*3(%rsp), %r14 361 + mov frame_GPRSAVE +8*4(%rsp), %r15 362 + 363 + # Restore Stack Pointer 364 + mov frame_RSPSAVE(%rsp), %rsp 365 + 366 + nowork: 367 + ret 368 + ENDPROC(sha512_transform_avx) 369 + 370 + ######################################################################## 371 + ### Binary Data 372 + 373 + .data 374 + 375 + .align 16 376 + 377 + # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. 378 + XMM_QWORD_BSWAP: 379 + .octa 0x08090a0b0c0d0e0f0001020304050607 380 + 381 + # K[t] used in SHA512 hashing 382 + K512: 383 + .quad 0x428a2f98d728ae22,0x7137449123ef65cd 384 + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 385 + .quad 0x3956c25bf348b538,0x59f111f1b605d019 386 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 387 + .quad 0xd807aa98a3030242,0x12835b0145706fbe 388 + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 389 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 390 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 391 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 392 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 393 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 394 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 395 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 396 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 397 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 398 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 399 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 400 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 401 + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 402 + .quad 0x81c2c92e47edaee6,0x92722c851482353b 403 + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 404 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 405 + .quad 0xd192e819d6ef5218,0xd69906245565a910 406 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 407 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 408 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 409 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 410 + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 411 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 412 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 413 + .quad 0x90befffa23631e28,0xa4506cebde82bde9 414 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 415 + .quad 0xca273eceea26619c,0xd186b8c721c0c207 416 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 417 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 418 + .quad 0x113f9804bef90dae,0x1b710b35131c471b 419 + .quad 0x28db77f523047d84,0x32caab7b40c72493 420 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 421 + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 422 + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 423 + #endif

+743

arch/x86/crypto/sha512-avx2-asm.S

··· 1 + ######################################################################## 2 + # Implement fast SHA-512 with AVX2 instructions. (x86_64) 3 + # 4 + # Copyright (C) 2013 Intel Corporation. 5 + # 6 + # Authors: 7 + # James Guilford <james.guilford@intel.com> 8 + # Kirk Yap <kirk.s.yap@intel.com> 9 + # David Cote <david.m.cote@intel.com> 10 + # Tim Chen <tim.c.chen@linux.intel.com> 11 + # 12 + # This software is available to you under a choice of one of two 13 + # licenses. You may choose to be licensed under the terms of the GNU 14 + # General Public License (GPL) Version 2, available from the file 15 + # COPYING in the main directory of this source tree, or the 16 + # OpenIB.org BSD license below: 17 + # 18 + # Redistribution and use in source and binary forms, with or 19 + # without modification, are permitted provided that the following 20 + # conditions are met: 21 + # 22 + # - Redistributions of source code must retain the above 23 + # copyright notice, this list of conditions and the following 24 + # disclaimer. 25 + # 26 + # - Redistributions in binary form must reproduce the above 27 + # copyright notice, this list of conditions and the following 28 + # disclaimer in the documentation and/or other materials 29 + # provided with the distribution. 30 + # 31 + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 32 + # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 33 + # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 34 + # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 35 + # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 36 + # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 37 + # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 38 + # SOFTWARE. 39 + # 40 + ######################################################################## 41 + # 42 + # This code is described in an Intel White-Paper: 43 + # "Fast SHA-512 Implementations on Intel Architecture Processors" 44 + # 45 + # To find it, surf to http://www.intel.com/p/en_US/embedded 46 + # and search for that title. 47 + # 48 + ######################################################################## 49 + # This code schedules 1 blocks at a time, with 4 lanes per block 50 + ######################################################################## 51 + 52 + #ifdef CONFIG_AS_AVX2 53 + #include <linux/linkage.h> 54 + 55 + .text 56 + 57 + # Virtual Registers 58 + Y_0 = %ymm4 59 + Y_1 = %ymm5 60 + Y_2 = %ymm6 61 + Y_3 = %ymm7 62 + 63 + YTMP0 = %ymm0 64 + YTMP1 = %ymm1 65 + YTMP2 = %ymm2 66 + YTMP3 = %ymm3 67 + YTMP4 = %ymm8 68 + XFER = YTMP0 69 + 70 + BYTE_FLIP_MASK = %ymm9 71 + 72 + # 1st arg 73 + INP = %rdi 74 + # 2nd arg 75 + CTX = %rsi 76 + # 3rd arg 77 + NUM_BLKS = %rdx 78 + 79 + c = %rcx 80 + d = %r8 81 + e = %rdx 82 + y3 = %rdi 83 + 84 + TBL = %rbp 85 + 86 + a = %rax 87 + b = %rbx 88 + 89 + f = %r9 90 + g = %r10 91 + h = %r11 92 + old_h = %r11 93 + 94 + T1 = %r12 95 + y0 = %r13 96 + y1 = %r14 97 + y2 = %r15 98 + 99 + y4 = %r12 100 + 101 + # Local variables (stack frame) 102 + XFER_SIZE = 4*8 103 + SRND_SIZE = 1*8 104 + INP_SIZE = 1*8 105 + INPEND_SIZE = 1*8 106 + RSPSAVE_SIZE = 1*8 107 + GPRSAVE_SIZE = 6*8 108 + 109 + frame_XFER = 0 110 + frame_SRND = frame_XFER + XFER_SIZE 111 + frame_INP = frame_SRND + SRND_SIZE 112 + frame_INPEND = frame_INP + INP_SIZE 113 + frame_RSPSAVE = frame_INPEND + INPEND_SIZE 114 + frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE 115 + frame_size = frame_GPRSAVE + GPRSAVE_SIZE 116 + 117 + ## assume buffers not aligned 118 + #define VMOVDQ vmovdqu 119 + 120 + # addm [mem], reg 121 + # Add reg to mem using reg-mem add and store 122 + .macro addm p1 p2 123 + add \p1, \p2 124 + mov \p2, \p1 125 + .endm 126 + 127 + 128 + # COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask 129 + # Load ymm with mem and byte swap each dword 130 + .macro COPY_YMM_AND_BSWAP p1 p2 p3 131 + VMOVDQ \p2, \p1 132 + vpshufb \p3, \p1, \p1 133 + .endm 134 + # rotate_Ys 135 + # Rotate values of symbols Y0...Y3 136 + .macro rotate_Ys 137 + Y_ = Y_0 138 + Y_0 = Y_1 139 + Y_1 = Y_2 140 + Y_2 = Y_3 141 + Y_3 = Y_ 142 + .endm 143 + 144 + # RotateState 145 + .macro RotateState 146 + # Rotate symbols a..h right 147 + old_h = h 148 + TMP_ = h 149 + h = g 150 + g = f 151 + f = e 152 + e = d 153 + d = c 154 + c = b 155 + b = a 156 + a = TMP_ 157 + .endm 158 + 159 + # macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL 160 + # YDST = {YSRC1, YSRC2} >> RVAL*8 161 + .macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL 162 + vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI} 163 + vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8 164 + .endm 165 + 166 + .macro FOUR_ROUNDS_AND_SCHED 167 + ################################### RND N + 0 ######################################### 168 + 169 + # Extract w[t-7] 170 + MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7] 171 + # Calculate w[t-16] + w[t-7] 172 + vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16] 173 + # Extract w[t-15] 174 + MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15] 175 + 176 + # Calculate sigma0 177 + 178 + # Calculate w[t-15] ror 1 179 + vpsrlq $1, YTMP1, YTMP2 180 + vpsllq $(64-1), YTMP1, YTMP3 181 + vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 182 + # Calculate w[t-15] shr 7 183 + vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7 184 + 185 + mov a, y3 # y3 = a # MAJA 186 + rorx $41, e, y0 # y0 = e >> 41 # S1A 187 + rorx $18, e, y1 # y1 = e >> 18 # S1B 188 + add frame_XFER(%rsp),h # h = k + w + h # -- 189 + or c, y3 # y3 = a|c # MAJA 190 + mov f, y2 # y2 = f # CH 191 + rorx $34, a, T1 # T1 = a >> 34 # S0B 192 + 193 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 194 + xor g, y2 # y2 = f^g # CH 195 + rorx $14, e, y1 # y1 = (e >> 14) # S1 196 + 197 + and e, y2 # y2 = (f^g)&e # CH 198 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 199 + rorx $39, a, y1 # y1 = a >> 39 # S0A 200 + add h, d # d = k + w + h + d # -- 201 + 202 + and b, y3 # y3 = (a|c)&b # MAJA 203 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 204 + rorx $28, a, T1 # T1 = (a >> 28) # S0 205 + 206 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 207 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 208 + mov a, T1 # T1 = a # MAJB 209 + and c, T1 # T1 = a&c # MAJB 210 + 211 + add y0, y2 # y2 = S1 + CH # -- 212 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 213 + add y1, h # h = k + w + h + S0 # -- 214 + 215 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 216 + 217 + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 218 + add y3, h # h = t1 + S0 + MAJ # -- 219 + 220 + RotateState 221 + 222 + ################################### RND N + 1 ######################################### 223 + 224 + # Calculate w[t-15] ror 8 225 + vpsrlq $8, YTMP1, YTMP2 226 + vpsllq $(64-8), YTMP1, YTMP1 227 + vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8 228 + # XOR the three components 229 + vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 230 + vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0 231 + 232 + 233 + # Add three components, w[t-16], w[t-7] and sigma0 234 + vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 235 + # Move to appropriate lanes for calculating w[16] and w[17] 236 + vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA} 237 + # Move to appropriate lanes for calculating w[18] and w[19] 238 + vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00} 239 + 240 + # Calculate w[16] and w[17] in both 128 bit lanes 241 + 242 + # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes 243 + vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA} 244 + vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA} 245 + 246 + 247 + mov a, y3 # y3 = a # MAJA 248 + rorx $41, e, y0 # y0 = e >> 41 # S1A 249 + rorx $18, e, y1 # y1 = e >> 18 # S1B 250 + add 1*8+frame_XFER(%rsp), h # h = k + w + h # -- 251 + or c, y3 # y3 = a|c # MAJA 252 + 253 + 254 + mov f, y2 # y2 = f # CH 255 + rorx $34, a, T1 # T1 = a >> 34 # S0B 256 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 257 + xor g, y2 # y2 = f^g # CH 258 + 259 + 260 + rorx $14, e, y1 # y1 = (e >> 14) # S1 261 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 262 + rorx $39, a, y1 # y1 = a >> 39 # S0A 263 + and e, y2 # y2 = (f^g)&e # CH 264 + add h, d # d = k + w + h + d # -- 265 + 266 + and b, y3 # y3 = (a|c)&b # MAJA 267 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 268 + 269 + rorx $28, a, T1 # T1 = (a >> 28) # S0 270 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 271 + 272 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 273 + mov a, T1 # T1 = a # MAJB 274 + and c, T1 # T1 = a&c # MAJB 275 + add y0, y2 # y2 = S1 + CH # -- 276 + 277 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 278 + add y1, h # h = k + w + h + S0 # -- 279 + 280 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 281 + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 282 + add y3, h # h = t1 + S0 + MAJ # -- 283 + 284 + RotateState 285 + 286 + 287 + ################################### RND N + 2 ######################################### 288 + 289 + vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA} 290 + vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA} 291 + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA} 292 + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} 293 + vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA} 294 + vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA} 295 + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA} 296 + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ 297 + # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} 298 + 299 + # Add sigma1 to the other compunents to get w[16] and w[17] 300 + vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]} 301 + 302 + # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane 303 + vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--} 304 + 305 + mov a, y3 # y3 = a # MAJA 306 + rorx $41, e, y0 # y0 = e >> 41 # S1A 307 + add 2*8+frame_XFER(%rsp), h # h = k + w + h # -- 308 + 309 + rorx $18, e, y1 # y1 = e >> 18 # S1B 310 + or c, y3 # y3 = a|c # MAJA 311 + mov f, y2 # y2 = f # CH 312 + xor g, y2 # y2 = f^g # CH 313 + 314 + rorx $34, a, T1 # T1 = a >> 34 # S0B 315 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 316 + and e, y2 # y2 = (f^g)&e # CH 317 + 318 + rorx $14, e, y1 # y1 = (e >> 14) # S1 319 + add h, d # d = k + w + h + d # -- 320 + and b, y3 # y3 = (a|c)&b # MAJA 321 + 322 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 323 + rorx $39, a, y1 # y1 = a >> 39 # S0A 324 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 325 + 326 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 327 + rorx $28, a, T1 # T1 = (a >> 28) # S0 328 + 329 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 330 + mov a, T1 # T1 = a # MAJB 331 + and c, T1 # T1 = a&c # MAJB 332 + add y0, y2 # y2 = S1 + CH # -- 333 + 334 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 335 + add y1, h # h = k + w + h + S0 # -- 336 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 337 + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 338 + 339 + add y3, h # h = t1 + S0 + MAJ # -- 340 + 341 + RotateState 342 + 343 + ################################### RND N + 3 ######################################### 344 + 345 + vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--} 346 + vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--} 347 + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--} 348 + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} 349 + vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--} 350 + vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--} 351 + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--} 352 + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ 353 + # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} 354 + 355 + # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] 356 + # to newly calculated sigma1 to get w[18] and w[19] 357 + vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --} 358 + 359 + # Form w[19, w[18], w17], w[16] 360 + vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]} 361 + 362 + mov a, y3 # y3 = a # MAJA 363 + rorx $41, e, y0 # y0 = e >> 41 # S1A 364 + rorx $18, e, y1 # y1 = e >> 18 # S1B 365 + add 3*8+frame_XFER(%rsp), h # h = k + w + h # -- 366 + or c, y3 # y3 = a|c # MAJA 367 + 368 + 369 + mov f, y2 # y2 = f # CH 370 + rorx $34, a, T1 # T1 = a >> 34 # S0B 371 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 372 + xor g, y2 # y2 = f^g # CH 373 + 374 + 375 + rorx $14, e, y1 # y1 = (e >> 14) # S1 376 + and e, y2 # y2 = (f^g)&e # CH 377 + add h, d # d = k + w + h + d # -- 378 + and b, y3 # y3 = (a|c)&b # MAJA 379 + 380 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 381 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 382 + 383 + rorx $39, a, y1 # y1 = a >> 39 # S0A 384 + add y0, y2 # y2 = S1 + CH # -- 385 + 386 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 387 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 388 + 389 + rorx $28, a, T1 # T1 = (a >> 28) # S0 390 + 391 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 392 + mov a, T1 # T1 = a # MAJB 393 + and c, T1 # T1 = a&c # MAJB 394 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 395 + 396 + add y1, h # h = k + w + h + S0 # -- 397 + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 398 + add y3, h # h = t1 + S0 + MAJ # -- 399 + 400 + RotateState 401 + 402 + rotate_Ys 403 + .endm 404 + 405 + .macro DO_4ROUNDS 406 + 407 + ################################### RND N + 0 ######################################### 408 + 409 + mov f, y2 # y2 = f # CH 410 + rorx $41, e, y0 # y0 = e >> 41 # S1A 411 + rorx $18, e, y1 # y1 = e >> 18 # S1B 412 + xor g, y2 # y2 = f^g # CH 413 + 414 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 415 + rorx $14, e, y1 # y1 = (e >> 14) # S1 416 + and e, y2 # y2 = (f^g)&e # CH 417 + 418 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 419 + rorx $34, a, T1 # T1 = a >> 34 # S0B 420 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 421 + rorx $39, a, y1 # y1 = a >> 39 # S0A 422 + mov a, y3 # y3 = a # MAJA 423 + 424 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 425 + rorx $28, a, T1 # T1 = (a >> 28) # S0 426 + add frame_XFER(%rsp), h # h = k + w + h # -- 427 + or c, y3 # y3 = a|c # MAJA 428 + 429 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 430 + mov a, T1 # T1 = a # MAJB 431 + and b, y3 # y3 = (a|c)&b # MAJA 432 + and c, T1 # T1 = a&c # MAJB 433 + add y0, y2 # y2 = S1 + CH # -- 434 + 435 + add h, d # d = k + w + h + d # -- 436 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 437 + add y1, h # h = k + w + h + S0 # -- 438 + 439 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 440 + 441 + RotateState 442 + 443 + ################################### RND N + 1 ######################################### 444 + 445 + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 446 + mov f, y2 # y2 = f # CH 447 + rorx $41, e, y0 # y0 = e >> 41 # S1A 448 + rorx $18, e, y1 # y1 = e >> 18 # S1B 449 + xor g, y2 # y2 = f^g # CH 450 + 451 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 452 + rorx $14, e, y1 # y1 = (e >> 14) # S1 453 + and e, y2 # y2 = (f^g)&e # CH 454 + add y3, old_h # h = t1 + S0 + MAJ # -- 455 + 456 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 457 + rorx $34, a, T1 # T1 = a >> 34 # S0B 458 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 459 + rorx $39, a, y1 # y1 = a >> 39 # S0A 460 + mov a, y3 # y3 = a # MAJA 461 + 462 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 463 + rorx $28, a, T1 # T1 = (a >> 28) # S0 464 + add 8*1+frame_XFER(%rsp), h # h = k + w + h # -- 465 + or c, y3 # y3 = a|c # MAJA 466 + 467 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 468 + mov a, T1 # T1 = a # MAJB 469 + and b, y3 # y3 = (a|c)&b # MAJA 470 + and c, T1 # T1 = a&c # MAJB 471 + add y0, y2 # y2 = S1 + CH # -- 472 + 473 + add h, d # d = k + w + h + d # -- 474 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 475 + add y1, h # h = k + w + h + S0 # -- 476 + 477 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 478 + 479 + RotateState 480 + 481 + ################################### RND N + 2 ######################################### 482 + 483 + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 484 + mov f, y2 # y2 = f # CH 485 + rorx $41, e, y0 # y0 = e >> 41 # S1A 486 + rorx $18, e, y1 # y1 = e >> 18 # S1B 487 + xor g, y2 # y2 = f^g # CH 488 + 489 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 490 + rorx $14, e, y1 # y1 = (e >> 14) # S1 491 + and e, y2 # y2 = (f^g)&e # CH 492 + add y3, old_h # h = t1 + S0 + MAJ # -- 493 + 494 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 495 + rorx $34, a, T1 # T1 = a >> 34 # S0B 496 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 497 + rorx $39, a, y1 # y1 = a >> 39 # S0A 498 + mov a, y3 # y3 = a # MAJA 499 + 500 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 501 + rorx $28, a, T1 # T1 = (a >> 28) # S0 502 + add 8*2+frame_XFER(%rsp), h # h = k + w + h # -- 503 + or c, y3 # y3 = a|c # MAJA 504 + 505 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 506 + mov a, T1 # T1 = a # MAJB 507 + and b, y3 # y3 = (a|c)&b # MAJA 508 + and c, T1 # T1 = a&c # MAJB 509 + add y0, y2 # y2 = S1 + CH # -- 510 + 511 + add h, d # d = k + w + h + d # -- 512 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 513 + add y1, h # h = k + w + h + S0 # -- 514 + 515 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 516 + 517 + RotateState 518 + 519 + ################################### RND N + 3 ######################################### 520 + 521 + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 522 + mov f, y2 # y2 = f # CH 523 + rorx $41, e, y0 # y0 = e >> 41 # S1A 524 + rorx $18, e, y1 # y1 = e >> 18 # S1B 525 + xor g, y2 # y2 = f^g # CH 526 + 527 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 528 + rorx $14, e, y1 # y1 = (e >> 14) # S1 529 + and e, y2 # y2 = (f^g)&e # CH 530 + add y3, old_h # h = t1 + S0 + MAJ # -- 531 + 532 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 533 + rorx $34, a, T1 # T1 = a >> 34 # S0B 534 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 535 + rorx $39, a, y1 # y1 = a >> 39 # S0A 536 + mov a, y3 # y3 = a # MAJA 537 + 538 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 539 + rorx $28, a, T1 # T1 = (a >> 28) # S0 540 + add 8*3+frame_XFER(%rsp), h # h = k + w + h # -- 541 + or c, y3 # y3 = a|c # MAJA 542 + 543 + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 544 + mov a, T1 # T1 = a # MAJB 545 + and b, y3 # y3 = (a|c)&b # MAJA 546 + and c, T1 # T1 = a&c # MAJB 547 + add y0, y2 # y2 = S1 + CH # -- 548 + 549 + 550 + add h, d # d = k + w + h + d # -- 551 + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 552 + add y1, h # h = k + w + h + S0 # -- 553 + 554 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 555 + 556 + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 557 + 558 + add y3, h # h = t1 + S0 + MAJ # -- 559 + 560 + RotateState 561 + 562 + .endm 563 + 564 + ######################################################################## 565 + # void sha512_transform_rorx(const void* M, void* D, uint64_t L)# 566 + # Purpose: Updates the SHA512 digest stored at D with the message stored in M. 567 + # The size of the message pointed to by M must be an integer multiple of SHA512 568 + # message blocks. 569 + # L is the message length in SHA512 blocks 570 + ######################################################################## 571 + ENTRY(sha512_transform_rorx) 572 + # Allocate Stack Space 573 + mov %rsp, %rax 574 + sub $frame_size, %rsp 575 + and $~(0x20 - 1), %rsp 576 + mov %rax, frame_RSPSAVE(%rsp) 577 + 578 + # Save GPRs 579 + mov %rbp, frame_GPRSAVE(%rsp) 580 + mov %rbx, 8*1+frame_GPRSAVE(%rsp) 581 + mov %r12, 8*2+frame_GPRSAVE(%rsp) 582 + mov %r13, 8*3+frame_GPRSAVE(%rsp) 583 + mov %r14, 8*4+frame_GPRSAVE(%rsp) 584 + mov %r15, 8*5+frame_GPRSAVE(%rsp) 585 + 586 + shl $7, NUM_BLKS # convert to bytes 587 + jz done_hash 588 + add INP, NUM_BLKS # pointer to end of data 589 + mov NUM_BLKS, frame_INPEND(%rsp) 590 + 591 + ## load initial digest 592 + mov 8*0(CTX),a 593 + mov 8*1(CTX),b 594 + mov 8*2(CTX),c 595 + mov 8*3(CTX),d 596 + mov 8*4(CTX),e 597 + mov 8*5(CTX),f 598 + mov 8*6(CTX),g 599 + mov 8*7(CTX),h 600 + 601 + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 602 + 603 + loop0: 604 + lea K512(%rip), TBL 605 + 606 + ## byte swap first 16 dwords 607 + COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK 608 + COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK 609 + COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK 610 + COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK 611 + 612 + mov INP, frame_INP(%rsp) 613 + 614 + ## schedule 64 input dwords, by doing 12 rounds of 4 each 615 + movq $4, frame_SRND(%rsp) 616 + 617 + .align 16 618 + loop1: 619 + vpaddq (TBL), Y_0, XFER 620 + vmovdqa XFER, frame_XFER(%rsp) 621 + FOUR_ROUNDS_AND_SCHED 622 + 623 + vpaddq 1*32(TBL), Y_0, XFER 624 + vmovdqa XFER, frame_XFER(%rsp) 625 + FOUR_ROUNDS_AND_SCHED 626 + 627 + vpaddq 2*32(TBL), Y_0, XFER 628 + vmovdqa XFER, frame_XFER(%rsp) 629 + FOUR_ROUNDS_AND_SCHED 630 + 631 + vpaddq 3*32(TBL), Y_0, XFER 632 + vmovdqa XFER, frame_XFER(%rsp) 633 + add $(4*32), TBL 634 + FOUR_ROUNDS_AND_SCHED 635 + 636 + subq $1, frame_SRND(%rsp) 637 + jne loop1 638 + 639 + movq $2, frame_SRND(%rsp) 640 + loop2: 641 + vpaddq (TBL), Y_0, XFER 642 + vmovdqa XFER, frame_XFER(%rsp) 643 + DO_4ROUNDS 644 + vpaddq 1*32(TBL), Y_1, XFER 645 + vmovdqa XFER, frame_XFER(%rsp) 646 + add $(2*32), TBL 647 + DO_4ROUNDS 648 + 649 + vmovdqa Y_2, Y_0 650 + vmovdqa Y_3, Y_1 651 + 652 + subq $1, frame_SRND(%rsp) 653 + jne loop2 654 + 655 + addm 8*0(CTX),a 656 + addm 8*1(CTX),b 657 + addm 8*2(CTX),c 658 + addm 8*3(CTX),d 659 + addm 8*4(CTX),e 660 + addm 8*5(CTX),f 661 + addm 8*6(CTX),g 662 + addm 8*7(CTX),h 663 + 664 + mov frame_INP(%rsp), INP 665 + add $128, INP 666 + cmp frame_INPEND(%rsp), INP 667 + jne loop0 668 + 669 + done_hash: 670 + 671 + # Restore GPRs 672 + mov frame_GPRSAVE(%rsp) ,%rbp 673 + mov 8*1+frame_GPRSAVE(%rsp) ,%rbx 674 + mov 8*2+frame_GPRSAVE(%rsp) ,%r12 675 + mov 8*3+frame_GPRSAVE(%rsp) ,%r13 676 + mov 8*4+frame_GPRSAVE(%rsp) ,%r14 677 + mov 8*5+frame_GPRSAVE(%rsp) ,%r15 678 + 679 + # Restore Stack Pointer 680 + mov frame_RSPSAVE(%rsp), %rsp 681 + ret 682 + ENDPROC(sha512_transform_rorx) 683 + 684 + ######################################################################## 685 + ### Binary Data 686 + 687 + .data 688 + 689 + .align 64 690 + # K[t] used in SHA512 hashing 691 + K512: 692 + .quad 0x428a2f98d728ae22,0x7137449123ef65cd 693 + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 694 + .quad 0x3956c25bf348b538,0x59f111f1b605d019 695 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 696 + .quad 0xd807aa98a3030242,0x12835b0145706fbe 697 + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 698 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 699 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 700 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 701 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 702 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 703 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 704 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 705 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 706 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 707 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 708 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 709 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 710 + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 711 + .quad 0x81c2c92e47edaee6,0x92722c851482353b 712 + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 713 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 714 + .quad 0xd192e819d6ef5218,0xd69906245565a910 715 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 716 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 717 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 718 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 719 + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 720 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 721 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 722 + .quad 0x90befffa23631e28,0xa4506cebde82bde9 723 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 724 + .quad 0xca273eceea26619c,0xd186b8c721c0c207 725 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 726 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 727 + .quad 0x113f9804bef90dae,0x1b710b35131c471b 728 + .quad 0x28db77f523047d84,0x32caab7b40c72493 729 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 730 + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 731 + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 732 + 733 + .align 32 734 + 735 + # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. 736 + PSHUFFLE_BYTE_FLIP_MASK: 737 + .octa 0x08090a0b0c0d0e0f0001020304050607 738 + .octa 0x18191a1b1c1d1e1f1011121314151617 739 + 740 + MASK_YMM_LO: 741 + .octa 0x00000000000000000000000000000000 742 + .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 743 + #endif

+421

arch/x86/crypto/sha512-ssse3-asm.S

··· 1 + ######################################################################## 2 + # Implement fast SHA-512 with SSSE3 instructions. (x86_64) 3 + # 4 + # Copyright (C) 2013 Intel Corporation. 5 + # 6 + # Authors: 7 + # James Guilford <james.guilford@intel.com> 8 + # Kirk Yap <kirk.s.yap@intel.com> 9 + # David Cote <david.m.cote@intel.com> 10 + # Tim Chen <tim.c.chen@linux.intel.com> 11 + # 12 + # This software is available to you under a choice of one of two 13 + # licenses. You may choose to be licensed under the terms of the GNU 14 + # General Public License (GPL) Version 2, available from the file 15 + # COPYING in the main directory of this source tree, or the 16 + # OpenIB.org BSD license below: 17 + # 18 + # Redistribution and use in source and binary forms, with or 19 + # without modification, are permitted provided that the following 20 + # conditions are met: 21 + # 22 + # - Redistributions of source code must retain the above 23 + # copyright notice, this list of conditions and the following 24 + # disclaimer. 25 + # 26 + # - Redistributions in binary form must reproduce the above 27 + # copyright notice, this list of conditions and the following 28 + # disclaimer in the documentation and/or other materials 29 + # provided with the distribution. 30 + # 31 + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 32 + # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 33 + # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 34 + # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 35 + # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 36 + # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 37 + # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 38 + # SOFTWARE. 39 + # 40 + ######################################################################## 41 + # 42 + # This code is described in an Intel White-Paper: 43 + # "Fast SHA-512 Implementations on Intel Architecture Processors" 44 + # 45 + # To find it, surf to http://www.intel.com/p/en_US/embedded 46 + # and search for that title. 47 + # 48 + ######################################################################## 49 + 50 + #include <linux/linkage.h> 51 + 52 + .text 53 + 54 + # Virtual Registers 55 + # ARG1 56 + msg = %rdi 57 + # ARG2 58 + digest = %rsi 59 + # ARG3 60 + msglen = %rdx 61 + T1 = %rcx 62 + T2 = %r8 63 + a_64 = %r9 64 + b_64 = %r10 65 + c_64 = %r11 66 + d_64 = %r12 67 + e_64 = %r13 68 + f_64 = %r14 69 + g_64 = %r15 70 + h_64 = %rbx 71 + tmp0 = %rax 72 + 73 + # Local variables (stack frame) 74 + 75 + W_SIZE = 80*8 76 + WK_SIZE = 2*8 77 + RSPSAVE_SIZE = 1*8 78 + GPRSAVE_SIZE = 5*8 79 + 80 + frame_W = 0 81 + frame_WK = frame_W + W_SIZE 82 + frame_RSPSAVE = frame_WK + WK_SIZE 83 + frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE 84 + frame_size = frame_GPRSAVE + GPRSAVE_SIZE 85 + 86 + # Useful QWORD "arrays" for simpler memory references 87 + # MSG, DIGEST, K_t, W_t are arrays 88 + # WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even 89 + 90 + # Input message (arg1) 91 + #define MSG(i) 8*i(msg) 92 + 93 + # Output Digest (arg2) 94 + #define DIGEST(i) 8*i(digest) 95 + 96 + # SHA Constants (static mem) 97 + #define K_t(i) 8*i+K512(%rip) 98 + 99 + # Message Schedule (stack frame) 100 + #define W_t(i) 8*i+frame_W(%rsp) 101 + 102 + # W[t]+K[t] (stack frame) 103 + #define WK_2(i) 8*((i%2))+frame_WK(%rsp) 104 + 105 + .macro RotateState 106 + # Rotate symbols a..h right 107 + TMP = h_64 108 + h_64 = g_64 109 + g_64 = f_64 110 + f_64 = e_64 111 + e_64 = d_64 112 + d_64 = c_64 113 + c_64 = b_64 114 + b_64 = a_64 115 + a_64 = TMP 116 + .endm 117 + 118 + .macro SHA512_Round rnd 119 + 120 + # Compute Round %%t 121 + mov f_64, T1 # T1 = f 122 + mov e_64, tmp0 # tmp = e 123 + xor g_64, T1 # T1 = f ^ g 124 + ror $23, tmp0 # 41 # tmp = e ror 23 125 + and e_64, T1 # T1 = (f ^ g) & e 126 + xor e_64, tmp0 # tmp = (e ror 23) ^ e 127 + xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) 128 + idx = \rnd 129 + add WK_2(idx), T1 # W[t] + K[t] from message scheduler 130 + ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4 131 + xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e 132 + mov a_64, T2 # T2 = a 133 + add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h 134 + ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) 135 + add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) 136 + mov a_64, tmp0 # tmp = a 137 + xor c_64, T2 # T2 = a ^ c 138 + and c_64, tmp0 # tmp = a & c 139 + and b_64, T2 # T2 = (a ^ c) & b 140 + xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) 141 + mov a_64, tmp0 # tmp = a 142 + ror $5, tmp0 # 39 # tmp = a ror 5 143 + xor a_64, tmp0 # tmp = (a ror 5) ^ a 144 + add T1, d_64 # e(next_state) = d + T1 145 + ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6 146 + xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a 147 + lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) 148 + ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) 149 + add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) 150 + RotateState 151 + .endm 152 + 153 + .macro SHA512_2Sched_2Round_sse rnd 154 + 155 + # Compute rounds t-2 and t-1 156 + # Compute message schedule QWORDS t and t+1 157 + 158 + # Two rounds are computed based on the values for K[t-2]+W[t-2] and 159 + # K[t-1]+W[t-1] which were previously stored at WK_2 by the message 160 + # scheduler. 161 + # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. 162 + # They are then added to their respective SHA512 constants at 163 + # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] 164 + # For brievity, the comments following vectored instructions only refer to 165 + # the first of a pair of QWORDS. 166 + # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} 167 + # The computation of the message schedule and the rounds are tightly 168 + # stitched to take advantage of instruction-level parallelism. 169 + # For clarity, integer instructions (for the rounds calculation) are indented 170 + # by one tab. Vectored instructions (for the message scheduler) are indented 171 + # by two tabs. 172 + 173 + mov f_64, T1 174 + idx = \rnd -2 175 + movdqa W_t(idx), %xmm2 # XMM2 = W[t-2] 176 + xor g_64, T1 177 + and e_64, T1 178 + movdqa %xmm2, %xmm0 # XMM0 = W[t-2] 179 + xor g_64, T1 180 + idx = \rnd 181 + add WK_2(idx), T1 182 + idx = \rnd - 15 183 + movdqu W_t(idx), %xmm5 # XMM5 = W[t-15] 184 + mov e_64, tmp0 185 + ror $23, tmp0 # 41 186 + movdqa %xmm5, %xmm3 # XMM3 = W[t-15] 187 + xor e_64, tmp0 188 + ror $4, tmp0 # 18 189 + psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42 190 + xor e_64, tmp0 191 + ror $14, tmp0 # 14 192 + psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1 193 + add tmp0, T1 194 + add h_64, T1 195 + pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2] 196 + mov a_64, T2 197 + xor c_64, T2 198 + pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15] 199 + and b_64, T2 200 + mov a_64, tmp0 201 + psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13 202 + and c_64, tmp0 203 + xor tmp0, T2 204 + psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6 205 + mov a_64, tmp0 206 + ror $5, tmp0 # 39 207 + pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] 208 + xor a_64, tmp0 209 + ror $6, tmp0 # 34 210 + pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] 211 + xor a_64, tmp0 212 + ror $28, tmp0 # 28 213 + psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 214 + add tmp0, T2 215 + add T1, d_64 216 + psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 217 + lea (T1, T2), h_64 218 + RotateState 219 + movdqa %xmm2, %xmm1 # XMM1 = W[t-2] 220 + mov f_64, T1 221 + xor g_64, T1 222 + movdqa %xmm5, %xmm4 # XMM4 = W[t-15] 223 + and e_64, T1 224 + xor g_64, T1 225 + psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42 226 + idx = \rnd + 1 227 + add WK_2(idx), T1 228 + mov e_64, tmp0 229 + psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7 230 + ror $23, tmp0 # 41 231 + xor e_64, tmp0 232 + pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2] 233 + ror $4, tmp0 # 18 234 + xor e_64, tmp0 235 + pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15] 236 + ror $14, tmp0 # 14 237 + add tmp0, T1 238 + psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3 239 + add h_64, T1 240 + mov a_64, T2 241 + psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56 242 + xor c_64, T2 243 + and b_64, T2 244 + pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2]) 245 + mov a_64, tmp0 246 + and c_64, tmp0 247 + idx = \rnd - 7 248 + movdqu W_t(idx), %xmm1 # XMM1 = W[t-7] 249 + xor tmp0, T2 250 + pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15]) 251 + mov a_64, tmp0 252 + paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) 253 + ror $5, tmp0 # 39 254 + idx =\rnd-16 255 + paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] 256 + xor a_64, tmp0 257 + paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] 258 + ror $6, tmp0 # 34 259 + movdqa %xmm0, W_t(\rnd) # Store scheduled qwords 260 + xor a_64, tmp0 261 + paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t] 262 + ror $28, tmp0 # 28 263 + idx = \rnd 264 + movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds 265 + add tmp0, T2 266 + add T1, d_64 267 + lea (T1, T2), h_64 268 + RotateState 269 + .endm 270 + 271 + ######################################################################## 272 + # void sha512_transform_ssse3(const void* M, void* D, u64 L)# 273 + # Purpose: Updates the SHA512 digest stored at D with the message stored in M. 274 + # The size of the message pointed to by M must be an integer multiple of SHA512 275 + # message blocks. 276 + # L is the message length in SHA512 blocks. 277 + ######################################################################## 278 + ENTRY(sha512_transform_ssse3) 279 + 280 + cmp $0, msglen 281 + je nowork 282 + 283 + # Allocate Stack Space 284 + mov %rsp, %rax 285 + sub $frame_size, %rsp 286 + and $~(0x20 - 1), %rsp 287 + mov %rax, frame_RSPSAVE(%rsp) 288 + 289 + # Save GPRs 290 + mov %rbx, frame_GPRSAVE(%rsp) 291 + mov %r12, frame_GPRSAVE +8*1(%rsp) 292 + mov %r13, frame_GPRSAVE +8*2(%rsp) 293 + mov %r14, frame_GPRSAVE +8*3(%rsp) 294 + mov %r15, frame_GPRSAVE +8*4(%rsp) 295 + 296 + updateblock: 297 + 298 + # Load state variables 299 + mov DIGEST(0), a_64 300 + mov DIGEST(1), b_64 301 + mov DIGEST(2), c_64 302 + mov DIGEST(3), d_64 303 + mov DIGEST(4), e_64 304 + mov DIGEST(5), f_64 305 + mov DIGEST(6), g_64 306 + mov DIGEST(7), h_64 307 + 308 + t = 0 309 + .rept 80/2 + 1 310 + # (80 rounds) / (2 rounds/iteration) + (1 iteration) 311 + # +1 iteration because the scheduler leads hashing by 1 iteration 312 + .if t < 2 313 + # BSWAP 2 QWORDS 314 + movdqa XMM_QWORD_BSWAP(%rip), %xmm1 315 + movdqu MSG(t), %xmm0 316 + pshufb %xmm1, %xmm0 # BSWAP 317 + movdqa %xmm0, W_t(t) # Store Scheduled Pair 318 + paddq K_t(t), %xmm0 # Compute W[t]+K[t] 319 + movdqa %xmm0, WK_2(t) # Store into WK for rounds 320 + .elseif t < 16 321 + # BSWAP 2 QWORDS# Compute 2 Rounds 322 + movdqu MSG(t), %xmm0 323 + pshufb %xmm1, %xmm0 # BSWAP 324 + SHA512_Round t-2 # Round t-2 325 + movdqa %xmm0, W_t(t) # Store Scheduled Pair 326 + paddq K_t(t), %xmm0 # Compute W[t]+K[t] 327 + SHA512_Round t-1 # Round t-1 328 + movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK 329 + .elseif t < 79 330 + # Schedule 2 QWORDS# Compute 2 Rounds 331 + SHA512_2Sched_2Round_sse t 332 + .else 333 + # Compute 2 Rounds 334 + SHA512_Round t-2 335 + SHA512_Round t-1 336 + .endif 337 + t = t+2 338 + .endr 339 + 340 + # Update digest 341 + add a_64, DIGEST(0) 342 + add b_64, DIGEST(1) 343 + add c_64, DIGEST(2) 344 + add d_64, DIGEST(3) 345 + add e_64, DIGEST(4) 346 + add f_64, DIGEST(5) 347 + add g_64, DIGEST(6) 348 + add h_64, DIGEST(7) 349 + 350 + # Advance to next message block 351 + add $16*8, msg 352 + dec msglen 353 + jnz updateblock 354 + 355 + # Restore GPRs 356 + mov frame_GPRSAVE(%rsp), %rbx 357 + mov frame_GPRSAVE +8*1(%rsp), %r12 358 + mov frame_GPRSAVE +8*2(%rsp), %r13 359 + mov frame_GPRSAVE +8*3(%rsp), %r14 360 + mov frame_GPRSAVE +8*4(%rsp), %r15 361 + 362 + # Restore Stack Pointer 363 + mov frame_RSPSAVE(%rsp), %rsp 364 + 365 + nowork: 366 + ret 367 + ENDPROC(sha512_transform_ssse3) 368 + 369 + ######################################################################## 370 + ### Binary Data 371 + 372 + .data 373 + 374 + .align 16 375 + 376 + # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. 377 + XMM_QWORD_BSWAP: 378 + .octa 0x08090a0b0c0d0e0f0001020304050607 379 + 380 + # K[t] used in SHA512 hashing 381 + K512: 382 + .quad 0x428a2f98d728ae22,0x7137449123ef65cd 383 + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 384 + .quad 0x3956c25bf348b538,0x59f111f1b605d019 385 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 386 + .quad 0xd807aa98a3030242,0x12835b0145706fbe 387 + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 388 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 389 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 390 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 391 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 392 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 393 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 394 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 395 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 396 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 397 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 398 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 399 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 400 + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 401 + .quad 0x81c2c92e47edaee6,0x92722c851482353b 402 + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 403 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 404 + .quad 0xd192e819d6ef5218,0xd69906245565a910 405 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 406 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 407 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 408 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 409 + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 410 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 411 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 412 + .quad 0x90befffa23631e28,0xa4506cebde82bde9 413 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 414 + .quad 0xca273eceea26619c,0xd186b8c721c0c207 415 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 416 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 417 + .quad 0x113f9804bef90dae,0x1b710b35131c471b 418 + .quad 0x28db77f523047d84,0x32caab7b40c72493 419 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 420 + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 421 + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817

+282

arch/x86/crypto/sha512_ssse3_glue.c

··· 1 + /* 2 + * Cryptographic API. 3 + * 4 + * Glue code for the SHA512 Secure Hash Algorithm assembler 5 + * implementation using supplemental SSE3 / AVX / AVX2 instructions. 6 + * 7 + * This file is based on sha512_generic.c 8 + * 9 + * Copyright (C) 2013 Intel Corporation 10 + * Author: Tim Chen <tim.c.chen@linux.intel.com> 11 + * 12 + * This program is free software; you can redistribute it and/or modify it 13 + * under the terms of the GNU General Public License as published by the Free 14 + * Software Foundation; either version 2 of the License, or (at your option) 15 + * any later version. 16 + * 17 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 21 + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 22 + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 23 + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 + * SOFTWARE. 25 + * 26 + */ 27 + 28 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 29 + 30 + #include <crypto/internal/hash.h> 31 + #include <linux/init.h> 32 + #include <linux/module.h> 33 + #include <linux/mm.h> 34 + #include <linux/cryptohash.h> 35 + #include <linux/types.h> 36 + #include <crypto/sha.h> 37 + #include <asm/byteorder.h> 38 + #include <asm/i387.h> 39 + #include <asm/xcr.h> 40 + #include <asm/xsave.h> 41 + 42 + #include <linux/string.h> 43 + 44 + asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest, 45 + u64 rounds); 46 + #ifdef CONFIG_AS_AVX 47 + asmlinkage void sha512_transform_avx(const char *data, u64 *digest, 48 + u64 rounds); 49 + #endif 50 + #ifdef CONFIG_AS_AVX2 51 + asmlinkage void sha512_transform_rorx(const char *data, u64 *digest, 52 + u64 rounds); 53 + #endif 54 + 55 + static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64); 56 + 57 + 58 + static int sha512_ssse3_init(struct shash_desc *desc) 59 + { 60 + struct sha512_state *sctx = shash_desc_ctx(desc); 61 + 62 + sctx->state[0] = SHA512_H0; 63 + sctx->state[1] = SHA512_H1; 64 + sctx->state[2] = SHA512_H2; 65 + sctx->state[3] = SHA512_H3; 66 + sctx->state[4] = SHA512_H4; 67 + sctx->state[5] = SHA512_H5; 68 + sctx->state[6] = SHA512_H6; 69 + sctx->state[7] = SHA512_H7; 70 + sctx->count[0] = sctx->count[1] = 0; 71 + 72 + return 0; 73 + } 74 + 75 + static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data, 76 + unsigned int len, unsigned int partial) 77 + { 78 + struct sha512_state *sctx = shash_desc_ctx(desc); 79 + unsigned int done = 0; 80 + 81 + sctx->count[0] += len; 82 + if (sctx->count[0] < len) 83 + sctx->count[1]++; 84 + 85 + if (partial) { 86 + done = SHA512_BLOCK_SIZE - partial; 87 + memcpy(sctx->buf + partial, data, done); 88 + sha512_transform_asm(sctx->buf, sctx->state, 1); 89 + } 90 + 91 + if (len - done >= SHA512_BLOCK_SIZE) { 92 + const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE; 93 + 94 + sha512_transform_asm(data + done, sctx->state, (u64) rounds); 95 + 96 + done += rounds * SHA512_BLOCK_SIZE; 97 + } 98 + 99 + memcpy(sctx->buf, data + done, len - done); 100 + 101 + return 0; 102 + } 103 + 104 + static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, 105 + unsigned int len) 106 + { 107 + struct sha512_state *sctx = shash_desc_ctx(desc); 108 + unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE; 109 + int res; 110 + 111 + /* Handle the fast case right here */ 112 + if (partial + len < SHA512_BLOCK_SIZE) { 113 + sctx->count[0] += len; 114 + if (sctx->count[0] < len) 115 + sctx->count[1]++; 116 + memcpy(sctx->buf + partial, data, len); 117 + 118 + return 0; 119 + } 120 + 121 + if (!irq_fpu_usable()) { 122 + res = crypto_sha512_update(desc, data, len); 123 + } else { 124 + kernel_fpu_begin(); 125 + res = __sha512_ssse3_update(desc, data, len, partial); 126 + kernel_fpu_end(); 127 + } 128 + 129 + return res; 130 + } 131 + 132 + 133 + /* Add padding and return the message digest. */ 134 + static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) 135 + { 136 + struct sha512_state *sctx = shash_desc_ctx(desc); 137 + unsigned int i, index, padlen; 138 + __be64 *dst = (__be64 *)out; 139 + __be64 bits[2]; 140 + static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, }; 141 + 142 + /* save number of bits */ 143 + bits[1] = cpu_to_be64(sctx->count[0] << 3); 144 + bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61; 145 + 146 + /* Pad out to 112 mod 128 and append length */ 147 + index = sctx->count[0] & 0x7f; 148 + padlen = (index < 112) ? (112 - index) : ((128+112) - index); 149 + 150 + if (!irq_fpu_usable()) { 151 + crypto_sha512_update(desc, padding, padlen); 152 + crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits)); 153 + } else { 154 + kernel_fpu_begin(); 155 + /* We need to fill a whole block for __sha512_ssse3_update() */ 156 + if (padlen <= 112) { 157 + sctx->count[0] += padlen; 158 + if (sctx->count[0] < padlen) 159 + sctx->count[1]++; 160 + memcpy(sctx->buf + index, padding, padlen); 161 + } else { 162 + __sha512_ssse3_update(desc, padding, padlen, index); 163 + } 164 + __sha512_ssse3_update(desc, (const u8 *)&bits, 165 + sizeof(bits), 112); 166 + kernel_fpu_end(); 167 + } 168 + 169 + /* Store state in digest */ 170 + for (i = 0; i < 8; i++) 171 + dst[i] = cpu_to_be64(sctx->state[i]); 172 + 173 + /* Wipe context */ 174 + memset(sctx, 0, sizeof(*sctx)); 175 + 176 + return 0; 177 + } 178 + 179 + static int sha512_ssse3_export(struct shash_desc *desc, void *out) 180 + { 181 + struct sha512_state *sctx = shash_desc_ctx(desc); 182 + 183 + memcpy(out, sctx, sizeof(*sctx)); 184 + 185 + return 0; 186 + } 187 + 188 + static int sha512_ssse3_import(struct shash_desc *desc, const void *in) 189 + { 190 + struct sha512_state *sctx = shash_desc_ctx(desc); 191 + 192 + memcpy(sctx, in, sizeof(*sctx)); 193 + 194 + return 0; 195 + } 196 + 197 + static struct shash_alg alg = { 198 + .digestsize = SHA512_DIGEST_SIZE, 199 + .init = sha512_ssse3_init, 200 + .update = sha512_ssse3_update, 201 + .final = sha512_ssse3_final, 202 + .export = sha512_ssse3_export, 203 + .import = sha512_ssse3_import, 204 + .descsize = sizeof(struct sha512_state), 205 + .statesize = sizeof(struct sha512_state), 206 + .base = { 207 + .cra_name = "sha512", 208 + .cra_driver_name = "sha512-ssse3", 209 + .cra_priority = 150, 210 + .cra_flags = CRYPTO_ALG_TYPE_SHASH, 211 + .cra_blocksize = SHA512_BLOCK_SIZE, 212 + .cra_module = THIS_MODULE, 213 + } 214 + }; 215 + 216 + #ifdef CONFIG_AS_AVX 217 + static bool __init avx_usable(void) 218 + { 219 + u64 xcr0; 220 + 221 + if (!cpu_has_avx || !cpu_has_osxsave) 222 + return false; 223 + 224 + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 225 + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { 226 + pr_info("AVX detected but unusable.\n"); 227 + 228 + return false; 229 + } 230 + 231 + return true; 232 + } 233 + #endif 234 + 235 + static int __init sha512_ssse3_mod_init(void) 236 + { 237 + /* test for SSE3 first */ 238 + if (cpu_has_ssse3) 239 + sha512_transform_asm = sha512_transform_ssse3; 240 + 241 + #ifdef CONFIG_AS_AVX 242 + /* allow AVX to override SSSE3, it's a little faster */ 243 + if (avx_usable()) { 244 + #ifdef CONFIG_AS_AVX2 245 + if (boot_cpu_has(X86_FEATURE_AVX2)) 246 + sha512_transform_asm = sha512_transform_rorx; 247 + else 248 + #endif 249 + sha512_transform_asm = sha512_transform_avx; 250 + } 251 + #endif 252 + 253 + if (sha512_transform_asm) { 254 + #ifdef CONFIG_AS_AVX 255 + if (sha512_transform_asm == sha512_transform_avx) 256 + pr_info("Using AVX optimized SHA-512 implementation\n"); 257 + #ifdef CONFIG_AS_AVX2 258 + else if (sha512_transform_asm == sha512_transform_rorx) 259 + pr_info("Using AVX2 optimized SHA-512 implementation\n"); 260 + #endif 261 + else 262 + #endif 263 + pr_info("Using SSSE3 optimized SHA-512 implementation\n"); 264 + return crypto_register_shash(&alg); 265 + } 266 + pr_info("Neither AVX nor SSSE3 is available/usable.\n"); 267 + 268 + return -ENODEV; 269 + } 270 + 271 + static void __exit sha512_ssse3_mod_fini(void) 272 + { 273 + crypto_unregister_shash(&alg); 274 + } 275 + 276 + module_init(sha512_ssse3_mod_init); 277 + module_exit(sha512_ssse3_mod_fini); 278 + 279 + MODULE_LICENSE("GPL"); 280 + MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); 281 + 282 + MODULE_ALIAS("sha512");

+47 -1

arch/x86/crypto/twofish-avx-x86_64-asm_64.S

··· 4 4 * Copyright (C) 2012 Johannes Goetzfried 5 5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 6 6 * 7 - * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 7 + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 8 8 * 9 9 * This program is free software; you can redistribute it and/or modify 10 10 * it under the terms of the GNU General Public License as published by ··· 33 33 34 34 .Lbswap128_mask: 35 35 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 36 + .Lxts_gf128mul_and_shl1_mask: 37 + .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 36 38 37 39 .text 38 40 ··· 410 408 411 409 ret; 412 410 ENDPROC(twofish_ctr_8way) 411 + 412 + ENTRY(twofish_xts_enc_8way) 413 + /* input: 414 + * %rdi: ctx, CTX 415 + * %rsi: dst 416 + * %rdx: src 417 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 418 + */ 419 + 420 + movq %rsi, %r11; 421 + 422 + /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 423 + load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 424 + RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); 425 + 426 + call __twofish_enc_blk8; 427 + 428 + /* dst <= regs xor IVs(in dst) */ 429 + store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); 430 + 431 + ret; 432 + ENDPROC(twofish_xts_enc_8way) 433 + 434 + ENTRY(twofish_xts_dec_8way) 435 + /* input: 436 + * %rdi: ctx, CTX 437 + * %rsi: dst 438 + * %rdx: src 439 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 440 + */ 441 + 442 + movq %rsi, %r11; 443 + 444 + /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 445 + load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2, 446 + RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); 447 + 448 + call __twofish_dec_blk8; 449 + 450 + /* dst <= regs xor IVs(in dst) */ 451 + store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 452 + 453 + ret; 454 + ENDPROC(twofish_xts_dec_8way)

+600

arch/x86/crypto/twofish-avx2-asm_64.S

··· 1 + /* 2 + * x86_64/AVX2 assembler optimized version of Twofish 3 + * 4 + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License as published by 8 + * the Free Software Foundation; either version 2 of the License, or 9 + * (at your option) any later version. 10 + * 11 + */ 12 + 13 + #include <linux/linkage.h> 14 + #include "glue_helper-asm-avx2.S" 15 + 16 + .file "twofish-avx2-asm_64.S" 17 + 18 + .data 19 + .align 16 20 + 21 + .Lvpshufb_mask0: 22 + .long 0x80808000 23 + .long 0x80808004 24 + .long 0x80808008 25 + .long 0x8080800c 26 + 27 + .Lbswap128_mask: 28 + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 29 + .Lxts_gf128mul_and_shl1_mask_0: 30 + .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 31 + .Lxts_gf128mul_and_shl1_mask_1: 32 + .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 33 + 34 + .text 35 + 36 + /* structure of crypto context */ 37 + #define s0 0 38 + #define s1 1024 39 + #define s2 2048 40 + #define s3 3072 41 + #define w 4096 42 + #define k 4128 43 + 44 + /* register macros */ 45 + #define CTX %rdi 46 + 47 + #define RS0 CTX 48 + #define RS1 %r8 49 + #define RS2 %r9 50 + #define RS3 %r10 51 + #define RK %r11 52 + #define RW %rax 53 + #define RROUND %r12 54 + #define RROUNDd %r12d 55 + 56 + #define RA0 %ymm8 57 + #define RB0 %ymm9 58 + #define RC0 %ymm10 59 + #define RD0 %ymm11 60 + #define RA1 %ymm12 61 + #define RB1 %ymm13 62 + #define RC1 %ymm14 63 + #define RD1 %ymm15 64 + 65 + /* temp regs */ 66 + #define RX0 %ymm0 67 + #define RY0 %ymm1 68 + #define RX1 %ymm2 69 + #define RY1 %ymm3 70 + #define RT0 %ymm4 71 + #define RIDX %ymm5 72 + 73 + #define RX0x %xmm0 74 + #define RY0x %xmm1 75 + #define RX1x %xmm2 76 + #define RY1x %xmm3 77 + #define RT0x %xmm4 78 + 79 + /* vpgatherdd mask and '-1' */ 80 + #define RNOT %ymm6 81 + 82 + /* byte mask, (-1 >> 24) */ 83 + #define RBYTE %ymm7 84 + 85 + /********************************************************************** 86 + 16-way AVX2 twofish 87 + **********************************************************************/ 88 + #define init_round_constants() \ 89 + vpcmpeqd RNOT, RNOT, RNOT; \ 90 + vpsrld $24, RNOT, RBYTE; \ 91 + leaq k(CTX), RK; \ 92 + leaq w(CTX), RW; \ 93 + leaq s1(CTX), RS1; \ 94 + leaq s2(CTX), RS2; \ 95 + leaq s3(CTX), RS3; \ 96 + 97 + #define g16(ab, rs0, rs1, rs2, rs3, xy) \ 98 + vpand RBYTE, ab ## 0, RIDX; \ 99 + vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \ 100 + vpcmpeqd RNOT, RNOT, RNOT; \ 101 + \ 102 + vpand RBYTE, ab ## 1, RIDX; \ 103 + vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \ 104 + vpcmpeqd RNOT, RNOT, RNOT; \ 105 + \ 106 + vpsrld $8, ab ## 0, RIDX; \ 107 + vpand RBYTE, RIDX, RIDX; \ 108 + vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ 109 + vpcmpeqd RNOT, RNOT, RNOT; \ 110 + vpxor RT0, xy ## 0, xy ## 0; \ 111 + \ 112 + vpsrld $8, ab ## 1, RIDX; \ 113 + vpand RBYTE, RIDX, RIDX; \ 114 + vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ 115 + vpcmpeqd RNOT, RNOT, RNOT; \ 116 + vpxor RT0, xy ## 1, xy ## 1; \ 117 + \ 118 + vpsrld $16, ab ## 0, RIDX; \ 119 + vpand RBYTE, RIDX, RIDX; \ 120 + vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ 121 + vpcmpeqd RNOT, RNOT, RNOT; \ 122 + vpxor RT0, xy ## 0, xy ## 0; \ 123 + \ 124 + vpsrld $16, ab ## 1, RIDX; \ 125 + vpand RBYTE, RIDX, RIDX; \ 126 + vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ 127 + vpcmpeqd RNOT, RNOT, RNOT; \ 128 + vpxor RT0, xy ## 1, xy ## 1; \ 129 + \ 130 + vpsrld $24, ab ## 0, RIDX; \ 131 + vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ 132 + vpcmpeqd RNOT, RNOT, RNOT; \ 133 + vpxor RT0, xy ## 0, xy ## 0; \ 134 + \ 135 + vpsrld $24, ab ## 1, RIDX; \ 136 + vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ 137 + vpcmpeqd RNOT, RNOT, RNOT; \ 138 + vpxor RT0, xy ## 1, xy ## 1; 139 + 140 + #define g1_16(a, x) \ 141 + g16(a, RS0, RS1, RS2, RS3, x); 142 + 143 + #define g2_16(b, y) \ 144 + g16(b, RS1, RS2, RS3, RS0, y); 145 + 146 + #define encrypt_round_end16(a, b, c, d, nk) \ 147 + vpaddd RY0, RX0, RX0; \ 148 + vpaddd RX0, RY0, RY0; \ 149 + vpbroadcastd nk(RK,RROUND,8), RT0; \ 150 + vpaddd RT0, RX0, RX0; \ 151 + vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ 152 + vpaddd RT0, RY0, RY0; \ 153 + \ 154 + vpxor RY0, d ## 0, d ## 0; \ 155 + \ 156 + vpxor RX0, c ## 0, c ## 0; \ 157 + vpsrld $1, c ## 0, RT0; \ 158 + vpslld $31, c ## 0, c ## 0; \ 159 + vpor RT0, c ## 0, c ## 0; \ 160 + \ 161 + vpaddd RY1, RX1, RX1; \ 162 + vpaddd RX1, RY1, RY1; \ 163 + vpbroadcastd nk(RK,RROUND,8), RT0; \ 164 + vpaddd RT0, RX1, RX1; \ 165 + vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ 166 + vpaddd RT0, RY1, RY1; \ 167 + \ 168 + vpxor RY1, d ## 1, d ## 1; \ 169 + \ 170 + vpxor RX1, c ## 1, c ## 1; \ 171 + vpsrld $1, c ## 1, RT0; \ 172 + vpslld $31, c ## 1, c ## 1; \ 173 + vpor RT0, c ## 1, c ## 1; \ 174 + 175 + #define encrypt_round16(a, b, c, d, nk) \ 176 + g2_16(b, RY); \ 177 + \ 178 + vpslld $1, b ## 0, RT0; \ 179 + vpsrld $31, b ## 0, b ## 0; \ 180 + vpor RT0, b ## 0, b ## 0; \ 181 + \ 182 + vpslld $1, b ## 1, RT0; \ 183 + vpsrld $31, b ## 1, b ## 1; \ 184 + vpor RT0, b ## 1, b ## 1; \ 185 + \ 186 + g1_16(a, RX); \ 187 + \ 188 + encrypt_round_end16(a, b, c, d, nk); 189 + 190 + #define encrypt_round_first16(a, b, c, d, nk) \ 191 + vpslld $1, d ## 0, RT0; \ 192 + vpsrld $31, d ## 0, d ## 0; \ 193 + vpor RT0, d ## 0, d ## 0; \ 194 + \ 195 + vpslld $1, d ## 1, RT0; \ 196 + vpsrld $31, d ## 1, d ## 1; \ 197 + vpor RT0, d ## 1, d ## 1; \ 198 + \ 199 + encrypt_round16(a, b, c, d, nk); 200 + 201 + #define encrypt_round_last16(a, b, c, d, nk) \ 202 + g2_16(b, RY); \ 203 + \ 204 + g1_16(a, RX); \ 205 + \ 206 + encrypt_round_end16(a, b, c, d, nk); 207 + 208 + #define decrypt_round_end16(a, b, c, d, nk) \ 209 + vpaddd RY0, RX0, RX0; \ 210 + vpaddd RX0, RY0, RY0; \ 211 + vpbroadcastd nk(RK,RROUND,8), RT0; \ 212 + vpaddd RT0, RX0, RX0; \ 213 + vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ 214 + vpaddd RT0, RY0, RY0; \ 215 + \ 216 + vpxor RX0, c ## 0, c ## 0; \ 217 + \ 218 + vpxor RY0, d ## 0, d ## 0; \ 219 + vpsrld $1, d ## 0, RT0; \ 220 + vpslld $31, d ## 0, d ## 0; \ 221 + vpor RT0, d ## 0, d ## 0; \ 222 + \ 223 + vpaddd RY1, RX1, RX1; \ 224 + vpaddd RX1, RY1, RY1; \ 225 + vpbroadcastd nk(RK,RROUND,8), RT0; \ 226 + vpaddd RT0, RX1, RX1; \ 227 + vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ 228 + vpaddd RT0, RY1, RY1; \ 229 + \ 230 + vpxor RX1, c ## 1, c ## 1; \ 231 + \ 232 + vpxor RY1, d ## 1, d ## 1; \ 233 + vpsrld $1, d ## 1, RT0; \ 234 + vpslld $31, d ## 1, d ## 1; \ 235 + vpor RT0, d ## 1, d ## 1; 236 + 237 + #define decrypt_round16(a, b, c, d, nk) \ 238 + g1_16(a, RX); \ 239 + \ 240 + vpslld $1, a ## 0, RT0; \ 241 + vpsrld $31, a ## 0, a ## 0; \ 242 + vpor RT0, a ## 0, a ## 0; \ 243 + \ 244 + vpslld $1, a ## 1, RT0; \ 245 + vpsrld $31, a ## 1, a ## 1; \ 246 + vpor RT0, a ## 1, a ## 1; \ 247 + \ 248 + g2_16(b, RY); \ 249 + \ 250 + decrypt_round_end16(a, b, c, d, nk); 251 + 252 + #define decrypt_round_first16(a, b, c, d, nk) \ 253 + vpslld $1, c ## 0, RT0; \ 254 + vpsrld $31, c ## 0, c ## 0; \ 255 + vpor RT0, c ## 0, c ## 0; \ 256 + \ 257 + vpslld $1, c ## 1, RT0; \ 258 + vpsrld $31, c ## 1, c ## 1; \ 259 + vpor RT0, c ## 1, c ## 1; \ 260 + \ 261 + decrypt_round16(a, b, c, d, nk) 262 + 263 + #define decrypt_round_last16(a, b, c, d, nk) \ 264 + g1_16(a, RX); \ 265 + \ 266 + g2_16(b, RY); \ 267 + \ 268 + decrypt_round_end16(a, b, c, d, nk); 269 + 270 + #define encrypt_cycle16() \ 271 + encrypt_round16(RA, RB, RC, RD, 0); \ 272 + encrypt_round16(RC, RD, RA, RB, 8); 273 + 274 + #define encrypt_cycle_first16() \ 275 + encrypt_round_first16(RA, RB, RC, RD, 0); \ 276 + encrypt_round16(RC, RD, RA, RB, 8); 277 + 278 + #define encrypt_cycle_last16() \ 279 + encrypt_round16(RA, RB, RC, RD, 0); \ 280 + encrypt_round_last16(RC, RD, RA, RB, 8); 281 + 282 + #define decrypt_cycle16(n) \ 283 + decrypt_round16(RC, RD, RA, RB, 8); \ 284 + decrypt_round16(RA, RB, RC, RD, 0); 285 + 286 + #define decrypt_cycle_first16(n) \ 287 + decrypt_round_first16(RC, RD, RA, RB, 8); \ 288 + decrypt_round16(RA, RB, RC, RD, 0); 289 + 290 + #define decrypt_cycle_last16(n) \ 291 + decrypt_round16(RC, RD, RA, RB, 8); \ 292 + decrypt_round_last16(RA, RB, RC, RD, 0); 293 + 294 + #define transpose_4x4(x0,x1,x2,x3,t1,t2) \ 295 + vpunpckhdq x1, x0, t2; \ 296 + vpunpckldq x1, x0, x0; \ 297 + \ 298 + vpunpckldq x3, x2, t1; \ 299 + vpunpckhdq x3, x2, x2; \ 300 + \ 301 + vpunpckhqdq t1, x0, x1; \ 302 + vpunpcklqdq t1, x0, x0; \ 303 + \ 304 + vpunpckhqdq x2, t2, x3; \ 305 + vpunpcklqdq x2, t2, x2; 306 + 307 + #define read_blocks8(offs,a,b,c,d) \ 308 + transpose_4x4(a, b, c, d, RX0, RY0); 309 + 310 + #define write_blocks8(offs,a,b,c,d) \ 311 + transpose_4x4(a, b, c, d, RX0, RY0); 312 + 313 + #define inpack_enc8(a,b,c,d) \ 314 + vpbroadcastd 4*0(RW), RT0; \ 315 + vpxor RT0, a, a; \ 316 + \ 317 + vpbroadcastd 4*1(RW), RT0; \ 318 + vpxor RT0, b, b; \ 319 + \ 320 + vpbroadcastd 4*2(RW), RT0; \ 321 + vpxor RT0, c, c; \ 322 + \ 323 + vpbroadcastd 4*3(RW), RT0; \ 324 + vpxor RT0, d, d; 325 + 326 + #define outunpack_enc8(a,b,c,d) \ 327 + vpbroadcastd 4*4(RW), RX0; \ 328 + vpbroadcastd 4*5(RW), RY0; \ 329 + vpxor RX0, c, RX0; \ 330 + vpxor RY0, d, RY0; \ 331 + \ 332 + vpbroadcastd 4*6(RW), RT0; \ 333 + vpxor RT0, a, c; \ 334 + vpbroadcastd 4*7(RW), RT0; \ 335 + vpxor RT0, b, d; \ 336 + \ 337 + vmovdqa RX0, a; \ 338 + vmovdqa RY0, b; 339 + 340 + #define inpack_dec8(a,b,c,d) \ 341 + vpbroadcastd 4*4(RW), RX0; \ 342 + vpbroadcastd 4*5(RW), RY0; \ 343 + vpxor RX0, a, RX0; \ 344 + vpxor RY0, b, RY0; \ 345 + \ 346 + vpbroadcastd 4*6(RW), RT0; \ 347 + vpxor RT0, c, a; \ 348 + vpbroadcastd 4*7(RW), RT0; \ 349 + vpxor RT0, d, b; \ 350 + \ 351 + vmovdqa RX0, c; \ 352 + vmovdqa RY0, d; 353 + 354 + #define outunpack_dec8(a,b,c,d) \ 355 + vpbroadcastd 4*0(RW), RT0; \ 356 + vpxor RT0, a, a; \ 357 + \ 358 + vpbroadcastd 4*1(RW), RT0; \ 359 + vpxor RT0, b, b; \ 360 + \ 361 + vpbroadcastd 4*2(RW), RT0; \ 362 + vpxor RT0, c, c; \ 363 + \ 364 + vpbroadcastd 4*3(RW), RT0; \ 365 + vpxor RT0, d, d; 366 + 367 + #define read_blocks16(a,b,c,d) \ 368 + read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ 369 + read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); 370 + 371 + #define write_blocks16(a,b,c,d) \ 372 + write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ 373 + write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); 374 + 375 + #define xor_blocks16(a,b,c,d) \ 376 + xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ 377 + xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); 378 + 379 + #define inpack_enc16(a,b,c,d) \ 380 + inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ 381 + inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); 382 + 383 + #define outunpack_enc16(a,b,c,d) \ 384 + outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ 385 + outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); 386 + 387 + #define inpack_dec16(a,b,c,d) \ 388 + inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ 389 + inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); 390 + 391 + #define outunpack_dec16(a,b,c,d) \ 392 + outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ 393 + outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); 394 + 395 + .align 8 396 + __twofish_enc_blk16: 397 + /* input: 398 + * %rdi: ctx, CTX 399 + * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext 400 + * output: 401 + * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext 402 + */ 403 + init_round_constants(); 404 + 405 + read_blocks16(RA, RB, RC, RD); 406 + inpack_enc16(RA, RB, RC, RD); 407 + 408 + xorl RROUNDd, RROUNDd; 409 + encrypt_cycle_first16(); 410 + movl $2, RROUNDd; 411 + 412 + .align 4 413 + .L__enc_loop: 414 + encrypt_cycle16(); 415 + 416 + addl $2, RROUNDd; 417 + cmpl $14, RROUNDd; 418 + jne .L__enc_loop; 419 + 420 + encrypt_cycle_last16(); 421 + 422 + outunpack_enc16(RA, RB, RC, RD); 423 + write_blocks16(RA, RB, RC, RD); 424 + 425 + ret; 426 + ENDPROC(__twofish_enc_blk16) 427 + 428 + .align 8 429 + __twofish_dec_blk16: 430 + /* input: 431 + * %rdi: ctx, CTX 432 + * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext 433 + * output: 434 + * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext 435 + */ 436 + init_round_constants(); 437 + 438 + read_blocks16(RA, RB, RC, RD); 439 + inpack_dec16(RA, RB, RC, RD); 440 + 441 + movl $14, RROUNDd; 442 + decrypt_cycle_first16(); 443 + movl $12, RROUNDd; 444 + 445 + .align 4 446 + .L__dec_loop: 447 + decrypt_cycle16(); 448 + 449 + addl $-2, RROUNDd; 450 + jnz .L__dec_loop; 451 + 452 + decrypt_cycle_last16(); 453 + 454 + outunpack_dec16(RA, RB, RC, RD); 455 + write_blocks16(RA, RB, RC, RD); 456 + 457 + ret; 458 + ENDPROC(__twofish_dec_blk16) 459 + 460 + ENTRY(twofish_ecb_enc_16way) 461 + /* input: 462 + * %rdi: ctx, CTX 463 + * %rsi: dst 464 + * %rdx: src 465 + */ 466 + 467 + vzeroupper; 468 + pushq %r12; 469 + 470 + load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); 471 + 472 + call __twofish_enc_blk16; 473 + 474 + store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); 475 + 476 + popq %r12; 477 + vzeroupper; 478 + 479 + ret; 480 + ENDPROC(twofish_ecb_enc_16way) 481 + 482 + ENTRY(twofish_ecb_dec_16way) 483 + /* input: 484 + * %rdi: ctx, CTX 485 + * %rsi: dst 486 + * %rdx: src 487 + */ 488 + 489 + vzeroupper; 490 + pushq %r12; 491 + 492 + load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); 493 + 494 + call __twofish_dec_blk16; 495 + 496 + store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); 497 + 498 + popq %r12; 499 + vzeroupper; 500 + 501 + ret; 502 + ENDPROC(twofish_ecb_dec_16way) 503 + 504 + ENTRY(twofish_cbc_dec_16way) 505 + /* input: 506 + * %rdi: ctx, CTX 507 + * %rsi: dst 508 + * %rdx: src 509 + */ 510 + 511 + vzeroupper; 512 + pushq %r12; 513 + 514 + load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); 515 + 516 + call __twofish_dec_blk16; 517 + 518 + store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1, 519 + RX0); 520 + 521 + popq %r12; 522 + vzeroupper; 523 + 524 + ret; 525 + ENDPROC(twofish_cbc_dec_16way) 526 + 527 + ENTRY(twofish_ctr_16way) 528 + /* input: 529 + * %rdi: ctx, CTX 530 + * %rsi: dst (16 blocks) 531 + * %rdx: src (16 blocks) 532 + * %rcx: iv (little endian, 128bit) 533 + */ 534 + 535 + vzeroupper; 536 + pushq %r12; 537 + 538 + load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1, 539 + RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT, 540 + RBYTE); 541 + 542 + call __twofish_enc_blk16; 543 + 544 + store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); 545 + 546 + popq %r12; 547 + vzeroupper; 548 + 549 + ret; 550 + ENDPROC(twofish_ctr_16way) 551 + 552 + .align 8 553 + twofish_xts_crypt_16way: 554 + /* input: 555 + * %rdi: ctx, CTX 556 + * %rsi: dst (16 blocks) 557 + * %rdx: src (16 blocks) 558 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 559 + * %r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16 560 + */ 561 + 562 + vzeroupper; 563 + pushq %r12; 564 + 565 + load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, 566 + RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT, 567 + .Lxts_gf128mul_and_shl1_mask_0, 568 + .Lxts_gf128mul_and_shl1_mask_1); 569 + 570 + call *%r8; 571 + 572 + store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); 573 + 574 + popq %r12; 575 + vzeroupper; 576 + 577 + ret; 578 + ENDPROC(twofish_xts_crypt_16way) 579 + 580 + ENTRY(twofish_xts_enc_16way) 581 + /* input: 582 + * %rdi: ctx, CTX 583 + * %rsi: dst (16 blocks) 584 + * %rdx: src (16 blocks) 585 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 586 + */ 587 + leaq __twofish_enc_blk16, %r8; 588 + jmp twofish_xts_crypt_16way; 589 + ENDPROC(twofish_xts_enc_16way) 590 + 591 + ENTRY(twofish_xts_dec_16way) 592 + /* input: 593 + * %rdi: ctx, CTX 594 + * %rsi: dst (16 blocks) 595 + * %rdx: src (16 blocks) 596 + * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 597 + */ 598 + leaq __twofish_dec_blk16, %r8; 599 + jmp twofish_xts_crypt_16way; 600 + ENDPROC(twofish_xts_dec_16way)

+584

arch/x86/crypto/twofish_avx2_glue.c

··· 1 + /* 2 + * Glue Code for x86_64/AVX2 assembler optimized version of Twofish 3 + * 4 + * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License as published by 8 + * the Free Software Foundation; either version 2 of the License, or 9 + * (at your option) any later version. 10 + * 11 + */ 12 + 13 + #include <linux/module.h> 14 + #include <linux/types.h> 15 + #include <linux/crypto.h> 16 + #include <linux/err.h> 17 + #include <crypto/algapi.h> 18 + #include <crypto/ctr.h> 19 + #include <crypto/twofish.h> 20 + #include <crypto/lrw.h> 21 + #include <crypto/xts.h> 22 + #include <asm/xcr.h> 23 + #include <asm/xsave.h> 24 + #include <asm/crypto/twofish.h> 25 + #include <asm/crypto/ablk_helper.h> 26 + #include <asm/crypto/glue_helper.h> 27 + #include <crypto/scatterwalk.h> 28 + 29 + #define TF_AVX2_PARALLEL_BLOCKS 16 30 + 31 + /* 16-way AVX2 parallel cipher functions */ 32 + asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst, 33 + const u8 *src); 34 + asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst, 35 + const u8 *src); 36 + asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src); 37 + 38 + asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src, 39 + le128 *iv); 40 + 41 + asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst, 42 + const u8 *src, le128 *iv); 43 + asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst, 44 + const u8 *src, le128 *iv); 45 + 46 + static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, 47 + const u8 *src) 48 + { 49 + __twofish_enc_blk_3way(ctx, dst, src, false); 50 + } 51 + 52 + static const struct common_glue_ctx twofish_enc = { 53 + .num_funcs = 4, 54 + .fpu_blocks_limit = 8, 55 + 56 + .funcs = { { 57 + .num_blocks = 16, 58 + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) } 59 + }, { 60 + .num_blocks = 8, 61 + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) } 62 + }, { 63 + .num_blocks = 3, 64 + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } 65 + }, { 66 + .num_blocks = 1, 67 + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } 68 + } } 69 + }; 70 + 71 + static const struct common_glue_ctx twofish_ctr = { 72 + .num_funcs = 4, 73 + .fpu_blocks_limit = 8, 74 + 75 + .funcs = { { 76 + .num_blocks = 16, 77 + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) } 78 + }, { 79 + .num_blocks = 8, 80 + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) } 81 + }, { 82 + .num_blocks = 3, 83 + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } 84 + }, { 85 + .num_blocks = 1, 86 + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) } 87 + } } 88 + }; 89 + 90 + static const struct common_glue_ctx twofish_enc_xts = { 91 + .num_funcs = 3, 92 + .fpu_blocks_limit = 8, 93 + 94 + .funcs = { { 95 + .num_blocks = 16, 96 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) } 97 + }, { 98 + .num_blocks = 8, 99 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) } 100 + }, { 101 + .num_blocks = 1, 102 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) } 103 + } } 104 + }; 105 + 106 + static const struct common_glue_ctx twofish_dec = { 107 + .num_funcs = 4, 108 + .fpu_blocks_limit = 8, 109 + 110 + .funcs = { { 111 + .num_blocks = 16, 112 + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) } 113 + }, { 114 + .num_blocks = 8, 115 + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) } 116 + }, { 117 + .num_blocks = 3, 118 + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } 119 + }, { 120 + .num_blocks = 1, 121 + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } 122 + } } 123 + }; 124 + 125 + static const struct common_glue_ctx twofish_dec_cbc = { 126 + .num_funcs = 4, 127 + .fpu_blocks_limit = 8, 128 + 129 + .funcs = { { 130 + .num_blocks = 16, 131 + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) } 132 + }, { 133 + .num_blocks = 8, 134 + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) } 135 + }, { 136 + .num_blocks = 3, 137 + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } 138 + }, { 139 + .num_blocks = 1, 140 + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } 141 + } } 142 + }; 143 + 144 + static const struct common_glue_ctx twofish_dec_xts = { 145 + .num_funcs = 3, 146 + .fpu_blocks_limit = 8, 147 + 148 + .funcs = { { 149 + .num_blocks = 16, 150 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) } 151 + }, { 152 + .num_blocks = 8, 153 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) } 154 + }, { 155 + .num_blocks = 1, 156 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) } 157 + } } 158 + }; 159 + 160 + static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 161 + struct scatterlist *src, unsigned int nbytes) 162 + { 163 + return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes); 164 + } 165 + 166 + static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 167 + struct scatterlist *src, unsigned int nbytes) 168 + { 169 + return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes); 170 + } 171 + 172 + static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 173 + struct scatterlist *src, unsigned int nbytes) 174 + { 175 + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc, 176 + dst, src, nbytes); 177 + } 178 + 179 + static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 180 + struct scatterlist *src, unsigned int nbytes) 181 + { 182 + return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src, 183 + nbytes); 184 + } 185 + 186 + static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, 187 + struct scatterlist *src, unsigned int nbytes) 188 + { 189 + return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes); 190 + } 191 + 192 + static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes) 193 + { 194 + /* since reusing AVX functions, starts using FPU at 8 parallel blocks */ 195 + return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes); 196 + } 197 + 198 + static inline void twofish_fpu_end(bool fpu_enabled) 199 + { 200 + glue_fpu_end(fpu_enabled); 201 + } 202 + 203 + struct crypt_priv { 204 + struct twofish_ctx *ctx; 205 + bool fpu_enabled; 206 + }; 207 + 208 + static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) 209 + { 210 + const unsigned int bsize = TF_BLOCK_SIZE; 211 + struct crypt_priv *ctx = priv; 212 + int i; 213 + 214 + ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); 215 + 216 + while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) { 217 + twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst); 218 + srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS; 219 + nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS; 220 + } 221 + 222 + while (nbytes >= 8 * bsize) { 223 + twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst); 224 + srcdst += bsize * 8; 225 + nbytes -= bsize * 8; 226 + } 227 + 228 + while (nbytes >= 3 * bsize) { 229 + twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst); 230 + srcdst += bsize * 3; 231 + nbytes -= bsize * 3; 232 + } 233 + 234 + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) 235 + twofish_enc_blk(ctx->ctx, srcdst, srcdst); 236 + } 237 + 238 + static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) 239 + { 240 + const unsigned int bsize = TF_BLOCK_SIZE; 241 + struct crypt_priv *ctx = priv; 242 + int i; 243 + 244 + ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); 245 + 246 + while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) { 247 + twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst); 248 + srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS; 249 + nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS; 250 + } 251 + 252 + while (nbytes >= 8 * bsize) { 253 + twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst); 254 + srcdst += bsize * 8; 255 + nbytes -= bsize * 8; 256 + } 257 + 258 + while (nbytes >= 3 * bsize) { 259 + twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst); 260 + srcdst += bsize * 3; 261 + nbytes -= bsize * 3; 262 + } 263 + 264 + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) 265 + twofish_dec_blk(ctx->ctx, srcdst, srcdst); 266 + } 267 + 268 + static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 269 + struct scatterlist *src, unsigned int nbytes) 270 + { 271 + struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 272 + be128 buf[TF_AVX2_PARALLEL_BLOCKS]; 273 + struct crypt_priv crypt_ctx = { 274 + .ctx = &ctx->twofish_ctx, 275 + .fpu_enabled = false, 276 + }; 277 + struct lrw_crypt_req req = { 278 + .tbuf = buf, 279 + .tbuflen = sizeof(buf), 280 + 281 + .table_ctx = &ctx->lrw_table, 282 + .crypt_ctx = &crypt_ctx, 283 + .crypt_fn = encrypt_callback, 284 + }; 285 + int ret; 286 + 287 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 288 + ret = lrw_crypt(desc, dst, src, nbytes, &req); 289 + twofish_fpu_end(crypt_ctx.fpu_enabled); 290 + 291 + return ret; 292 + } 293 + 294 + static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 295 + struct scatterlist *src, unsigned int nbytes) 296 + { 297 + struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 298 + be128 buf[TF_AVX2_PARALLEL_BLOCKS]; 299 + struct crypt_priv crypt_ctx = { 300 + .ctx = &ctx->twofish_ctx, 301 + .fpu_enabled = false, 302 + }; 303 + struct lrw_crypt_req req = { 304 + .tbuf = buf, 305 + .tbuflen = sizeof(buf), 306 + 307 + .table_ctx = &ctx->lrw_table, 308 + .crypt_ctx = &crypt_ctx, 309 + .crypt_fn = decrypt_callback, 310 + }; 311 + int ret; 312 + 313 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 314 + ret = lrw_crypt(desc, dst, src, nbytes, &req); 315 + twofish_fpu_end(crypt_ctx.fpu_enabled); 316 + 317 + return ret; 318 + } 319 + 320 + static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 321 + struct scatterlist *src, unsigned int nbytes) 322 + { 323 + struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 324 + 325 + return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes, 326 + XTS_TWEAK_CAST(twofish_enc_blk), 327 + &ctx->tweak_ctx, &ctx->crypt_ctx); 328 + } 329 + 330 + static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 331 + struct scatterlist *src, unsigned int nbytes) 332 + { 333 + struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 334 + 335 + return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes, 336 + XTS_TWEAK_CAST(twofish_enc_blk), 337 + &ctx->tweak_ctx, &ctx->crypt_ctx); 338 + } 339 + 340 + static struct crypto_alg tf_algs[10] = { { 341 + .cra_name = "__ecb-twofish-avx2", 342 + .cra_driver_name = "__driver-ecb-twofish-avx2", 343 + .cra_priority = 0, 344 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 345 + .cra_blocksize = TF_BLOCK_SIZE, 346 + .cra_ctxsize = sizeof(struct twofish_ctx), 347 + .cra_alignmask = 0, 348 + .cra_type = &crypto_blkcipher_type, 349 + .cra_module = THIS_MODULE, 350 + .cra_u = { 351 + .blkcipher = { 352 + .min_keysize = TF_MIN_KEY_SIZE, 353 + .max_keysize = TF_MAX_KEY_SIZE, 354 + .setkey = twofish_setkey, 355 + .encrypt = ecb_encrypt, 356 + .decrypt = ecb_decrypt, 357 + }, 358 + }, 359 + }, { 360 + .cra_name = "__cbc-twofish-avx2", 361 + .cra_driver_name = "__driver-cbc-twofish-avx2", 362 + .cra_priority = 0, 363 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 364 + .cra_blocksize = TF_BLOCK_SIZE, 365 + .cra_ctxsize = sizeof(struct twofish_ctx), 366 + .cra_alignmask = 0, 367 + .cra_type = &crypto_blkcipher_type, 368 + .cra_module = THIS_MODULE, 369 + .cra_u = { 370 + .blkcipher = { 371 + .min_keysize = TF_MIN_KEY_SIZE, 372 + .max_keysize = TF_MAX_KEY_SIZE, 373 + .setkey = twofish_setkey, 374 + .encrypt = cbc_encrypt, 375 + .decrypt = cbc_decrypt, 376 + }, 377 + }, 378 + }, { 379 + .cra_name = "__ctr-twofish-avx2", 380 + .cra_driver_name = "__driver-ctr-twofish-avx2", 381 + .cra_priority = 0, 382 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 383 + .cra_blocksize = 1, 384 + .cra_ctxsize = sizeof(struct twofish_ctx), 385 + .cra_alignmask = 0, 386 + .cra_type = &crypto_blkcipher_type, 387 + .cra_module = THIS_MODULE, 388 + .cra_u = { 389 + .blkcipher = { 390 + .min_keysize = TF_MIN_KEY_SIZE, 391 + .max_keysize = TF_MAX_KEY_SIZE, 392 + .ivsize = TF_BLOCK_SIZE, 393 + .setkey = twofish_setkey, 394 + .encrypt = ctr_crypt, 395 + .decrypt = ctr_crypt, 396 + }, 397 + }, 398 + }, { 399 + .cra_name = "__lrw-twofish-avx2", 400 + .cra_driver_name = "__driver-lrw-twofish-avx2", 401 + .cra_priority = 0, 402 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 403 + .cra_blocksize = TF_BLOCK_SIZE, 404 + .cra_ctxsize = sizeof(struct twofish_lrw_ctx), 405 + .cra_alignmask = 0, 406 + .cra_type = &crypto_blkcipher_type, 407 + .cra_module = THIS_MODULE, 408 + .cra_exit = lrw_twofish_exit_tfm, 409 + .cra_u = { 410 + .blkcipher = { 411 + .min_keysize = TF_MIN_KEY_SIZE + 412 + TF_BLOCK_SIZE, 413 + .max_keysize = TF_MAX_KEY_SIZE + 414 + TF_BLOCK_SIZE, 415 + .ivsize = TF_BLOCK_SIZE, 416 + .setkey = lrw_twofish_setkey, 417 + .encrypt = lrw_encrypt, 418 + .decrypt = lrw_decrypt, 419 + }, 420 + }, 421 + }, { 422 + .cra_name = "__xts-twofish-avx2", 423 + .cra_driver_name = "__driver-xts-twofish-avx2", 424 + .cra_priority = 0, 425 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 426 + .cra_blocksize = TF_BLOCK_SIZE, 427 + .cra_ctxsize = sizeof(struct twofish_xts_ctx), 428 + .cra_alignmask = 0, 429 + .cra_type = &crypto_blkcipher_type, 430 + .cra_module = THIS_MODULE, 431 + .cra_u = { 432 + .blkcipher = { 433 + .min_keysize = TF_MIN_KEY_SIZE * 2, 434 + .max_keysize = TF_MAX_KEY_SIZE * 2, 435 + .ivsize = TF_BLOCK_SIZE, 436 + .setkey = xts_twofish_setkey, 437 + .encrypt = xts_encrypt, 438 + .decrypt = xts_decrypt, 439 + }, 440 + }, 441 + }, { 442 + .cra_name = "ecb(twofish)", 443 + .cra_driver_name = "ecb-twofish-avx2", 444 + .cra_priority = 500, 445 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 446 + .cra_blocksize = TF_BLOCK_SIZE, 447 + .cra_ctxsize = sizeof(struct async_helper_ctx), 448 + .cra_alignmask = 0, 449 + .cra_type = &crypto_ablkcipher_type, 450 + .cra_module = THIS_MODULE, 451 + .cra_init = ablk_init, 452 + .cra_exit = ablk_exit, 453 + .cra_u = { 454 + .ablkcipher = { 455 + .min_keysize = TF_MIN_KEY_SIZE, 456 + .max_keysize = TF_MAX_KEY_SIZE, 457 + .setkey = ablk_set_key, 458 + .encrypt = ablk_encrypt, 459 + .decrypt = ablk_decrypt, 460 + }, 461 + }, 462 + }, { 463 + .cra_name = "cbc(twofish)", 464 + .cra_driver_name = "cbc-twofish-avx2", 465 + .cra_priority = 500, 466 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 467 + .cra_blocksize = TF_BLOCK_SIZE, 468 + .cra_ctxsize = sizeof(struct async_helper_ctx), 469 + .cra_alignmask = 0, 470 + .cra_type = &crypto_ablkcipher_type, 471 + .cra_module = THIS_MODULE, 472 + .cra_init = ablk_init, 473 + .cra_exit = ablk_exit, 474 + .cra_u = { 475 + .ablkcipher = { 476 + .min_keysize = TF_MIN_KEY_SIZE, 477 + .max_keysize = TF_MAX_KEY_SIZE, 478 + .ivsize = TF_BLOCK_SIZE, 479 + .setkey = ablk_set_key, 480 + .encrypt = __ablk_encrypt, 481 + .decrypt = ablk_decrypt, 482 + }, 483 + }, 484 + }, { 485 + .cra_name = "ctr(twofish)", 486 + .cra_driver_name = "ctr-twofish-avx2", 487 + .cra_priority = 500, 488 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 489 + .cra_blocksize = 1, 490 + .cra_ctxsize = sizeof(struct async_helper_ctx), 491 + .cra_alignmask = 0, 492 + .cra_type = &crypto_ablkcipher_type, 493 + .cra_module = THIS_MODULE, 494 + .cra_init = ablk_init, 495 + .cra_exit = ablk_exit, 496 + .cra_u = { 497 + .ablkcipher = { 498 + .min_keysize = TF_MIN_KEY_SIZE, 499 + .max_keysize = TF_MAX_KEY_SIZE, 500 + .ivsize = TF_BLOCK_SIZE, 501 + .setkey = ablk_set_key, 502 + .encrypt = ablk_encrypt, 503 + .decrypt = ablk_encrypt, 504 + .geniv = "chainiv", 505 + }, 506 + }, 507 + }, { 508 + .cra_name = "lrw(twofish)", 509 + .cra_driver_name = "lrw-twofish-avx2", 510 + .cra_priority = 500, 511 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 512 + .cra_blocksize = TF_BLOCK_SIZE, 513 + .cra_ctxsize = sizeof(struct async_helper_ctx), 514 + .cra_alignmask = 0, 515 + .cra_type = &crypto_ablkcipher_type, 516 + .cra_module = THIS_MODULE, 517 + .cra_init = ablk_init, 518 + .cra_exit = ablk_exit, 519 + .cra_u = { 520 + .ablkcipher = { 521 + .min_keysize = TF_MIN_KEY_SIZE + 522 + TF_BLOCK_SIZE, 523 + .max_keysize = TF_MAX_KEY_SIZE + 524 + TF_BLOCK_SIZE, 525 + .ivsize = TF_BLOCK_SIZE, 526 + .setkey = ablk_set_key, 527 + .encrypt = ablk_encrypt, 528 + .decrypt = ablk_decrypt, 529 + }, 530 + }, 531 + }, { 532 + .cra_name = "xts(twofish)", 533 + .cra_driver_name = "xts-twofish-avx2", 534 + .cra_priority = 500, 535 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 536 + .cra_blocksize = TF_BLOCK_SIZE, 537 + .cra_ctxsize = sizeof(struct async_helper_ctx), 538 + .cra_alignmask = 0, 539 + .cra_type = &crypto_ablkcipher_type, 540 + .cra_module = THIS_MODULE, 541 + .cra_init = ablk_init, 542 + .cra_exit = ablk_exit, 543 + .cra_u = { 544 + .ablkcipher = { 545 + .min_keysize = TF_MIN_KEY_SIZE * 2, 546 + .max_keysize = TF_MAX_KEY_SIZE * 2, 547 + .ivsize = TF_BLOCK_SIZE, 548 + .setkey = ablk_set_key, 549 + .encrypt = ablk_encrypt, 550 + .decrypt = ablk_decrypt, 551 + }, 552 + }, 553 + } }; 554 + 555 + static int __init init(void) 556 + { 557 + u64 xcr0; 558 + 559 + if (!cpu_has_avx2 || !cpu_has_osxsave) { 560 + pr_info("AVX2 instructions are not detected.\n"); 561 + return -ENODEV; 562 + } 563 + 564 + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 565 + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { 566 + pr_info("AVX2 detected but unusable.\n"); 567 + return -ENODEV; 568 + } 569 + 570 + return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs)); 571 + } 572 + 573 + static void __exit fini(void) 574 + { 575 + crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs)); 576 + } 577 + 578 + module_init(init); 579 + module_exit(fini); 580 + 581 + MODULE_LICENSE("GPL"); 582 + MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized"); 583 + MODULE_ALIAS("twofish"); 584 + MODULE_ALIAS("twofish-asm");

+61 -40

arch/x86/crypto/twofish_avx_glue.c

··· 4 4 * Copyright (C) 2012 Johannes Goetzfried 5 5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 6 6 * 7 + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 8 + * 7 9 * This program is free software; you can redistribute it and/or modify 8 10 * it under the terms of the GNU General Public License as published by 9 11 * the Free Software Foundation; either version 2 of the License, or ··· 50 48 /* 8-way parallel cipher functions */ 51 49 asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, 52 50 const u8 *src); 51 + EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way); 52 + 53 53 asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, 54 54 const u8 *src); 55 + EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way); 55 56 56 57 asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, 57 58 const u8 *src); 59 + EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way); 60 + 58 61 asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, 59 62 const u8 *src, le128 *iv); 63 + EXPORT_SYMBOL_GPL(twofish_ctr_8way); 64 + 65 + asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, 66 + const u8 *src, le128 *iv); 67 + EXPORT_SYMBOL_GPL(twofish_xts_enc_8way); 68 + asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, 69 + const u8 *src, le128 *iv); 70 + EXPORT_SYMBOL_GPL(twofish_xts_dec_8way); 60 71 61 72 static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, 62 73 const u8 *src) 63 74 { 64 75 __twofish_enc_blk_3way(ctx, dst, src, false); 65 76 } 77 + 78 + void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) 79 + { 80 + glue_xts_crypt_128bit_one(ctx, dst, src, iv, 81 + GLUE_FUNC_CAST(twofish_enc_blk)); 82 + } 83 + EXPORT_SYMBOL_GPL(twofish_xts_enc); 84 + 85 + void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) 86 + { 87 + glue_xts_crypt_128bit_one(ctx, dst, src, iv, 88 + GLUE_FUNC_CAST(twofish_dec_blk)); 89 + } 90 + EXPORT_SYMBOL_GPL(twofish_xts_dec); 66 91 67 92 68 93 static const struct common_glue_ctx twofish_enc = { ··· 124 95 } } 125 96 }; 126 97 98 + static const struct common_glue_ctx twofish_enc_xts = { 99 + .num_funcs = 2, 100 + .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, 101 + 102 + .funcs = { { 103 + .num_blocks = TWOFISH_PARALLEL_BLOCKS, 104 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) } 105 + }, { 106 + .num_blocks = 1, 107 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) } 108 + } } 109 + }; 110 + 127 111 static const struct common_glue_ctx twofish_dec = { 128 112 .num_funcs = 3, 129 113 .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, ··· 166 124 }, { 167 125 .num_blocks = 1, 168 126 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } 127 + } } 128 + }; 129 + 130 + static const struct common_glue_ctx twofish_dec_xts = { 131 + .num_funcs = 2, 132 + .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, 133 + 134 + .funcs = { { 135 + .num_blocks = TWOFISH_PARALLEL_BLOCKS, 136 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) } 137 + }, { 138 + .num_blocks = 1, 139 + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) } 169 140 } } 170 141 }; 171 142 ··· 330 275 struct scatterlist *src, unsigned int nbytes) 331 276 { 332 277 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 333 - be128 buf[TWOFISH_PARALLEL_BLOCKS]; 334 - struct crypt_priv crypt_ctx = { 335 - .ctx = &ctx->crypt_ctx, 336 - .fpu_enabled = false, 337 - }; 338 - struct xts_crypt_req req = { 339 - .tbuf = buf, 340 - .tbuflen = sizeof(buf), 341 278 342 - .tweak_ctx = &ctx->tweak_ctx, 343 - .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), 344 - .crypt_ctx = &crypt_ctx, 345 - .crypt_fn = encrypt_callback, 346 - }; 347 - int ret; 348 - 349 - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 350 - ret = xts_crypt(desc, dst, src, nbytes, &req); 351 - twofish_fpu_end(crypt_ctx.fpu_enabled); 352 - 353 - return ret; 279 + return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes, 280 + XTS_TWEAK_CAST(twofish_enc_blk), 281 + &ctx->tweak_ctx, &ctx->crypt_ctx); 354 282 } 355 283 356 284 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 357 285 struct scatterlist *src, unsigned int nbytes) 358 286 { 359 287 struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 360 - be128 buf[TWOFISH_PARALLEL_BLOCKS]; 361 - struct crypt_priv crypt_ctx = { 362 - .ctx = &ctx->crypt_ctx, 363 - .fpu_enabled = false, 364 - }; 365 - struct xts_crypt_req req = { 366 - .tbuf = buf, 367 - .tbuflen = sizeof(buf), 368 288 369 - .tweak_ctx = &ctx->tweak_ctx, 370 - .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), 371 - .crypt_ctx = &crypt_ctx, 372 - .crypt_fn = decrypt_callback, 373 - }; 374 - int ret; 375 - 376 - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 377 - ret = xts_crypt(desc, dst, src, nbytes, &req); 378 - twofish_fpu_end(crypt_ctx.fpu_enabled); 379 - 380 - return ret; 289 + return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes, 290 + XTS_TWEAK_CAST(twofish_enc_blk), 291 + &ctx->tweak_ctx, &ctx->crypt_ctx); 381 292 } 382 293 383 294 static struct crypto_alg twofish_algs[10] = { {

+1

arch/x86/include/asm/cpufeature.h

··· 293 293 #define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3) 294 294 #define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) 295 295 #define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) 296 + #define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) 296 297 #define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) 297 298 #define cpu_has_mp boot_cpu_has(X86_FEATURE_MP) 298 299 #define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)

+43

arch/x86/include/asm/crypto/blowfish.h

··· 1 + #ifndef ASM_X86_BLOWFISH_H 2 + #define ASM_X86_BLOWFISH_H 3 + 4 + #include <linux/crypto.h> 5 + #include <crypto/blowfish.h> 6 + 7 + #define BF_PARALLEL_BLOCKS 4 8 + 9 + /* regular block cipher functions */ 10 + asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, 11 + bool xor); 12 + asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); 13 + 14 + /* 4-way parallel cipher functions */ 15 + asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, 16 + const u8 *src, bool xor); 17 + asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, 18 + const u8 *src); 19 + 20 + static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) 21 + { 22 + __blowfish_enc_blk(ctx, dst, src, false); 23 + } 24 + 25 + static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, 26 + const u8 *src) 27 + { 28 + __blowfish_enc_blk(ctx, dst, src, true); 29 + } 30 + 31 + static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, 32 + const u8 *src) 33 + { 34 + __blowfish_enc_blk_4way(ctx, dst, src, false); 35 + } 36 + 37 + static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, 38 + const u8 *src) 39 + { 40 + __blowfish_enc_blk_4way(ctx, dst, src, true); 41 + } 42 + 43 + #endif

+19

arch/x86/include/asm/crypto/camellia.h

··· 48 48 asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, 49 49 const u8 *src); 50 50 51 + /* 16-way parallel cipher functions (avx/aes-ni) */ 52 + asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, 53 + const u8 *src); 54 + asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, 55 + const u8 *src); 56 + 57 + asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, 58 + const u8 *src); 59 + asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, 60 + const u8 *src, le128 *iv); 61 + 62 + asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst, 63 + const u8 *src, le128 *iv); 64 + asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst, 65 + const u8 *src, le128 *iv); 66 + 51 67 static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, 52 68 const u8 *src) 53 69 { ··· 94 78 le128 *iv); 95 79 extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, 96 80 le128 *iv); 81 + 82 + extern void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); 83 + extern void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); 97 84 98 85 #endif /* ASM_X86_CAMELLIA_H */

+24

arch/x86/include/asm/crypto/glue_helper.h

··· 14 14 typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); 15 15 typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, 16 16 le128 *iv); 17 + typedef void (*common_glue_xts_func_t)(void *ctx, u128 *dst, const u128 *src, 18 + le128 *iv); 17 19 18 20 #define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) 19 21 #define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) 20 22 #define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn)) 23 + #define GLUE_XTS_FUNC_CAST(fn) ((common_glue_xts_func_t)(fn)) 21 24 22 25 struct common_glue_func_entry { 23 26 unsigned int num_blocks; /* number of blocks that @fn will process */ ··· 28 25 common_glue_func_t ecb; 29 26 common_glue_cbc_func_t cbc; 30 27 common_glue_ctr_func_t ctr; 28 + common_glue_xts_func_t xts; 31 29 } fn_u; 32 30 }; 33 31 ··· 100 96 i->b = cpu_to_le64(b); 101 97 } 102 98 99 + static inline void le128_gf128mul_x_ble(le128 *dst, const le128 *src) 100 + { 101 + u64 a = le64_to_cpu(src->a); 102 + u64 b = le64_to_cpu(src->b); 103 + u64 _tt = ((s64)a >> 63) & 0x87; 104 + 105 + dst->a = cpu_to_le64((a << 1) ^ (b >> 63)); 106 + dst->b = cpu_to_le64((b << 1) ^ _tt); 107 + } 108 + 103 109 extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, 104 110 struct blkcipher_desc *desc, 105 111 struct scatterlist *dst, ··· 131 117 struct blkcipher_desc *desc, 132 118 struct scatterlist *dst, 133 119 struct scatterlist *src, unsigned int nbytes); 120 + 121 + extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, 122 + struct blkcipher_desc *desc, 123 + struct scatterlist *dst, 124 + struct scatterlist *src, unsigned int nbytes, 125 + common_glue_func_t tweak_fn, void *tweak_ctx, 126 + void *crypt_ctx); 127 + 128 + extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, 129 + le128 *iv, common_glue_func_t fn); 134 130 135 131 #endif /* _CRYPTO_GLUE_HELPER_H */

+29

arch/x86/include/asm/crypto/serpent-avx.h

··· 6 6 7 7 #define SERPENT_PARALLEL_BLOCKS 8 8 8 9 + struct serpent_lrw_ctx { 10 + struct lrw_table_ctx lrw_table; 11 + struct serpent_ctx serpent_ctx; 12 + }; 13 + 14 + struct serpent_xts_ctx { 15 + struct serpent_ctx tweak_ctx; 16 + struct serpent_ctx crypt_ctx; 17 + }; 18 + 9 19 asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, 10 20 const u8 *src); 11 21 asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, ··· 25 15 const u8 *src); 26 16 asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst, 27 17 const u8 *src, le128 *iv); 18 + 19 + asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, 20 + const u8 *src, le128 *iv); 21 + asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, 22 + const u8 *src, le128 *iv); 23 + 24 + extern void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, 25 + le128 *iv); 26 + 27 + extern void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); 28 + extern void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); 29 + 30 + extern int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, 31 + unsigned int keylen); 32 + 33 + extern void lrw_serpent_exit_tfm(struct crypto_tfm *tfm); 34 + 35 + extern int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, 36 + unsigned int keylen); 28 37 29 38 #endif

+18

arch/x86/include/asm/crypto/twofish.h

··· 28 28 asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, 29 29 const u8 *src); 30 30 31 + /* 8-way parallel cipher functions */ 32 + asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, 33 + const u8 *src); 34 + asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, 35 + const u8 *src); 36 + asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, 37 + const u8 *src); 38 + asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, 39 + const u8 *src, le128 *iv); 40 + asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst, 41 + const u8 *src, le128 *iv); 42 + asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst, 43 + const u8 *src, le128 *iv); 44 + 31 45 /* helpers from twofish_x86_64-3way module */ 32 46 extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); 33 47 extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, ··· 56 42 57 43 extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, 58 44 unsigned int keylen); 45 + 46 + /* helpers from twofish-avx module */ 47 + extern void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); 48 + extern void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); 59 49 60 50 #endif /* ASM_X86_TWOFISH_H */

+123 -10

crypto/Kconfig

··· 198 198 select CRYPTO_CTR 199 199 select CRYPTO_AEAD 200 200 select CRYPTO_GHASH 201 + select CRYPTO_NULL 201 202 help 202 203 Support for Galois/Counter Mode (GCM) and Galois Message 203 204 Authentication Code (GMAC). Required for IPSec. ··· 283 282 284 283 comment "Hash modes" 285 284 285 + config CRYPTO_CMAC 286 + tristate "CMAC support" 287 + select CRYPTO_HASH 288 + select CRYPTO_MANAGER 289 + help 290 + Cipher-based Message Authentication Code (CMAC) specified by 291 + The National Institute of Standards and Technology (NIST). 292 + 293 + https://tools.ietf.org/html/rfc4493 294 + http://csrc.nist.gov/publications/nistpubs/800-38B/SP_800-38B.pdf 295 + 286 296 config CRYPTO_HMAC 287 297 tristate "HMAC support" 288 298 select CRYPTO_HASH ··· 334 322 by iSCSI for header and data digests and by others. 335 323 See Castagnoli93. Module will be crc32c. 336 324 337 - config CRYPTO_CRC32C_X86_64 338 - bool 339 - depends on X86 && 64BIT 340 - select CRYPTO_HASH 341 - help 342 - In Intel processor with SSE4.2 supported, the processor will 343 - support CRC32C calculation using hardware accelerated CRC32 344 - instruction optimized with PCLMULQDQ instruction when available. 345 - 346 325 config CRYPTO_CRC32C_INTEL 347 326 tristate "CRC32c INTEL hardware acceleration" 348 327 depends on X86 349 - select CRYPTO_CRC32C_X86_64 if 64BIT 350 328 select CRYPTO_HASH 351 329 help 352 330 In Intel processor with SSE4.2 supported, the processor will ··· 481 479 SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented 482 480 using Supplemental SSE3 (SSSE3) instructions or Advanced Vector 483 481 Extensions (AVX), when available. 482 + 483 + config CRYPTO_SHA256_SSSE3 484 + tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2)" 485 + depends on X86 && 64BIT 486 + select CRYPTO_SHA256 487 + select CRYPTO_HASH 488 + help 489 + SHA-256 secure hash standard (DFIPS 180-2) implemented 490 + using Supplemental SSE3 (SSSE3) instructions, or Advanced Vector 491 + Extensions version 1 (AVX1), or Advanced Vector Extensions 492 + version 2 (AVX2) instructions, when available. 493 + 494 + config CRYPTO_SHA512_SSSE3 495 + tristate "SHA512 digest algorithm (SSSE3/AVX/AVX2)" 496 + depends on X86 && 64BIT 497 + select CRYPTO_SHA512 498 + select CRYPTO_HASH 499 + help 500 + SHA-512 secure hash standard (DFIPS 180-2) implemented 501 + using Supplemental SSE3 (SSSE3) instructions, or Advanced Vector 502 + Extensions version 1 (AVX1), or Advanced Vector Extensions 503 + version 2 (AVX2) instructions, when available. 484 504 485 505 config CRYPTO_SHA1_SPARC64 486 506 tristate "SHA1 digest algorithm (SPARC64)" ··· 678 654 select CRYPTO_CRYPTD 679 655 select CRYPTO_ABLK_HELPER_X86 680 656 select CRYPTO_ALGAPI 657 + select CRYPTO_GLUE_HELPER_X86 if 64BIT 681 658 select CRYPTO_LRW 682 659 select CRYPTO_XTS 683 660 help ··· 820 795 See also: 821 796 <http://www.schneier.com/blowfish.html> 822 797 798 + config CRYPTO_BLOWFISH_AVX2_X86_64 799 + tristate "Blowfish cipher algorithm (x86_64/AVX2)" 800 + depends on X86 && 64BIT 801 + select CRYPTO_ALGAPI 802 + select CRYPTO_CRYPTD 803 + select CRYPTO_ABLK_HELPER_X86 804 + select CRYPTO_BLOWFISH_COMMON 805 + select CRYPTO_BLOWFISH_X86_64 806 + help 807 + Blowfish cipher algorithm (x86_64/AVX2), by Bruce Schneier. 808 + 809 + This is a variable key length cipher which can use keys from 32 810 + bits to 448 bits in length. It's fast, simple and specifically 811 + designed for use on "large microprocessors". 812 + 813 + See also: 814 + <http://www.schneier.com/blowfish.html> 815 + 823 816 config CRYPTO_CAMELLIA 824 817 tristate "Camellia cipher algorithms" 825 818 depends on CRYPTO ··· 885 842 select CRYPTO_XTS 886 843 help 887 844 Camellia cipher algorithm module (x86_64/AES-NI/AVX). 845 + 846 + Camellia is a symmetric key block cipher developed jointly 847 + at NTT and Mitsubishi Electric Corporation. 848 + 849 + The Camellia specifies three key sizes: 128, 192 and 256 bits. 850 + 851 + See also: 852 + <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html> 853 + 854 + config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64 855 + tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX2)" 856 + depends on X86 && 64BIT 857 + depends on CRYPTO 858 + select CRYPTO_ALGAPI 859 + select CRYPTO_CRYPTD 860 + select CRYPTO_ABLK_HELPER_X86 861 + select CRYPTO_GLUE_HELPER_X86 862 + select CRYPTO_CAMELLIA_X86_64 863 + select CRYPTO_CAMELLIA_AESNI_AVX_X86_64 864 + select CRYPTO_LRW 865 + select CRYPTO_XTS 866 + help 867 + Camellia cipher algorithm module (x86_64/AES-NI/AVX2). 888 868 889 869 Camellia is a symmetric key block cipher developed jointly 890 870 at NTT and Mitsubishi Electric Corporation. ··· 1154 1088 See also: 1155 1089 <http://www.cl.cam.ac.uk/~rja14/serpent.html> 1156 1090 1091 + config CRYPTO_SERPENT_AVX2_X86_64 1092 + tristate "Serpent cipher algorithm (x86_64/AVX2)" 1093 + depends on X86 && 64BIT 1094 + select CRYPTO_ALGAPI 1095 + select CRYPTO_CRYPTD 1096 + select CRYPTO_ABLK_HELPER_X86 1097 + select CRYPTO_GLUE_HELPER_X86 1098 + select CRYPTO_SERPENT 1099 + select CRYPTO_SERPENT_AVX_X86_64 1100 + select CRYPTO_LRW 1101 + select CRYPTO_XTS 1102 + help 1103 + Serpent cipher algorithm, by Anderson, Biham & Knudsen. 1104 + 1105 + Keys are allowed to be from 0 to 256 bits in length, in steps 1106 + of 8 bits. 1107 + 1108 + This module provides Serpent cipher algorithm that processes 16 1109 + blocks parallel using AVX2 instruction set. 1110 + 1111 + See also: 1112 + <http://www.cl.cam.ac.uk/~rja14/serpent.html> 1113 + 1157 1114 config CRYPTO_TEA 1158 1115 tristate "TEA, XTEA and XETA cipher algorithms" 1159 1116 select CRYPTO_ALGAPI ··· 1292 1203 1293 1204 This module provides the Twofish cipher algorithm that processes 1294 1205 eight blocks parallel using the AVX Instruction Set. 1206 + 1207 + See also: 1208 + <http://www.schneier.com/twofish.html> 1209 + 1210 + config CRYPTO_TWOFISH_AVX2_X86_64 1211 + tristate "Twofish cipher algorithm (x86_64/AVX2)" 1212 + depends on X86 && 64BIT 1213 + select CRYPTO_ALGAPI 1214 + select CRYPTO_CRYPTD 1215 + select CRYPTO_ABLK_HELPER_X86 1216 + select CRYPTO_GLUE_HELPER_X86 1217 + select CRYPTO_TWOFISH_COMMON 1218 + select CRYPTO_TWOFISH_X86_64 1219 + select CRYPTO_TWOFISH_X86_64_3WAY 1220 + select CRYPTO_TWOFISH_AVX_X86_64 1221 + select CRYPTO_LRW 1222 + select CRYPTO_XTS 1223 + help 1224 + Twofish cipher algorithm (x86_64/AVX2). 1225 + 1226 + Twofish was submitted as an AES (Advanced Encryption Standard) 1227 + candidate cipher by researchers at CounterPane Systems. It is a 1228 + 16 round block cipher supporting key sizes of 128, 192, and 256 1229 + bits. 1295 1230 1296 1231 See also: 1297 1232 <http://www.schneier.com/twofish.html>

+1

crypto/Makefile

··· 32 32 33 33 obj-$(CONFIG_CRYPTO_MANAGER2) += cryptomgr.o 34 34 obj-$(CONFIG_CRYPTO_USER) += crypto_user.o 35 + obj-$(CONFIG_CRYPTO_CMAC) += cmac.o 35 36 obj-$(CONFIG_CRYPTO_HMAC) += hmac.o 36 37 obj-$(CONFIG_CRYPTO_VMAC) += vmac.o 37 38 obj-$(CONFIG_CRYPTO_XCBC) += xcbc.o

+315

crypto/cmac.c

··· 1 + /* 2 + * CMAC: Cipher Block Mode for Authentication 3 + * 4 + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 5 + * 6 + * Based on work by: 7 + * Copyright © 2013 Tom St Denis <tstdenis@elliptictech.com> 8 + * Based on crypto/xcbc.c: 9 + * Copyright © 2006 USAGI/WIDE Project, 10 + * Author: Kazunori Miyazawa <miyazawa@linux-ipv6.org> 11 + * 12 + * This program is free software; you can redistribute it and/or modify 13 + * it under the terms of the GNU General Public License as published by 14 + * the Free Software Foundation; either version 2 of the License, or 15 + * (at your option) any later version. 16 + * 17 + */ 18 + 19 + #include <crypto/internal/hash.h> 20 + #include <linux/err.h> 21 + #include <linux/kernel.h> 22 + #include <linux/module.h> 23 + 24 + /* 25 + * +------------------------ 26 + * | <parent tfm> 27 + * +------------------------ 28 + * | cmac_tfm_ctx 29 + * +------------------------ 30 + * | consts (block size * 2) 31 + * +------------------------ 32 + */ 33 + struct cmac_tfm_ctx { 34 + struct crypto_cipher *child; 35 + u8 ctx[]; 36 + }; 37 + 38 + /* 39 + * +------------------------ 40 + * | <shash desc> 41 + * +------------------------ 42 + * | cmac_desc_ctx 43 + * +------------------------ 44 + * | odds (block size) 45 + * +------------------------ 46 + * | prev (block size) 47 + * +------------------------ 48 + */ 49 + struct cmac_desc_ctx { 50 + unsigned int len; 51 + u8 ctx[]; 52 + }; 53 + 54 + static int crypto_cmac_digest_setkey(struct crypto_shash *parent, 55 + const u8 *inkey, unsigned int keylen) 56 + { 57 + unsigned long alignmask = crypto_shash_alignmask(parent); 58 + struct cmac_tfm_ctx *ctx = crypto_shash_ctx(parent); 59 + unsigned int bs = crypto_shash_blocksize(parent); 60 + __be64 *consts = PTR_ALIGN((void *)ctx->ctx, alignmask + 1); 61 + u64 _const[2]; 62 + int i, err = 0; 63 + u8 msb_mask, gfmask; 64 + 65 + err = crypto_cipher_setkey(ctx->child, inkey, keylen); 66 + if (err) 67 + return err; 68 + 69 + /* encrypt the zero block */ 70 + memset(consts, 0, bs); 71 + crypto_cipher_encrypt_one(ctx->child, (u8 *)consts, (u8 *)consts); 72 + 73 + switch (bs) { 74 + case 16: 75 + gfmask = 0x87; 76 + _const[0] = be64_to_cpu(consts[1]); 77 + _const[1] = be64_to_cpu(consts[0]); 78 + 79 + /* gf(2^128) multiply zero-ciphertext with u and u^2 */ 80 + for (i = 0; i < 4; i += 2) { 81 + msb_mask = ((s64)_const[1] >> 63) & gfmask; 82 + _const[1] = (_const[1] << 1) | (_const[0] >> 63); 83 + _const[0] = (_const[0] << 1) ^ msb_mask; 84 + 85 + consts[i + 0] = cpu_to_be64(_const[1]); 86 + consts[i + 1] = cpu_to_be64(_const[0]); 87 + } 88 + 89 + break; 90 + case 8: 91 + gfmask = 0x1B; 92 + _const[0] = be64_to_cpu(consts[0]); 93 + 94 + /* gf(2^64) multiply zero-ciphertext with u and u^2 */ 95 + for (i = 0; i < 2; i++) { 96 + msb_mask = ((s64)_const[0] >> 63) & gfmask; 97 + _const[0] = (_const[0] << 1) ^ msb_mask; 98 + 99 + consts[i] = cpu_to_be64(_const[0]); 100 + } 101 + 102 + break; 103 + } 104 + 105 + return 0; 106 + } 107 + 108 + static int crypto_cmac_digest_init(struct shash_desc *pdesc) 109 + { 110 + unsigned long alignmask = crypto_shash_alignmask(pdesc->tfm); 111 + struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc); 112 + int bs = crypto_shash_blocksize(pdesc->tfm); 113 + u8 *prev = PTR_ALIGN((void *)ctx->ctx, alignmask + 1) + bs; 114 + 115 + ctx->len = 0; 116 + memset(prev, 0, bs); 117 + 118 + return 0; 119 + } 120 + 121 + static int crypto_cmac_digest_update(struct shash_desc *pdesc, const u8 *p, 122 + unsigned int len) 123 + { 124 + struct crypto_shash *parent = pdesc->tfm; 125 + unsigned long alignmask = crypto_shash_alignmask(parent); 126 + struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent); 127 + struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc); 128 + struct crypto_cipher *tfm = tctx->child; 129 + int bs = crypto_shash_blocksize(parent); 130 + u8 *odds = PTR_ALIGN((void *)ctx->ctx, alignmask + 1); 131 + u8 *prev = odds + bs; 132 + 133 + /* checking the data can fill the block */ 134 + if ((ctx->len + len) <= bs) { 135 + memcpy(odds + ctx->len, p, len); 136 + ctx->len += len; 137 + return 0; 138 + } 139 + 140 + /* filling odds with new data and encrypting it */ 141 + memcpy(odds + ctx->len, p, bs - ctx->len); 142 + len -= bs - ctx->len; 143 + p += bs - ctx->len; 144 + 145 + crypto_xor(prev, odds, bs); 146 + crypto_cipher_encrypt_one(tfm, prev, prev); 147 + 148 + /* clearing the length */ 149 + ctx->len = 0; 150 + 151 + /* encrypting the rest of data */ 152 + while (len > bs) { 153 + crypto_xor(prev, p, bs); 154 + crypto_cipher_encrypt_one(tfm, prev, prev); 155 + p += bs; 156 + len -= bs; 157 + } 158 + 159 + /* keeping the surplus of blocksize */ 160 + if (len) { 161 + memcpy(odds, p, len); 162 + ctx->len = len; 163 + } 164 + 165 + return 0; 166 + } 167 + 168 + static int crypto_cmac_digest_final(struct shash_desc *pdesc, u8 *out) 169 + { 170 + struct crypto_shash *parent = pdesc->tfm; 171 + unsigned long alignmask = crypto_shash_alignmask(parent); 172 + struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent); 173 + struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc); 174 + struct crypto_cipher *tfm = tctx->child; 175 + int bs = crypto_shash_blocksize(parent); 176 + u8 *consts = PTR_ALIGN((void *)tctx->ctx, alignmask + 1); 177 + u8 *odds = PTR_ALIGN((void *)ctx->ctx, alignmask + 1); 178 + u8 *prev = odds + bs; 179 + unsigned int offset = 0; 180 + 181 + if (ctx->len != bs) { 182 + unsigned int rlen; 183 + u8 *p = odds + ctx->len; 184 + 185 + *p = 0x80; 186 + p++; 187 + 188 + rlen = bs - ctx->len - 1; 189 + if (rlen) 190 + memset(p, 0, rlen); 191 + 192 + offset += bs; 193 + } 194 + 195 + crypto_xor(prev, odds, bs); 196 + crypto_xor(prev, consts + offset, bs); 197 + 198 + crypto_cipher_encrypt_one(tfm, out, prev); 199 + 200 + return 0; 201 + } 202 + 203 + static int cmac_init_tfm(struct crypto_tfm *tfm) 204 + { 205 + struct crypto_cipher *cipher; 206 + struct crypto_instance *inst = (void *)tfm->__crt_alg; 207 + struct crypto_spawn *spawn = crypto_instance_ctx(inst); 208 + struct cmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm); 209 + 210 + cipher = crypto_spawn_cipher(spawn); 211 + if (IS_ERR(cipher)) 212 + return PTR_ERR(cipher); 213 + 214 + ctx->child = cipher; 215 + 216 + return 0; 217 + }; 218 + 219 + static void cmac_exit_tfm(struct crypto_tfm *tfm) 220 + { 221 + struct cmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm); 222 + crypto_free_cipher(ctx->child); 223 + } 224 + 225 + static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb) 226 + { 227 + struct shash_instance *inst; 228 + struct crypto_alg *alg; 229 + unsigned long alignmask; 230 + int err; 231 + 232 + err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH); 233 + if (err) 234 + return err; 235 + 236 + alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_CIPHER, 237 + CRYPTO_ALG_TYPE_MASK); 238 + if (IS_ERR(alg)) 239 + return PTR_ERR(alg); 240 + 241 + switch (alg->cra_blocksize) { 242 + case 16: 243 + case 8: 244 + break; 245 + default: 246 + goto out_put_alg; 247 + } 248 + 249 + inst = shash_alloc_instance("cmac", alg); 250 + err = PTR_ERR(inst); 251 + if (IS_ERR(inst)) 252 + goto out_put_alg; 253 + 254 + err = crypto_init_spawn(shash_instance_ctx(inst), alg, 255 + shash_crypto_instance(inst), 256 + CRYPTO_ALG_TYPE_MASK); 257 + if (err) 258 + goto out_free_inst; 259 + 260 + alignmask = alg->cra_alignmask | (sizeof(long) - 1); 261 + inst->alg.base.cra_alignmask = alignmask; 262 + inst->alg.base.cra_priority = alg->cra_priority; 263 + inst->alg.base.cra_blocksize = alg->cra_blocksize; 264 + 265 + inst->alg.digestsize = alg->cra_blocksize; 266 + inst->alg.descsize = 267 + ALIGN(sizeof(struct cmac_desc_ctx), crypto_tfm_ctx_alignment()) 268 + + (alignmask & ~(crypto_tfm_ctx_alignment() - 1)) 269 + + alg->cra_blocksize * 2; 270 + 271 + inst->alg.base.cra_ctxsize = 272 + ALIGN(sizeof(struct cmac_tfm_ctx), alignmask + 1) 273 + + alg->cra_blocksize * 2; 274 + 275 + inst->alg.base.cra_init = cmac_init_tfm; 276 + inst->alg.base.cra_exit = cmac_exit_tfm; 277 + 278 + inst->alg.init = crypto_cmac_digest_init; 279 + inst->alg.update = crypto_cmac_digest_update; 280 + inst->alg.final = crypto_cmac_digest_final; 281 + inst->alg.setkey = crypto_cmac_digest_setkey; 282 + 283 + err = shash_register_instance(tmpl, inst); 284 + if (err) { 285 + out_free_inst: 286 + shash_free_instance(shash_crypto_instance(inst)); 287 + } 288 + 289 + out_put_alg: 290 + crypto_mod_put(alg); 291 + return err; 292 + } 293 + 294 + static struct crypto_template crypto_cmac_tmpl = { 295 + .name = "cmac", 296 + .create = cmac_create, 297 + .free = shash_free_instance, 298 + .module = THIS_MODULE, 299 + }; 300 + 301 + static int __init crypto_cmac_module_init(void) 302 + { 303 + return crypto_register_template(&crypto_cmac_tmpl); 304 + } 305 + 306 + static void __exit crypto_cmac_module_exit(void) 307 + { 308 + crypto_unregister_template(&crypto_cmac_tmpl); 309 + } 310 + 311 + module_init(crypto_cmac_module_init); 312 + module_exit(crypto_cmac_module_exit); 313 + 314 + MODULE_LICENSE("GPL"); 315 + MODULE_DESCRIPTION("CMAC keyed hash algorithm");

+2 -2

crypto/crypto_user.c

··· 440 440 441 441 #undef MSGSIZE 442 442 443 - static struct crypto_link { 443 + static const struct crypto_link { 444 444 int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **); 445 445 int (*dump)(struct sk_buff *, struct netlink_callback *); 446 446 int (*done)(struct netlink_callback *); ··· 456 456 static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 457 457 { 458 458 struct nlattr *attrs[CRYPTOCFGA_MAX+1]; 459 - struct crypto_link *link; 459 + const struct crypto_link *link; 460 460 int type, err; 461 461 462 462 type = nlh->nlmsg_type;

+97 -19

crypto/gcm.c

··· 37 37 u8 nonce[4]; 38 38 }; 39 39 40 + struct crypto_rfc4543_instance_ctx { 41 + struct crypto_aead_spawn aead; 42 + struct crypto_skcipher_spawn null; 43 + }; 44 + 40 45 struct crypto_rfc4543_ctx { 41 46 struct crypto_aead *child; 47 + struct crypto_blkcipher *null; 42 48 u8 nonce[4]; 43 49 }; 44 50 ··· 1100 1094 return crypto_aead_setauthsize(ctx->child, authsize); 1101 1095 } 1102 1096 1097 + static void crypto_rfc4543_done(struct crypto_async_request *areq, int err) 1098 + { 1099 + struct aead_request *req = areq->data; 1100 + struct crypto_aead *aead = crypto_aead_reqtfm(req); 1101 + struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req); 1102 + 1103 + if (!err) { 1104 + scatterwalk_map_and_copy(rctx->auth_tag, req->dst, 1105 + req->cryptlen, 1106 + crypto_aead_authsize(aead), 1); 1107 + } 1108 + 1109 + aead_request_complete(req, err); 1110 + } 1111 + 1103 1112 static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req, 1104 - int enc) 1113 + bool enc) 1105 1114 { 1106 1115 struct crypto_aead *aead = crypto_aead_reqtfm(req); 1107 1116 struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead); 1108 1117 struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req); 1109 1118 struct aead_request *subreq = &rctx->subreq; 1110 - struct scatterlist *dst = req->dst; 1119 + struct scatterlist *src = req->src; 1111 1120 struct scatterlist *cipher = rctx->cipher; 1112 1121 struct scatterlist *payload = rctx->payload; 1113 1122 struct scatterlist *assoc = rctx->assoc; 1114 1123 unsigned int authsize = crypto_aead_authsize(aead); 1115 1124 unsigned int assoclen = req->assoclen; 1116 - struct page *dstp; 1117 - u8 *vdst; 1125 + struct page *srcp; 1126 + u8 *vsrc; 1118 1127 u8 *iv = PTR_ALIGN((u8 *)(rctx + 1) + crypto_aead_reqsize(ctx->child), 1119 1128 crypto_aead_alignmask(ctx->child) + 1); 1120 1129 ··· 1140 1119 if (enc) 1141 1120 memset(rctx->auth_tag, 0, authsize); 1142 1121 else 1143 - scatterwalk_map_and_copy(rctx->auth_tag, dst, 1122 + scatterwalk_map_and_copy(rctx->auth_tag, src, 1144 1123 req->cryptlen - authsize, 1145 1124 authsize, 0); 1146 1125 1147 1126 sg_init_one(cipher, rctx->auth_tag, authsize); 1148 1127 1149 1128 /* construct the aad */ 1150 - dstp = sg_page(dst); 1151 - vdst = PageHighMem(dstp) ? NULL : page_address(dstp) + dst->offset; 1129 + srcp = sg_page(src); 1130 + vsrc = PageHighMem(srcp) ? NULL : page_address(srcp) + src->offset; 1152 1131 1153 1132 sg_init_table(payload, 2); 1154 1133 sg_set_buf(payload, req->iv, 8); 1155 - scatterwalk_crypto_chain(payload, dst, vdst == req->iv + 8, 2); 1134 + scatterwalk_crypto_chain(payload, src, vsrc == req->iv + 8, 2); 1156 1135 assoclen += 8 + req->cryptlen - (enc ? 0 : authsize); 1157 1136 1158 1137 if (req->assoc->length == req->assoclen) { ··· 1171 1150 scatterwalk_crypto_chain(assoc, payload, 0, 2); 1172 1151 1173 1152 aead_request_set_tfm(subreq, ctx->child); 1174 - aead_request_set_callback(subreq, req->base.flags, req->base.complete, 1175 - req->base.data); 1153 + aead_request_set_callback(subreq, req->base.flags, crypto_rfc4543_done, 1154 + req); 1176 1155 aead_request_set_crypt(subreq, cipher, cipher, enc ? 0 : authsize, iv); 1177 1156 aead_request_set_assoc(subreq, assoc, assoclen); 1178 1157 1179 1158 return subreq; 1159 + } 1160 + 1161 + static int crypto_rfc4543_copy_src_to_dst(struct aead_request *req, bool enc) 1162 + { 1163 + struct crypto_aead *aead = crypto_aead_reqtfm(req); 1164 + struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead); 1165 + unsigned int authsize = crypto_aead_authsize(aead); 1166 + unsigned int nbytes = req->cryptlen - (enc ? 0 : authsize); 1167 + struct blkcipher_desc desc = { 1168 + .tfm = ctx->null, 1169 + }; 1170 + 1171 + return crypto_blkcipher_encrypt(&desc, req->dst, req->src, nbytes); 1180 1172 } 1181 1173 1182 1174 static int crypto_rfc4543_encrypt(struct aead_request *req) ··· 1199 1165 struct aead_request *subreq; 1200 1166 int err; 1201 1167 1202 - subreq = crypto_rfc4543_crypt(req, 1); 1168 + if (req->src != req->dst) { 1169 + err = crypto_rfc4543_copy_src_to_dst(req, true); 1170 + if (err) 1171 + return err; 1172 + } 1173 + 1174 + subreq = crypto_rfc4543_crypt(req, true); 1203 1175 err = crypto_aead_encrypt(subreq); 1204 1176 if (err) 1205 1177 return err; ··· 1218 1178 1219 1179 static int crypto_rfc4543_decrypt(struct aead_request *req) 1220 1180 { 1221 - req = crypto_rfc4543_crypt(req, 0); 1181 + int err; 1182 + 1183 + if (req->src != req->dst) { 1184 + err = crypto_rfc4543_copy_src_to_dst(req, false); 1185 + if (err) 1186 + return err; 1187 + } 1188 + 1189 + req = crypto_rfc4543_crypt(req, false); 1222 1190 1223 1191 return crypto_aead_decrypt(req); 1224 1192 } ··· 1234 1186 static int crypto_rfc4543_init_tfm(struct crypto_tfm *tfm) 1235 1187 { 1236 1188 struct crypto_instance *inst = (void *)tfm->__crt_alg; 1237 - struct crypto_aead_spawn *spawn = crypto_instance_ctx(inst); 1189 + struct crypto_rfc4543_instance_ctx *ictx = crypto_instance_ctx(inst); 1190 + struct crypto_aead_spawn *spawn = &ictx->aead; 1238 1191 struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm); 1239 1192 struct crypto_aead *aead; 1193 + struct crypto_blkcipher *null; 1240 1194 unsigned long align; 1195 + int err = 0; 1241 1196 1242 1197 aead = crypto_spawn_aead(spawn); 1243 1198 if (IS_ERR(aead)) 1244 1199 return PTR_ERR(aead); 1245 1200 1201 + null = crypto_spawn_blkcipher(&ictx->null.base); 1202 + err = PTR_ERR(null); 1203 + if (IS_ERR(null)) 1204 + goto err_free_aead; 1205 + 1246 1206 ctx->child = aead; 1207 + ctx->null = null; 1247 1208 1248 1209 align = crypto_aead_alignmask(aead); 1249 1210 align &= ~(crypto_tfm_ctx_alignment() - 1); ··· 1262 1205 align + 16; 1263 1206 1264 1207 return 0; 1208 + 1209 + err_free_aead: 1210 + crypto_free_aead(aead); 1211 + return err; 1265 1212 } 1266 1213 1267 1214 static void crypto_rfc4543_exit_tfm(struct crypto_tfm *tfm) ··· 1273 1212 struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm); 1274 1213 1275 1214 crypto_free_aead(ctx->child); 1215 + crypto_free_blkcipher(ctx->null); 1276 1216 } 1277 1217 1278 1218 static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb) ··· 1282 1220 struct crypto_instance *inst; 1283 1221 struct crypto_aead_spawn *spawn; 1284 1222 struct crypto_alg *alg; 1223 + struct crypto_rfc4543_instance_ctx *ctx; 1285 1224 const char *ccm_name; 1286 1225 int err; 1287 1226 ··· 1297 1234 if (IS_ERR(ccm_name)) 1298 1235 return ERR_CAST(ccm_name); 1299 1236 1300 - inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); 1237 + inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); 1301 1238 if (!inst) 1302 1239 return ERR_PTR(-ENOMEM); 1303 1240 1304 - spawn = crypto_instance_ctx(inst); 1241 + ctx = crypto_instance_ctx(inst); 1242 + spawn = &ctx->aead; 1305 1243 crypto_set_aead_spawn(spawn, inst); 1306 1244 err = crypto_grab_aead(spawn, ccm_name, 0, 1307 1245 crypto_requires_sync(algt->type, algt->mask)); ··· 1311 1247 1312 1248 alg = crypto_aead_spawn_alg(spawn); 1313 1249 1250 + crypto_set_skcipher_spawn(&ctx->null, inst); 1251 + err = crypto_grab_skcipher(&ctx->null, "ecb(cipher_null)", 0, 1252 + CRYPTO_ALG_ASYNC); 1253 + if (err) 1254 + goto out_drop_alg; 1255 + 1256 + crypto_skcipher_spawn_alg(&ctx->null); 1257 + 1314 1258 err = -EINVAL; 1315 1259 1316 1260 /* We only support 16-byte blocks. */ 1317 1261 if (alg->cra_aead.ivsize != 16) 1318 - goto out_drop_alg; 1262 + goto out_drop_ecbnull; 1319 1263 1320 1264 /* Not a stream cipher? */ 1321 1265 if (alg->cra_blocksize != 1) 1322 - goto out_drop_alg; 1266 + goto out_drop_ecbnull; 1323 1267 1324 1268 err = -ENAMETOOLONG; 1325 1269 if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME, ··· 1335 1263 snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, 1336 1264 "rfc4543(%s)", alg->cra_driver_name) >= 1337 1265 CRYPTO_MAX_ALG_NAME) 1338 - goto out_drop_alg; 1266 + goto out_drop_ecbnull; 1339 1267 1340 1268 inst->alg.cra_flags = CRYPTO_ALG_TYPE_AEAD; 1341 1269 inst->alg.cra_flags |= alg->cra_flags & CRYPTO_ALG_ASYNC; ··· 1362 1290 out: 1363 1291 return inst; 1364 1292 1293 + out_drop_ecbnull: 1294 + crypto_drop_skcipher(&ctx->null); 1365 1295 out_drop_alg: 1366 1296 crypto_drop_aead(spawn); 1367 1297 out_free_inst: ··· 1374 1300 1375 1301 static void crypto_rfc4543_free(struct crypto_instance *inst) 1376 1302 { 1377 - crypto_drop_spawn(crypto_instance_ctx(inst)); 1303 + struct crypto_rfc4543_instance_ctx *ctx = crypto_instance_ctx(inst); 1304 + 1305 + crypto_drop_aead(&ctx->aead); 1306 + crypto_drop_skcipher(&ctx->null); 1307 + 1378 1308 kfree(inst); 1379 1309 } 1380 1310

+6 -5

crypto/sha256_generic.c

··· 246 246 return 0; 247 247 } 248 248 249 - static int sha256_update(struct shash_desc *desc, const u8 *data, 249 + int crypto_sha256_update(struct shash_desc *desc, const u8 *data, 250 250 unsigned int len) 251 251 { 252 252 struct sha256_state *sctx = shash_desc_ctx(desc); ··· 277 277 278 278 return 0; 279 279 } 280 + EXPORT_SYMBOL(crypto_sha256_update); 280 281 281 282 static int sha256_final(struct shash_desc *desc, u8 *out) 282 283 { ··· 294 293 /* Pad out to 56 mod 64. */ 295 294 index = sctx->count & 0x3f; 296 295 pad_len = (index < 56) ? (56 - index) : ((64+56) - index); 297 - sha256_update(desc, padding, pad_len); 296 + crypto_sha256_update(desc, padding, pad_len); 298 297 299 298 /* Append length (before padding) */ 300 - sha256_update(desc, (const u8 *)&bits, sizeof(bits)); 299 + crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits)); 301 300 302 301 /* Store state in digest */ 303 302 for (i = 0; i < 8; i++) ··· 340 339 static struct shash_alg sha256_algs[2] = { { 341 340 .digestsize = SHA256_DIGEST_SIZE, 342 341 .init = sha256_init, 343 - .update = sha256_update, 342 + .update = crypto_sha256_update, 344 343 .final = sha256_final, 345 344 .export = sha256_export, 346 345 .import = sha256_import, ··· 356 355 }, { 357 356 .digestsize = SHA224_DIGEST_SIZE, 358 357 .init = sha224_init, 359 - .update = sha256_update, 358 + .update = crypto_sha256_update, 360 359 .final = sha224_final, 361 360 .descsize = sizeof(struct sha256_state), 362 361 .base = {

+7 -6

crypto/sha512_generic.c

··· 163 163 return 0; 164 164 } 165 165 166 - static int 167 - sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len) 166 + int crypto_sha512_update(struct shash_desc *desc, const u8 *data, 167 + unsigned int len) 168 168 { 169 169 struct sha512_state *sctx = shash_desc_ctx(desc); 170 170 ··· 197 197 198 198 return 0; 199 199 } 200 + EXPORT_SYMBOL(crypto_sha512_update); 200 201 201 202 static int 202 203 sha512_final(struct shash_desc *desc, u8 *hash) ··· 216 215 /* Pad out to 112 mod 128. */ 217 216 index = sctx->count[0] & 0x7f; 218 217 pad_len = (index < 112) ? (112 - index) : ((128+112) - index); 219 - sha512_update(desc, padding, pad_len); 218 + crypto_sha512_update(desc, padding, pad_len); 220 219 221 220 /* Append length (before padding) */ 222 - sha512_update(desc, (const u8 *)bits, sizeof(bits)); 221 + crypto_sha512_update(desc, (const u8 *)bits, sizeof(bits)); 223 222 224 223 /* Store state in digest */ 225 224 for (i = 0; i < 8; i++) ··· 246 245 static struct shash_alg sha512_algs[2] = { { 247 246 .digestsize = SHA512_DIGEST_SIZE, 248 247 .init = sha512_init, 249 - .update = sha512_update, 248 + .update = crypto_sha512_update, 250 249 .final = sha512_final, 251 250 .descsize = sizeof(struct sha512_state), 252 251 .base = { ··· 258 257 }, { 259 258 .digestsize = SHA384_DIGEST_SIZE, 260 259 .init = sha384_init, 261 - .update = sha512_update, 260 + .update = crypto_sha512_update, 262 261 .final = sha384_final, 263 262 .descsize = sizeof(struct sha512_state), 264 263 .base = {

+29 -1

crypto/tcrypt.c

··· 1095 1095 break; 1096 1096 1097 1097 case 28: 1098 - 1099 1098 ret += tcrypt_test("tgr160"); 1100 1099 break; 1101 1100 ··· 1117 1118 ret += tcrypt_test("lrw(camellia)"); 1118 1119 ret += tcrypt_test("xts(camellia)"); 1119 1120 break; 1121 + 1120 1122 case 33: 1121 1123 ret += tcrypt_test("sha224"); 1122 1124 break; ··· 1213 1213 case 109: 1214 1214 ret += tcrypt_test("vmac(aes)"); 1215 1215 break; 1216 + 1216 1217 case 110: 1217 1218 ret += tcrypt_test("hmac(crc32)"); 1218 1219 break; ··· 1224 1223 1225 1224 case 151: 1226 1225 ret += tcrypt_test("rfc4106(gcm(aes))"); 1226 + break; 1227 + 1228 + case 152: 1229 + ret += tcrypt_test("rfc4543(gcm(aes))"); 1230 + break; 1231 + 1232 + case 153: 1233 + ret += tcrypt_test("cmac(aes)"); 1234 + break; 1235 + 1236 + case 154: 1237 + ret += tcrypt_test("cmac(des3_ede)"); 1227 1238 break; 1228 1239 1229 1240 case 200: ··· 1766 1753 speed_template_32_64); 1767 1754 test_acipher_speed("xts(camellia)", DECRYPT, sec, NULL, 0, 1768 1755 speed_template_32_64); 1756 + break; 1757 + 1758 + case 509: 1759 + test_acipher_speed("ecb(blowfish)", ENCRYPT, sec, NULL, 0, 1760 + speed_template_8_32); 1761 + test_acipher_speed("ecb(blowfish)", DECRYPT, sec, NULL, 0, 1762 + speed_template_8_32); 1763 + test_acipher_speed("cbc(blowfish)", ENCRYPT, sec, NULL, 0, 1764 + speed_template_8_32); 1765 + test_acipher_speed("cbc(blowfish)", DECRYPT, sec, NULL, 0, 1766 + speed_template_8_32); 1767 + test_acipher_speed("ctr(blowfish)", ENCRYPT, sec, NULL, 0, 1768 + speed_template_8_32); 1769 + test_acipher_speed("ctr(blowfish)", DECRYPT, sec, NULL, 0, 1770 + speed_template_8_32); 1769 1771 break; 1770 1772 1771 1773 case 1000:

+93 -2

crypto/testmgr.c

··· 1645 1645 .alg = "__cbc-serpent-avx", 1646 1646 .test = alg_test_null, 1647 1647 }, { 1648 + .alg = "__cbc-serpent-avx2", 1649 + .test = alg_test_null, 1650 + }, { 1648 1651 .alg = "__cbc-serpent-sse2", 1649 1652 .test = alg_test_null, 1650 1653 }, { 1651 1654 .alg = "__cbc-twofish-avx", 1652 1655 .test = alg_test_null, 1653 1656 }, { 1657 + .alg = "__cbc-twofish-avx2", 1658 + .test = alg_test_null, 1659 + }, { 1654 1660 .alg = "__driver-cbc-aes-aesni", 1655 1661 .test = alg_test_null, 1656 1662 .fips_allowed = 1, 1657 1663 }, { 1664 + .alg = "__driver-cbc-blowfish-avx2", 1665 + .test = alg_test_null, 1666 + }, { 1658 1667 .alg = "__driver-cbc-camellia-aesni", 1668 + .test = alg_test_null, 1669 + }, { 1670 + .alg = "__driver-cbc-camellia-aesni-avx2", 1659 1671 .test = alg_test_null, 1660 1672 }, { 1661 1673 .alg = "__driver-cbc-cast5-avx", ··· 1679 1667 .alg = "__driver-cbc-serpent-avx", 1680 1668 .test = alg_test_null, 1681 1669 }, { 1670 + .alg = "__driver-cbc-serpent-avx2", 1671 + .test = alg_test_null, 1672 + }, { 1682 1673 .alg = "__driver-cbc-serpent-sse2", 1683 1674 .test = alg_test_null, 1684 1675 }, { 1685 1676 .alg = "__driver-cbc-twofish-avx", 1686 1677 .test = alg_test_null, 1687 1678 }, { 1679 + .alg = "__driver-cbc-twofish-avx2", 1680 + .test = alg_test_null, 1681 + }, { 1688 1682 .alg = "__driver-ecb-aes-aesni", 1689 1683 .test = alg_test_null, 1690 1684 .fips_allowed = 1, 1691 1685 }, { 1686 + .alg = "__driver-ecb-blowfish-avx2", 1687 + .test = alg_test_null, 1688 + }, { 1692 1689 .alg = "__driver-ecb-camellia-aesni", 1690 + .test = alg_test_null, 1691 + }, { 1692 + .alg = "__driver-ecb-camellia-aesni-avx2", 1693 1693 .test = alg_test_null, 1694 1694 }, { 1695 1695 .alg = "__driver-ecb-cast5-avx", ··· 1713 1689 .alg = "__driver-ecb-serpent-avx", 1714 1690 .test = alg_test_null, 1715 1691 }, { 1692 + .alg = "__driver-ecb-serpent-avx2", 1693 + .test = alg_test_null, 1694 + }, { 1716 1695 .alg = "__driver-ecb-serpent-sse2", 1717 1696 .test = alg_test_null, 1718 1697 }, { 1719 1698 .alg = "__driver-ecb-twofish-avx", 1699 + .test = alg_test_null, 1700 + }, { 1701 + .alg = "__driver-ecb-twofish-avx2", 1720 1702 .test = alg_test_null, 1721 1703 }, { 1722 1704 .alg = "__ghash-pclmulqdqni", ··· 1943 1913 } 1944 1914 } 1945 1915 }, { 1916 + .alg = "cmac(aes)", 1917 + .test = alg_test_hash, 1918 + .suite = { 1919 + .hash = { 1920 + .vecs = aes_cmac128_tv_template, 1921 + .count = CMAC_AES_TEST_VECTORS 1922 + } 1923 + } 1924 + }, { 1925 + .alg = "cmac(des3_ede)", 1926 + .test = alg_test_hash, 1927 + .suite = { 1928 + .hash = { 1929 + .vecs = des3_ede_cmac64_tv_template, 1930 + .count = CMAC_DES3_EDE_TEST_VECTORS 1931 + } 1932 + } 1933 + }, { 1934 + .alg = "compress_null", 1935 + .test = alg_test_null, 1936 + }, { 1946 1937 .alg = "crc32c", 1947 1938 .test = alg_test_crc32c, 1948 1939 .fips_allowed = 1, ··· 1978 1927 .test = alg_test_null, 1979 1928 .fips_allowed = 1, 1980 1929 }, { 1930 + .alg = "cryptd(__driver-cbc-blowfish-avx2)", 1931 + .test = alg_test_null, 1932 + }, { 1981 1933 .alg = "cryptd(__driver-cbc-camellia-aesni)", 1934 + .test = alg_test_null, 1935 + }, { 1936 + .alg = "cryptd(__driver-cbc-camellia-aesni-avx2)", 1937 + .test = alg_test_null, 1938 + }, { 1939 + .alg = "cryptd(__driver-cbc-serpent-avx2)", 1982 1940 .test = alg_test_null, 1983 1941 }, { 1984 1942 .alg = "cryptd(__driver-ecb-aes-aesni)", 1985 1943 .test = alg_test_null, 1986 1944 .fips_allowed = 1, 1987 1945 }, { 1946 + .alg = "cryptd(__driver-ecb-blowfish-avx2)", 1947 + .test = alg_test_null, 1948 + }, { 1988 1949 .alg = "cryptd(__driver-ecb-camellia-aesni)", 1950 + .test = alg_test_null, 1951 + }, { 1952 + .alg = "cryptd(__driver-ecb-camellia-aesni-avx2)", 1989 1953 .test = alg_test_null, 1990 1954 }, { 1991 1955 .alg = "cryptd(__driver-ecb-cast5-avx)", ··· 2012 1946 .alg = "cryptd(__driver-ecb-serpent-avx)", 2013 1947 .test = alg_test_null, 2014 1948 }, { 1949 + .alg = "cryptd(__driver-ecb-serpent-avx2)", 1950 + .test = alg_test_null, 1951 + }, { 2015 1952 .alg = "cryptd(__driver-ecb-serpent-sse2)", 2016 1953 .test = alg_test_null, 2017 1954 }, { 2018 1955 .alg = "cryptd(__driver-ecb-twofish-avx)", 1956 + .test = alg_test_null, 1957 + }, { 1958 + .alg = "cryptd(__driver-ecb-twofish-avx2)", 2019 1959 .test = alg_test_null, 2020 1960 }, { 2021 1961 .alg = "cryptd(__driver-gcm-aes-aesni)", ··· 2199 2127 } 2200 2128 } 2201 2129 }, { 2130 + .alg = "digest_null", 2131 + .test = alg_test_null, 2132 + }, { 2202 2133 .alg = "ecb(__aes-aesni)", 2203 2134 .test = alg_test_null, 2204 2135 .fips_allowed = 1, ··· 2311 2236 } 2312 2237 } 2313 2238 } 2239 + }, { 2240 + .alg = "ecb(cipher_null)", 2241 + .test = alg_test_null, 2314 2242 }, { 2315 2243 .alg = "ecb(des)", 2316 2244 .test = alg_test_skcipher, ··· 2774 2696 } 2775 2697 } 2776 2698 }, { 2777 - 2778 - 2779 2699 .alg = "rfc4309(ccm(aes))", 2780 2700 .test = alg_test_aead, 2781 2701 .fips_allowed = 1, ··· 2787 2711 .vecs = aes_ccm_rfc4309_dec_tv_template, 2788 2712 .count = AES_CCM_4309_DEC_TEST_VECTORS 2789 2713 } 2714 + } 2715 + } 2716 + }, { 2717 + .alg = "rfc4543(gcm(aes))", 2718 + .test = alg_test_aead, 2719 + .suite = { 2720 + .aead = { 2721 + .enc = { 2722 + .vecs = aes_gcm_rfc4543_enc_tv_template, 2723 + .count = AES_GCM_4543_ENC_TEST_VECTORS 2724 + }, 2725 + .dec = { 2726 + .vecs = aes_gcm_rfc4543_dec_tv_template, 2727 + .count = AES_GCM_4543_DEC_TEST_VECTORS 2728 + }, 2790 2729 } 2791 2730 } 2792 2731 }, {

+1276 -38

crypto/testmgr.h

··· 1639 1639 }, 1640 1640 }; 1641 1641 1642 + #define CMAC_AES_TEST_VECTORS 6 1643 + 1644 + static struct hash_testvec aes_cmac128_tv_template[] = { 1645 + { /* From NIST Special Publication 800-38B, AES-128 */ 1646 + .key = "\x2b\x7e\x15\x16\x28\xae\xd2\xa6" 1647 + "\xab\xf7\x15\x88\x09\xcf\x4f\x3c", 1648 + .plaintext = zeroed_string, 1649 + .digest = "\xbb\x1d\x69\x29\xe9\x59\x37\x28" 1650 + "\x7f\xa3\x7d\x12\x9b\x75\x67\x46", 1651 + .psize = 0, 1652 + .ksize = 16, 1653 + }, { 1654 + .key = "\x2b\x7e\x15\x16\x28\xae\xd2\xa6" 1655 + "\xab\xf7\x15\x88\x09\xcf\x4f\x3c", 1656 + .plaintext = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96" 1657 + "\xe9\x3d\x7e\x11\x73\x93\x17\x2a", 1658 + .digest = "\x07\x0a\x16\xb4\x6b\x4d\x41\x44" 1659 + "\xf7\x9b\xdd\x9d\xd0\x4a\x28\x7c", 1660 + .psize = 16, 1661 + .ksize = 16, 1662 + }, { 1663 + .key = "\x2b\x7e\x15\x16\x28\xae\xd2\xa6" 1664 + "\xab\xf7\x15\x88\x09\xcf\x4f\x3c", 1665 + .plaintext = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96" 1666 + "\xe9\x3d\x7e\x11\x73\x93\x17\x2a" 1667 + "\xae\x2d\x8a\x57\x1e\x03\xac\x9c" 1668 + "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51" 1669 + "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11", 1670 + .digest = "\xdf\xa6\x67\x47\xde\x9a\xe6\x30" 1671 + "\x30\xca\x32\x61\x14\x97\xc8\x27", 1672 + .psize = 40, 1673 + .ksize = 16, 1674 + }, { 1675 + .key = "\x2b\x7e\x15\x16\x28\xae\xd2\xa6" 1676 + "\xab\xf7\x15\x88\x09\xcf\x4f\x3c", 1677 + .plaintext = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96" 1678 + "\xe9\x3d\x7e\x11\x73\x93\x17\x2a" 1679 + "\xae\x2d\x8a\x57\x1e\x03\xac\x9c" 1680 + "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51" 1681 + "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11" 1682 + "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef" 1683 + "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17" 1684 + "\xad\x2b\x41\x7b\xe6\x6c\x37\x10", 1685 + .digest = "\x51\xf0\xbe\xbf\x7e\x3b\x9d\x92" 1686 + "\xfc\x49\x74\x17\x79\x36\x3c\xfe", 1687 + .psize = 64, 1688 + .ksize = 16, 1689 + }, { /* From NIST Special Publication 800-38B, AES-256 */ 1690 + .key = "\x60\x3d\xeb\x10\x15\xca\x71\xbe" 1691 + "\x2b\x73\xae\xf0\x85\x7d\x77\x81" 1692 + "\x1f\x35\x2c\x07\x3b\x61\x08\xd7" 1693 + "\x2d\x98\x10\xa3\x09\x14\xdf\xf4", 1694 + .plaintext = zeroed_string, 1695 + .digest = "\x02\x89\x62\xf6\x1b\x7b\xf8\x9e" 1696 + "\xfc\x6b\x55\x1f\x46\x67\xd9\x83", 1697 + .psize = 0, 1698 + .ksize = 32, 1699 + }, { 1700 + .key = "\x60\x3d\xeb\x10\x15\xca\x71\xbe" 1701 + "\x2b\x73\xae\xf0\x85\x7d\x77\x81" 1702 + "\x1f\x35\x2c\x07\x3b\x61\x08\xd7" 1703 + "\x2d\x98\x10\xa3\x09\x14\xdf\xf4", 1704 + .plaintext = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96" 1705 + "\xe9\x3d\x7e\x11\x73\x93\x17\x2a" 1706 + "\xae\x2d\x8a\x57\x1e\x03\xac\x9c" 1707 + "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51" 1708 + "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11" 1709 + "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef" 1710 + "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17" 1711 + "\xad\x2b\x41\x7b\xe6\x6c\x37\x10", 1712 + .digest = "\xe1\x99\x21\x90\x54\x9f\x6e\xd5" 1713 + "\x69\x6a\x2c\x05\x6c\x31\x54\x10", 1714 + .psize = 64, 1715 + .ksize = 32, 1716 + } 1717 + }; 1718 + 1719 + #define CMAC_DES3_EDE_TEST_VECTORS 4 1720 + 1721 + static struct hash_testvec des3_ede_cmac64_tv_template[] = { 1722 + /* 1723 + * From NIST Special Publication 800-38B, Three Key TDEA 1724 + * Corrected test vectors from: 1725 + * http://csrc.nist.gov/publications/nistpubs/800-38B/Updated_CMAC_Examples.pdf 1726 + */ 1727 + { 1728 + .key = "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62" 1729 + "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58" 1730 + "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5", 1731 + .plaintext = zeroed_string, 1732 + .digest = "\xb7\xa6\x88\xe1\x22\xff\xaf\x95", 1733 + .psize = 0, 1734 + .ksize = 24, 1735 + }, { 1736 + .key = "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62" 1737 + "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58" 1738 + "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5", 1739 + .plaintext = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96", 1740 + .digest = "\x8e\x8f\x29\x31\x36\x28\x37\x97", 1741 + .psize = 8, 1742 + .ksize = 24, 1743 + }, { 1744 + .key = "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62" 1745 + "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58" 1746 + "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5", 1747 + .plaintext = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96" 1748 + "\xe9\x3d\x7e\x11\x73\x93\x17\x2a" 1749 + "\xae\x2d\x8a\x57", 1750 + .digest = "\x74\x3d\xdb\xe0\xce\x2d\xc2\xed", 1751 + .psize = 20, 1752 + .ksize = 24, 1753 + }, { 1754 + .key = "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62" 1755 + "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58" 1756 + "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5", 1757 + .plaintext = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96" 1758 + "\xe9\x3d\x7e\x11\x73\x93\x17\x2a" 1759 + "\xae\x2d\x8a\x57\x1e\x03\xac\x9c" 1760 + "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51", 1761 + .digest = "\x33\xe6\xb1\x09\x24\x00\xea\xe5", 1762 + .psize = 32, 1763 + .ksize = 24, 1764 + } 1765 + }; 1766 + 1642 1767 #define XCBC_AES_TEST_VECTORS 6 1643 1768 1644 1769 static struct hash_testvec aes_xcbc128_tv_template[] = { ··· 12805 12680 #define AES_GCM_DEC_TEST_VECTORS 8 12806 12681 #define AES_GCM_4106_ENC_TEST_VECTORS 7 12807 12682 #define AES_GCM_4106_DEC_TEST_VECTORS 7 12683 + #define AES_GCM_4543_ENC_TEST_VECTORS 1 12684 + #define AES_GCM_4543_DEC_TEST_VECTORS 2 12808 12685 #define AES_CCM_ENC_TEST_VECTORS 7 12809 12686 #define AES_CCM_DEC_TEST_VECTORS 7 12810 12687 #define AES_CCM_4309_ENC_TEST_VECTORS 7 ··· 18320 18193 } 18321 18194 }; 18322 18195 18196 + static struct aead_testvec aes_gcm_rfc4543_enc_tv_template[] = { 18197 + { /* From draft-mcgrew-gcm-test-01 */ 18198 + .key = "\x4c\x80\xcd\xef\xbb\x5d\x10\xda" 18199 + "\x90\x6a\xc7\x3c\x36\x13\xa6\x34" 18200 + "\x22\x43\x3c\x64", 18201 + .klen = 20, 18202 + .iv = zeroed_string, 18203 + .assoc = "\x00\x00\x43\x21\x00\x00\x00\x07", 18204 + .alen = 8, 18205 + .input = "\x45\x00\x00\x30\xda\x3a\x00\x00" 18206 + "\x80\x01\xdf\x3b\xc0\xa8\x00\x05" 18207 + "\xc0\xa8\x00\x01\x08\x00\xc6\xcd" 18208 + "\x02\x00\x07\x00\x61\x62\x63\x64" 18209 + "\x65\x66\x67\x68\x69\x6a\x6b\x6c" 18210 + "\x6d\x6e\x6f\x70\x71\x72\x73\x74" 18211 + "\x01\x02\x02\x01", 18212 + .ilen = 52, 18213 + .result = "\x45\x00\x00\x30\xda\x3a\x00\x00" 18214 + "\x80\x01\xdf\x3b\xc0\xa8\x00\x05" 18215 + "\xc0\xa8\x00\x01\x08\x00\xc6\xcd" 18216 + "\x02\x00\x07\x00\x61\x62\x63\x64" 18217 + "\x65\x66\x67\x68\x69\x6a\x6b\x6c" 18218 + "\x6d\x6e\x6f\x70\x71\x72\x73\x74" 18219 + "\x01\x02\x02\x01\xf2\xa9\xa8\x36" 18220 + "\xe1\x55\x10\x6a\xa8\xdc\xd6\x18" 18221 + "\xe4\x09\x9a\xaa", 18222 + .rlen = 68, 18223 + } 18224 + }; 18225 + 18226 + static struct aead_testvec aes_gcm_rfc4543_dec_tv_template[] = { 18227 + { /* From draft-mcgrew-gcm-test-01 */ 18228 + .key = "\x4c\x80\xcd\xef\xbb\x5d\x10\xda" 18229 + "\x90\x6a\xc7\x3c\x36\x13\xa6\x34" 18230 + "\x22\x43\x3c\x64", 18231 + .klen = 20, 18232 + .iv = zeroed_string, 18233 + .assoc = "\x00\x00\x43\x21\x00\x00\x00\x07", 18234 + .alen = 8, 18235 + .input = "\x45\x00\x00\x30\xda\x3a\x00\x00" 18236 + "\x80\x01\xdf\x3b\xc0\xa8\x00\x05" 18237 + "\xc0\xa8\x00\x01\x08\x00\xc6\xcd" 18238 + "\x02\x00\x07\x00\x61\x62\x63\x64" 18239 + "\x65\x66\x67\x68\x69\x6a\x6b\x6c" 18240 + "\x6d\x6e\x6f\x70\x71\x72\x73\x74" 18241 + "\x01\x02\x02\x01\xf2\xa9\xa8\x36" 18242 + "\xe1\x55\x10\x6a\xa8\xdc\xd6\x18" 18243 + "\xe4\x09\x9a\xaa", 18244 + .ilen = 68, 18245 + .result = "\x45\x00\x00\x30\xda\x3a\x00\x00" 18246 + "\x80\x01\xdf\x3b\xc0\xa8\x00\x05" 18247 + "\xc0\xa8\x00\x01\x08\x00\xc6\xcd" 18248 + "\x02\x00\x07\x00\x61\x62\x63\x64" 18249 + "\x65\x66\x67\x68\x69\x6a\x6b\x6c" 18250 + "\x6d\x6e\x6f\x70\x71\x72\x73\x74" 18251 + "\x01\x02\x02\x01", 18252 + .rlen = 52, 18253 + }, { /* nearly same as previous, but should fail */ 18254 + .key = "\x4c\x80\xcd\xef\xbb\x5d\x10\xda" 18255 + "\x90\x6a\xc7\x3c\x36\x13\xa6\x34" 18256 + "\x22\x43\x3c\x64", 18257 + .klen = 20, 18258 + .iv = zeroed_string, 18259 + .assoc = "\x00\x00\x43\x21\x00\x00\x00\x07", 18260 + .alen = 8, 18261 + .input = "\x45\x00\x00\x30\xda\x3a\x00\x00" 18262 + "\x80\x01\xdf\x3b\xc0\xa8\x00\x05" 18263 + "\xc0\xa8\x00\x01\x08\x00\xc6\xcd" 18264 + "\x02\x00\x07\x00\x61\x62\x63\x64" 18265 + "\x65\x66\x67\x68\x69\x6a\x6b\x6c" 18266 + "\x6d\x6e\x6f\x70\x71\x72\x73\x74" 18267 + "\x01\x02\x02\x01\xf2\xa9\xa8\x36" 18268 + "\xe1\x55\x10\x6a\xa8\xdc\xd6\x18" 18269 + "\x00\x00\x00\x00", 18270 + .ilen = 68, 18271 + .novrfy = 1, 18272 + .result = "\x45\x00\x00\x30\xda\x3a\x00\x00" 18273 + "\x80\x01\xdf\x3b\xc0\xa8\x00\x05" 18274 + "\xc0\xa8\x00\x01\x08\x00\xc6\xcd" 18275 + "\x02\x00\x07\x00\x61\x62\x63\x64" 18276 + "\x65\x66\x67\x68\x69\x6a\x6b\x6c" 18277 + "\x6d\x6e\x6f\x70\x71\x72\x73\x74" 18278 + "\x01\x02\x02\x01", 18279 + .rlen = 52, 18280 + }, 18281 + }; 18282 + 18323 18283 static struct aead_testvec aes_ccm_enc_tv_template[] = { 18324 18284 { /* From RFC 3610 */ 18325 18285 .key = "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7" ··· 20997 20783 "\x86\x1D\xB4\x28\xBF\x56\xED\x61" 20998 20784 "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3" 20999 20785 "\x6A\x01\x75\x0C\xA3\x17\xAE\x45" 21000 - "\xDC\x50\xE7\x7E\x15\x89\x20\xB7", 21001 - .ilen = 496, 20786 + "\xDC\x50\xE7\x7E\x15\x89\x20\xB7" 20787 + "\x2B\xC2\x59\xF0\x64\xFB\x92\x06" 20788 + "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78" 20789 + "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA" 20790 + "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C" 20791 + "\xF3\x67\xFE\x95\x09\xA0\x37\xCE" 20792 + "\x42\xD9\x70\x07\x7B\x12\xA9\x1D" 20793 + "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F" 20794 + "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01" 20795 + "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73" 20796 + "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5" 20797 + "\x59\xF0\x87\x1E\x92\x29\xC0\x34" 20798 + "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6" 20799 + "\x3D\xD4\x48\xDF\x76\x0D\x81\x18" 20800 + "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A" 20801 + "\x21\x95\x2C\xC3\x37\xCE\x65\xFC" 20802 + "\x70\x07\x9E\x12\xA9\x40\xD7\x4B" 20803 + "\xE2\x79\x10\x84\x1B\xB2\x26\xBD" 20804 + "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F" 20805 + "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1" 20806 + "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13" 20807 + "\x87\x1E\xB5\x29\xC0\x57\xEE\x62" 20808 + "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4" 20809 + "\x6B\x02\x76\x0D\xA4\x18\xAF\x46" 20810 + "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8" 20811 + "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07" 20812 + "\x9E\x35\xCC\x40\xD7\x6E\x05\x79" 20813 + "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB" 20814 + "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D" 20815 + "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF" 20816 + "\x43\xDA\x71\x08\x7C\x13\xAA\x1E" 20817 + "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90" 20818 + "\x27\xBE\x32\xC9\x60\xF7\x6B\x02" 20819 + "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74" 20820 + "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6" 20821 + "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35" 20822 + "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7" 20823 + "\x3E\xD5\x49\xE0\x77\x0E\x82\x19" 20824 + "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B" 20825 + "\x22\x96\x2D\xC4\x38\xCF\x66\xFD" 20826 + "\x71\x08\x9F\x13\xAA\x41\xD8\x4C" 20827 + "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE" 20828 + "\x55\xEC\x60\xF7\x8E\x02\x99\x30" 20829 + "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2" 20830 + "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14" 20831 + "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63" 20832 + "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5" 20833 + "\x6C\x03\x77\x0E\xA5\x19\xB0\x47" 20834 + "\xDE\x52\xE9\x80\x17\x8B\x22\xB9" 20835 + "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08" 20836 + "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A" 20837 + "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC" 20838 + "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E" 20839 + "\xF5\x69\x00\x97\x0B\xA2\x39\xD0" 20840 + "\x44\xDB\x72\x09\x7D\x14\xAB\x1F" 20841 + "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91" 20842 + "\x28\xBF\x33\xCA\x61\xF8\x6C\x03" 20843 + "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75" 20844 + "\x0C\x80\x17\xAE\x22\xB9\x50\xE7" 20845 + "\x5B\xF2\x89\x20\x94\x2B\xC2\x36" 20846 + "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8" 20847 + "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A" 20848 + "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C" 20849 + "\x00\x97\x2E\xC5\x39\xD0\x67\xFE" 20850 + "\x72\x09\xA0\x14\xAB\x42\xD9\x4D", 20851 + .ilen = 1008, 21002 20852 .result = "\xED\xCD\xDB\xB8\x68\xCE\xBD\xEA" 21003 20853 "\x9D\x9D\xCD\x9F\x4F\xFC\x4D\xB7" 21004 20854 "\xA5\xFF\x6F\x43\x0F\xBA\x32\x04" ··· 21124 20846 "\x2C\x35\x1B\x38\x85\x7D\xE8\xF3" 21125 20847 "\x87\x4F\xDA\xD8\x5F\xFC\xB6\x44" 21126 20848 "\xD0\xE3\x9B\x8B\xBF\xD6\xB8\xC4" 21127 - "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB", 21128 - .rlen = 496, 20849 + "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB" 20850 + "\xA4\xAD\xCF\x5D\xD4\x58\xC9\xCD" 20851 + "\xF7\x90\x68\xCF\xC9\x11\x52\x3E" 20852 + "\xE8\xA1\xA3\x78\x8B\xD0\xAC\x0A" 20853 + "\xD4\xC9\xA3\xA5\x55\x30\xC8\x3E" 20854 + "\xED\x28\x39\xE9\x63\xED\x41\x70" 20855 + "\x51\xE3\xC4\xA0\xFC\xD5\x43\xCB" 20856 + "\x4D\x65\xC8\xFD\x3A\x91\x8F\x60" 20857 + "\x8A\xA6\x6D\x9D\x3E\x01\x23\x4B" 20858 + "\x50\x47\xC9\xDC\x9B\xDE\x37\xC5" 20859 + "\xBF\x67\xB1\x6B\x78\x38\xD5\x7E" 20860 + "\xB6\xFF\x67\x83\x3B\x6E\xBE\x23" 20861 + "\x45\xFA\x1D\x69\x44\xFD\xC6\xB9" 20862 + "\xD0\x4A\x92\xD1\xBE\xF6\x4A\xB7" 20863 + "\xCA\xA8\xA2\x9E\x13\x87\x57\x92" 20864 + "\x64\x7C\x85\x0B\xB3\x29\x37\xD8" 20865 + "\xE6\xAA\xAF\xC4\x03\x67\xA3\xBF" 20866 + "\x2E\x45\x83\xB6\xD8\x54\x00\x89" 20867 + "\xF6\xBC\x3A\x7A\x88\x58\x51\xED" 20868 + "\xF4\x4E\x01\xA5\xC3\x2E\xD9\x42" 20869 + "\xBD\x6E\x0D\x0B\x21\xB0\x1A\xCC" 20870 + "\xA4\xD3\x3F\xDC\x9B\x81\xD8\xF1" 20871 + "\xEA\x7A\x6A\xB7\x07\xC9\x6D\x91" 20872 + "\x6D\x3A\xF5\x5F\xA6\xFF\x87\x1E" 20873 + "\x3F\xDD\xC0\x72\xEA\xAC\x08\x15" 20874 + "\x21\xE6\xC6\xB6\x0D\xD8\x51\x86" 20875 + "\x2A\x03\x73\xF7\x29\xD4\xC4\xE4" 20876 + "\x7F\x95\x10\xF7\xAB\x3F\x92\x23" 20877 + "\xD3\xCE\x9C\x2E\x46\x3B\x63\x43" 20878 + "\xBB\xC2\x82\x7A\x83\xD5\x55\xE2" 20879 + "\xE7\x9B\x2F\x92\xAF\xFD\x81\x56" 20880 + "\x79\xFD\x3E\xF9\x46\xE0\x25\xD4" 20881 + "\x38\xDE\xBC\x2C\xC4\x7A\x2A\x8F" 20882 + "\x94\x4F\xD0\xAD\x9B\x37\x18\xD4" 20883 + "\x0E\x4D\x0F\x02\x3A\xDC\x5A\xA2" 20884 + "\x39\x25\x55\x20\x5A\xA6\x02\x9F" 20885 + "\xE6\x77\x21\x77\xE5\x4B\x7B\x0B" 20886 + "\x30\xF8\x5F\x33\x0F\x49\xCD\xFF" 20887 + "\xF2\xE4\x35\xF9\xF0\x63\xC3\x7E" 20888 + "\xF1\xA6\x73\xB4\xDF\xE7\xBB\x78" 20889 + "\xFF\x21\xA9\xF3\xF3\xCF\x5D\xBA" 20890 + "\xED\x87\x98\xAC\xFE\x48\x97\x6D" 20891 + "\xA6\x7F\x69\x31\xB1\xC4\xFF\x14" 20892 + "\xC6\x76\xD4\x10\xDD\xF6\x49\x2C" 20893 + "\x9C\xC8\x6D\x76\xC0\x8F\x5F\x55" 20894 + "\x2F\x3C\x8A\x30\xAA\xC3\x16\x55" 20895 + "\xC6\xFC\x8D\x8B\xB9\xE5\x80\x6C" 20896 + "\xC8\x7E\xBD\x65\x58\x36\xD5\xBC" 20897 + "\xF0\x33\x52\x29\x70\xF9\x5C\xE9" 20898 + "\xAC\x1F\xB5\x73\x56\x66\x54\xAF" 20899 + "\x1B\x8F\x7D\xED\xAB\x03\xCE\xE3" 20900 + "\xAE\x47\xB6\x69\x86\xE9\x01\x31" 20901 + "\x83\x18\x3D\xF4\x74\x7B\xF9\x42" 20902 + "\x4C\xFD\x75\x4A\x6D\xF0\x03\xA6" 20903 + "\x2B\x20\x63\xDA\x49\x65\x5E\x8B" 20904 + "\xC0\x19\xE3\x8D\xD9\xF3\xB0\x34" 20905 + "\xD3\x52\xFC\x68\x00\x43\x1B\x37" 20906 + "\x31\x93\x51\x1C\x63\x97\x70\xB0" 20907 + "\x99\x78\x83\x13\xFD\xCF\x53\x81" 20908 + "\x36\x46\xB5\x42\x52\x2F\x32\xEB" 20909 + "\x4A\x3D\xF1\x8F\x1C\x54\x2E\xFC" 20910 + "\x41\x75\x5A\x8C\x8E\x6F\xE7\x1A" 20911 + "\xAE\xEF\x3E\x82\x12\x0B\x74\x72" 20912 + "\xF8\xB2\xAA\x7A\xD6\xFF\xFA\x55" 20913 + "\x33\x1A\xBB\xD3\xA2\x7E\x97\x66", 20914 + .rlen = 1008, 21129 20915 .also_non_np = 1, 21130 20916 .np = 2, 21131 - .tap = { 496 - 16, 16 }, 20917 + .tap = { 1008 - 16, 16 }, 21132 20918 }, 21133 20919 }; 21134 20920 ··· 21297 20955 "\x2C\x35\x1B\x38\x85\x7D\xE8\xF3" 21298 20956 "\x87\x4F\xDA\xD8\x5F\xFC\xB6\x44" 21299 20957 "\xD0\xE3\x9B\x8B\xBF\xD6\xB8\xC4" 21300 - "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB", 21301 - .ilen = 496, 20958 + "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB" 20959 + "\xA4\xAD\xCF\x5D\xD4\x58\xC9\xCD" 20960 + "\xF7\x90\x68\xCF\xC9\x11\x52\x3E" 20961 + "\xE8\xA1\xA3\x78\x8B\xD0\xAC\x0A" 20962 + "\xD4\xC9\xA3\xA5\x55\x30\xC8\x3E" 20963 + "\xED\x28\x39\xE9\x63\xED\x41\x70" 20964 + "\x51\xE3\xC4\xA0\xFC\xD5\x43\xCB" 20965 + "\x4D\x65\xC8\xFD\x3A\x91\x8F\x60" 20966 + "\x8A\xA6\x6D\x9D\x3E\x01\x23\x4B" 20967 + "\x50\x47\xC9\xDC\x9B\xDE\x37\xC5" 20968 + "\xBF\x67\xB1\x6B\x78\x38\xD5\x7E" 20969 + "\xB6\xFF\x67\x83\x3B\x6E\xBE\x23" 20970 + "\x45\xFA\x1D\x69\x44\xFD\xC6\xB9" 20971 + "\xD0\x4A\x92\xD1\xBE\xF6\x4A\xB7" 20972 + "\xCA\xA8\xA2\x9E\x13\x87\x57\x92" 20973 + "\x64\x7C\x85\x0B\xB3\x29\x37\xD8" 20974 + "\xE6\xAA\xAF\xC4\x03\x67\xA3\xBF" 20975 + "\x2E\x45\x83\xB6\xD8\x54\x00\x89" 20976 + "\xF6\xBC\x3A\x7A\x88\x58\x51\xED" 20977 + "\xF4\x4E\x01\xA5\xC3\x2E\xD9\x42" 20978 + "\xBD\x6E\x0D\x0B\x21\xB0\x1A\xCC" 20979 + "\xA4\xD3\x3F\xDC\x9B\x81\xD8\xF1" 20980 + "\xEA\x7A\x6A\xB7\x07\xC9\x6D\x91" 20981 + "\x6D\x3A\xF5\x5F\xA6\xFF\x87\x1E" 20982 + "\x3F\xDD\xC0\x72\xEA\xAC\x08\x15" 20983 + "\x21\xE6\xC6\xB6\x0D\xD8\x51\x86" 20984 + "\x2A\x03\x73\xF7\x29\xD4\xC4\xE4" 20985 + "\x7F\x95\x10\xF7\xAB\x3F\x92\x23" 20986 + "\xD3\xCE\x9C\x2E\x46\x3B\x63\x43" 20987 + "\xBB\xC2\x82\x7A\x83\xD5\x55\xE2" 20988 + "\xE7\x9B\x2F\x92\xAF\xFD\x81\x56" 20989 + "\x79\xFD\x3E\xF9\x46\xE0\x25\xD4" 20990 + "\x38\xDE\xBC\x2C\xC4\x7A\x2A\x8F" 20991 + "\x94\x4F\xD0\xAD\x9B\x37\x18\xD4" 20992 + "\x0E\x4D\x0F\x02\x3A\xDC\x5A\xA2" 20993 + "\x39\x25\x55\x20\x5A\xA6\x02\x9F" 20994 + "\xE6\x77\x21\x77\xE5\x4B\x7B\x0B" 20995 + "\x30\xF8\x5F\x33\x0F\x49\xCD\xFF" 20996 + "\xF2\xE4\x35\xF9\xF0\x63\xC3\x7E" 20997 + "\xF1\xA6\x73\xB4\xDF\xE7\xBB\x78" 20998 + "\xFF\x21\xA9\xF3\xF3\xCF\x5D\xBA" 20999 + "\xED\x87\x98\xAC\xFE\x48\x97\x6D" 21000 + "\xA6\x7F\x69\x31\xB1\xC4\xFF\x14" 21001 + "\xC6\x76\xD4\x10\xDD\xF6\x49\x2C" 21002 + "\x9C\xC8\x6D\x76\xC0\x8F\x5F\x55" 21003 + "\x2F\x3C\x8A\x30\xAA\xC3\x16\x55" 21004 + "\xC6\xFC\x8D\x8B\xB9\xE5\x80\x6C" 21005 + "\xC8\x7E\xBD\x65\x58\x36\xD5\xBC" 21006 + "\xF0\x33\x52\x29\x70\xF9\x5C\xE9" 21007 + "\xAC\x1F\xB5\x73\x56\x66\x54\xAF" 21008 + "\x1B\x8F\x7D\xED\xAB\x03\xCE\xE3" 21009 + "\xAE\x47\xB6\x69\x86\xE9\x01\x31" 21010 + "\x83\x18\x3D\xF4\x74\x7B\xF9\x42" 21011 + "\x4C\xFD\x75\x4A\x6D\xF0\x03\xA6" 21012 + "\x2B\x20\x63\xDA\x49\x65\x5E\x8B" 21013 + "\xC0\x19\xE3\x8D\xD9\xF3\xB0\x34" 21014 + "\xD3\x52\xFC\x68\x00\x43\x1B\x37" 21015 + "\x31\x93\x51\x1C\x63\x97\x70\xB0" 21016 + "\x99\x78\x83\x13\xFD\xCF\x53\x81" 21017 + "\x36\x46\xB5\x42\x52\x2F\x32\xEB" 21018 + "\x4A\x3D\xF1\x8F\x1C\x54\x2E\xFC" 21019 + "\x41\x75\x5A\x8C\x8E\x6F\xE7\x1A" 21020 + "\xAE\xEF\x3E\x82\x12\x0B\x74\x72" 21021 + "\xF8\xB2\xAA\x7A\xD6\xFF\xFA\x55" 21022 + "\x33\x1A\xBB\xD3\xA2\x7E\x97\x66", 21023 + .ilen = 1008, 21302 21024 .result = "\x56\xED\x84\x1B\x8F\x26\xBD\x31" 21303 21025 "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3" 21304 21026 "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15" ··· 21424 21018 "\x86\x1D\xB4\x28\xBF\x56\xED\x61" 21425 21019 "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3" 21426 21020 "\x6A\x01\x75\x0C\xA3\x17\xAE\x45" 21427 - "\xDC\x50\xE7\x7E\x15\x89\x20\xB7", 21428 - .rlen = 496, 21021 + "\xDC\x50\xE7\x7E\x15\x89\x20\xB7" 21022 + "\x2B\xC2\x59\xF0\x64\xFB\x92\x06" 21023 + "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78" 21024 + "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA" 21025 + "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C" 21026 + "\xF3\x67\xFE\x95\x09\xA0\x37\xCE" 21027 + "\x42\xD9\x70\x07\x7B\x12\xA9\x1D" 21028 + "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F" 21029 + "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01" 21030 + "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73" 21031 + "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5" 21032 + "\x59\xF0\x87\x1E\x92\x29\xC0\x34" 21033 + "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6" 21034 + "\x3D\xD4\x48\xDF\x76\x0D\x81\x18" 21035 + "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A" 21036 + "\x21\x95\x2C\xC3\x37\xCE\x65\xFC" 21037 + "\x70\x07\x9E\x12\xA9\x40\xD7\x4B" 21038 + "\xE2\x79\x10\x84\x1B\xB2\x26\xBD" 21039 + "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F" 21040 + "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1" 21041 + "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13" 21042 + "\x87\x1E\xB5\x29\xC0\x57\xEE\x62" 21043 + "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4" 21044 + "\x6B\x02\x76\x0D\xA4\x18\xAF\x46" 21045 + "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8" 21046 + "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07" 21047 + "\x9E\x35\xCC\x40\xD7\x6E\x05\x79" 21048 + "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB" 21049 + "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D" 21050 + "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF" 21051 + "\x43\xDA\x71\x08\x7C\x13\xAA\x1E" 21052 + "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90" 21053 + "\x27\xBE\x32\xC9\x60\xF7\x6B\x02" 21054 + "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74" 21055 + "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6" 21056 + "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35" 21057 + "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7" 21058 + "\x3E\xD5\x49\xE0\x77\x0E\x82\x19" 21059 + "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B" 21060 + "\x22\x96\x2D\xC4\x38\xCF\x66\xFD" 21061 + "\x71\x08\x9F\x13\xAA\x41\xD8\x4C" 21062 + "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE" 21063 + "\x55\xEC\x60\xF7\x8E\x02\x99\x30" 21064 + "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2" 21065 + "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14" 21066 + "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63" 21067 + "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5" 21068 + "\x6C\x03\x77\x0E\xA5\x19\xB0\x47" 21069 + "\xDE\x52\xE9\x80\x17\x8B\x22\xB9" 21070 + "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08" 21071 + "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A" 21072 + "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC" 21073 + "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E" 21074 + "\xF5\x69\x00\x97\x0B\xA2\x39\xD0" 21075 + "\x44\xDB\x72\x09\x7D\x14\xAB\x1F" 21076 + "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91" 21077 + "\x28\xBF\x33\xCA\x61\xF8\x6C\x03" 21078 + "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75" 21079 + "\x0C\x80\x17\xAE\x22\xB9\x50\xE7" 21080 + "\x5B\xF2\x89\x20\x94\x2B\xC2\x36" 21081 + "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8" 21082 + "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A" 21083 + "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C" 21084 + "\x00\x97\x2E\xC5\x39\xD0\x67\xFE" 21085 + "\x72\x09\xA0\x14\xAB\x42\xD9\x4D", 21086 + .rlen = 1008, 21429 21087 .also_non_np = 1, 21430 21088 .np = 2, 21431 - .tap = { 496 - 16, 16 }, 21089 + .tap = { 1008 - 16, 16 }, 21432 21090 }, 21433 21091 }; 21434 21092 ··· 21593 21123 "\x86\x1D\xB4\x28\xBF\x56\xED\x61" 21594 21124 "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3" 21595 21125 "\x6A\x01\x75\x0C\xA3\x17\xAE\x45" 21596 - "\xDC\x50\xE7\x7E\x15\x89\x20\xB7", 21597 - .ilen = 496, 21126 + "\xDC\x50\xE7\x7E\x15\x89\x20\xB7" 21127 + "\x2B\xC2\x59\xF0\x64\xFB\x92\x06" 21128 + "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78" 21129 + "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA" 21130 + "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C" 21131 + "\xF3\x67\xFE\x95\x09\xA0\x37\xCE" 21132 + "\x42\xD9\x70\x07\x7B\x12\xA9\x1D" 21133 + "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F" 21134 + "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01" 21135 + "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73" 21136 + "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5" 21137 + "\x59\xF0\x87\x1E\x92\x29\xC0\x34" 21138 + "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6" 21139 + "\x3D\xD4\x48\xDF\x76\x0D\x81\x18" 21140 + "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A" 21141 + "\x21\x95\x2C\xC3\x37\xCE\x65\xFC" 21142 + "\x70\x07\x9E\x12\xA9\x40\xD7\x4B" 21143 + "\xE2\x79\x10\x84\x1B\xB2\x26\xBD" 21144 + "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F" 21145 + "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1" 21146 + "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13" 21147 + "\x87\x1E\xB5\x29\xC0\x57\xEE\x62" 21148 + "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4" 21149 + "\x6B\x02\x76\x0D\xA4\x18\xAF\x46" 21150 + "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8" 21151 + "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07" 21152 + "\x9E\x35\xCC\x40\xD7\x6E\x05\x79" 21153 + "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB" 21154 + "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D" 21155 + "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF" 21156 + "\x43\xDA\x71\x08\x7C\x13\xAA\x1E" 21157 + "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90" 21158 + "\x27\xBE\x32\xC9\x60\xF7\x6B\x02" 21159 + "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74" 21160 + "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6" 21161 + "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35" 21162 + "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7" 21163 + "\x3E\xD5\x49\xE0\x77\x0E\x82\x19" 21164 + "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B" 21165 + "\x22\x96\x2D\xC4\x38\xCF\x66\xFD" 21166 + "\x71\x08\x9F\x13\xAA\x41\xD8\x4C" 21167 + "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE" 21168 + "\x55\xEC\x60\xF7\x8E\x02\x99\x30" 21169 + "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2" 21170 + "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14" 21171 + "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63" 21172 + "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5" 21173 + "\x6C\x03\x77\x0E\xA5\x19\xB0\x47" 21174 + "\xDE\x52\xE9\x80\x17\x8B\x22\xB9" 21175 + "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08" 21176 + "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A" 21177 + "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC" 21178 + "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E" 21179 + "\xF5\x69\x00\x97\x0B\xA2\x39\xD0" 21180 + "\x44\xDB\x72\x09\x7D\x14\xAB\x1F" 21181 + "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91" 21182 + "\x28\xBF\x33\xCA\x61\xF8\x6C\x03" 21183 + "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75" 21184 + "\x0C\x80\x17\xAE\x22\xB9\x50\xE7" 21185 + "\x5B\xF2\x89\x20\x94\x2B\xC2\x36" 21186 + "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8" 21187 + "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A" 21188 + "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C" 21189 + "\x00\x97\x2E\xC5\x39\xD0\x67\xFE" 21190 + "\x72\x09\xA0\x14\xAB\x42\xD9\x4D", 21191 + .ilen = 1008, 21598 21192 .result = "\xCD\x3E\x2A\x3B\x3E\x94\xC5\x77" 21599 21193 "\xBA\xBB\x5B\xB1\xDE\x7B\xA4\x40" 21600 21194 "\x88\x39\xE3\xFD\x94\x4B\x25\x58" ··· 21720 21186 "\x2D\x1A\x68\xFE\xEC\x92\x94\xDA" 21721 21187 "\x94\x2A\x6F\xD6\xFE\xE5\x76\x97" 21722 21188 "\xF4\x6E\xEE\xCB\x2B\x95\x4E\x36" 21723 - "\x5F\x74\x8C\x86\x5B\x71\xD0\x20", 21724 - .rlen = 496, 21189 + "\x5F\x74\x8C\x86\x5B\x71\xD0\x20" 21190 + "\x78\x1A\x7F\x18\x8C\xD9\xCD\xF5" 21191 + "\x21\x41\x56\x72\x13\xE1\x86\x07" 21192 + "\x07\x26\xF3\x4F\x7B\xEA\xB5\x18" 21193 + "\xFE\x94\x2D\x9F\xE0\x72\x18\x65" 21194 + "\xB2\xA5\x63\x48\xB4\x13\x22\xF7" 21195 + "\x25\xF1\x80\xA8\x7F\x54\x86\x7B" 21196 + "\x39\xAE\x95\x0C\x09\x32\x22\x2D" 21197 + "\x4D\x73\x39\x0C\x09\x2C\x7C\x10" 21198 + "\xD0\x4B\x53\xF6\x90\xC5\x99\x2F" 21199 + "\x15\xE1\x7F\xC6\xC5\x7A\x52\x14" 21200 + "\x65\xEE\x93\x54\xD0\x66\x15\x3C" 21201 + "\x4C\x68\xFD\x64\x0F\xF9\x10\x39" 21202 + "\x46\x7A\xDD\x97\x20\xEE\xC7\xD2" 21203 + "\x98\x4A\xB6\xE6\xF5\xA8\x1F\x4F" 21204 + "\xDB\xAB\x6D\xD5\x9B\x34\x16\x97" 21205 + "\x2F\x64\xE5\x37\xEF\x0E\xA1\xE9" 21206 + "\xBE\x31\x31\x96\x8B\x40\x18\x75" 21207 + "\x11\x75\x14\x32\xA5\x2D\x1B\x6B" 21208 + "\xDB\x59\xEB\xFA\x3D\x8E\x7C\xC4" 21209 + "\xDE\x68\xC8\x9F\xC9\x99\xE3\xC6" 21210 + "\x71\xB0\x12\x57\x89\x0D\xC0\x2B" 21211 + "\x9F\x12\x6A\x04\x67\xF1\x95\x31" 21212 + "\x59\xFD\x84\x95\x2C\x9C\x5B\xEC" 21213 + "\x09\xB0\x43\x96\x4A\x64\x80\x40" 21214 + "\xB9\x72\x19\xDD\x70\x42\xFA\xB1" 21215 + "\x4A\x2C\x0C\x0A\x60\x6E\xE3\x7C" 21216 + "\x37\x5A\xBE\xA4\x62\xCF\x29\xAB" 21217 + "\x7F\x4D\xA6\xB3\xE2\xB6\x64\xC6" 21218 + "\x33\x0B\xF3\xD5\x01\x38\x74\xA4" 21219 + "\x67\x1E\x75\x68\xC3\xAD\x76\xE9" 21220 + "\xE9\xBC\xF0\xEB\xD8\xFD\x31\x8A" 21221 + "\x5F\xC9\x18\x94\x4B\x86\x66\xFC" 21222 + "\xBD\x0B\x3D\xB3\x9F\xFA\x1F\xD9" 21223 + "\x78\xC4\xE3\x24\x1C\x67\xA2\xF8" 21224 + "\x43\xBC\x76\x75\xBF\x6C\x05\xB3" 21225 + "\x32\xE8\x7C\x80\xDB\xC7\xB6\x61" 21226 + "\x1A\x3E\x2B\xA7\x25\xED\x8F\xA0" 21227 + "\x00\x4B\xF8\x90\xCA\xD8\xFB\x12" 21228 + "\xAC\x1F\x18\xE9\xD2\x5E\xA2\x8E" 21229 + "\xE4\x84\x6B\x9D\xEB\x1E\x6B\xA3" 21230 + "\x7B\xDC\xCE\x15\x97\x27\xB2\x65" 21231 + "\xBC\x0E\x47\xAB\x55\x13\x53\xAB" 21232 + "\x0E\x34\x55\x02\x5F\x27\xC5\x89" 21233 + "\xDF\xC5\x70\xC4\xDD\x76\x82\xEE" 21234 + "\x68\xA6\x09\xB0\xE5\x5E\xF1\x0C" 21235 + "\xE3\xF3\x09\x9B\xFE\x65\x4B\xB8" 21236 + "\x30\xEC\xD5\x7C\x6A\xEC\x1D\xD2" 21237 + "\x93\xB7\xA1\x1A\x02\xD4\xC0\xD6" 21238 + "\x8D\x4D\x83\x9A\xED\x29\x4E\x14" 21239 + "\x86\xD5\x3C\x1A\xD5\xB9\x0A\x6A" 21240 + "\x72\x22\xD5\x92\x38\xF1\xA1\x86" 21241 + "\xB2\x41\x51\xCA\x4E\xAB\x8F\xD3" 21242 + "\x80\x56\xC3\xD7\x65\xE1\xB3\x86" 21243 + "\xCB\xCE\x98\xA1\xD4\x59\x1C\x06" 21244 + "\x01\xED\xF8\x29\x91\x19\x5C\x9A" 21245 + "\xEE\x28\x1B\x48\xD7\x32\xEF\x9F" 21246 + "\x6C\x2B\x66\x4E\x78\xD5\x8B\x72" 21247 + "\x80\xE7\x29\xDC\x23\x55\x98\x54" 21248 + "\xB1\xFF\x3E\x95\x56\xA8\x78\x78" 21249 + "\xEF\xC4\xA5\x11\x2D\x2B\xD8\x93" 21250 + "\x30\x6E\x7E\x51\xBB\x42\x5F\x03" 21251 + "\x43\x94\x23\x7E\xEE\xF0\xA5\x79" 21252 + "\x55\x01\xD4\x58\xB2\xF2\x85\x49" 21253 + "\x70\xC5\xB9\x0B\x3B\x7A\x6E\x6C", 21254 + .rlen = 1008, 21725 21255 .also_non_np = 1, 21726 21256 .np = 2, 21727 - .tap = { 496 - 16, 16 }, 21257 + .tap = { 1008 - 16, 16 }, 21728 21258 }, 21729 21259 }; 21730 21260 ··· 21889 21291 "\x2D\x1A\x68\xFE\xEC\x92\x94\xDA" 21890 21292 "\x94\x2A\x6F\xD6\xFE\xE5\x76\x97" 21891 21293 "\xF4\x6E\xEE\xCB\x2B\x95\x4E\x36" 21892 - "\x5F\x74\x8C\x86\x5B\x71\xD0\x20", 21893 - .ilen = 496, 21294 + "\x5F\x74\x8C\x86\x5B\x71\xD0\x20" 21295 + "\x78\x1A\x7F\x18\x8C\xD9\xCD\xF5" 21296 + "\x21\x41\x56\x72\x13\xE1\x86\x07" 21297 + "\x07\x26\xF3\x4F\x7B\xEA\xB5\x18" 21298 + "\xFE\x94\x2D\x9F\xE0\x72\x18\x65" 21299 + "\xB2\xA5\x63\x48\xB4\x13\x22\xF7" 21300 + "\x25\xF1\x80\xA8\x7F\x54\x86\x7B" 21301 + "\x39\xAE\x95\x0C\x09\x32\x22\x2D" 21302 + "\x4D\x73\x39\x0C\x09\x2C\x7C\x10" 21303 + "\xD0\x4B\x53\xF6\x90\xC5\x99\x2F" 21304 + "\x15\xE1\x7F\xC6\xC5\x7A\x52\x14" 21305 + "\x65\xEE\x93\x54\xD0\x66\x15\x3C" 21306 + "\x4C\x68\xFD\x64\x0F\xF9\x10\x39" 21307 + "\x46\x7A\xDD\x97\x20\xEE\xC7\xD2" 21308 + "\x98\x4A\xB6\xE6\xF5\xA8\x1F\x4F" 21309 + "\xDB\xAB\x6D\xD5\x9B\x34\x16\x97" 21310 + "\x2F\x64\xE5\x37\xEF\x0E\xA1\xE9" 21311 + "\xBE\x31\x31\x96\x8B\x40\x18\x75" 21312 + "\x11\x75\x14\x32\xA5\x2D\x1B\x6B" 21313 + "\xDB\x59\xEB\xFA\x3D\x8E\x7C\xC4" 21314 + "\xDE\x68\xC8\x9F\xC9\x99\xE3\xC6" 21315 + "\x71\xB0\x12\x57\x89\x0D\xC0\x2B" 21316 + "\x9F\x12\x6A\x04\x67\xF1\x95\x31" 21317 + "\x59\xFD\x84\x95\x2C\x9C\x5B\xEC" 21318 + "\x09\xB0\x43\x96\x4A\x64\x80\x40" 21319 + "\xB9\x72\x19\xDD\x70\x42\xFA\xB1" 21320 + "\x4A\x2C\x0C\x0A\x60\x6E\xE3\x7C" 21321 + "\x37\x5A\xBE\xA4\x62\xCF\x29\xAB" 21322 + "\x7F\x4D\xA6\xB3\xE2\xB6\x64\xC6" 21323 + "\x33\x0B\xF3\xD5\x01\x38\x74\xA4" 21324 + "\x67\x1E\x75\x68\xC3\xAD\x76\xE9" 21325 + "\xE9\xBC\xF0\xEB\xD8\xFD\x31\x8A" 21326 + "\x5F\xC9\x18\x94\x4B\x86\x66\xFC" 21327 + "\xBD\x0B\x3D\xB3\x9F\xFA\x1F\xD9" 21328 + "\x78\xC4\xE3\x24\x1C\x67\xA2\xF8" 21329 + "\x43\xBC\x76\x75\xBF\x6C\x05\xB3" 21330 + "\x32\xE8\x7C\x80\xDB\xC7\xB6\x61" 21331 + "\x1A\x3E\x2B\xA7\x25\xED\x8F\xA0" 21332 + "\x00\x4B\xF8\x90\xCA\xD8\xFB\x12" 21333 + "\xAC\x1F\x18\xE9\xD2\x5E\xA2\x8E" 21334 + "\xE4\x84\x6B\x9D\xEB\x1E\x6B\xA3" 21335 + "\x7B\xDC\xCE\x15\x97\x27\xB2\x65" 21336 + "\xBC\x0E\x47\xAB\x55\x13\x53\xAB" 21337 + "\x0E\x34\x55\x02\x5F\x27\xC5\x89" 21338 + "\xDF\xC5\x70\xC4\xDD\x76\x82\xEE" 21339 + "\x68\xA6\x09\xB0\xE5\x5E\xF1\x0C" 21340 + "\xE3\xF3\x09\x9B\xFE\x65\x4B\xB8" 21341 + "\x30\xEC\xD5\x7C\x6A\xEC\x1D\xD2" 21342 + "\x93\xB7\xA1\x1A\x02\xD4\xC0\xD6" 21343 + "\x8D\x4D\x83\x9A\xED\x29\x4E\x14" 21344 + "\x86\xD5\x3C\x1A\xD5\xB9\x0A\x6A" 21345 + "\x72\x22\xD5\x92\x38\xF1\xA1\x86" 21346 + "\xB2\x41\x51\xCA\x4E\xAB\x8F\xD3" 21347 + "\x80\x56\xC3\xD7\x65\xE1\xB3\x86" 21348 + "\xCB\xCE\x98\xA1\xD4\x59\x1C\x06" 21349 + "\x01\xED\xF8\x29\x91\x19\x5C\x9A" 21350 + "\xEE\x28\x1B\x48\xD7\x32\xEF\x9F" 21351 + "\x6C\x2B\x66\x4E\x78\xD5\x8B\x72" 21352 + "\x80\xE7\x29\xDC\x23\x55\x98\x54" 21353 + "\xB1\xFF\x3E\x95\x56\xA8\x78\x78" 21354 + "\xEF\xC4\xA5\x11\x2D\x2B\xD8\x93" 21355 + "\x30\x6E\x7E\x51\xBB\x42\x5F\x03" 21356 + "\x43\x94\x23\x7E\xEE\xF0\xA5\x79" 21357 + "\x55\x01\xD4\x58\xB2\xF2\x85\x49" 21358 + "\x70\xC5\xB9\x0B\x3B\x7A\x6E\x6C", 21359 + .ilen = 1008, 21894 21360 .result = "\x56\xED\x84\x1B\x8F\x26\xBD\x31" 21895 21361 "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3" 21896 21362 "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15" ··· 22016 21354 "\x86\x1D\xB4\x28\xBF\x56\xED\x61" 22017 21355 "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3" 22018 21356 "\x6A\x01\x75\x0C\xA3\x17\xAE\x45" 22019 - "\xDC\x50\xE7\x7E\x15\x89\x20\xB7", 22020 - .rlen = 496, 21357 + "\xDC\x50\xE7\x7E\x15\x89\x20\xB7" 21358 + "\x2B\xC2\x59\xF0\x64\xFB\x92\x06" 21359 + "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78" 21360 + "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA" 21361 + "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C" 21362 + "\xF3\x67\xFE\x95\x09\xA0\x37\xCE" 21363 + "\x42\xD9\x70\x07\x7B\x12\xA9\x1D" 21364 + "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F" 21365 + "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01" 21366 + "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73" 21367 + "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5" 21368 + "\x59\xF0\x87\x1E\x92\x29\xC0\x34" 21369 + "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6" 21370 + "\x3D\xD4\x48\xDF\x76\x0D\x81\x18" 21371 + "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A" 21372 + "\x21\x95\x2C\xC3\x37\xCE\x65\xFC" 21373 + "\x70\x07\x9E\x12\xA9\x40\xD7\x4B" 21374 + "\xE2\x79\x10\x84\x1B\xB2\x26\xBD" 21375 + "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F" 21376 + "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1" 21377 + "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13" 21378 + "\x87\x1E\xB5\x29\xC0\x57\xEE\x62" 21379 + "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4" 21380 + "\x6B\x02\x76\x0D\xA4\x18\xAF\x46" 21381 + "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8" 21382 + "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07" 21383 + "\x9E\x35\xCC\x40\xD7\x6E\x05\x79" 21384 + "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB" 21385 + "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D" 21386 + "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF" 21387 + "\x43\xDA\x71\x08\x7C\x13\xAA\x1E" 21388 + "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90" 21389 + "\x27\xBE\x32\xC9\x60\xF7\x6B\x02" 21390 + "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74" 21391 + "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6" 21392 + "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35" 21393 + "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7" 21394 + "\x3E\xD5\x49\xE0\x77\x0E\x82\x19" 21395 + "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B" 21396 + "\x22\x96\x2D\xC4\x38\xCF\x66\xFD" 21397 + "\x71\x08\x9F\x13\xAA\x41\xD8\x4C" 21398 + "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE" 21399 + "\x55\xEC\x60\xF7\x8E\x02\x99\x30" 21400 + "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2" 21401 + "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14" 21402 + "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63" 21403 + "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5" 21404 + "\x6C\x03\x77\x0E\xA5\x19\xB0\x47" 21405 + "\xDE\x52\xE9\x80\x17\x8B\x22\xB9" 21406 + "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08" 21407 + "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A" 21408 + "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC" 21409 + "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E" 21410 + "\xF5\x69\x00\x97\x0B\xA2\x39\xD0" 21411 + "\x44\xDB\x72\x09\x7D\x14\xAB\x1F" 21412 + "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91" 21413 + "\x28\xBF\x33\xCA\x61\xF8\x6C\x03" 21414 + "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75" 21415 + "\x0C\x80\x17\xAE\x22\xB9\x50\xE7" 21416 + "\x5B\xF2\x89\x20\x94\x2B\xC2\x36" 21417 + "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8" 21418 + "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A" 21419 + "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C" 21420 + "\x00\x97\x2E\xC5\x39\xD0\x67\xFE" 21421 + "\x72\x09\xA0\x14\xAB\x42\xD9\x4D", 21422 + .rlen = 1008, 22021 21423 .also_non_np = 1, 22022 21424 .np = 2, 22023 - .tap = { 496 - 16, 16 }, 21425 + .tap = { 1008 - 16, 16 }, 22024 21426 }, 22025 21427 }; 22026 21428 ··· 22293 21567 "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3" 22294 21568 "\x6A\x01\x75\x0C\xA3\x17\xAE\x45" 22295 21569 "\xDC\x50\xE7\x7E\x15\x89\x20\xB7" 22296 - "\x2B\xC2\x59", 22297 - .ilen = 499, 21570 + "\x2B\xC2\x59\xF0\x64\xFB\x92\x06" 21571 + "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78" 21572 + "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA" 21573 + "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C" 21574 + "\xF3\x67\xFE\x95\x09\xA0\x37\xCE" 21575 + "\x42\xD9\x70\x07\x7B\x12\xA9\x1D" 21576 + "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F" 21577 + "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01" 21578 + "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73" 21579 + "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5" 21580 + "\x59\xF0\x87\x1E\x92\x29\xC0\x34" 21581 + "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6" 21582 + "\x3D\xD4\x48\xDF\x76\x0D\x81\x18" 21583 + "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A" 21584 + "\x21\x95\x2C\xC3\x37\xCE\x65\xFC" 21585 + "\x70\x07\x9E\x12\xA9\x40\xD7\x4B" 21586 + "\xE2\x79\x10\x84\x1B\xB2\x26\xBD" 21587 + "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F" 21588 + "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1" 21589 + "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13" 21590 + "\x87\x1E\xB5\x29\xC0\x57\xEE\x62" 21591 + "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4" 21592 + "\x6B\x02\x76\x0D\xA4\x18\xAF\x46" 21593 + "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8" 21594 + "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07" 21595 + "\x9E\x35\xCC\x40\xD7\x6E\x05\x79" 21596 + "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB" 21597 + "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D" 21598 + "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF" 21599 + "\x43\xDA\x71\x08\x7C\x13\xAA\x1E" 21600 + "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90" 21601 + "\x27\xBE\x32\xC9\x60\xF7\x6B\x02" 21602 + "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74" 21603 + "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6" 21604 + "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35" 21605 + "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7" 21606 + "\x3E\xD5\x49\xE0\x77\x0E\x82\x19" 21607 + "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B" 21608 + "\x22\x96\x2D\xC4\x38\xCF\x66\xFD" 21609 + "\x71\x08\x9F\x13\xAA\x41\xD8\x4C" 21610 + "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE" 21611 + "\x55\xEC\x60\xF7\x8E\x02\x99\x30" 21612 + "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2" 21613 + "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14" 21614 + "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63" 21615 + "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5" 21616 + "\x6C\x03\x77\x0E\xA5\x19\xB0\x47" 21617 + "\xDE\x52\xE9\x80\x17\x8B\x22\xB9" 21618 + "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08" 21619 + "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A" 21620 + "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC" 21621 + "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E" 21622 + "\xF5\x69\x00\x97\x0B\xA2\x39\xD0" 21623 + "\x44\xDB\x72\x09\x7D\x14\xAB\x1F" 21624 + "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91" 21625 + "\x28\xBF\x33\xCA\x61\xF8\x6C\x03" 21626 + "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75" 21627 + "\x0C\x80\x17\xAE\x22\xB9\x50\xE7" 21628 + "\x5B\xF2\x89\x20\x94\x2B\xC2\x36" 21629 + "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8" 21630 + "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A" 21631 + "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C" 21632 + "\x00\x97\x2E\xC5\x39\xD0\x67\xFE" 21633 + "\x72\x09\xA0\x14\xAB\x42\xD9\x4D" 21634 + "\xE4\x7B\x12", 21635 + .ilen = 1011, 22298 21636 .result = "\xF3\x06\x3A\x84\xCD\xBA\x8E\x11" 22299 21637 "\xB7\x74\x6F\x5C\x97\xFB\x36\xFE" 22300 21638 "\xDE\x71\x58\xD4\x15\xD1\xC1\xA4" ··· 22421 21631 "\x7E\x42\xEC\xB6\x6F\x4D\x6B\x48" 22422 21632 "\xE6\xA6\x50\x80\x78\x9E\xF1\xB0" 22423 21633 "\x4D\xB2\x0D\x3D\xFC\x40\x25\x4D" 22424 - "\x93\x11\x1C", 22425 - .rlen = 499, 21634 + "\x93\x11\x1C\xE9\xD2\x9F\x6E\x90" 21635 + "\xE5\x41\x4A\xE2\x3C\x45\x29\x35" 21636 + "\xEC\xD6\x47\x50\xCB\x7B\xA2\x32" 21637 + "\xF7\x8B\x62\xF1\xE3\x9A\xFE\xC7" 21638 + "\x1D\x8C\x02\x72\x68\x09\xE9\xB6" 21639 + "\x4A\x80\xE6\xB1\x56\xDF\x90\xD4" 21640 + "\x93\x74\xA4\xCE\x20\x23\xBF\x48" 21641 + "\xA5\xDE\x1B\xFA\x40\x69\x31\x98" 21642 + "\x62\x6E\xA5\xC7\xBF\x0C\x62\xE5" 21643 + "\x6D\xE1\x93\xF1\x83\x10\x1C\xCA" 21644 + "\xF6\x5C\x19\xF8\x90\x78\xCB\xE4" 21645 + "\x0B\x3A\xB5\xF8\x43\x86\xD3\x3F" 21646 + "\xBA\x83\x34\x3C\x42\xCC\x7D\x28" 21647 + "\x29\x63\x4F\xD8\x02\x17\xC5\x07" 21648 + "\x2C\xA4\xAC\x79\xCB\xC3\xA9\x09" 21649 + "\x81\x45\x18\xED\xE4\xCB\x42\x3B" 21650 + "\x87\x2D\x23\xDC\xC5\xBA\x45\xBD" 21651 + "\x92\xE5\x02\x97\x96\xCE\xAD\xEC" 21652 + "\xBA\xD8\x76\xF8\xCA\xC1\x31\xEC" 21653 + "\x1E\x4F\x3F\x83\xF8\x33\xE8\x6E" 21654 + "\xCC\xF8\x5F\xDD\x65\x50\x99\x69" 21655 + "\xAF\x48\xCE\xA5\xBA\xB6\x14\x9F" 21656 + "\x05\x93\xB2\xE6\x59\xC8\x28\xFE" 21657 + "\x8F\x37\xF9\x64\xB9\xA5\x56\x8F" 21658 + "\xF1\x1B\x90\xEF\xAE\xEB\xFC\x09" 21659 + "\x11\x7A\xF2\x19\x0A\x0A\x9A\x3C" 21660 + "\xE2\x5E\x29\xFA\x31\x9B\xC1\x74" 21661 + "\x1E\x10\x3E\x07\xA9\x31\x6D\xF8" 21662 + "\x81\xF5\xD5\x8A\x04\x23\x51\xAC" 21663 + "\xA2\xE2\x63\xFD\x27\x1F\x79\x5B" 21664 + "\x1F\xE8\xDA\x11\x49\x4D\x1C\xBA" 21665 + "\x54\xCC\x0F\xBA\x92\x69\xE5\xCB" 21666 + "\x41\x1A\x67\xA6\x40\x82\x70\x8C" 21667 + "\x19\x79\x08\xA4\x51\x20\x7D\xC9" 21668 + "\x12\x27\xAE\x20\x0D\x2C\xA1\x6D" 21669 + "\xF4\x55\xD4\xE7\xE6\xD4\x28\x08" 21670 + "\x00\x70\x12\x56\x56\x50\xAD\x14" 21671 + "\x5C\x3E\xA2\xD1\x36\x3F\x36\x48" 21672 + "\xED\xB1\x57\x3E\x5D\x15\xF6\x1E" 21673 + "\x53\xE9\xA4\x3E\xED\x7D\xCF\x7D" 21674 + "\x29\xAF\xF3\x1E\x51\xA8\x9F\x85" 21675 + "\x8B\xF0\xBB\xCE\xCC\x39\xC3\x64" 21676 + "\x4B\xF2\xAD\x70\x19\xD4\x44\x8F" 21677 + "\x91\x76\xE8\x15\x66\x34\x9F\xF6" 21678 + "\x0F\x15\xA4\xA8\x24\xF8\x58\xB1" 21679 + "\x38\x46\x47\xC7\x9B\xCA\xE9\x42" 21680 + "\x44\xAA\xE6\xB5\x9C\x91\xA4\xD3" 21681 + "\x16\xA0\xED\x42\xBE\xB5\x06\x19" 21682 + "\xBE\x67\xE8\xBC\x22\x32\xA4\x1E" 21683 + "\x93\xEB\xBE\xE9\xE1\x93\xE5\x31" 21684 + "\x3A\xA2\x75\xDF\xE3\x6B\xE7\xCC" 21685 + "\xB4\x70\x20\xE0\x6D\x82\x7C\xC8" 21686 + "\x94\x5C\x5E\x37\x18\xAD\xED\x8B" 21687 + "\x44\x86\xCA\x5E\x07\xB7\x70\x8D" 21688 + "\x40\x48\x19\x73\x7C\x78\x64\x0B" 21689 + "\xDB\x01\xCA\xAE\x63\x19\xE9\xD1" 21690 + "\x6B\x2C\x84\x10\x45\x42\x2E\xC3" 21691 + "\xDF\x7F\xAA\xE8\x87\x1B\x63\x46" 21692 + "\x74\x28\x9D\x05\x30\x20\x62\x41" 21693 + "\xC0\x9F\x2C\x36\x2B\x78\xD7\x26" 21694 + "\xDF\x58\x51\xED\xFA\xDC\x87\x79" 21695 + "\xBF\x8C\xBF\xC4\x0F\xE5\x05\xDA" 21696 + "\x45\xE3\x35\x0D\x69\x91\x54\x1C" 21697 + "\xE7\x2C\x49\x08\x8B\x72\xFA\x5C" 21698 + "\xF1\x6B\xD9", 21699 + .rlen = 1011, 22426 21700 .also_non_np = 1, 22427 21701 .np = 2, 22428 - .tap = { 499 - 16, 16 }, 21702 + .tap = { 1011 - 16, 16 }, 22429 21703 }, { /* Generated with Crypto++ */ 22430 21704 .key = "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9" 22431 21705 "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A" ··· 22559 21705 "\x86\x1D\xB4\x28\xBF\x56\xED\x61" 22560 21706 "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3" 22561 21707 "\x6A\x01\x75\x0C\xA3\x17\xAE\x45" 22562 - "\xDC\x50\xE7\x7E\x15\x89\x20\xB7", 22563 - .ilen = 496, 21708 + "\xDC\x50\xE7\x7E\x15\x89\x20\xB7" 21709 + "\x2B\xC2\x59\xF0\x64\xFB\x92\x06" 21710 + "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78" 21711 + "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA" 21712 + "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C" 21713 + "\xF3\x67\xFE\x95\x09\xA0\x37\xCE" 21714 + "\x42\xD9\x70\x07\x7B\x12\xA9\x1D" 21715 + "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F" 21716 + "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01" 21717 + "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73" 21718 + "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5" 21719 + "\x59\xF0\x87\x1E\x92\x29\xC0\x34" 21720 + "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6" 21721 + "\x3D\xD4\x48\xDF\x76\x0D\x81\x18" 21722 + "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A" 21723 + "\x21\x95\x2C\xC3\x37\xCE\x65\xFC" 21724 + "\x70\x07\x9E\x12\xA9\x40\xD7\x4B" 21725 + "\xE2\x79\x10\x84\x1B\xB2\x26\xBD" 21726 + "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F" 21727 + "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1" 21728 + "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13" 21729 + "\x87\x1E\xB5\x29\xC0\x57\xEE\x62" 21730 + "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4" 21731 + "\x6B\x02\x76\x0D\xA4\x18\xAF\x46" 21732 + "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8" 21733 + "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07" 21734 + "\x9E\x35\xCC\x40\xD7\x6E\x05\x79" 21735 + "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB" 21736 + "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D" 21737 + "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF" 21738 + "\x43\xDA\x71\x08\x7C\x13\xAA\x1E" 21739 + "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90" 21740 + "\x27\xBE\x32\xC9\x60\xF7\x6B\x02" 21741 + "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74" 21742 + "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6" 21743 + "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35" 21744 + "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7" 21745 + "\x3E\xD5\x49\xE0\x77\x0E\x82\x19" 21746 + "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B" 21747 + "\x22\x96\x2D\xC4\x38\xCF\x66\xFD" 21748 + "\x71\x08\x9F\x13\xAA\x41\xD8\x4C" 21749 + "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE" 21750 + "\x55\xEC\x60\xF7\x8E\x02\x99\x30" 21751 + "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2" 21752 + "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14" 21753 + "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63" 21754 + "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5" 21755 + "\x6C\x03\x77\x0E\xA5\x19\xB0\x47" 21756 + "\xDE\x52\xE9\x80\x17\x8B\x22\xB9" 21757 + "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08" 21758 + "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A" 21759 + "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC" 21760 + "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E" 21761 + "\xF5\x69\x00\x97\x0B\xA2\x39\xD0" 21762 + "\x44\xDB\x72\x09\x7D\x14\xAB\x1F" 21763 + "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91" 21764 + "\x28\xBF\x33\xCA\x61\xF8\x6C\x03" 21765 + "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75" 21766 + "\x0C\x80\x17\xAE\x22\xB9\x50\xE7" 21767 + "\x5B\xF2\x89\x20\x94\x2B\xC2\x36" 21768 + "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8" 21769 + "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A" 21770 + "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C" 21771 + "\x00\x97\x2E\xC5\x39\xD0\x67\xFE" 21772 + "\x72\x09\xA0\x14\xAB\x42\xD9\x4D", 21773 + .ilen = 1008, 22564 21774 .result = "\x85\x79\x6C\x8B\x2B\x6D\x14\xF9" 22565 21775 "\xA6\x83\xB6\x80\x5B\x3A\xF3\x7E" 22566 21776 "\x30\x29\xEB\x1F\xDC\x19\x5F\xEB" ··· 22686 21768 "\xB4\x3A\x5F\x19\xCF\x42\x1B\x22" 22687 21769 "\x0B\x2D\x7B\xF1\xC5\x43\xF7\x5E" 22688 21770 "\x12\xA8\x01\x64\x16\x0B\x26\x5A" 22689 - "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C", 22690 - .rlen = 496, 21771 + "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C" 21772 + "\xCF\xF5\xD5\xB7\x7A\x34\x23\xB6" 21773 + "\xAA\x9E\xA8\x98\xA2\xF8\x3D\xD3" 21774 + "\x3F\x23\x69\x63\x56\x96\x45\xD6" 21775 + "\x74\x23\x1D\x5C\x63\xCC\xD8\x78" 21776 + "\x16\xE2\x9C\xD2\x80\x02\xF2\x28" 21777 + "\x69\x2F\xC4\xA8\x15\x15\x24\x3B" 21778 + "\xCB\xF0\x14\xE4\x62\xC8\xF3\xD1" 21779 + "\x03\x58\x1B\x33\x77\x74\x1F\xB4" 21780 + "\x07\x86\xF2\x21\xB7\x41\xAE\xBF" 21781 + "\x25\xC2\xFF\x51\xEF\xEA\xCE\xC4" 21782 + "\x5F\xD9\xB8\x18\x6A\xF0\x0F\x0D" 21783 + "\xF8\x04\xBB\x6D\x62\x33\x87\x26" 21784 + "\x4F\x2F\x14\x6E\xDC\xDB\x66\x09" 21785 + "\x2A\xEF\x7D\x84\x10\xAC\x82\x5E" 21786 + "\xD2\xE4\xAD\x74\x7A\x6D\xCC\x3A" 21787 + "\x7B\x62\xD8\xD6\x07\x2D\xF7\xDF" 21788 + "\x9B\xB3\x82\xCF\x9C\x1D\x76\x5C" 21789 + "\xAC\x7B\xD4\x9B\x45\xA1\x64\x11" 21790 + "\x66\xF1\xA7\x0B\xF9\xDD\x00\xDD" 21791 + "\xA4\x45\x3D\x3E\x03\xC9\x2E\xCB" 21792 + "\xC3\x14\x84\x72\xFD\x41\xDC\xBD" 21793 + "\x75\xBE\xA8\xE5\x16\x48\x64\x39" 21794 + "\xCA\xF3\xE6\xDC\x25\x24\xF1\x6D" 21795 + "\xB2\x8D\xC5\x38\x54\xD3\x5D\x6D" 21796 + "\x0B\x29\x10\x15\x0E\x13\x3B\xAC" 21797 + "\x7E\xCC\x9E\x3E\x18\x48\xA6\x02" 21798 + "\xEF\x03\xB2\x2E\xE3\xD2\x70\x21" 21799 + "\xB4\x19\x26\xBE\x3A\x3D\x05\xE0" 21800 + "\xF8\x09\xAF\xE4\x31\x26\x92\x2F" 21801 + "\x8F\x55\xAC\xED\x0B\xB2\xA5\x34" 21802 + "\xBE\x50\xB1\x02\x22\x96\xE3\x40" 21803 + "\x7B\x70\x50\x6E\x3B\xD5\xE5\xA0" 21804 + "\x8E\xA2\xAD\x14\x60\x5C\x7A\x2B" 21805 + "\x3D\x1B\x7F\xC1\xC0\x2C\x56\x36" 21806 + "\xD2\x0A\x32\x06\x97\x34\xB9\xF4" 21807 + "\x6F\x9F\x7E\x80\xD0\x9D\xF7\x6A" 21808 + "\x21\xC1\xA2\x6A\xB1\x96\x5B\x4D" 21809 + "\x7A\x15\x6C\xC4\x4E\xB8\xE0\x9E" 21810 + "\x6C\x50\xF3\x9C\xC9\xB5\x23\xB7" 21811 + "\xF1\xD4\x29\x4A\x23\xC4\xAD\x1E" 21812 + "\x2C\x07\xD2\x43\x5F\x57\x93\xCA" 21813 + "\x85\xF9\x9F\xAD\x4C\xF1\xE4\xB1" 21814 + "\x1A\x8E\x28\xA4\xB6\x52\x77\x7E" 21815 + "\x68\xC6\x47\xB9\x76\xCC\x65\x5F" 21816 + "\x0B\xF9\x67\x93\xD8\x0E\x9A\x37" 21817 + "\x5F\x41\xED\x64\x6C\xAD\x5F\xED" 21818 + "\x3F\x8D\xFB\x8E\x1E\xA0\xE4\x1F" 21819 + "\xC2\xC7\xED\x18\x43\xE1\x20\x86" 21820 + "\x5D\xBC\x30\x70\x22\xA1\xDC\x53" 21821 + "\x10\x3A\x8D\x47\x82\xCD\x7F\x59" 21822 + "\x03\x2D\x6D\xF5\xE7\x79\xD4\x07" 21823 + "\x68\x2A\xA5\x42\x19\x4D\xAF\xF5" 21824 + "\xED\x47\x83\xBC\x5F\x62\x84\xDA" 21825 + "\xDA\x41\xFF\xB0\x1D\x64\xA3\xC8" 21826 + "\xBD\x4E\xE0\xB8\x7F\xEE\x55\x0A" 21827 + "\x4E\x61\xB2\x51\xF6\x9C\x95\xF6" 21828 + "\x92\xBB\xF6\xC5\xF0\x09\x86\xDE" 21829 + "\x37\x9E\x29\xF9\x2A\x18\x73\x0D" 21830 + "\xDC\x7E\x6B\x7B\x1B\x43\x8C\xEA" 21831 + "\x13\xC8\x1A\x47\x0A\x2D\x6D\x56" 21832 + "\xCD\xD2\xE7\x53\x1A\xAB\x1C\x3C" 21833 + "\xC5\x9B\x03\x70\x29\x2A\x49\x09" 21834 + "\x67\xA1\xEA\xD6\x3A\x5B\xBF\x71" 21835 + "\x1D\x48\x64\x6C\xFB\xC0\x9E\x36", 21836 + .rlen = 1008, 22691 21837 }, 22692 21838 }; 22693 21839 ··· 22960 21978 "\x7E\x42\xEC\xB6\x6F\x4D\x6B\x48" 22961 21979 "\xE6\xA6\x50\x80\x78\x9E\xF1\xB0" 22962 21980 "\x4D\xB2\x0D\x3D\xFC\x40\x25\x4D" 22963 - "\x93\x11\x1C", 22964 - .ilen = 499, 21981 + "\x93\x11\x1C\xE9\xD2\x9F\x6E\x90" 21982 + "\xE5\x41\x4A\xE2\x3C\x45\x29\x35" 21983 + "\xEC\xD6\x47\x50\xCB\x7B\xA2\x32" 21984 + "\xF7\x8B\x62\xF1\xE3\x9A\xFE\xC7" 21985 + "\x1D\x8C\x02\x72\x68\x09\xE9\xB6" 21986 + "\x4A\x80\xE6\xB1\x56\xDF\x90\xD4" 21987 + "\x93\x74\xA4\xCE\x20\x23\xBF\x48" 21988 + "\xA5\xDE\x1B\xFA\x40\x69\x31\x98" 21989 + "\x62\x6E\xA5\xC7\xBF\x0C\x62\xE5" 21990 + "\x6D\xE1\x93\xF1\x83\x10\x1C\xCA" 21991 + "\xF6\x5C\x19\xF8\x90\x78\xCB\xE4" 21992 + "\x0B\x3A\xB5\xF8\x43\x86\xD3\x3F" 21993 + "\xBA\x83\x34\x3C\x42\xCC\x7D\x28" 21994 + "\x29\x63\x4F\xD8\x02\x17\xC5\x07" 21995 + "\x2C\xA4\xAC\x79\xCB\xC3\xA9\x09" 21996 + "\x81\x45\x18\xED\xE4\xCB\x42\x3B" 21997 + "\x87\x2D\x23\xDC\xC5\xBA\x45\xBD" 21998 + "\x92\xE5\x02\x97\x96\xCE\xAD\xEC" 21999 + "\xBA\xD8\x76\xF8\xCA\xC1\x31\xEC" 22000 + "\x1E\x4F\x3F\x83\xF8\x33\xE8\x6E" 22001 + "\xCC\xF8\x5F\xDD\x65\x50\x99\x69" 22002 + "\xAF\x48\xCE\xA5\xBA\xB6\x14\x9F" 22003 + "\x05\x93\xB2\xE6\x59\xC8\x28\xFE" 22004 + "\x8F\x37\xF9\x64\xB9\xA5\x56\x8F" 22005 + "\xF1\x1B\x90\xEF\xAE\xEB\xFC\x09" 22006 + "\x11\x7A\xF2\x19\x0A\x0A\x9A\x3C" 22007 + "\xE2\x5E\x29\xFA\x31\x9B\xC1\x74" 22008 + "\x1E\x10\x3E\x07\xA9\x31\x6D\xF8" 22009 + "\x81\xF5\xD5\x8A\x04\x23\x51\xAC" 22010 + "\xA2\xE2\x63\xFD\x27\x1F\x79\x5B" 22011 + "\x1F\xE8\xDA\x11\x49\x4D\x1C\xBA" 22012 + "\x54\xCC\x0F\xBA\x92\x69\xE5\xCB" 22013 + "\x41\x1A\x67\xA6\x40\x82\x70\x8C" 22014 + "\x19\x79\x08\xA4\x51\x20\x7D\xC9" 22015 + "\x12\x27\xAE\x20\x0D\x2C\xA1\x6D" 22016 + "\xF4\x55\xD4\xE7\xE6\xD4\x28\x08" 22017 + "\x00\x70\x12\x56\x56\x50\xAD\x14" 22018 + "\x5C\x3E\xA2\xD1\x36\x3F\x36\x48" 22019 + "\xED\xB1\x57\x3E\x5D\x15\xF6\x1E" 22020 + "\x53\xE9\xA4\x3E\xED\x7D\xCF\x7D" 22021 + "\x29\xAF\xF3\x1E\x51\xA8\x9F\x85" 22022 + "\x8B\xF0\xBB\xCE\xCC\x39\xC3\x64" 22023 + "\x4B\xF2\xAD\x70\x19\xD4\x44\x8F" 22024 + "\x91\x76\xE8\x15\x66\x34\x9F\xF6" 22025 + "\x0F\x15\xA4\xA8\x24\xF8\x58\xB1" 22026 + "\x38\x46\x47\xC7\x9B\xCA\xE9\x42" 22027 + "\x44\xAA\xE6\xB5\x9C\x91\xA4\xD3" 22028 + "\x16\xA0\xED\x42\xBE\xB5\x06\x19" 22029 + "\xBE\x67\xE8\xBC\x22\x32\xA4\x1E" 22030 + "\x93\xEB\xBE\xE9\xE1\x93\xE5\x31" 22031 + "\x3A\xA2\x75\xDF\xE3\x6B\xE7\xCC" 22032 + "\xB4\x70\x20\xE0\x6D\x82\x7C\xC8" 22033 + "\x94\x5C\x5E\x37\x18\xAD\xED\x8B" 22034 + "\x44\x86\xCA\x5E\x07\xB7\x70\x8D" 22035 + "\x40\x48\x19\x73\x7C\x78\x64\x0B" 22036 + "\xDB\x01\xCA\xAE\x63\x19\xE9\xD1" 22037 + "\x6B\x2C\x84\x10\x45\x42\x2E\xC3" 22038 + "\xDF\x7F\xAA\xE8\x87\x1B\x63\x46" 22039 + "\x74\x28\x9D\x05\x30\x20\x62\x41" 22040 + "\xC0\x9F\x2C\x36\x2B\x78\xD7\x26" 22041 + "\xDF\x58\x51\xED\xFA\xDC\x87\x79" 22042 + "\xBF\x8C\xBF\xC4\x0F\xE5\x05\xDA" 22043 + "\x45\xE3\x35\x0D\x69\x91\x54\x1C" 22044 + "\xE7\x2C\x49\x08\x8B\x72\xFA\x5C" 22045 + "\xF1\x6B\xD9", 22046 + .ilen = 1011, 22965 22047 .result = "\x56\xED\x84\x1B\x8F\x26\xBD\x31" 22966 22048 "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3" 22967 22049 "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15" ··· 23088 22042 "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3" 23089 22043 "\x6A\x01\x75\x0C\xA3\x17\xAE\x45" 23090 22044 "\xDC\x50\xE7\x7E\x15\x89\x20\xB7" 23091 - "\x2B\xC2\x59", 23092 - .rlen = 499, 22045 + "\x2B\xC2\x59\xF0\x64\xFB\x92\x06" 22046 + "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78" 22047 + "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA" 22048 + "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C" 22049 + "\xF3\x67\xFE\x95\x09\xA0\x37\xCE" 22050 + "\x42\xD9\x70\x07\x7B\x12\xA9\x1D" 22051 + "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F" 22052 + "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01" 22053 + "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73" 22054 + "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5" 22055 + "\x59\xF0\x87\x1E\x92\x29\xC0\x34" 22056 + "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6" 22057 + "\x3D\xD4\x48\xDF\x76\x0D\x81\x18" 22058 + "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A" 22059 + "\x21\x95\x2C\xC3\x37\xCE\x65\xFC" 22060 + "\x70\x07\x9E\x12\xA9\x40\xD7\x4B" 22061 + "\xE2\x79\x10\x84\x1B\xB2\x26\xBD" 22062 + "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F" 22063 + "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1" 22064 + "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13" 22065 + "\x87\x1E\xB5\x29\xC0\x57\xEE\x62" 22066 + "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4" 22067 + "\x6B\x02\x76\x0D\xA4\x18\xAF\x46" 22068 + "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8" 22069 + "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07" 22070 + "\x9E\x35\xCC\x40\xD7\x6E\x05\x79" 22071 + "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB" 22072 + "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D" 22073 + "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF" 22074 + "\x43\xDA\x71\x08\x7C\x13\xAA\x1E" 22075 + "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90" 22076 + "\x27\xBE\x32\xC9\x60\xF7\x6B\x02" 22077 + "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74" 22078 + "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6" 22079 + "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35" 22080 + "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7" 22081 + "\x3E\xD5\x49\xE0\x77\x0E\x82\x19" 22082 + "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B" 22083 + "\x22\x96\x2D\xC4\x38\xCF\x66\xFD" 22084 + "\x71\x08\x9F\x13\xAA\x41\xD8\x4C" 22085 + "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE" 22086 + "\x55\xEC\x60\xF7\x8E\x02\x99\x30" 22087 + "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2" 22088 + "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14" 22089 + "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63" 22090 + "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5" 22091 + "\x6C\x03\x77\x0E\xA5\x19\xB0\x47" 22092 + "\xDE\x52\xE9\x80\x17\x8B\x22\xB9" 22093 + "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08" 22094 + "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A" 22095 + "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC" 22096 + "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E" 22097 + "\xF5\x69\x00\x97\x0B\xA2\x39\xD0" 22098 + "\x44\xDB\x72\x09\x7D\x14\xAB\x1F" 22099 + "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91" 22100 + "\x28\xBF\x33\xCA\x61\xF8\x6C\x03" 22101 + "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75" 22102 + "\x0C\x80\x17\xAE\x22\xB9\x50\xE7" 22103 + "\x5B\xF2\x89\x20\x94\x2B\xC2\x36" 22104 + "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8" 22105 + "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A" 22106 + "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C" 22107 + "\x00\x97\x2E\xC5\x39\xD0\x67\xFE" 22108 + "\x72\x09\xA0\x14\xAB\x42\xD9\x4D" 22109 + "\xE4\x7B\x12", 22110 + .rlen = 1011, 23093 22111 .also_non_np = 1, 23094 22112 .np = 2, 23095 - .tap = { 499 - 16, 16 }, 22113 + .tap = { 1011 - 16, 16 }, 23096 22114 }, { /* Generated with Crypto++ */ 23097 22115 .key = "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9" 23098 22116 "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A" ··· 23226 22116 "\xB4\x3A\x5F\x19\xCF\x42\x1B\x22" 23227 22117 "\x0B\x2D\x7B\xF1\xC5\x43\xF7\x5E" 23228 22118 "\x12\xA8\x01\x64\x16\x0B\x26\x5A" 23229 - "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C", 23230 - .ilen = 496, 22119 + "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C" 22120 + "\xCF\xF5\xD5\xB7\x7A\x34\x23\xB6" 22121 + "\xAA\x9E\xA8\x98\xA2\xF8\x3D\xD3" 22122 + "\x3F\x23\x69\x63\x56\x96\x45\xD6" 22123 + "\x74\x23\x1D\x5C\x63\xCC\xD8\x78" 22124 + "\x16\xE2\x9C\xD2\x80\x02\xF2\x28" 22125 + "\x69\x2F\xC4\xA8\x15\x15\x24\x3B" 22126 + "\xCB\xF0\x14\xE4\x62\xC8\xF3\xD1" 22127 + "\x03\x58\x1B\x33\x77\x74\x1F\xB4" 22128 + "\x07\x86\xF2\x21\xB7\x41\xAE\xBF" 22129 + "\x25\xC2\xFF\x51\xEF\xEA\xCE\xC4" 22130 + "\x5F\xD9\xB8\x18\x6A\xF0\x0F\x0D" 22131 + "\xF8\x04\xBB\x6D\x62\x33\x87\x26" 22132 + "\x4F\x2F\x14\x6E\xDC\xDB\x66\x09" 22133 + "\x2A\xEF\x7D\x84\x10\xAC\x82\x5E" 22134 + "\xD2\xE4\xAD\x74\x7A\x6D\xCC\x3A" 22135 + "\x7B\x62\xD8\xD6\x07\x2D\xF7\xDF" 22136 + "\x9B\xB3\x82\xCF\x9C\x1D\x76\x5C" 22137 + "\xAC\x7B\xD4\x9B\x45\xA1\x64\x11" 22138 + "\x66\xF1\xA7\x0B\xF9\xDD\x00\xDD" 22139 + "\xA4\x45\x3D\x3E\x03\xC9\x2E\xCB" 22140 + "\xC3\x14\x84\x72\xFD\x41\xDC\xBD" 22141 + "\x75\xBE\xA8\xE5\x16\x48\x64\x39" 22142 + "\xCA\xF3\xE6\xDC\x25\x24\xF1\x6D" 22143 + "\xB2\x8D\xC5\x38\x54\xD3\x5D\x6D" 22144 + "\x0B\x29\x10\x15\x0E\x13\x3B\xAC" 22145 + "\x7E\xCC\x9E\x3E\x18\x48\xA6\x02" 22146 + "\xEF\x03\xB2\x2E\xE3\xD2\x70\x21" 22147 + "\xB4\x19\x26\xBE\x3A\x3D\x05\xE0" 22148 + "\xF8\x09\xAF\xE4\x31\x26\x92\x2F" 22149 + "\x8F\x55\xAC\xED\x0B\xB2\xA5\x34" 22150 + "\xBE\x50\xB1\x02\x22\x96\xE3\x40" 22151 + "\x7B\x70\x50\x6E\x3B\xD5\xE5\xA0" 22152 + "\x8E\xA2\xAD\x14\x60\x5C\x7A\x2B" 22153 + "\x3D\x1B\x7F\xC1\xC0\x2C\x56\x36" 22154 + "\xD2\x0A\x32\x06\x97\x34\xB9\xF4" 22155 + "\x6F\x9F\x7E\x80\xD0\x9D\xF7\x6A" 22156 + "\x21\xC1\xA2\x6A\xB1\x96\x5B\x4D" 22157 + "\x7A\x15\x6C\xC4\x4E\xB8\xE0\x9E" 22158 + "\x6C\x50\xF3\x9C\xC9\xB5\x23\xB7" 22159 + "\xF1\xD4\x29\x4A\x23\xC4\xAD\x1E" 22160 + "\x2C\x07\xD2\x43\x5F\x57\x93\xCA" 22161 + "\x85\xF9\x9F\xAD\x4C\xF1\xE4\xB1" 22162 + "\x1A\x8E\x28\xA4\xB6\x52\x77\x7E" 22163 + "\x68\xC6\x47\xB9\x76\xCC\x65\x5F" 22164 + "\x0B\xF9\x67\x93\xD8\x0E\x9A\x37" 22165 + "\x5F\x41\xED\x64\x6C\xAD\x5F\xED" 22166 + "\x3F\x8D\xFB\x8E\x1E\xA0\xE4\x1F" 22167 + "\xC2\xC7\xED\x18\x43\xE1\x20\x86" 22168 + "\x5D\xBC\x30\x70\x22\xA1\xDC\x53" 22169 + "\x10\x3A\x8D\x47\x82\xCD\x7F\x59" 22170 + "\x03\x2D\x6D\xF5\xE7\x79\xD4\x07" 22171 + "\x68\x2A\xA5\x42\x19\x4D\xAF\xF5" 22172 + "\xED\x47\x83\xBC\x5F\x62\x84\xDA" 22173 + "\xDA\x41\xFF\xB0\x1D\x64\xA3\xC8" 22174 + "\xBD\x4E\xE0\xB8\x7F\xEE\x55\x0A" 22175 + "\x4E\x61\xB2\x51\xF6\x9C\x95\xF6" 22176 + "\x92\xBB\xF6\xC5\xF0\x09\x86\xDE" 22177 + "\x37\x9E\x29\xF9\x2A\x18\x73\x0D" 22178 + "\xDC\x7E\x6B\x7B\x1B\x43\x8C\xEA" 22179 + "\x13\xC8\x1A\x47\x0A\x2D\x6D\x56" 22180 + "\xCD\xD2\xE7\x53\x1A\xAB\x1C\x3C" 22181 + "\xC5\x9B\x03\x70\x29\x2A\x49\x09" 22182 + "\x67\xA1\xEA\xD6\x3A\x5B\xBF\x71" 22183 + "\x1D\x48\x64\x6C\xFB\xC0\x9E\x36", 22184 + .ilen = 1008, 23231 22185 .result = "\x56\xED\x84\x1B\x8F\x26\xBD\x31" 23232 22186 "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3" 23233 22187 "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15" ··· 23353 22179 "\x86\x1D\xB4\x28\xBF\x56\xED\x61" 23354 22180 "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3" 23355 22181 "\x6A\x01\x75\x0C\xA3\x17\xAE\x45" 23356 - "\xDC\x50\xE7\x7E\x15\x89\x20\xB7", 23357 - .rlen = 496, 22182 + "\xDC\x50\xE7\x7E\x15\x89\x20\xB7" 22183 + "\x2B\xC2\x59\xF0\x64\xFB\x92\x06" 22184 + "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78" 22185 + "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA" 22186 + "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C" 22187 + "\xF3\x67\xFE\x95\x09\xA0\x37\xCE" 22188 + "\x42\xD9\x70\x07\x7B\x12\xA9\x1D" 22189 + "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F" 22190 + "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01" 22191 + "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73" 22192 + "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5" 22193 + "\x59\xF0\x87\x1E\x92\x29\xC0\x34" 22194 + "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6" 22195 + "\x3D\xD4\x48\xDF\x76\x0D\x81\x18" 22196 + "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A" 22197 + "\x21\x95\x2C\xC3\x37\xCE\x65\xFC" 22198 + "\x70\x07\x9E\x12\xA9\x40\xD7\x4B" 22199 + "\xE2\x79\x10\x84\x1B\xB2\x26\xBD" 22200 + "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F" 22201 + "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1" 22202 + "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13" 22203 + "\x87\x1E\xB5\x29\xC0\x57\xEE\x62" 22204 + "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4" 22205 + "\x6B\x02\x76\x0D\xA4\x18\xAF\x46" 22206 + "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8" 22207 + "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07" 22208 + "\x9E\x35\xCC\x40\xD7\x6E\x05\x79" 22209 + "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB" 22210 + "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D" 22211 + "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF" 22212 + "\x43\xDA\x71\x08\x7C\x13\xAA\x1E" 22213 + "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90" 22214 + "\x27\xBE\x32\xC9\x60\xF7\x6B\x02" 22215 + "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74" 22216 + "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6" 22217 + "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35" 22218 + "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7" 22219 + "\x3E\xD5\x49\xE0\x77\x0E\x82\x19" 22220 + "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B" 22221 + "\x22\x96\x2D\xC4\x38\xCF\x66\xFD" 22222 + "\x71\x08\x9F\x13\xAA\x41\xD8\x4C" 22223 + "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE" 22224 + "\x55\xEC\x60\xF7\x8E\x02\x99\x30" 22225 + "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2" 22226 + "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14" 22227 + "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63" 22228 + "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5" 22229 + "\x6C\x03\x77\x0E\xA5\x19\xB0\x47" 22230 + "\xDE\x52\xE9\x80\x17\x8B\x22\xB9" 22231 + "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08" 22232 + "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A" 22233 + "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC" 22234 + "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E" 22235 + "\xF5\x69\x00\x97\x0B\xA2\x39\xD0" 22236 + "\x44\xDB\x72\x09\x7D\x14\xAB\x1F" 22237 + "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91" 22238 + "\x28\xBF\x33\xCA\x61\xF8\x6C\x03" 22239 + "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75" 22240 + "\x0C\x80\x17\xAE\x22\xB9\x50\xE7" 22241 + "\x5B\xF2\x89\x20\x94\x2B\xC2\x36" 22242 + "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8" 22243 + "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A" 22244 + "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C" 22245 + "\x00\x97\x2E\xC5\x39\xD0\x67\xFE" 22246 + "\x72\x09\xA0\x14\xAB\x42\xD9\x4D", 22247 + .rlen = 1008, 23358 22248 }, 23359 22249 }; 23360 22250

+12

drivers/char/hw_random/Kconfig

··· 86 86 87 87 If unusure, say Y. 88 88 89 + config HW_RANDOM_BCM2835 90 + tristate "Broadcom BCM2835 Random Number Generator support" 91 + depends on HW_RANDOM && ARCH_BCM2835 92 + default HW_RANDOM 93 + ---help--- 94 + This driver provides kernel-side support for the Random Number 95 + Generator hardware found on the Broadcom BCM2835 SoCs. 96 + 97 + To compile this driver as a module, choose M here: the 98 + module will be called bcm2835-rng 99 + 100 + If unsure, say Y. 89 101 90 102 config HW_RANDOM_GEODE 91 103 tristate "AMD Geode HW Random Number Generator support"

+1

drivers/char/hw_random/Makefile

··· 26 26 obj-$(CONFIG_HW_RANDOM_PSERIES) += pseries-rng.o 27 27 obj-$(CONFIG_HW_RANDOM_EXYNOS) += exynos-rng.o 28 28 obj-$(CONFIG_HW_RANDOM_TPM) += tpm-rng.o 29 + obj-$(CONFIG_HW_RANDOM_BCM2835) += bcm2835-rng.o

+113

drivers/char/hw_random/bcm2835-rng.c

··· 1 + /** 2 + * Copyright (c) 2010-2012 Broadcom. All rights reserved. 3 + * Copyright (c) 2013 Lubomir Rintel 4 + * 5 + * This program is free software; you can redistribute it and/or 6 + * modify it under the terms of the GNU General Public License ("GPL") 7 + * version 2, as published by the Free Software Foundation. 8 + */ 9 + 10 + #include <linux/hw_random.h> 11 + #include <linux/init.h> 12 + #include <linux/io.h> 13 + #include <linux/kernel.h> 14 + #include <linux/module.h> 15 + #include <linux/of_address.h> 16 + #include <linux/of_platform.h> 17 + #include <linux/platform_device.h> 18 + #include <linux/printk.h> 19 + 20 + #define RNG_CTRL 0x0 21 + #define RNG_STATUS 0x4 22 + #define RNG_DATA 0x8 23 + 24 + /* enable rng */ 25 + #define RNG_RBGEN 0x1 26 + 27 + /* the initial numbers generated are "less random" so will be discarded */ 28 + #define RNG_WARMUP_COUNT 0x40000 29 + 30 + static int bcm2835_rng_read(struct hwrng *rng, void *buf, size_t max, 31 + bool wait) 32 + { 33 + void __iomem *rng_base = (void __iomem *)rng->priv; 34 + 35 + while ((__raw_readl(rng_base + RNG_STATUS) >> 24) == 0) { 36 + if (!wait) 37 + return 0; 38 + cpu_relax(); 39 + } 40 + 41 + *(u32 *)buf = __raw_readl(rng_base + RNG_DATA); 42 + return sizeof(u32); 43 + } 44 + 45 + static struct hwrng bcm2835_rng_ops = { 46 + .name = "bcm2835", 47 + .read = bcm2835_rng_read, 48 + }; 49 + 50 + static int bcm2835_rng_probe(struct platform_device *pdev) 51 + { 52 + struct device *dev = &pdev->dev; 53 + struct device_node *np = dev->of_node; 54 + void __iomem *rng_base; 55 + int err; 56 + 57 + /* map peripheral */ 58 + rng_base = of_iomap(np, 0); 59 + if (!rng_base) { 60 + dev_err(dev, "failed to remap rng regs"); 61 + return -ENODEV; 62 + } 63 + bcm2835_rng_ops.priv = (unsigned long)rng_base; 64 + 65 + /* register driver */ 66 + err = hwrng_register(&bcm2835_rng_ops); 67 + if (err) { 68 + dev_err(dev, "hwrng registration failed\n"); 69 + iounmap(rng_base); 70 + } else { 71 + dev_info(dev, "hwrng registered\n"); 72 + 73 + /* set warm-up count & enable */ 74 + __raw_writel(RNG_WARMUP_COUNT, rng_base + RNG_STATUS); 75 + __raw_writel(RNG_RBGEN, rng_base + RNG_CTRL); 76 + } 77 + return err; 78 + } 79 + 80 + static int bcm2835_rng_remove(struct platform_device *pdev) 81 + { 82 + void __iomem *rng_base = (void __iomem *)bcm2835_rng_ops.priv; 83 + 84 + /* disable rng hardware */ 85 + __raw_writel(0, rng_base + RNG_CTRL); 86 + 87 + /* unregister driver */ 88 + hwrng_unregister(&bcm2835_rng_ops); 89 + iounmap(rng_base); 90 + 91 + return 0; 92 + } 93 + 94 + static const struct of_device_id bcm2835_rng_of_match[] = { 95 + { .compatible = "brcm,bcm2835-rng", }, 96 + {}, 97 + }; 98 + MODULE_DEVICE_TABLE(of, bcm2835_rng_of_match); 99 + 100 + static struct platform_driver bcm2835_rng_driver = { 101 + .driver = { 102 + .name = "bcm2835-rng", 103 + .owner = THIS_MODULE, 104 + .of_match_table = bcm2835_rng_of_match, 105 + }, 106 + .probe = bcm2835_rng_probe, 107 + .remove = bcm2835_rng_remove, 108 + }; 109 + module_platform_driver(bcm2835_rng_driver); 110 + 111 + MODULE_AUTHOR("Lubomir Rintel <lkundrak@v3.sk>"); 112 + MODULE_DESCRIPTION("BCM2835 Random Number Generator (RNG) driver"); 113 + MODULE_LICENSE("GPLv2");

+2 -1

drivers/char/hw_random/exynos-rng.c

··· 144 144 return 0; 145 145 } 146 146 147 + #if defined(CONFIG_PM_SLEEP) || defined(CONFIG_PM_RUNTIME) 147 148 static int exynos_rng_runtime_suspend(struct device *dev) 148 149 { 149 150 struct platform_device *pdev = to_platform_device(dev); ··· 162 161 163 162 return clk_prepare_enable(exynos_rng->clk); 164 163 } 165 - 164 + #endif 166 165 167 166 static UNIVERSAL_DEV_PM_OPS(exynos_rng_pm_ops, exynos_rng_runtime_suspend, 168 167 exynos_rng_runtime_resume, NULL);

+4 -17

drivers/char/hw_random/mxc-rnga.c

··· 142 142 static int __init mxc_rnga_probe(struct platform_device *pdev) 143 143 { 144 144 int err = -ENODEV; 145 - struct resource *res, *mem; 145 + struct resource *res; 146 146 struct mxc_rng *mxc_rng; 147 147 148 148 mxc_rng = devm_kzalloc(&pdev->dev, sizeof(struct mxc_rng), ··· 172 172 goto err_region; 173 173 } 174 174 175 - mem = request_mem_region(res->start, resource_size(res), pdev->name); 176 - if (mem == NULL) { 177 - err = -EBUSY; 178 - goto err_region; 179 - } 180 - 181 - mxc_rng->mem = ioremap(res->start, resource_size(res)); 182 - if (!mxc_rng->mem) { 183 - err = -ENOMEM; 175 + mxc_rng->mem = devm_ioremap_resource(&pdev->dev, res); 176 + if (IS_ERR(mxc_rng->mem)) { 177 + err = PTR_ERR(mxc_rng->mem); 184 178 goto err_ioremap; 185 179 } 186 180 ··· 189 195 return 0; 190 196 191 197 err_ioremap: 192 - release_mem_region(res->start, resource_size(res)); 193 - 194 198 err_region: 195 199 clk_disable_unprepare(mxc_rng->clk); 196 200 ··· 198 206 199 207 static int __exit mxc_rnga_remove(struct platform_device *pdev) 200 208 { 201 - struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0); 202 209 struct mxc_rng *mxc_rng = platform_get_drvdata(pdev); 203 210 204 211 hwrng_unregister(&mxc_rng->rng); 205 - 206 - iounmap(mxc_rng->mem); 207 - 208 - release_mem_region(res->start, resource_size(res)); 209 212 210 213 clk_disable_unprepare(mxc_rng->clk); 211 214

+139 -57

drivers/char/hw_random/timeriomem-rng.c

··· 23 23 #include <linux/module.h> 24 24 #include <linux/kernel.h> 25 25 #include <linux/platform_device.h> 26 + #include <linux/of.h> 26 27 #include <linux/hw_random.h> 27 28 #include <linux/io.h> 29 + #include <linux/slab.h> 28 30 #include <linux/timeriomem-rng.h> 29 31 #include <linux/jiffies.h> 30 32 #include <linux/sched.h> 31 33 #include <linux/timer.h> 32 34 #include <linux/completion.h> 33 35 34 - static struct timeriomem_rng_data *timeriomem_rng_data; 36 + struct timeriomem_rng_private_data { 37 + void __iomem *io_base; 38 + unsigned int expires; 39 + unsigned int period; 40 + unsigned int present:1; 35 41 36 - static void timeriomem_rng_trigger(unsigned long); 37 - static DEFINE_TIMER(timeriomem_rng_timer, timeriomem_rng_trigger, 0, 0); 42 + struct timer_list timer; 43 + struct completion completion; 44 + 45 + struct hwrng timeriomem_rng_ops; 46 + }; 47 + 48 + #define to_rng_priv(rng) \ 49 + ((struct timeriomem_rng_private_data *)rng->priv) 38 50 39 51 /* 40 52 * have data return 1, however return 0 if we have nothing 41 53 */ 42 54 static int timeriomem_rng_data_present(struct hwrng *rng, int wait) 43 55 { 44 - if (rng->priv == 0) 45 - return 1; 56 + struct timeriomem_rng_private_data *priv = to_rng_priv(rng); 46 57 47 - if (!wait || timeriomem_rng_data->present) 48 - return timeriomem_rng_data->present; 58 + if (!wait || priv->present) 59 + return priv->present; 49 60 50 - wait_for_completion(&timeriomem_rng_data->completion); 61 + wait_for_completion(&priv->completion); 51 62 52 63 return 1; 53 64 } 54 65 55 66 static int timeriomem_rng_data_read(struct hwrng *rng, u32 *data) 56 67 { 68 + struct timeriomem_rng_private_data *priv = to_rng_priv(rng); 57 69 unsigned long cur; 58 70 s32 delay; 59 71 60 - *data = readl(timeriomem_rng_data->address); 72 + *data = readl(priv->io_base); 61 73 62 - if (rng->priv != 0) { 63 - cur = jiffies; 74 + cur = jiffies; 64 75 65 - delay = cur - timeriomem_rng_timer.expires; 66 - delay = rng->priv - (delay % rng->priv); 76 + delay = cur - priv->expires; 77 + delay = priv->period - (delay % priv->period); 67 78 68 - timeriomem_rng_timer.expires = cur + delay; 69 - timeriomem_rng_data->present = 0; 79 + priv->expires = cur + delay; 80 + priv->present = 0; 70 81 71 - init_completion(&timeriomem_rng_data->completion); 72 - add_timer(&timeriomem_rng_timer); 73 - } 82 + INIT_COMPLETION(priv->completion); 83 + mod_timer(&priv->timer, priv->expires); 74 84 75 85 return 4; 76 86 } 77 87 78 - static void timeriomem_rng_trigger(unsigned long dummy) 88 + static void timeriomem_rng_trigger(unsigned long data) 79 89 { 80 - timeriomem_rng_data->present = 1; 81 - complete(&timeriomem_rng_data->completion); 82 - } 90 + struct timeriomem_rng_private_data *priv 91 + = (struct timeriomem_rng_private_data *)data; 83 92 84 - static struct hwrng timeriomem_rng_ops = { 85 - .name = "timeriomem", 86 - .data_present = timeriomem_rng_data_present, 87 - .data_read = timeriomem_rng_data_read, 88 - .priv = 0, 89 - }; 93 + priv->present = 1; 94 + complete(&priv->completion); 95 + } 90 96 91 97 static int timeriomem_rng_probe(struct platform_device *pdev) 92 98 { 99 + struct timeriomem_rng_data *pdata = pdev->dev.platform_data; 100 + struct timeriomem_rng_private_data *priv; 93 101 struct resource *res; 94 - int ret; 102 + int err = 0; 103 + int period; 104 + 105 + if (!pdev->dev.of_node && !pdata) { 106 + dev_err(&pdev->dev, "timeriomem_rng_data is missing\n"); 107 + return -EINVAL; 108 + } 95 109 96 110 res = platform_get_resource(pdev, IORESOURCE_MEM, 0); 97 - 98 111 if (!res) 99 - return -ENOENT; 112 + return -ENXIO; 100 113 101 - timeriomem_rng_data = pdev->dev.platform_data; 102 - 103 - timeriomem_rng_data->address = ioremap(res->start, resource_size(res)); 104 - if (!timeriomem_rng_data->address) 105 - return -EIO; 106 - 107 - if (timeriomem_rng_data->period != 0 108 - && usecs_to_jiffies(timeriomem_rng_data->period) > 0) { 109 - timeriomem_rng_timer.expires = jiffies; 110 - 111 - timeriomem_rng_ops.priv = usecs_to_jiffies( 112 - timeriomem_rng_data->period); 114 + if (res->start % 4 != 0 || resource_size(res) != 4) { 115 + dev_err(&pdev->dev, 116 + "address must be four bytes wide and aligned\n"); 117 + return -EINVAL; 113 118 } 114 - timeriomem_rng_data->present = 1; 115 119 116 - ret = hwrng_register(&timeriomem_rng_ops); 117 - if (ret) 118 - goto failed; 120 + /* Allocate memory for the device structure (and zero it) */ 121 + priv = kzalloc(sizeof(struct timeriomem_rng_private_data), GFP_KERNEL); 122 + if (!priv) { 123 + dev_err(&pdev->dev, "failed to allocate device structure.\n"); 124 + return -ENOMEM; 125 + } 126 + 127 + platform_set_drvdata(pdev, priv); 128 + 129 + if (pdev->dev.of_node) { 130 + int i; 131 + 132 + if (!of_property_read_u32(pdev->dev.of_node, 133 + "period", &i)) 134 + period = i; 135 + else { 136 + dev_err(&pdev->dev, "missing period\n"); 137 + err = -EINVAL; 138 + goto out_free; 139 + } 140 + } else 141 + period = pdata->period; 142 + 143 + priv->period = usecs_to_jiffies(period); 144 + if (priv->period < 1) { 145 + dev_err(&pdev->dev, "period is less than one jiffy\n"); 146 + err = -EINVAL; 147 + goto out_free; 148 + } 149 + 150 + priv->expires = jiffies; 151 + priv->present = 1; 152 + 153 + init_completion(&priv->completion); 154 + complete(&priv->completion); 155 + 156 + setup_timer(&priv->timer, timeriomem_rng_trigger, (unsigned long)priv); 157 + 158 + priv->timeriomem_rng_ops.name = dev_name(&pdev->dev); 159 + priv->timeriomem_rng_ops.data_present = timeriomem_rng_data_present; 160 + priv->timeriomem_rng_ops.data_read = timeriomem_rng_data_read; 161 + priv->timeriomem_rng_ops.priv = (unsigned long)priv; 162 + 163 + if (!request_mem_region(res->start, resource_size(res), 164 + dev_name(&pdev->dev))) { 165 + dev_err(&pdev->dev, "request_mem_region failed\n"); 166 + err = -EBUSY; 167 + goto out_timer; 168 + } 169 + 170 + priv->io_base = ioremap(res->start, resource_size(res)); 171 + if (priv->io_base == NULL) { 172 + dev_err(&pdev->dev, "ioremap failed\n"); 173 + err = -EIO; 174 + goto out_release_io; 175 + } 176 + 177 + err = hwrng_register(&priv->timeriomem_rng_ops); 178 + if (err) { 179 + dev_err(&pdev->dev, "problem registering\n"); 180 + goto out; 181 + } 119 182 120 183 dev_info(&pdev->dev, "32bits from 0x%p @ %dus\n", 121 - timeriomem_rng_data->address, 122 - timeriomem_rng_data->period); 184 + priv->io_base, period); 123 185 124 186 return 0; 125 187 126 - failed: 127 - dev_err(&pdev->dev, "problem registering\n"); 128 - iounmap(timeriomem_rng_data->address); 129 - 130 - return ret; 188 + out: 189 + iounmap(priv->io_base); 190 + out_release_io: 191 + release_mem_region(res->start, resource_size(res)); 192 + out_timer: 193 + del_timer_sync(&priv->timer); 194 + out_free: 195 + platform_set_drvdata(pdev, NULL); 196 + kfree(priv); 197 + return err; 131 198 } 132 199 133 200 static int timeriomem_rng_remove(struct platform_device *pdev) 134 201 { 135 - del_timer_sync(&timeriomem_rng_timer); 136 - hwrng_unregister(&timeriomem_rng_ops); 202 + struct timeriomem_rng_private_data *priv = platform_get_drvdata(pdev); 203 + struct resource *res; 137 204 138 - iounmap(timeriomem_rng_data->address); 205 + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); 206 + 207 + hwrng_unregister(&priv->timeriomem_rng_ops); 208 + 209 + del_timer_sync(&priv->timer); 210 + iounmap(priv->io_base); 211 + release_mem_region(res->start, resource_size(res)); 212 + platform_set_drvdata(pdev, NULL); 213 + kfree(priv); 139 214 140 215 return 0; 141 216 } 217 + 218 + static const struct of_device_id timeriomem_rng_match[] = { 219 + { .compatible = "timeriomem_rng" }, 220 + {}, 221 + }; 222 + MODULE_DEVICE_TABLE(of, timeriomem_rng_match); 142 223 143 224 static struct platform_driver timeriomem_rng_driver = { 144 225 .driver = { 145 226 .name = "timeriomem_rng", 146 227 .owner = THIS_MODULE, 228 + .of_match_table = timeriomem_rng_match, 147 229 }, 148 230 .probe = timeriomem_rng_probe, 149 231 .remove = timeriomem_rng_remove,

+15 -3

drivers/crypto/Kconfig

··· 276 276 277 277 Saying m here will build a module named pipcoxcell_crypto. 278 278 279 + config CRYPTO_DEV_SAHARA 280 + tristate "Support for SAHARA crypto accelerator" 281 + depends on ARCH_MXC && EXPERIMENTAL && OF 282 + select CRYPTO_BLKCIPHER 283 + select CRYPTO_AES 284 + select CRYPTO_ECB 285 + help 286 + This option enables support for the SAHARA HW crypto accelerator 287 + found in some Freescale i.MX chips. 288 + 279 289 config CRYPTO_DEV_S5P 280 290 tristate "Support for Samsung S5PV210 crypto accelerator" 281 291 depends on ARCH_S5PV210 ··· 371 361 will be called atmel-tdes. 372 362 373 363 config CRYPTO_DEV_ATMEL_SHA 374 - tristate "Support for Atmel SHA1/SHA256 hw accelerator" 364 + tristate "Support for Atmel SHA hw accelerator" 375 365 depends on ARCH_AT91 376 366 select CRYPTO_SHA1 377 367 select CRYPTO_SHA256 368 + select CRYPTO_SHA512 378 369 select CRYPTO_ALGAPI 379 370 help 380 - Some Atmel processors have SHA1/SHA256 hw accelerator. 371 + Some Atmel processors have SHA1/SHA224/SHA256/SHA384/SHA512 372 + hw accelerator. 381 373 Select this if you want to use the Atmel module for 382 - SHA1/SHA256 algorithms. 374 + SHA1/SHA224/SHA256/SHA384/SHA512 algorithms. 383 375 384 376 To compile this driver as a module, choose M here: the module 385 377 will be called atmel-sha.

+1

drivers/crypto/Makefile

··· 12 12 obj-$(CONFIG_CRYPTO_DEV_OMAP_SHAM) += omap-sham.o 13 13 obj-$(CONFIG_CRYPTO_DEV_OMAP_AES) += omap-aes.o 14 14 obj-$(CONFIG_CRYPTO_DEV_PICOXCELL) += picoxcell_crypto.o 15 + obj-$(CONFIG_CRYPTO_DEV_SAHARA) += sahara.o 15 16 obj-$(CONFIG_CRYPTO_DEV_S5P) += s5p-sss.o 16 17 obj-$(CONFIG_CRYPTO_DEV_TEGRA_AES) += tegra-aes.o 17 18 obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/

+356 -121

drivers/crypto/atmel-aes.c

··· 38 38 #include <crypto/aes.h> 39 39 #include <crypto/hash.h> 40 40 #include <crypto/internal/hash.h> 41 - #include <linux/platform_data/atmel-aes.h> 41 + #include <linux/platform_data/crypto-atmel.h> 42 42 #include "atmel-aes-regs.h" 43 43 44 44 #define CFB8_BLOCK_SIZE 1 ··· 47 47 #define CFB64_BLOCK_SIZE 8 48 48 49 49 /* AES flags */ 50 - #define AES_FLAGS_MODE_MASK 0x01ff 50 + #define AES_FLAGS_MODE_MASK 0x03ff 51 51 #define AES_FLAGS_ENCRYPT BIT(0) 52 52 #define AES_FLAGS_CBC BIT(1) 53 53 #define AES_FLAGS_CFB BIT(2) ··· 55 55 #define AES_FLAGS_CFB16 BIT(4) 56 56 #define AES_FLAGS_CFB32 BIT(5) 57 57 #define AES_FLAGS_CFB64 BIT(6) 58 - #define AES_FLAGS_OFB BIT(7) 59 - #define AES_FLAGS_CTR BIT(8) 58 + #define AES_FLAGS_CFB128 BIT(7) 59 + #define AES_FLAGS_OFB BIT(8) 60 + #define AES_FLAGS_CTR BIT(9) 60 61 61 62 #define AES_FLAGS_INIT BIT(16) 62 63 #define AES_FLAGS_DMA BIT(17) 63 64 #define AES_FLAGS_BUSY BIT(18) 65 + #define AES_FLAGS_FAST BIT(19) 64 66 65 - #define AES_FLAGS_DUALBUFF BIT(24) 66 - 67 - #define ATMEL_AES_QUEUE_LENGTH 1 68 - #define ATMEL_AES_CACHE_SIZE 0 67 + #define ATMEL_AES_QUEUE_LENGTH 50 69 68 70 69 #define ATMEL_AES_DMA_THRESHOLD 16 71 70 71 + 72 + struct atmel_aes_caps { 73 + bool has_dualbuff; 74 + bool has_cfb64; 75 + u32 max_burst_size; 76 + }; 72 77 73 78 struct atmel_aes_dev; 74 79 ··· 82 77 83 78 int keylen; 84 79 u32 key[AES_KEYSIZE_256 / sizeof(u32)]; 80 + 81 + u16 block_size; 85 82 }; 86 83 87 84 struct atmel_aes_reqctx { ··· 119 112 120 113 struct scatterlist *in_sg; 121 114 unsigned int nb_in_sg; 122 - 115 + size_t in_offset; 123 116 struct scatterlist *out_sg; 124 117 unsigned int nb_out_sg; 118 + size_t out_offset; 125 119 126 120 size_t bufcnt; 121 + size_t buflen; 122 + size_t dma_size; 127 123 128 - u8 buf_in[ATMEL_AES_DMA_THRESHOLD] __aligned(sizeof(u32)); 129 - int dma_in; 124 + void *buf_in; 125 + int dma_in; 126 + dma_addr_t dma_addr_in; 130 127 struct atmel_aes_dma dma_lch_in; 131 128 132 - u8 buf_out[ATMEL_AES_DMA_THRESHOLD] __aligned(sizeof(u32)); 133 - int dma_out; 129 + void *buf_out; 130 + int dma_out; 131 + dma_addr_t dma_addr_out; 134 132 struct atmel_aes_dma dma_lch_out; 133 + 134 + struct atmel_aes_caps caps; 135 135 136 136 u32 hw_version; 137 137 }; ··· 179 165 return sg_nb; 180 166 } 181 167 168 + static int atmel_aes_sg_copy(struct scatterlist **sg, size_t *offset, 169 + void *buf, size_t buflen, size_t total, int out) 170 + { 171 + unsigned int count, off = 0; 172 + 173 + while (buflen && total) { 174 + count = min((*sg)->length - *offset, total); 175 + count = min(count, buflen); 176 + 177 + if (!count) 178 + return off; 179 + 180 + scatterwalk_map_and_copy(buf + off, *sg, *offset, count, out); 181 + 182 + off += count; 183 + buflen -= count; 184 + *offset += count; 185 + total -= count; 186 + 187 + if (*offset == (*sg)->length) { 188 + *sg = sg_next(*sg); 189 + if (*sg) 190 + *offset = 0; 191 + else 192 + total = 0; 193 + } 194 + } 195 + 196 + return off; 197 + } 198 + 182 199 static inline u32 atmel_aes_read(struct atmel_aes_dev *dd, u32 offset) 183 200 { 184 201 return readl_relaxed(dd->io_base + offset); ··· 233 188 { 234 189 for (; count--; value++, offset += 4) 235 190 atmel_aes_write(dd, offset, *value); 236 - } 237 - 238 - static void atmel_aes_dualbuff_test(struct atmel_aes_dev *dd) 239 - { 240 - atmel_aes_write(dd, AES_MR, AES_MR_DUALBUFF); 241 - 242 - if (atmel_aes_read(dd, AES_MR) & AES_MR_DUALBUFF) 243 - dd->flags |= AES_FLAGS_DUALBUFF; 244 191 } 245 192 246 193 static struct atmel_aes_dev *atmel_aes_find_dev(struct atmel_aes_ctx *ctx) ··· 262 225 263 226 if (!(dd->flags & AES_FLAGS_INIT)) { 264 227 atmel_aes_write(dd, AES_CR, AES_CR_SWRST); 265 - atmel_aes_dualbuff_test(dd); 228 + atmel_aes_write(dd, AES_MR, 0xE << AES_MR_CKEY_OFFSET); 266 229 dd->flags |= AES_FLAGS_INIT; 267 230 dd->err = 0; 268 231 } ··· 270 233 return 0; 271 234 } 272 235 236 + static inline unsigned int atmel_aes_get_version(struct atmel_aes_dev *dd) 237 + { 238 + return atmel_aes_read(dd, AES_HW_VERSION) & 0x00000fff; 239 + } 240 + 273 241 static void atmel_aes_hw_version_init(struct atmel_aes_dev *dd) 274 242 { 275 243 atmel_aes_hw_init(dd); 276 244 277 - dd->hw_version = atmel_aes_read(dd, AES_HW_VERSION); 245 + dd->hw_version = atmel_aes_get_version(dd); 246 + 247 + dev_info(dd->dev, 248 + "version: 0x%x\n", dd->hw_version); 278 249 279 250 clk_disable_unprepare(dd->iclk); 280 251 } ··· 305 260 tasklet_schedule(&dd->done_task); 306 261 } 307 262 308 - static int atmel_aes_crypt_dma(struct atmel_aes_dev *dd) 263 + static int atmel_aes_crypt_dma(struct atmel_aes_dev *dd, 264 + dma_addr_t dma_addr_in, dma_addr_t dma_addr_out, int length) 309 265 { 266 + struct scatterlist sg[2]; 310 267 struct dma_async_tx_descriptor *in_desc, *out_desc; 311 - int nb_dma_sg_in, nb_dma_sg_out; 312 268 313 - dd->nb_in_sg = atmel_aes_sg_length(dd->req, dd->in_sg); 314 - if (!dd->nb_in_sg) 315 - goto exit_err; 269 + dd->dma_size = length; 316 270 317 - nb_dma_sg_in = dma_map_sg(dd->dev, dd->in_sg, dd->nb_in_sg, 318 - DMA_TO_DEVICE); 319 - if (!nb_dma_sg_in) 320 - goto exit_err; 271 + if (!(dd->flags & AES_FLAGS_FAST)) { 272 + dma_sync_single_for_device(dd->dev, dma_addr_in, length, 273 + DMA_TO_DEVICE); 274 + } 321 275 322 - in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, dd->in_sg, 323 - nb_dma_sg_in, DMA_MEM_TO_DEV, 276 + if (dd->flags & AES_FLAGS_CFB8) { 277 + dd->dma_lch_in.dma_conf.dst_addr_width = 278 + DMA_SLAVE_BUSWIDTH_1_BYTE; 279 + dd->dma_lch_out.dma_conf.src_addr_width = 280 + DMA_SLAVE_BUSWIDTH_1_BYTE; 281 + } else if (dd->flags & AES_FLAGS_CFB16) { 282 + dd->dma_lch_in.dma_conf.dst_addr_width = 283 + DMA_SLAVE_BUSWIDTH_2_BYTES; 284 + dd->dma_lch_out.dma_conf.src_addr_width = 285 + DMA_SLAVE_BUSWIDTH_2_BYTES; 286 + } else { 287 + dd->dma_lch_in.dma_conf.dst_addr_width = 288 + DMA_SLAVE_BUSWIDTH_4_BYTES; 289 + dd->dma_lch_out.dma_conf.src_addr_width = 290 + DMA_SLAVE_BUSWIDTH_4_BYTES; 291 + } 292 + 293 + if (dd->flags & (AES_FLAGS_CFB8 | AES_FLAGS_CFB16 | 294 + AES_FLAGS_CFB32 | AES_FLAGS_CFB64)) { 295 + dd->dma_lch_in.dma_conf.src_maxburst = 1; 296 + dd->dma_lch_in.dma_conf.dst_maxburst = 1; 297 + dd->dma_lch_out.dma_conf.src_maxburst = 1; 298 + dd->dma_lch_out.dma_conf.dst_maxburst = 1; 299 + } else { 300 + dd->dma_lch_in.dma_conf.src_maxburst = dd->caps.max_burst_size; 301 + dd->dma_lch_in.dma_conf.dst_maxburst = dd->caps.max_burst_size; 302 + dd->dma_lch_out.dma_conf.src_maxburst = dd->caps.max_burst_size; 303 + dd->dma_lch_out.dma_conf.dst_maxburst = dd->caps.max_burst_size; 304 + } 305 + 306 + dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf); 307 + dmaengine_slave_config(dd->dma_lch_out.chan, &dd->dma_lch_out.dma_conf); 308 + 309 + dd->flags |= AES_FLAGS_DMA; 310 + 311 + sg_init_table(&sg[0], 1); 312 + sg_dma_address(&sg[0]) = dma_addr_in; 313 + sg_dma_len(&sg[0]) = length; 314 + 315 + sg_init_table(&sg[1], 1); 316 + sg_dma_address(&sg[1]) = dma_addr_out; 317 + sg_dma_len(&sg[1]) = length; 318 + 319 + in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, &sg[0], 320 + 1, DMA_MEM_TO_DEV, 324 321 DMA_PREP_INTERRUPT | DMA_CTRL_ACK); 325 - 326 322 if (!in_desc) 327 - goto unmap_in; 323 + return -EINVAL; 328 324 329 - /* callback not needed */ 330 - 331 - dd->nb_out_sg = atmel_aes_sg_length(dd->req, dd->out_sg); 332 - if (!dd->nb_out_sg) 333 - goto unmap_in; 334 - 335 - nb_dma_sg_out = dma_map_sg(dd->dev, dd->out_sg, dd->nb_out_sg, 336 - DMA_FROM_DEVICE); 337 - if (!nb_dma_sg_out) 338 - goto unmap_out; 339 - 340 - out_desc = dmaengine_prep_slave_sg(dd->dma_lch_out.chan, dd->out_sg, 341 - nb_dma_sg_out, DMA_DEV_TO_MEM, 325 + out_desc = dmaengine_prep_slave_sg(dd->dma_lch_out.chan, &sg[1], 326 + 1, DMA_DEV_TO_MEM, 342 327 DMA_PREP_INTERRUPT | DMA_CTRL_ACK); 343 - 344 328 if (!out_desc) 345 - goto unmap_out; 329 + return -EINVAL; 346 330 347 331 out_desc->callback = atmel_aes_dma_callback; 348 332 out_desc->callback_param = dd; 349 - 350 - dd->total -= dd->req->nbytes; 351 333 352 334 dmaengine_submit(out_desc); 353 335 dma_async_issue_pending(dd->dma_lch_out.chan); ··· 383 311 dma_async_issue_pending(dd->dma_lch_in.chan); 384 312 385 313 return 0; 386 - 387 - unmap_out: 388 - dma_unmap_sg(dd->dev, dd->out_sg, dd->nb_out_sg, 389 - DMA_FROM_DEVICE); 390 - unmap_in: 391 - dma_unmap_sg(dd->dev, dd->in_sg, dd->nb_in_sg, 392 - DMA_TO_DEVICE); 393 - exit_err: 394 - return -EINVAL; 395 314 } 396 315 397 316 static int atmel_aes_crypt_cpu_start(struct atmel_aes_dev *dd) ··· 415 352 416 353 static int atmel_aes_crypt_dma_start(struct atmel_aes_dev *dd) 417 354 { 418 - int err; 355 + int err, fast = 0, in, out; 356 + size_t count; 357 + dma_addr_t addr_in, addr_out; 419 358 420 - if (dd->flags & AES_FLAGS_CFB8) { 421 - dd->dma_lch_in.dma_conf.dst_addr_width = 422 - DMA_SLAVE_BUSWIDTH_1_BYTE; 423 - dd->dma_lch_out.dma_conf.src_addr_width = 424 - DMA_SLAVE_BUSWIDTH_1_BYTE; 425 - } else if (dd->flags & AES_FLAGS_CFB16) { 426 - dd->dma_lch_in.dma_conf.dst_addr_width = 427 - DMA_SLAVE_BUSWIDTH_2_BYTES; 428 - dd->dma_lch_out.dma_conf.src_addr_width = 429 - DMA_SLAVE_BUSWIDTH_2_BYTES; 430 - } else { 431 - dd->dma_lch_in.dma_conf.dst_addr_width = 432 - DMA_SLAVE_BUSWIDTH_4_BYTES; 433 - dd->dma_lch_out.dma_conf.src_addr_width = 434 - DMA_SLAVE_BUSWIDTH_4_BYTES; 359 + if ((!dd->in_offset) && (!dd->out_offset)) { 360 + /* check for alignment */ 361 + in = IS_ALIGNED((u32)dd->in_sg->offset, sizeof(u32)) && 362 + IS_ALIGNED(dd->in_sg->length, dd->ctx->block_size); 363 + out = IS_ALIGNED((u32)dd->out_sg->offset, sizeof(u32)) && 364 + IS_ALIGNED(dd->out_sg->length, dd->ctx->block_size); 365 + fast = in && out; 366 + 367 + if (sg_dma_len(dd->in_sg) != sg_dma_len(dd->out_sg)) 368 + fast = 0; 435 369 } 436 370 437 - dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf); 438 - dmaengine_slave_config(dd->dma_lch_out.chan, &dd->dma_lch_out.dma_conf); 439 371 440 - dd->flags |= AES_FLAGS_DMA; 441 - err = atmel_aes_crypt_dma(dd); 372 + if (fast) { 373 + count = min(dd->total, sg_dma_len(dd->in_sg)); 374 + count = min(count, sg_dma_len(dd->out_sg)); 375 + 376 + err = dma_map_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE); 377 + if (!err) { 378 + dev_err(dd->dev, "dma_map_sg() error\n"); 379 + return -EINVAL; 380 + } 381 + 382 + err = dma_map_sg(dd->dev, dd->out_sg, 1, 383 + DMA_FROM_DEVICE); 384 + if (!err) { 385 + dev_err(dd->dev, "dma_map_sg() error\n"); 386 + dma_unmap_sg(dd->dev, dd->in_sg, 1, 387 + DMA_TO_DEVICE); 388 + return -EINVAL; 389 + } 390 + 391 + addr_in = sg_dma_address(dd->in_sg); 392 + addr_out = sg_dma_address(dd->out_sg); 393 + 394 + dd->flags |= AES_FLAGS_FAST; 395 + 396 + } else { 397 + /* use cache buffers */ 398 + count = atmel_aes_sg_copy(&dd->in_sg, &dd->in_offset, 399 + dd->buf_in, dd->buflen, dd->total, 0); 400 + 401 + addr_in = dd->dma_addr_in; 402 + addr_out = dd->dma_addr_out; 403 + 404 + dd->flags &= ~AES_FLAGS_FAST; 405 + } 406 + 407 + dd->total -= count; 408 + 409 + err = atmel_aes_crypt_dma(dd, addr_in, addr_out, count); 410 + 411 + if (err && (dd->flags & AES_FLAGS_FAST)) { 412 + dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE); 413 + dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_TO_DEVICE); 414 + } 442 415 443 416 return err; 444 417 } ··· 509 410 valmr |= AES_MR_CFBS_32b; 510 411 else if (dd->flags & AES_FLAGS_CFB64) 511 412 valmr |= AES_MR_CFBS_64b; 413 + else if (dd->flags & AES_FLAGS_CFB128) 414 + valmr |= AES_MR_CFBS_128b; 512 415 } else if (dd->flags & AES_FLAGS_OFB) { 513 416 valmr |= AES_MR_OPMOD_OFB; 514 417 } else if (dd->flags & AES_FLAGS_CTR) { ··· 524 423 525 424 if (dd->total > ATMEL_AES_DMA_THRESHOLD) { 526 425 valmr |= AES_MR_SMOD_IDATAR0; 527 - if (dd->flags & AES_FLAGS_DUALBUFF) 426 + if (dd->caps.has_dualbuff) 528 427 valmr |= AES_MR_DUALBUFF; 529 428 } else { 530 429 valmr |= AES_MR_SMOD_AUTO; ··· 578 477 /* assign new request to device */ 579 478 dd->req = req; 580 479 dd->total = req->nbytes; 480 + dd->in_offset = 0; 581 481 dd->in_sg = req->src; 482 + dd->out_offset = 0; 582 483 dd->out_sg = req->dst; 583 484 584 485 rctx = ablkcipher_request_ctx(req); ··· 609 506 static int atmel_aes_crypt_dma_stop(struct atmel_aes_dev *dd) 610 507 { 611 508 int err = -EINVAL; 509 + size_t count; 612 510 613 511 if (dd->flags & AES_FLAGS_DMA) { 614 - dma_unmap_sg(dd->dev, dd->out_sg, 615 - dd->nb_out_sg, DMA_FROM_DEVICE); 616 - dma_unmap_sg(dd->dev, dd->in_sg, 617 - dd->nb_in_sg, DMA_TO_DEVICE); 618 512 err = 0; 513 + if (dd->flags & AES_FLAGS_FAST) { 514 + dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_FROM_DEVICE); 515 + dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE); 516 + } else { 517 + dma_sync_single_for_device(dd->dev, dd->dma_addr_out, 518 + dd->dma_size, DMA_FROM_DEVICE); 519 + 520 + /* copy data */ 521 + count = atmel_aes_sg_copy(&dd->out_sg, &dd->out_offset, 522 + dd->buf_out, dd->buflen, dd->dma_size, 1); 523 + if (count != dd->dma_size) { 524 + err = -EINVAL; 525 + pr_err("not all data converted: %u\n", count); 526 + } 527 + } 619 528 } 620 529 621 530 return err; 531 + } 532 + 533 + 534 + static int atmel_aes_buff_init(struct atmel_aes_dev *dd) 535 + { 536 + int err = -ENOMEM; 537 + 538 + dd->buf_in = (void *)__get_free_pages(GFP_KERNEL, 0); 539 + dd->buf_out = (void *)__get_free_pages(GFP_KERNEL, 0); 540 + dd->buflen = PAGE_SIZE; 541 + dd->buflen &= ~(AES_BLOCK_SIZE - 1); 542 + 543 + if (!dd->buf_in || !dd->buf_out) { 544 + dev_err(dd->dev, "unable to alloc pages.\n"); 545 + goto err_alloc; 546 + } 547 + 548 + /* MAP here */ 549 + dd->dma_addr_in = dma_map_single(dd->dev, dd->buf_in, 550 + dd->buflen, DMA_TO_DEVICE); 551 + if (dma_mapping_error(dd->dev, dd->dma_addr_in)) { 552 + dev_err(dd->dev, "dma %d bytes error\n", dd->buflen); 553 + err = -EINVAL; 554 + goto err_map_in; 555 + } 556 + 557 + dd->dma_addr_out = dma_map_single(dd->dev, dd->buf_out, 558 + dd->buflen, DMA_FROM_DEVICE); 559 + if (dma_mapping_error(dd->dev, dd->dma_addr_out)) { 560 + dev_err(dd->dev, "dma %d bytes error\n", dd->buflen); 561 + err = -EINVAL; 562 + goto err_map_out; 563 + } 564 + 565 + return 0; 566 + 567 + err_map_out: 568 + dma_unmap_single(dd->dev, dd->dma_addr_in, dd->buflen, 569 + DMA_TO_DEVICE); 570 + err_map_in: 571 + free_page((unsigned long)dd->buf_out); 572 + free_page((unsigned long)dd->buf_in); 573 + err_alloc: 574 + if (err) 575 + pr_err("error: %d\n", err); 576 + return err; 577 + } 578 + 579 + static void atmel_aes_buff_cleanup(struct atmel_aes_dev *dd) 580 + { 581 + dma_unmap_single(dd->dev, dd->dma_addr_out, dd->buflen, 582 + DMA_FROM_DEVICE); 583 + dma_unmap_single(dd->dev, dd->dma_addr_in, dd->buflen, 584 + DMA_TO_DEVICE); 585 + free_page((unsigned long)dd->buf_out); 586 + free_page((unsigned long)dd->buf_in); 622 587 } 623 588 624 589 static int atmel_aes_crypt(struct ablkcipher_request *req, unsigned long mode) ··· 696 525 struct atmel_aes_reqctx *rctx = ablkcipher_request_ctx(req); 697 526 struct atmel_aes_dev *dd; 698 527 699 - if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE)) { 700 - pr_err("request size is not exact amount of AES blocks\n"); 701 - return -EINVAL; 528 + if (mode & AES_FLAGS_CFB8) { 529 + if (!IS_ALIGNED(req->nbytes, CFB8_BLOCK_SIZE)) { 530 + pr_err("request size is not exact amount of CFB8 blocks\n"); 531 + return -EINVAL; 532 + } 533 + ctx->block_size = CFB8_BLOCK_SIZE; 534 + } else if (mode & AES_FLAGS_CFB16) { 535 + if (!IS_ALIGNED(req->nbytes, CFB16_BLOCK_SIZE)) { 536 + pr_err("request size is not exact amount of CFB16 blocks\n"); 537 + return -EINVAL; 538 + } 539 + ctx->block_size = CFB16_BLOCK_SIZE; 540 + } else if (mode & AES_FLAGS_CFB32) { 541 + if (!IS_ALIGNED(req->nbytes, CFB32_BLOCK_SIZE)) { 542 + pr_err("request size is not exact amount of CFB32 blocks\n"); 543 + return -EINVAL; 544 + } 545 + ctx->block_size = CFB32_BLOCK_SIZE; 546 + } else { 547 + if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE)) { 548 + pr_err("request size is not exact amount of AES blocks\n"); 549 + return -EINVAL; 550 + } 551 + ctx->block_size = AES_BLOCK_SIZE; 702 552 } 703 553 704 554 dd = atmel_aes_find_dev(ctx); ··· 743 551 } 744 552 } 745 553 746 - static int atmel_aes_dma_init(struct atmel_aes_dev *dd) 554 + static int atmel_aes_dma_init(struct atmel_aes_dev *dd, 555 + struct crypto_platform_data *pdata) 747 556 { 748 557 int err = -ENOMEM; 749 - struct aes_platform_data *pdata; 750 558 dma_cap_mask_t mask_in, mask_out; 751 - 752 - pdata = dd->dev->platform_data; 753 559 754 560 if (pdata && pdata->dma_slave->txdata.dma_dev && 755 561 pdata->dma_slave->rxdata.dma_dev) { ··· 758 568 759 569 dd->dma_lch_in.chan = dma_request_channel(mask_in, 760 570 atmel_aes_filter, &pdata->dma_slave->rxdata); 571 + 761 572 if (!dd->dma_lch_in.chan) 762 573 goto err_dma_in; 763 574 764 575 dd->dma_lch_in.dma_conf.direction = DMA_MEM_TO_DEV; 765 576 dd->dma_lch_in.dma_conf.dst_addr = dd->phys_base + 766 577 AES_IDATAR(0); 767 - dd->dma_lch_in.dma_conf.src_maxburst = 1; 768 - dd->dma_lch_in.dma_conf.dst_maxburst = 1; 578 + dd->dma_lch_in.dma_conf.src_maxburst = dd->caps.max_burst_size; 579 + dd->dma_lch_in.dma_conf.src_addr_width = 580 + DMA_SLAVE_BUSWIDTH_4_BYTES; 581 + dd->dma_lch_in.dma_conf.dst_maxburst = dd->caps.max_burst_size; 582 + dd->dma_lch_in.dma_conf.dst_addr_width = 583 + DMA_SLAVE_BUSWIDTH_4_BYTES; 769 584 dd->dma_lch_in.dma_conf.device_fc = false; 770 585 771 586 dma_cap_zero(mask_out); 772 587 dma_cap_set(DMA_SLAVE, mask_out); 773 588 dd->dma_lch_out.chan = dma_request_channel(mask_out, 774 589 atmel_aes_filter, &pdata->dma_slave->txdata); 590 + 775 591 if (!dd->dma_lch_out.chan) 776 592 goto err_dma_out; 777 593 778 594 dd->dma_lch_out.dma_conf.direction = DMA_DEV_TO_MEM; 779 595 dd->dma_lch_out.dma_conf.src_addr = dd->phys_base + 780 596 AES_ODATAR(0); 781 - dd->dma_lch_out.dma_conf.src_maxburst = 1; 782 - dd->dma_lch_out.dma_conf.dst_maxburst = 1; 597 + dd->dma_lch_out.dma_conf.src_maxburst = dd->caps.max_burst_size; 598 + dd->dma_lch_out.dma_conf.src_addr_width = 599 + DMA_SLAVE_BUSWIDTH_4_BYTES; 600 + dd->dma_lch_out.dma_conf.dst_maxburst = dd->caps.max_burst_size; 601 + dd->dma_lch_out.dma_conf.dst_addr_width = 602 + DMA_SLAVE_BUSWIDTH_4_BYTES; 783 603 dd->dma_lch_out.dma_conf.device_fc = false; 784 604 785 605 return 0; ··· 865 665 static int atmel_aes_cfb_encrypt(struct ablkcipher_request *req) 866 666 { 867 667 return atmel_aes_crypt(req, 868 - AES_FLAGS_ENCRYPT | AES_FLAGS_CFB); 668 + AES_FLAGS_ENCRYPT | AES_FLAGS_CFB | AES_FLAGS_CFB128); 869 669 } 870 670 871 671 static int atmel_aes_cfb_decrypt(struct ablkcipher_request *req) 872 672 { 873 673 return atmel_aes_crypt(req, 874 - AES_FLAGS_CFB); 674 + AES_FLAGS_CFB | AES_FLAGS_CFB128); 875 675 } 876 676 877 677 static int atmel_aes_cfb64_encrypt(struct ablkcipher_request *req) ··· 953 753 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 954 754 .cra_blocksize = AES_BLOCK_SIZE, 955 755 .cra_ctxsize = sizeof(struct atmel_aes_ctx), 956 - .cra_alignmask = 0x0, 756 + .cra_alignmask = 0xf, 957 757 .cra_type = &crypto_ablkcipher_type, 958 758 .cra_module = THIS_MODULE, 959 759 .cra_init = atmel_aes_cra_init, ··· 973 773 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 974 774 .cra_blocksize = AES_BLOCK_SIZE, 975 775 .cra_ctxsize = sizeof(struct atmel_aes_ctx), 976 - .cra_alignmask = 0x0, 776 + .cra_alignmask = 0xf, 977 777 .cra_type = &crypto_ablkcipher_type, 978 778 .cra_module = THIS_MODULE, 979 779 .cra_init = atmel_aes_cra_init, ··· 994 794 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 995 795 .cra_blocksize = AES_BLOCK_SIZE, 996 796 .cra_ctxsize = sizeof(struct atmel_aes_ctx), 997 - .cra_alignmask = 0x0, 797 + .cra_alignmask = 0xf, 998 798 .cra_type = &crypto_ablkcipher_type, 999 799 .cra_module = THIS_MODULE, 1000 800 .cra_init = atmel_aes_cra_init, ··· 1015 815 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1016 816 .cra_blocksize = AES_BLOCK_SIZE, 1017 817 .cra_ctxsize = sizeof(struct atmel_aes_ctx), 1018 - .cra_alignmask = 0x0, 818 + .cra_alignmask = 0xf, 1019 819 .cra_type = &crypto_ablkcipher_type, 1020 820 .cra_module = THIS_MODULE, 1021 821 .cra_init = atmel_aes_cra_init, ··· 1036 836 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1037 837 .cra_blocksize = CFB32_BLOCK_SIZE, 1038 838 .cra_ctxsize = sizeof(struct atmel_aes_ctx), 1039 - .cra_alignmask = 0x0, 839 + .cra_alignmask = 0x3, 1040 840 .cra_type = &crypto_ablkcipher_type, 1041 841 .cra_module = THIS_MODULE, 1042 842 .cra_init = atmel_aes_cra_init, ··· 1057 857 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1058 858 .cra_blocksize = CFB16_BLOCK_SIZE, 1059 859 .cra_ctxsize = sizeof(struct atmel_aes_ctx), 1060 - .cra_alignmask = 0x0, 860 + .cra_alignmask = 0x1, 1061 861 .cra_type = &crypto_ablkcipher_type, 1062 862 .cra_module = THIS_MODULE, 1063 863 .cra_init = atmel_aes_cra_init, ··· 1099 899 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1100 900 .cra_blocksize = AES_BLOCK_SIZE, 1101 901 .cra_ctxsize = sizeof(struct atmel_aes_ctx), 1102 - .cra_alignmask = 0x0, 902 + .cra_alignmask = 0xf, 1103 903 .cra_type = &crypto_ablkcipher_type, 1104 904 .cra_module = THIS_MODULE, 1105 905 .cra_init = atmel_aes_cra_init, ··· 1115 915 }, 1116 916 }; 1117 917 1118 - static struct crypto_alg aes_cfb64_alg[] = { 1119 - { 918 + static struct crypto_alg aes_cfb64_alg = { 1120 919 .cra_name = "cfb64(aes)", 1121 920 .cra_driver_name = "atmel-cfb64-aes", 1122 921 .cra_priority = 100, 1123 922 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1124 923 .cra_blocksize = CFB64_BLOCK_SIZE, 1125 924 .cra_ctxsize = sizeof(struct atmel_aes_ctx), 1126 - .cra_alignmask = 0x0, 925 + .cra_alignmask = 0x7, 1127 926 .cra_type = &crypto_ablkcipher_type, 1128 927 .cra_module = THIS_MODULE, 1129 928 .cra_init = atmel_aes_cra_init, ··· 1135 936 .encrypt = atmel_aes_cfb64_encrypt, 1136 937 .decrypt = atmel_aes_cfb64_decrypt, 1137 938 } 1138 - }, 1139 939 }; 1140 940 1141 941 static void atmel_aes_queue_task(unsigned long data) ··· 1167 969 err = dd->err ? : err; 1168 970 1169 971 if (dd->total && !err) { 1170 - err = atmel_aes_crypt_dma_start(dd); 972 + if (dd->flags & AES_FLAGS_FAST) { 973 + dd->in_sg = sg_next(dd->in_sg); 974 + dd->out_sg = sg_next(dd->out_sg); 975 + if (!dd->in_sg || !dd->out_sg) 976 + err = -EINVAL; 977 + } 978 + if (!err) 979 + err = atmel_aes_crypt_dma_start(dd); 1171 980 if (!err) 1172 981 return; /* DMA started. Not fininishing. */ 1173 982 } ··· 1208 1003 1209 1004 for (i = 0; i < ARRAY_SIZE(aes_algs); i++) 1210 1005 crypto_unregister_alg(&aes_algs[i]); 1211 - if (dd->hw_version >= 0x130) 1212 - crypto_unregister_alg(&aes_cfb64_alg[0]); 1006 + if (dd->caps.has_cfb64) 1007 + crypto_unregister_alg(&aes_cfb64_alg); 1213 1008 } 1214 1009 1215 1010 static int atmel_aes_register_algs(struct atmel_aes_dev *dd) ··· 1222 1017 goto err_aes_algs; 1223 1018 } 1224 1019 1225 - atmel_aes_hw_version_init(dd); 1226 - 1227 - if (dd->hw_version >= 0x130) { 1228 - err = crypto_register_alg(&aes_cfb64_alg[0]); 1020 + if (dd->caps.has_cfb64) { 1021 + err = crypto_register_alg(&aes_cfb64_alg); 1229 1022 if (err) 1230 1023 goto err_aes_cfb64_alg; 1231 1024 } ··· 1239 1036 return err; 1240 1037 } 1241 1038 1039 + static void atmel_aes_get_cap(struct atmel_aes_dev *dd) 1040 + { 1041 + dd->caps.has_dualbuff = 0; 1042 + dd->caps.has_cfb64 = 0; 1043 + dd->caps.max_burst_size = 1; 1044 + 1045 + /* keep only major version number */ 1046 + switch (dd->hw_version & 0xff0) { 1047 + case 0x130: 1048 + dd->caps.has_dualbuff = 1; 1049 + dd->caps.has_cfb64 = 1; 1050 + dd->caps.max_burst_size = 4; 1051 + break; 1052 + case 0x120: 1053 + break; 1054 + default: 1055 + dev_warn(dd->dev, 1056 + "Unmanaged aes version, set minimum capabilities\n"); 1057 + break; 1058 + } 1059 + } 1060 + 1242 1061 static int atmel_aes_probe(struct platform_device *pdev) 1243 1062 { 1244 1063 struct atmel_aes_dev *aes_dd; 1245 - struct aes_platform_data *pdata; 1064 + struct crypto_platform_data *pdata; 1246 1065 struct device *dev = &pdev->dev; 1247 1066 struct resource *aes_res; 1248 1067 unsigned long aes_phys_size; ··· 1324 1099 } 1325 1100 1326 1101 /* Initializing the clock */ 1327 - aes_dd->iclk = clk_get(&pdev->dev, NULL); 1102 + aes_dd->iclk = clk_get(&pdev->dev, "aes_clk"); 1328 1103 if (IS_ERR(aes_dd->iclk)) { 1329 1104 dev_err(dev, "clock intialization failed.\n"); 1330 1105 err = PTR_ERR(aes_dd->iclk); ··· 1338 1113 goto aes_io_err; 1339 1114 } 1340 1115 1341 - err = atmel_aes_dma_init(aes_dd); 1116 + atmel_aes_hw_version_init(aes_dd); 1117 + 1118 + atmel_aes_get_cap(aes_dd); 1119 + 1120 + err = atmel_aes_buff_init(aes_dd); 1121 + if (err) 1122 + goto err_aes_buff; 1123 + 1124 + err = atmel_aes_dma_init(aes_dd, pdata); 1342 1125 if (err) 1343 1126 goto err_aes_dma; 1344 1127 ··· 1368 1135 spin_unlock(&atmel_aes.lock); 1369 1136 atmel_aes_dma_cleanup(aes_dd); 1370 1137 err_aes_dma: 1138 + atmel_aes_buff_cleanup(aes_dd); 1139 + err_aes_buff: 1371 1140 iounmap(aes_dd->io_base); 1372 1141 aes_io_err: 1373 1142 clk_put(aes_dd->iclk);

+6 -1

drivers/crypto/atmel-sha-regs.h

··· 14 14 #define SHA_MR_MODE_MANUAL 0x0 15 15 #define SHA_MR_MODE_AUTO 0x1 16 16 #define SHA_MR_MODE_PDC 0x2 17 - #define SHA_MR_DUALBUFF (1 << 3) 18 17 #define SHA_MR_PROCDLY (1 << 4) 19 18 #define SHA_MR_ALGO_SHA1 (0 << 8) 20 19 #define SHA_MR_ALGO_SHA256 (1 << 8) 20 + #define SHA_MR_ALGO_SHA384 (2 << 8) 21 + #define SHA_MR_ALGO_SHA512 (3 << 8) 22 + #define SHA_MR_ALGO_SHA224 (4 << 8) 23 + #define SHA_MR_DUALBUFF (1 << 16) 21 24 22 25 #define SHA_IER 0x10 23 26 #define SHA_IDR 0x14 ··· 35 32 #define SHA_ISR_URAT_ODR (0x1 << 12) 36 33 #define SHA_ISR_URAT_MR (0x2 << 12) 37 34 #define SHA_ISR_URAT_WO (0x5 << 12) 35 + 36 + #define SHA_HW_VERSION 0xFC 38 37 39 38 #define SHA_TPR 0x108 40 39 #define SHA_TCR 0x10C

+484 -98

drivers/crypto/atmel-sha.c

··· 38 38 #include <crypto/sha.h> 39 39 #include <crypto/hash.h> 40 40 #include <crypto/internal/hash.h> 41 + #include <linux/platform_data/crypto-atmel.h> 41 42 #include "atmel-sha-regs.h" 42 43 43 44 /* SHA flags */ ··· 53 52 #define SHA_FLAGS_FINUP BIT(16) 54 53 #define SHA_FLAGS_SG BIT(17) 55 54 #define SHA_FLAGS_SHA1 BIT(18) 56 - #define SHA_FLAGS_SHA256 BIT(19) 57 - #define SHA_FLAGS_ERROR BIT(20) 58 - #define SHA_FLAGS_PAD BIT(21) 59 - 60 - #define SHA_FLAGS_DUALBUFF BIT(24) 55 + #define SHA_FLAGS_SHA224 BIT(19) 56 + #define SHA_FLAGS_SHA256 BIT(20) 57 + #define SHA_FLAGS_SHA384 BIT(21) 58 + #define SHA_FLAGS_SHA512 BIT(22) 59 + #define SHA_FLAGS_ERROR BIT(23) 60 + #define SHA_FLAGS_PAD BIT(24) 61 61 62 62 #define SHA_OP_UPDATE 1 63 63 #define SHA_OP_FINAL 2 ··· 67 65 68 66 #define ATMEL_SHA_DMA_THRESHOLD 56 69 67 68 + struct atmel_sha_caps { 69 + bool has_dma; 70 + bool has_dualbuff; 71 + bool has_sha224; 72 + bool has_sha_384_512; 73 + }; 70 74 71 75 struct atmel_sha_dev; 72 76 ··· 81 73 unsigned long flags; 82 74 unsigned long op; 83 75 84 - u8 digest[SHA256_DIGEST_SIZE] __aligned(sizeof(u32)); 85 - size_t digcnt; 76 + u8 digest[SHA512_DIGEST_SIZE] __aligned(sizeof(u32)); 77 + u64 digcnt[2]; 86 78 size_t bufcnt; 87 79 size_t buflen; 88 80 dma_addr_t dma_addr; ··· 91 83 struct scatterlist *sg; 92 84 unsigned int offset; /* offset in current sg */ 93 85 unsigned int total; /* total request */ 86 + 87 + size_t block_size; 94 88 95 89 u8 buffer[0] __aligned(sizeof(u32)); 96 90 }; ··· 107 97 108 98 }; 109 99 110 - #define ATMEL_SHA_QUEUE_LENGTH 1 100 + #define ATMEL_SHA_QUEUE_LENGTH 50 101 + 102 + struct atmel_sha_dma { 103 + struct dma_chan *chan; 104 + struct dma_slave_config dma_conf; 105 + }; 111 106 112 107 struct atmel_sha_dev { 113 108 struct list_head list; ··· 129 114 unsigned long flags; 130 115 struct crypto_queue queue; 131 116 struct ahash_request *req; 117 + 118 + struct atmel_sha_dma dma_lch_in; 119 + 120 + struct atmel_sha_caps caps; 121 + 122 + u32 hw_version; 132 123 }; 133 124 134 125 struct atmel_sha_drv { ··· 156 135 u32 offset, u32 value) 157 136 { 158 137 writel_relaxed(value, dd->io_base + offset); 159 - } 160 - 161 - static void atmel_sha_dualbuff_test(struct atmel_sha_dev *dd) 162 - { 163 - atmel_sha_write(dd, SHA_MR, SHA_MR_DUALBUFF); 164 - 165 - if (atmel_sha_read(dd, SHA_MR) & SHA_MR_DUALBUFF) 166 - dd->flags |= SHA_FLAGS_DUALBUFF; 167 138 } 168 139 169 140 static size_t atmel_sha_append_sg(struct atmel_sha_reqctx *ctx) ··· 189 176 } 190 177 191 178 /* 192 - * The purpose of this padding is to ensure that the padded message 193 - * is a multiple of 512 bits. The bit "1" is appended at the end of 194 - * the message followed by "padlen-1" zero bits. Then a 64 bits block 195 - * equals to the message length in bits is appended. 179 + * The purpose of this padding is to ensure that the padded message is a 180 + * multiple of 512 bits (SHA1/SHA224/SHA256) or 1024 bits (SHA384/SHA512). 181 + * The bit "1" is appended at the end of the message followed by 182 + * "padlen-1" zero bits. Then a 64 bits block (SHA1/SHA224/SHA256) or 183 + * 128 bits block (SHA384/SHA512) equals to the message length in bits 184 + * is appended. 196 185 * 197 - * padlen is calculated as followed: 186 + * For SHA1/SHA224/SHA256, padlen is calculated as followed: 198 187 * - if message length < 56 bytes then padlen = 56 - message length 199 188 * - else padlen = 64 + 56 - message length 189 + * 190 + * For SHA384/SHA512, padlen is calculated as followed: 191 + * - if message length < 112 bytes then padlen = 112 - message length 192 + * - else padlen = 128 + 112 - message length 200 193 */ 201 194 static void atmel_sha_fill_padding(struct atmel_sha_reqctx *ctx, int length) 202 195 { 203 196 unsigned int index, padlen; 204 - u64 bits; 205 - u64 size; 197 + u64 bits[2]; 198 + u64 size[2]; 206 199 207 - bits = (ctx->bufcnt + ctx->digcnt + length) << 3; 208 - size = cpu_to_be64(bits); 200 + size[0] = ctx->digcnt[0]; 201 + size[1] = ctx->digcnt[1]; 209 202 210 - index = ctx->bufcnt & 0x3f; 211 - padlen = (index < 56) ? (56 - index) : ((64+56) - index); 212 - *(ctx->buffer + ctx->bufcnt) = 0x80; 213 - memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen-1); 214 - memcpy(ctx->buffer + ctx->bufcnt + padlen, &size, 8); 215 - ctx->bufcnt += padlen + 8; 216 - ctx->flags |= SHA_FLAGS_PAD; 203 + size[0] += ctx->bufcnt; 204 + if (size[0] < ctx->bufcnt) 205 + size[1]++; 206 + 207 + size[0] += length; 208 + if (size[0] < length) 209 + size[1]++; 210 + 211 + bits[1] = cpu_to_be64(size[0] << 3); 212 + bits[0] = cpu_to_be64(size[1] << 3 | size[0] >> 61); 213 + 214 + if (ctx->flags & (SHA_FLAGS_SHA384 | SHA_FLAGS_SHA512)) { 215 + index = ctx->bufcnt & 0x7f; 216 + padlen = (index < 112) ? (112 - index) : ((128+112) - index); 217 + *(ctx->buffer + ctx->bufcnt) = 0x80; 218 + memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen-1); 219 + memcpy(ctx->buffer + ctx->bufcnt + padlen, bits, 16); 220 + ctx->bufcnt += padlen + 16; 221 + ctx->flags |= SHA_FLAGS_PAD; 222 + } else { 223 + index = ctx->bufcnt & 0x3f; 224 + padlen = (index < 56) ? (56 - index) : ((64+56) - index); 225 + *(ctx->buffer + ctx->bufcnt) = 0x80; 226 + memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen-1); 227 + memcpy(ctx->buffer + ctx->bufcnt + padlen, &bits[1], 8); 228 + ctx->bufcnt += padlen + 8; 229 + ctx->flags |= SHA_FLAGS_PAD; 230 + } 217 231 } 218 232 219 233 static int atmel_sha_init(struct ahash_request *req) ··· 271 231 dev_dbg(dd->dev, "init: digest size: %d\n", 272 232 crypto_ahash_digestsize(tfm)); 273 233 274 - if (crypto_ahash_digestsize(tfm) == SHA1_DIGEST_SIZE) 234 + switch (crypto_ahash_digestsize(tfm)) { 235 + case SHA1_DIGEST_SIZE: 275 236 ctx->flags |= SHA_FLAGS_SHA1; 276 - else if (crypto_ahash_digestsize(tfm) == SHA256_DIGEST_SIZE) 237 + ctx->block_size = SHA1_BLOCK_SIZE; 238 + break; 239 + case SHA224_DIGEST_SIZE: 240 + ctx->flags |= SHA_FLAGS_SHA224; 241 + ctx->block_size = SHA224_BLOCK_SIZE; 242 + break; 243 + case SHA256_DIGEST_SIZE: 277 244 ctx->flags |= SHA_FLAGS_SHA256; 245 + ctx->block_size = SHA256_BLOCK_SIZE; 246 + break; 247 + case SHA384_DIGEST_SIZE: 248 + ctx->flags |= SHA_FLAGS_SHA384; 249 + ctx->block_size = SHA384_BLOCK_SIZE; 250 + break; 251 + case SHA512_DIGEST_SIZE: 252 + ctx->flags |= SHA_FLAGS_SHA512; 253 + ctx->block_size = SHA512_BLOCK_SIZE; 254 + break; 255 + default: 256 + return -EINVAL; 257 + break; 258 + } 278 259 279 260 ctx->bufcnt = 0; 280 - ctx->digcnt = 0; 261 + ctx->digcnt[0] = 0; 262 + ctx->digcnt[1] = 0; 281 263 ctx->buflen = SHA_BUFFER_LEN; 282 264 283 265 return 0; ··· 311 249 u32 valcr = 0, valmr = SHA_MR_MODE_AUTO; 312 250 313 251 if (likely(dma)) { 314 - atmel_sha_write(dd, SHA_IER, SHA_INT_TXBUFE); 252 + if (!dd->caps.has_dma) 253 + atmel_sha_write(dd, SHA_IER, SHA_INT_TXBUFE); 315 254 valmr = SHA_MR_MODE_PDC; 316 - if (dd->flags & SHA_FLAGS_DUALBUFF) 317 - valmr = SHA_MR_DUALBUFF; 255 + if (dd->caps.has_dualbuff) 256 + valmr |= SHA_MR_DUALBUFF; 318 257 } else { 319 258 atmel_sha_write(dd, SHA_IER, SHA_INT_DATARDY); 320 259 } 321 260 322 - if (ctx->flags & SHA_FLAGS_SHA256) 261 + if (ctx->flags & SHA_FLAGS_SHA1) 262 + valmr |= SHA_MR_ALGO_SHA1; 263 + else if (ctx->flags & SHA_FLAGS_SHA224) 264 + valmr |= SHA_MR_ALGO_SHA224; 265 + else if (ctx->flags & SHA_FLAGS_SHA256) 323 266 valmr |= SHA_MR_ALGO_SHA256; 267 + else if (ctx->flags & SHA_FLAGS_SHA384) 268 + valmr |= SHA_MR_ALGO_SHA384; 269 + else if (ctx->flags & SHA_FLAGS_SHA512) 270 + valmr |= SHA_MR_ALGO_SHA512; 324 271 325 272 /* Setting CR_FIRST only for the first iteration */ 326 - if (!ctx->digcnt) 273 + if (!(ctx->digcnt[0] || ctx->digcnt[1])) 327 274 valcr = SHA_CR_FIRST; 328 275 329 276 atmel_sha_write(dd, SHA_CR, valcr); ··· 346 275 int count, len32; 347 276 const u32 *buffer = (const u32 *)buf; 348 277 349 - dev_dbg(dd->dev, "xmit_cpu: digcnt: %d, length: %d, final: %d\n", 350 - ctx->digcnt, length, final); 278 + dev_dbg(dd->dev, "xmit_cpu: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n", 279 + ctx->digcnt[1], ctx->digcnt[0], length, final); 351 280 352 281 atmel_sha_write_ctrl(dd, 0); 353 282 354 283 /* should be non-zero before next lines to disable clocks later */ 355 - ctx->digcnt += length; 284 + ctx->digcnt[0] += length; 285 + if (ctx->digcnt[0] < length) 286 + ctx->digcnt[1]++; 356 287 357 288 if (final) 358 289 dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */ ··· 375 302 struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req); 376 303 int len32; 377 304 378 - dev_dbg(dd->dev, "xmit_pdc: digcnt: %d, length: %d, final: %d\n", 379 - ctx->digcnt, length1, final); 305 + dev_dbg(dd->dev, "xmit_pdc: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n", 306 + ctx->digcnt[1], ctx->digcnt[0], length1, final); 380 307 381 308 len32 = DIV_ROUND_UP(length1, sizeof(u32)); 382 309 atmel_sha_write(dd, SHA_PTCR, SHA_PTCR_TXTDIS); ··· 390 317 atmel_sha_write_ctrl(dd, 1); 391 318 392 319 /* should be non-zero before next lines to disable clocks later */ 393 - ctx->digcnt += length1; 320 + ctx->digcnt[0] += length1; 321 + if (ctx->digcnt[0] < length1) 322 + ctx->digcnt[1]++; 394 323 395 324 if (final) 396 325 dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */ ··· 405 330 return -EINPROGRESS; 406 331 } 407 332 333 + static void atmel_sha_dma_callback(void *data) 334 + { 335 + struct atmel_sha_dev *dd = data; 336 + 337 + /* dma_lch_in - completed - wait DATRDY */ 338 + atmel_sha_write(dd, SHA_IER, SHA_INT_DATARDY); 339 + } 340 + 341 + static int atmel_sha_xmit_dma(struct atmel_sha_dev *dd, dma_addr_t dma_addr1, 342 + size_t length1, dma_addr_t dma_addr2, size_t length2, int final) 343 + { 344 + struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req); 345 + struct dma_async_tx_descriptor *in_desc; 346 + struct scatterlist sg[2]; 347 + 348 + dev_dbg(dd->dev, "xmit_dma: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n", 349 + ctx->digcnt[1], ctx->digcnt[0], length1, final); 350 + 351 + if (ctx->flags & (SHA_FLAGS_SHA1 | SHA_FLAGS_SHA224 | 352 + SHA_FLAGS_SHA256)) { 353 + dd->dma_lch_in.dma_conf.src_maxburst = 16; 354 + dd->dma_lch_in.dma_conf.dst_maxburst = 16; 355 + } else { 356 + dd->dma_lch_in.dma_conf.src_maxburst = 32; 357 + dd->dma_lch_in.dma_conf.dst_maxburst = 32; 358 + } 359 + 360 + dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf); 361 + 362 + if (length2) { 363 + sg_init_table(sg, 2); 364 + sg_dma_address(&sg[0]) = dma_addr1; 365 + sg_dma_len(&sg[0]) = length1; 366 + sg_dma_address(&sg[1]) = dma_addr2; 367 + sg_dma_len(&sg[1]) = length2; 368 + in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, sg, 2, 369 + DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); 370 + } else { 371 + sg_init_table(sg, 1); 372 + sg_dma_address(&sg[0]) = dma_addr1; 373 + sg_dma_len(&sg[0]) = length1; 374 + in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, sg, 1, 375 + DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); 376 + } 377 + if (!in_desc) 378 + return -EINVAL; 379 + 380 + in_desc->callback = atmel_sha_dma_callback; 381 + in_desc->callback_param = dd; 382 + 383 + atmel_sha_write_ctrl(dd, 1); 384 + 385 + /* should be non-zero before next lines to disable clocks later */ 386 + ctx->digcnt[0] += length1; 387 + if (ctx->digcnt[0] < length1) 388 + ctx->digcnt[1]++; 389 + 390 + if (final) 391 + dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */ 392 + 393 + dd->flags |= SHA_FLAGS_DMA_ACTIVE; 394 + 395 + /* Start DMA transfer */ 396 + dmaengine_submit(in_desc); 397 + dma_async_issue_pending(dd->dma_lch_in.chan); 398 + 399 + return -EINPROGRESS; 400 + } 401 + 402 + static int atmel_sha_xmit_start(struct atmel_sha_dev *dd, dma_addr_t dma_addr1, 403 + size_t length1, dma_addr_t dma_addr2, size_t length2, int final) 404 + { 405 + if (dd->caps.has_dma) 406 + return atmel_sha_xmit_dma(dd, dma_addr1, length1, 407 + dma_addr2, length2, final); 408 + else 409 + return atmel_sha_xmit_pdc(dd, dma_addr1, length1, 410 + dma_addr2, length2, final); 411 + } 412 + 408 413 static int atmel_sha_update_cpu(struct atmel_sha_dev *dd) 409 414 { 410 415 struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req); ··· 492 337 493 338 atmel_sha_append_sg(ctx); 494 339 atmel_sha_fill_padding(ctx, 0); 495 - 496 340 bufcnt = ctx->bufcnt; 497 341 ctx->bufcnt = 0; 498 342 ··· 503 349 size_t length, int final) 504 350 { 505 351 ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer, 506 - ctx->buflen + SHA1_BLOCK_SIZE, DMA_TO_DEVICE); 352 + ctx->buflen + ctx->block_size, DMA_TO_DEVICE); 507 353 if (dma_mapping_error(dd->dev, ctx->dma_addr)) { 508 354 dev_err(dd->dev, "dma %u bytes error\n", ctx->buflen + 509 - SHA1_BLOCK_SIZE); 355 + ctx->block_size); 510 356 return -EINVAL; 511 357 } 512 358 513 359 ctx->flags &= ~SHA_FLAGS_SG; 514 360 515 361 /* next call does not fail... so no unmap in the case of error */ 516 - return atmel_sha_xmit_pdc(dd, ctx->dma_addr, length, 0, 0, final); 362 + return atmel_sha_xmit_start(dd, ctx->dma_addr, length, 0, 0, final); 517 363 } 518 364 519 365 static int atmel_sha_update_dma_slow(struct atmel_sha_dev *dd) ··· 526 372 527 373 final = (ctx->flags & SHA_FLAGS_FINUP) && !ctx->total; 528 374 529 - dev_dbg(dd->dev, "slow: bufcnt: %u, digcnt: %d, final: %d\n", 530 - ctx->bufcnt, ctx->digcnt, final); 375 + dev_dbg(dd->dev, "slow: bufcnt: %u, digcnt: 0x%llx 0x%llx, final: %d\n", 376 + ctx->bufcnt, ctx->digcnt[1], ctx->digcnt[0], final); 531 377 532 378 if (final) 533 379 atmel_sha_fill_padding(ctx, 0); ··· 554 400 if (ctx->bufcnt || ctx->offset) 555 401 return atmel_sha_update_dma_slow(dd); 556 402 557 - dev_dbg(dd->dev, "fast: digcnt: %d, bufcnt: %u, total: %u\n", 558 - ctx->digcnt, ctx->bufcnt, ctx->total); 403 + dev_dbg(dd->dev, "fast: digcnt: 0x%llx 0x%llx, bufcnt: %u, total: %u\n", 404 + ctx->digcnt[1], ctx->digcnt[0], ctx->bufcnt, ctx->total); 559 405 560 406 sg = ctx->sg; 561 407 562 408 if (!IS_ALIGNED(sg->offset, sizeof(u32))) 563 409 return atmel_sha_update_dma_slow(dd); 564 410 565 - if (!sg_is_last(sg) && !IS_ALIGNED(sg->length, SHA1_BLOCK_SIZE)) 566 - /* size is not SHA1_BLOCK_SIZE aligned */ 411 + if (!sg_is_last(sg) && !IS_ALIGNED(sg->length, ctx->block_size)) 412 + /* size is not ctx->block_size aligned */ 567 413 return atmel_sha_update_dma_slow(dd); 568 414 569 415 length = min(ctx->total, sg->length); 570 416 571 417 if (sg_is_last(sg)) { 572 418 if (!(ctx->flags & SHA_FLAGS_FINUP)) { 573 - /* not last sg must be SHA1_BLOCK_SIZE aligned */ 574 - tail = length & (SHA1_BLOCK_SIZE - 1); 419 + /* not last sg must be ctx->block_size aligned */ 420 + tail = length & (ctx->block_size - 1); 575 421 length -= tail; 576 - if (length == 0) { 577 - /* offset where to start slow */ 578 - ctx->offset = length; 579 - return atmel_sha_update_dma_slow(dd); 580 - } 581 422 } 582 423 } 583 424 ··· 583 434 584 435 /* Add padding */ 585 436 if (final) { 586 - tail = length & (SHA1_BLOCK_SIZE - 1); 437 + tail = length & (ctx->block_size - 1); 587 438 length -= tail; 588 439 ctx->total += tail; 589 440 ctx->offset = length; /* offset where to start slow */ ··· 594 445 atmel_sha_fill_padding(ctx, length); 595 446 596 447 ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer, 597 - ctx->buflen + SHA1_BLOCK_SIZE, DMA_TO_DEVICE); 448 + ctx->buflen + ctx->block_size, DMA_TO_DEVICE); 598 449 if (dma_mapping_error(dd->dev, ctx->dma_addr)) { 599 450 dev_err(dd->dev, "dma %u bytes error\n", 600 - ctx->buflen + SHA1_BLOCK_SIZE); 451 + ctx->buflen + ctx->block_size); 601 452 return -EINVAL; 602 453 } 603 454 ··· 605 456 ctx->flags &= ~SHA_FLAGS_SG; 606 457 count = ctx->bufcnt; 607 458 ctx->bufcnt = 0; 608 - return atmel_sha_xmit_pdc(dd, ctx->dma_addr, count, 0, 459 + return atmel_sha_xmit_start(dd, ctx->dma_addr, count, 0, 609 460 0, final); 610 461 } else { 611 462 ctx->sg = sg; ··· 619 470 620 471 count = ctx->bufcnt; 621 472 ctx->bufcnt = 0; 622 - return atmel_sha_xmit_pdc(dd, sg_dma_address(ctx->sg), 473 + return atmel_sha_xmit_start(dd, sg_dma_address(ctx->sg), 623 474 length, ctx->dma_addr, count, final); 624 475 } 625 476 } ··· 632 483 ctx->flags |= SHA_FLAGS_SG; 633 484 634 485 /* next call does not fail... so no unmap in the case of error */ 635 - return atmel_sha_xmit_pdc(dd, sg_dma_address(ctx->sg), length, 0, 486 + return atmel_sha_xmit_start(dd, sg_dma_address(ctx->sg), length, 0, 636 487 0, final); 637 488 } 638 489 ··· 647 498 if (ctx->sg) 648 499 ctx->offset = 0; 649 500 } 650 - if (ctx->flags & SHA_FLAGS_PAD) 501 + if (ctx->flags & SHA_FLAGS_PAD) { 651 502 dma_unmap_single(dd->dev, ctx->dma_addr, 652 - ctx->buflen + SHA1_BLOCK_SIZE, DMA_TO_DEVICE); 503 + ctx->buflen + ctx->block_size, DMA_TO_DEVICE); 504 + } 653 505 } else { 654 506 dma_unmap_single(dd->dev, ctx->dma_addr, ctx->buflen + 655 - SHA1_BLOCK_SIZE, DMA_TO_DEVICE); 507 + ctx->block_size, DMA_TO_DEVICE); 656 508 } 657 509 658 510 return 0; ··· 665 515 struct atmel_sha_reqctx *ctx = ahash_request_ctx(req); 666 516 int err; 667 517 668 - dev_dbg(dd->dev, "update_req: total: %u, digcnt: %d, finup: %d\n", 669 - ctx->total, ctx->digcnt, (ctx->flags & SHA_FLAGS_FINUP) != 0); 518 + dev_dbg(dd->dev, "update_req: total: %u, digcnt: 0x%llx 0x%llx\n", 519 + ctx->total, ctx->digcnt[1], ctx->digcnt[0]); 670 520 671 521 if (ctx->flags & SHA_FLAGS_CPU) 672 522 err = atmel_sha_update_cpu(dd); ··· 674 524 err = atmel_sha_update_dma_start(dd); 675 525 676 526 /* wait for dma completion before can take more data */ 677 - dev_dbg(dd->dev, "update: err: %d, digcnt: %d\n", 678 - err, ctx->digcnt); 527 + dev_dbg(dd->dev, "update: err: %d, digcnt: 0x%llx 0%llx\n", 528 + err, ctx->digcnt[1], ctx->digcnt[0]); 679 529 680 530 return err; 681 531 } ··· 712 562 u32 *hash = (u32 *)ctx->digest; 713 563 int i; 714 564 715 - if (likely(ctx->flags & SHA_FLAGS_SHA1)) 565 + if (ctx->flags & SHA_FLAGS_SHA1) 716 566 for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(u32); i++) 717 567 hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i)); 718 - else 568 + else if (ctx->flags & SHA_FLAGS_SHA224) 569 + for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(u32); i++) 570 + hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i)); 571 + else if (ctx->flags & SHA_FLAGS_SHA256) 719 572 for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(u32); i++) 573 + hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i)); 574 + else if (ctx->flags & SHA_FLAGS_SHA384) 575 + for (i = 0; i < SHA384_DIGEST_SIZE / sizeof(u32); i++) 576 + hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i)); 577 + else 578 + for (i = 0; i < SHA512_DIGEST_SIZE / sizeof(u32); i++) 720 579 hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i)); 721 580 } 722 581 ··· 736 577 if (!req->result) 737 578 return; 738 579 739 - if (likely(ctx->flags & SHA_FLAGS_SHA1)) 580 + if (ctx->flags & SHA_FLAGS_SHA1) 740 581 memcpy(req->result, ctx->digest, SHA1_DIGEST_SIZE); 741 - else 582 + else if (ctx->flags & SHA_FLAGS_SHA224) 583 + memcpy(req->result, ctx->digest, SHA224_DIGEST_SIZE); 584 + else if (ctx->flags & SHA_FLAGS_SHA256) 742 585 memcpy(req->result, ctx->digest, SHA256_DIGEST_SIZE); 586 + else if (ctx->flags & SHA_FLAGS_SHA384) 587 + memcpy(req->result, ctx->digest, SHA384_DIGEST_SIZE); 588 + else 589 + memcpy(req->result, ctx->digest, SHA512_DIGEST_SIZE); 743 590 } 744 591 745 592 static int atmel_sha_finish(struct ahash_request *req) ··· 754 589 struct atmel_sha_dev *dd = ctx->dd; 755 590 int err = 0; 756 591 757 - if (ctx->digcnt) 592 + if (ctx->digcnt[0] || ctx->digcnt[1]) 758 593 atmel_sha_copy_ready_hash(req); 759 594 760 - dev_dbg(dd->dev, "digcnt: %d, bufcnt: %d\n", ctx->digcnt, 761 - ctx->bufcnt); 595 + dev_dbg(dd->dev, "digcnt: 0x%llx 0x%llx, bufcnt: %d\n", ctx->digcnt[1], 596 + ctx->digcnt[0], ctx->bufcnt); 762 597 763 598 return err; 764 599 } ··· 793 628 { 794 629 clk_prepare_enable(dd->iclk); 795 630 796 - if (SHA_FLAGS_INIT & dd->flags) { 631 + if (!(SHA_FLAGS_INIT & dd->flags)) { 797 632 atmel_sha_write(dd, SHA_CR, SHA_CR_SWRST); 798 - atmel_sha_dualbuff_test(dd); 799 633 dd->flags |= SHA_FLAGS_INIT; 800 634 dd->err = 0; 801 635 } 802 636 803 637 return 0; 638 + } 639 + 640 + static inline unsigned int atmel_sha_get_version(struct atmel_sha_dev *dd) 641 + { 642 + return atmel_sha_read(dd, SHA_HW_VERSION) & 0x00000fff; 643 + } 644 + 645 + static void atmel_sha_hw_version_init(struct atmel_sha_dev *dd) 646 + { 647 + atmel_sha_hw_init(dd); 648 + 649 + dd->hw_version = atmel_sha_get_version(dd); 650 + 651 + dev_info(dd->dev, 652 + "version: 0x%x\n", dd->hw_version); 653 + 654 + clk_disable_unprepare(dd->iclk); 804 655 } 805 656 806 657 static int atmel_sha_handle_queue(struct atmel_sha_dev *dd, ··· 863 682 864 683 if (ctx->op == SHA_OP_UPDATE) { 865 684 err = atmel_sha_update_req(dd); 866 - if (err != -EINPROGRESS && (ctx->flags & SHA_FLAGS_FINUP)) { 685 + if (err != -EINPROGRESS && (ctx->flags & SHA_FLAGS_FINUP)) 867 686 /* no final() after finup() */ 868 687 err = atmel_sha_final_req(dd); 869 - } 870 688 } else if (ctx->op == SHA_OP_FINAL) { 871 689 err = atmel_sha_final_req(dd); 872 690 } ··· 988 808 } 989 809 crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), 990 810 sizeof(struct atmel_sha_reqctx) + 991 - SHA_BUFFER_LEN + SHA256_BLOCK_SIZE); 811 + SHA_BUFFER_LEN + SHA512_BLOCK_SIZE); 992 812 993 813 return 0; 994 814 } ··· 1006 826 tctx->fallback = NULL; 1007 827 } 1008 828 1009 - static struct ahash_alg sha_algs[] = { 829 + static struct ahash_alg sha_1_256_algs[] = { 1010 830 { 1011 831 .init = atmel_sha_init, 1012 832 .update = atmel_sha_update, ··· 1047 867 .cra_blocksize = SHA256_BLOCK_SIZE, 1048 868 .cra_ctxsize = sizeof(struct atmel_sha_ctx), 1049 869 .cra_alignmask = 0, 870 + .cra_module = THIS_MODULE, 871 + .cra_init = atmel_sha_cra_init, 872 + .cra_exit = atmel_sha_cra_exit, 873 + } 874 + } 875 + }, 876 + }; 877 + 878 + static struct ahash_alg sha_224_alg = { 879 + .init = atmel_sha_init, 880 + .update = atmel_sha_update, 881 + .final = atmel_sha_final, 882 + .finup = atmel_sha_finup, 883 + .digest = atmel_sha_digest, 884 + .halg = { 885 + .digestsize = SHA224_DIGEST_SIZE, 886 + .base = { 887 + .cra_name = "sha224", 888 + .cra_driver_name = "atmel-sha224", 889 + .cra_priority = 100, 890 + .cra_flags = CRYPTO_ALG_ASYNC | 891 + CRYPTO_ALG_NEED_FALLBACK, 892 + .cra_blocksize = SHA224_BLOCK_SIZE, 893 + .cra_ctxsize = sizeof(struct atmel_sha_ctx), 894 + .cra_alignmask = 0, 895 + .cra_module = THIS_MODULE, 896 + .cra_init = atmel_sha_cra_init, 897 + .cra_exit = atmel_sha_cra_exit, 898 + } 899 + } 900 + }; 901 + 902 + static struct ahash_alg sha_384_512_algs[] = { 903 + { 904 + .init = atmel_sha_init, 905 + .update = atmel_sha_update, 906 + .final = atmel_sha_final, 907 + .finup = atmel_sha_finup, 908 + .digest = atmel_sha_digest, 909 + .halg = { 910 + .digestsize = SHA384_DIGEST_SIZE, 911 + .base = { 912 + .cra_name = "sha384", 913 + .cra_driver_name = "atmel-sha384", 914 + .cra_priority = 100, 915 + .cra_flags = CRYPTO_ALG_ASYNC | 916 + CRYPTO_ALG_NEED_FALLBACK, 917 + .cra_blocksize = SHA384_BLOCK_SIZE, 918 + .cra_ctxsize = sizeof(struct atmel_sha_ctx), 919 + .cra_alignmask = 0x3, 920 + .cra_module = THIS_MODULE, 921 + .cra_init = atmel_sha_cra_init, 922 + .cra_exit = atmel_sha_cra_exit, 923 + } 924 + } 925 + }, 926 + { 927 + .init = atmel_sha_init, 928 + .update = atmel_sha_update, 929 + .final = atmel_sha_final, 930 + .finup = atmel_sha_finup, 931 + .digest = atmel_sha_digest, 932 + .halg = { 933 + .digestsize = SHA512_DIGEST_SIZE, 934 + .base = { 935 + .cra_name = "sha512", 936 + .cra_driver_name = "atmel-sha512", 937 + .cra_priority = 100, 938 + .cra_flags = CRYPTO_ALG_ASYNC | 939 + CRYPTO_ALG_NEED_FALLBACK, 940 + .cra_blocksize = SHA512_BLOCK_SIZE, 941 + .cra_ctxsize = sizeof(struct atmel_sha_ctx), 942 + .cra_alignmask = 0x3, 1050 943 .cra_module = THIS_MODULE, 1051 944 .cra_init = atmel_sha_cra_init, 1052 945 .cra_exit = atmel_sha_cra_exit, ··· 1194 941 { 1195 942 int i; 1196 943 1197 - for (i = 0; i < ARRAY_SIZE(sha_algs); i++) 1198 - crypto_unregister_ahash(&sha_algs[i]); 944 + for (i = 0; i < ARRAY_SIZE(sha_1_256_algs); i++) 945 + crypto_unregister_ahash(&sha_1_256_algs[i]); 946 + 947 + if (dd->caps.has_sha224) 948 + crypto_unregister_ahash(&sha_224_alg); 949 + 950 + if (dd->caps.has_sha_384_512) { 951 + for (i = 0; i < ARRAY_SIZE(sha_384_512_algs); i++) 952 + crypto_unregister_ahash(&sha_384_512_algs[i]); 953 + } 1199 954 } 1200 955 1201 956 static int atmel_sha_register_algs(struct atmel_sha_dev *dd) 1202 957 { 1203 958 int err, i, j; 1204 959 1205 - for (i = 0; i < ARRAY_SIZE(sha_algs); i++) { 1206 - err = crypto_register_ahash(&sha_algs[i]); 960 + for (i = 0; i < ARRAY_SIZE(sha_1_256_algs); i++) { 961 + err = crypto_register_ahash(&sha_1_256_algs[i]); 1207 962 if (err) 1208 - goto err_sha_algs; 963 + goto err_sha_1_256_algs; 964 + } 965 + 966 + if (dd->caps.has_sha224) { 967 + err = crypto_register_ahash(&sha_224_alg); 968 + if (err) 969 + goto err_sha_224_algs; 970 + } 971 + 972 + if (dd->caps.has_sha_384_512) { 973 + for (i = 0; i < ARRAY_SIZE(sha_384_512_algs); i++) { 974 + err = crypto_register_ahash(&sha_384_512_algs[i]); 975 + if (err) 976 + goto err_sha_384_512_algs; 977 + } 1209 978 } 1210 979 1211 980 return 0; 1212 981 1213 - err_sha_algs: 982 + err_sha_384_512_algs: 1214 983 for (j = 0; j < i; j++) 1215 - crypto_unregister_ahash(&sha_algs[j]); 984 + crypto_unregister_ahash(&sha_384_512_algs[j]); 985 + crypto_unregister_ahash(&sha_224_alg); 986 + err_sha_224_algs: 987 + i = ARRAY_SIZE(sha_1_256_algs); 988 + err_sha_1_256_algs: 989 + for (j = 0; j < i; j++) 990 + crypto_unregister_ahash(&sha_1_256_algs[j]); 1216 991 1217 992 return err; 993 + } 994 + 995 + static bool atmel_sha_filter(struct dma_chan *chan, void *slave) 996 + { 997 + struct at_dma_slave *sl = slave; 998 + 999 + if (sl && sl->dma_dev == chan->device->dev) { 1000 + chan->private = sl; 1001 + return true; 1002 + } else { 1003 + return false; 1004 + } 1005 + } 1006 + 1007 + static int atmel_sha_dma_init(struct atmel_sha_dev *dd, 1008 + struct crypto_platform_data *pdata) 1009 + { 1010 + int err = -ENOMEM; 1011 + dma_cap_mask_t mask_in; 1012 + 1013 + if (pdata && pdata->dma_slave->rxdata.dma_dev) { 1014 + /* Try to grab DMA channel */ 1015 + dma_cap_zero(mask_in); 1016 + dma_cap_set(DMA_SLAVE, mask_in); 1017 + 1018 + dd->dma_lch_in.chan = dma_request_channel(mask_in, 1019 + atmel_sha_filter, &pdata->dma_slave->rxdata); 1020 + 1021 + if (!dd->dma_lch_in.chan) 1022 + return err; 1023 + 1024 + dd->dma_lch_in.dma_conf.direction = DMA_MEM_TO_DEV; 1025 + dd->dma_lch_in.dma_conf.dst_addr = dd->phys_base + 1026 + SHA_REG_DIN(0); 1027 + dd->dma_lch_in.dma_conf.src_maxburst = 1; 1028 + dd->dma_lch_in.dma_conf.src_addr_width = 1029 + DMA_SLAVE_BUSWIDTH_4_BYTES; 1030 + dd->dma_lch_in.dma_conf.dst_maxburst = 1; 1031 + dd->dma_lch_in.dma_conf.dst_addr_width = 1032 + DMA_SLAVE_BUSWIDTH_4_BYTES; 1033 + dd->dma_lch_in.dma_conf.device_fc = false; 1034 + 1035 + return 0; 1036 + } 1037 + 1038 + return -ENODEV; 1039 + } 1040 + 1041 + static void atmel_sha_dma_cleanup(struct atmel_sha_dev *dd) 1042 + { 1043 + dma_release_channel(dd->dma_lch_in.chan); 1044 + } 1045 + 1046 + static void atmel_sha_get_cap(struct atmel_sha_dev *dd) 1047 + { 1048 + 1049 + dd->caps.has_dma = 0; 1050 + dd->caps.has_dualbuff = 0; 1051 + dd->caps.has_sha224 = 0; 1052 + dd->caps.has_sha_384_512 = 0; 1053 + 1054 + /* keep only major version number */ 1055 + switch (dd->hw_version & 0xff0) { 1056 + case 0x410: 1057 + dd->caps.has_dma = 1; 1058 + dd->caps.has_dualbuff = 1; 1059 + dd->caps.has_sha224 = 1; 1060 + dd->caps.has_sha_384_512 = 1; 1061 + break; 1062 + case 0x400: 1063 + dd->caps.has_dma = 1; 1064 + dd->caps.has_dualbuff = 1; 1065 + dd->caps.has_sha224 = 1; 1066 + break; 1067 + case 0x320: 1068 + break; 1069 + default: 1070 + dev_warn(dd->dev, 1071 + "Unmanaged sha version, set minimum capabilities\n"); 1072 + break; 1073 + } 1218 1074 } 1219 1075 1220 1076 static int atmel_sha_probe(struct platform_device *pdev) 1221 1077 { 1222 1078 struct atmel_sha_dev *sha_dd; 1079 + struct crypto_platform_data *pdata; 1223 1080 struct device *dev = &pdev->dev; 1224 1081 struct resource *sha_res; 1225 1082 unsigned long sha_phys_size; ··· 1381 1018 } 1382 1019 1383 1020 /* Initializing the clock */ 1384 - sha_dd->iclk = clk_get(&pdev->dev, NULL); 1021 + sha_dd->iclk = clk_get(&pdev->dev, "sha_clk"); 1385 1022 if (IS_ERR(sha_dd->iclk)) { 1386 1023 dev_err(dev, "clock intialization failed.\n"); 1387 1024 err = PTR_ERR(sha_dd->iclk); ··· 1393 1030 dev_err(dev, "can't ioremap\n"); 1394 1031 err = -ENOMEM; 1395 1032 goto sha_io_err; 1033 + } 1034 + 1035 + atmel_sha_hw_version_init(sha_dd); 1036 + 1037 + atmel_sha_get_cap(sha_dd); 1038 + 1039 + if (sha_dd->caps.has_dma) { 1040 + pdata = pdev->dev.platform_data; 1041 + if (!pdata) { 1042 + dev_err(&pdev->dev, "platform data not available\n"); 1043 + err = -ENXIO; 1044 + goto err_pdata; 1045 + } 1046 + err = atmel_sha_dma_init(sha_dd, pdata); 1047 + if (err) 1048 + goto err_sha_dma; 1396 1049 } 1397 1050 1398 1051 spin_lock(&atmel_sha.lock); ··· 1427 1048 spin_lock(&atmel_sha.lock); 1428 1049 list_del(&sha_dd->list); 1429 1050 spin_unlock(&atmel_sha.lock); 1051 + if (sha_dd->caps.has_dma) 1052 + atmel_sha_dma_cleanup(sha_dd); 1053 + err_sha_dma: 1054 + err_pdata: 1430 1055 iounmap(sha_dd->io_base); 1431 1056 sha_io_err: 1432 1057 clk_put(sha_dd->iclk); ··· 1461 1078 1462 1079 tasklet_kill(&sha_dd->done_task); 1463 1080 1081 + if (sha_dd->caps.has_dma) 1082 + atmel_sha_dma_cleanup(sha_dd); 1083 + 1464 1084 iounmap(sha_dd->io_base); 1465 1085 1466 1086 clk_put(sha_dd->iclk); ··· 1488 1102 1489 1103 module_platform_driver(atmel_sha_driver); 1490 1104 1491 - MODULE_DESCRIPTION("Atmel SHA1/SHA256 hw acceleration support."); 1105 + MODULE_DESCRIPTION("Atmel SHA (1/256/224/384/512) hw acceleration support."); 1492 1106 MODULE_LICENSE("GPL v2"); 1493 1107 MODULE_AUTHOR("Nicolas Royer - Eukréa Electromatique");

+2

drivers/crypto/atmel-tdes-regs.h

··· 69 69 #define TDES_XTEARNDR_XTEA_RNDS_MASK (0x3F << 0) 70 70 #define TDES_XTEARNDR_XTEA_RNDS_OFFSET 0 71 71 72 + #define TDES_HW_VERSION 0xFC 73 + 72 74 #define TDES_RPR 0x100 73 75 #define TDES_RCR 0x104 74 76 #define TDES_TPR 0x108

+342 -54

drivers/crypto/atmel-tdes.c

··· 38 38 #include <crypto/des.h> 39 39 #include <crypto/hash.h> 40 40 #include <crypto/internal/hash.h> 41 + #include <linux/platform_data/crypto-atmel.h> 41 42 #include "atmel-tdes-regs.h" 42 43 43 44 /* TDES flags */ 44 - #define TDES_FLAGS_MODE_MASK 0x007f 45 + #define TDES_FLAGS_MODE_MASK 0x00ff 45 46 #define TDES_FLAGS_ENCRYPT BIT(0) 46 47 #define TDES_FLAGS_CBC BIT(1) 47 48 #define TDES_FLAGS_CFB BIT(2) 48 49 #define TDES_FLAGS_CFB8 BIT(3) 49 50 #define TDES_FLAGS_CFB16 BIT(4) 50 51 #define TDES_FLAGS_CFB32 BIT(5) 51 - #define TDES_FLAGS_OFB BIT(6) 52 + #define TDES_FLAGS_CFB64 BIT(6) 53 + #define TDES_FLAGS_OFB BIT(7) 52 54 53 55 #define TDES_FLAGS_INIT BIT(16) 54 56 #define TDES_FLAGS_FAST BIT(17) 55 57 #define TDES_FLAGS_BUSY BIT(18) 58 + #define TDES_FLAGS_DMA BIT(19) 56 59 57 - #define ATMEL_TDES_QUEUE_LENGTH 1 60 + #define ATMEL_TDES_QUEUE_LENGTH 50 58 61 59 62 #define CFB8_BLOCK_SIZE 1 60 63 #define CFB16_BLOCK_SIZE 2 61 64 #define CFB32_BLOCK_SIZE 4 62 - #define CFB64_BLOCK_SIZE 8 63 65 66 + struct atmel_tdes_caps { 67 + bool has_dma; 68 + u32 has_cfb_3keys; 69 + }; 64 70 65 71 struct atmel_tdes_dev; 66 72 ··· 76 70 int keylen; 77 71 u32 key[3*DES_KEY_SIZE / sizeof(u32)]; 78 72 unsigned long flags; 73 + 74 + u16 block_size; 79 75 }; 80 76 81 77 struct atmel_tdes_reqctx { 82 78 unsigned long mode; 79 + }; 80 + 81 + struct atmel_tdes_dma { 82 + struct dma_chan *chan; 83 + struct dma_slave_config dma_conf; 83 84 }; 84 85 85 86 struct atmel_tdes_dev { ··· 112 99 size_t total; 113 100 114 101 struct scatterlist *in_sg; 102 + unsigned int nb_in_sg; 115 103 size_t in_offset; 116 104 struct scatterlist *out_sg; 105 + unsigned int nb_out_sg; 117 106 size_t out_offset; 118 107 119 108 size_t buflen; ··· 124 109 void *buf_in; 125 110 int dma_in; 126 111 dma_addr_t dma_addr_in; 112 + struct atmel_tdes_dma dma_lch_in; 127 113 128 114 void *buf_out; 129 115 int dma_out; 130 116 dma_addr_t dma_addr_out; 117 + struct atmel_tdes_dma dma_lch_out; 118 + 119 + struct atmel_tdes_caps caps; 120 + 121 + u32 hw_version; 131 122 }; 132 123 133 124 struct atmel_tdes_drv { ··· 228 207 return 0; 229 208 } 230 209 210 + static inline unsigned int atmel_tdes_get_version(struct atmel_tdes_dev *dd) 211 + { 212 + return atmel_tdes_read(dd, TDES_HW_VERSION) & 0x00000fff; 213 + } 214 + 215 + static void atmel_tdes_hw_version_init(struct atmel_tdes_dev *dd) 216 + { 217 + atmel_tdes_hw_init(dd); 218 + 219 + dd->hw_version = atmel_tdes_get_version(dd); 220 + 221 + dev_info(dd->dev, 222 + "version: 0x%x\n", dd->hw_version); 223 + 224 + clk_disable_unprepare(dd->iclk); 225 + } 226 + 227 + static void atmel_tdes_dma_callback(void *data) 228 + { 229 + struct atmel_tdes_dev *dd = data; 230 + 231 + /* dma_lch_out - completed */ 232 + tasklet_schedule(&dd->done_task); 233 + } 234 + 231 235 static int atmel_tdes_write_ctrl(struct atmel_tdes_dev *dd) 232 236 { 233 237 int err; ··· 263 217 if (err) 264 218 return err; 265 219 266 - atmel_tdes_write(dd, TDES_PTCR, TDES_PTCR_TXTDIS|TDES_PTCR_RXTDIS); 220 + if (!dd->caps.has_dma) 221 + atmel_tdes_write(dd, TDES_PTCR, 222 + TDES_PTCR_TXTDIS | TDES_PTCR_RXTDIS); 267 223 268 224 /* MR register must be set before IV registers */ 269 225 if (dd->ctx->keylen > (DES_KEY_SIZE << 1)) { ··· 289 241 valmr |= TDES_MR_CFBS_16b; 290 242 else if (dd->flags & TDES_FLAGS_CFB32) 291 243 valmr |= TDES_MR_CFBS_32b; 244 + else if (dd->flags & TDES_FLAGS_CFB64) 245 + valmr |= TDES_MR_CFBS_64b; 292 246 } else if (dd->flags & TDES_FLAGS_OFB) { 293 247 valmr |= TDES_MR_OPMOD_OFB; 294 248 } ··· 312 262 return 0; 313 263 } 314 264 315 - static int atmel_tdes_crypt_dma_stop(struct atmel_tdes_dev *dd) 265 + static int atmel_tdes_crypt_pdc_stop(struct atmel_tdes_dev *dd) 316 266 { 317 267 int err = 0; 318 268 size_t count; ··· 338 288 return err; 339 289 } 340 290 341 - static int atmel_tdes_dma_init(struct atmel_tdes_dev *dd) 291 + static int atmel_tdes_buff_init(struct atmel_tdes_dev *dd) 342 292 { 343 293 int err = -ENOMEM; 344 294 ··· 383 333 return err; 384 334 } 385 335 386 - static void atmel_tdes_dma_cleanup(struct atmel_tdes_dev *dd) 336 + static void atmel_tdes_buff_cleanup(struct atmel_tdes_dev *dd) 387 337 { 388 338 dma_unmap_single(dd->dev, dd->dma_addr_out, dd->buflen, 389 339 DMA_FROM_DEVICE); ··· 393 343 free_page((unsigned long)dd->buf_in); 394 344 } 395 345 396 - static int atmel_tdes_crypt_dma(struct crypto_tfm *tfm, dma_addr_t dma_addr_in, 346 + static int atmel_tdes_crypt_pdc(struct crypto_tfm *tfm, dma_addr_t dma_addr_in, 397 347 dma_addr_t dma_addr_out, int length) 398 348 { 399 349 struct atmel_tdes_ctx *ctx = crypto_tfm_ctx(tfm); ··· 429 379 return 0; 430 380 } 431 381 432 - static int atmel_tdes_crypt_dma_start(struct atmel_tdes_dev *dd) 382 + static int atmel_tdes_crypt_dma(struct crypto_tfm *tfm, dma_addr_t dma_addr_in, 383 + dma_addr_t dma_addr_out, int length) 384 + { 385 + struct atmel_tdes_ctx *ctx = crypto_tfm_ctx(tfm); 386 + struct atmel_tdes_dev *dd = ctx->dd; 387 + struct scatterlist sg[2]; 388 + struct dma_async_tx_descriptor *in_desc, *out_desc; 389 + 390 + dd->dma_size = length; 391 + 392 + if (!(dd->flags & TDES_FLAGS_FAST)) { 393 + dma_sync_single_for_device(dd->dev, dma_addr_in, length, 394 + DMA_TO_DEVICE); 395 + } 396 + 397 + if (dd->flags & TDES_FLAGS_CFB8) { 398 + dd->dma_lch_in.dma_conf.dst_addr_width = 399 + DMA_SLAVE_BUSWIDTH_1_BYTE; 400 + dd->dma_lch_out.dma_conf.src_addr_width = 401 + DMA_SLAVE_BUSWIDTH_1_BYTE; 402 + } else if (dd->flags & TDES_FLAGS_CFB16) { 403 + dd->dma_lch_in.dma_conf.dst_addr_width = 404 + DMA_SLAVE_BUSWIDTH_2_BYTES; 405 + dd->dma_lch_out.dma_conf.src_addr_width = 406 + DMA_SLAVE_BUSWIDTH_2_BYTES; 407 + } else { 408 + dd->dma_lch_in.dma_conf.dst_addr_width = 409 + DMA_SLAVE_BUSWIDTH_4_BYTES; 410 + dd->dma_lch_out.dma_conf.src_addr_width = 411 + DMA_SLAVE_BUSWIDTH_4_BYTES; 412 + } 413 + 414 + dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf); 415 + dmaengine_slave_config(dd->dma_lch_out.chan, &dd->dma_lch_out.dma_conf); 416 + 417 + dd->flags |= TDES_FLAGS_DMA; 418 + 419 + sg_init_table(&sg[0], 1); 420 + sg_dma_address(&sg[0]) = dma_addr_in; 421 + sg_dma_len(&sg[0]) = length; 422 + 423 + sg_init_table(&sg[1], 1); 424 + sg_dma_address(&sg[1]) = dma_addr_out; 425 + sg_dma_len(&sg[1]) = length; 426 + 427 + in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, &sg[0], 428 + 1, DMA_MEM_TO_DEV, 429 + DMA_PREP_INTERRUPT | DMA_CTRL_ACK); 430 + if (!in_desc) 431 + return -EINVAL; 432 + 433 + out_desc = dmaengine_prep_slave_sg(dd->dma_lch_out.chan, &sg[1], 434 + 1, DMA_DEV_TO_MEM, 435 + DMA_PREP_INTERRUPT | DMA_CTRL_ACK); 436 + if (!out_desc) 437 + return -EINVAL; 438 + 439 + out_desc->callback = atmel_tdes_dma_callback; 440 + out_desc->callback_param = dd; 441 + 442 + dmaengine_submit(out_desc); 443 + dma_async_issue_pending(dd->dma_lch_out.chan); 444 + 445 + dmaengine_submit(in_desc); 446 + dma_async_issue_pending(dd->dma_lch_in.chan); 447 + 448 + return 0; 449 + } 450 + 451 + static int atmel_tdes_crypt_start(struct atmel_tdes_dev *dd) 433 452 { 434 453 struct crypto_tfm *tfm = crypto_ablkcipher_tfm( 435 454 crypto_ablkcipher_reqtfm(dd->req)); ··· 506 387 size_t count; 507 388 dma_addr_t addr_in, addr_out; 508 389 509 - if (sg_is_last(dd->in_sg) && sg_is_last(dd->out_sg)) { 390 + if ((!dd->in_offset) && (!dd->out_offset)) { 510 391 /* check for alignment */ 511 - in = IS_ALIGNED((u32)dd->in_sg->offset, sizeof(u32)); 512 - out = IS_ALIGNED((u32)dd->out_sg->offset, sizeof(u32)); 513 - 392 + in = IS_ALIGNED((u32)dd->in_sg->offset, sizeof(u32)) && 393 + IS_ALIGNED(dd->in_sg->length, dd->ctx->block_size); 394 + out = IS_ALIGNED((u32)dd->out_sg->offset, sizeof(u32)) && 395 + IS_ALIGNED(dd->out_sg->length, dd->ctx->block_size); 514 396 fast = in && out; 397 + 398 + if (sg_dma_len(dd->in_sg) != sg_dma_len(dd->out_sg)) 399 + fast = 0; 515 400 } 401 + 516 402 517 403 if (fast) { 518 404 count = min(dd->total, sg_dma_len(dd->in_sg)); 519 405 count = min(count, sg_dma_len(dd->out_sg)); 520 - 521 - if (count != dd->total) { 522 - pr_err("request length != buffer length\n"); 523 - return -EINVAL; 524 - } 525 406 526 407 err = dma_map_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE); 527 408 if (!err) { ··· 552 433 addr_out = dd->dma_addr_out; 553 434 554 435 dd->flags &= ~TDES_FLAGS_FAST; 555 - 556 436 } 557 437 558 438 dd->total -= count; 559 439 560 - err = atmel_tdes_crypt_dma(tfm, addr_in, addr_out, count); 561 - if (err) { 440 + if (dd->caps.has_dma) 441 + err = atmel_tdes_crypt_dma(tfm, addr_in, addr_out, count); 442 + else 443 + err = atmel_tdes_crypt_pdc(tfm, addr_in, addr_out, count); 444 + 445 + if (err && (dd->flags & TDES_FLAGS_FAST)) { 562 446 dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE); 563 447 dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_TO_DEVICE); 564 448 } 565 449 566 450 return err; 567 451 } 568 - 569 452 570 453 static void atmel_tdes_finish_req(struct atmel_tdes_dev *dd, int err) 571 454 { ··· 627 506 628 507 err = atmel_tdes_write_ctrl(dd); 629 508 if (!err) 630 - err = atmel_tdes_crypt_dma_start(dd); 509 + err = atmel_tdes_crypt_start(dd); 631 510 if (err) { 632 511 /* des_task will not finish it, so do it here */ 633 512 atmel_tdes_finish_req(dd, err); ··· 637 516 return ret; 638 517 } 639 518 519 + static int atmel_tdes_crypt_dma_stop(struct atmel_tdes_dev *dd) 520 + { 521 + int err = -EINVAL; 522 + size_t count; 523 + 524 + if (dd->flags & TDES_FLAGS_DMA) { 525 + err = 0; 526 + if (dd->flags & TDES_FLAGS_FAST) { 527 + dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_FROM_DEVICE); 528 + dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE); 529 + } else { 530 + dma_sync_single_for_device(dd->dev, dd->dma_addr_out, 531 + dd->dma_size, DMA_FROM_DEVICE); 532 + 533 + /* copy data */ 534 + count = atmel_tdes_sg_copy(&dd->out_sg, &dd->out_offset, 535 + dd->buf_out, dd->buflen, dd->dma_size, 1); 536 + if (count != dd->dma_size) { 537 + err = -EINVAL; 538 + pr_err("not all data converted: %u\n", count); 539 + } 540 + } 541 + } 542 + return err; 543 + } 640 544 641 545 static int atmel_tdes_crypt(struct ablkcipher_request *req, unsigned long mode) 642 546 { 643 547 struct atmel_tdes_ctx *ctx = crypto_ablkcipher_ctx( 644 548 crypto_ablkcipher_reqtfm(req)); 645 549 struct atmel_tdes_reqctx *rctx = ablkcipher_request_ctx(req); 646 - struct atmel_tdes_dev *dd; 647 550 648 551 if (mode & TDES_FLAGS_CFB8) { 649 552 if (!IS_ALIGNED(req->nbytes, CFB8_BLOCK_SIZE)) { 650 553 pr_err("request size is not exact amount of CFB8 blocks\n"); 651 554 return -EINVAL; 652 555 } 556 + ctx->block_size = CFB8_BLOCK_SIZE; 653 557 } else if (mode & TDES_FLAGS_CFB16) { 654 558 if (!IS_ALIGNED(req->nbytes, CFB16_BLOCK_SIZE)) { 655 559 pr_err("request size is not exact amount of CFB16 blocks\n"); 656 560 return -EINVAL; 657 561 } 562 + ctx->block_size = CFB16_BLOCK_SIZE; 658 563 } else if (mode & TDES_FLAGS_CFB32) { 659 564 if (!IS_ALIGNED(req->nbytes, CFB32_BLOCK_SIZE)) { 660 565 pr_err("request size is not exact amount of CFB32 blocks\n"); 661 566 return -EINVAL; 662 567 } 663 - } else if (!IS_ALIGNED(req->nbytes, DES_BLOCK_SIZE)) { 664 - pr_err("request size is not exact amount of DES blocks\n"); 665 - return -EINVAL; 568 + ctx->block_size = CFB32_BLOCK_SIZE; 569 + } else { 570 + if (!IS_ALIGNED(req->nbytes, DES_BLOCK_SIZE)) { 571 + pr_err("request size is not exact amount of DES blocks\n"); 572 + return -EINVAL; 573 + } 574 + ctx->block_size = DES_BLOCK_SIZE; 666 575 } 667 - 668 - dd = atmel_tdes_find_dev(ctx); 669 - if (!dd) 670 - return -ENODEV; 671 576 672 577 rctx->mode = mode; 673 578 674 - return atmel_tdes_handle_queue(dd, req); 579 + return atmel_tdes_handle_queue(ctx->dd, req); 580 + } 581 + 582 + static bool atmel_tdes_filter(struct dma_chan *chan, void *slave) 583 + { 584 + struct at_dma_slave *sl = slave; 585 + 586 + if (sl && sl->dma_dev == chan->device->dev) { 587 + chan->private = sl; 588 + return true; 589 + } else { 590 + return false; 591 + } 592 + } 593 + 594 + static int atmel_tdes_dma_init(struct atmel_tdes_dev *dd, 595 + struct crypto_platform_data *pdata) 596 + { 597 + int err = -ENOMEM; 598 + dma_cap_mask_t mask_in, mask_out; 599 + 600 + if (pdata && pdata->dma_slave->txdata.dma_dev && 601 + pdata->dma_slave->rxdata.dma_dev) { 602 + 603 + /* Try to grab 2 DMA channels */ 604 + dma_cap_zero(mask_in); 605 + dma_cap_set(DMA_SLAVE, mask_in); 606 + 607 + dd->dma_lch_in.chan = dma_request_channel(mask_in, 608 + atmel_tdes_filter, &pdata->dma_slave->rxdata); 609 + 610 + if (!dd->dma_lch_in.chan) 611 + goto err_dma_in; 612 + 613 + dd->dma_lch_in.dma_conf.direction = DMA_MEM_TO_DEV; 614 + dd->dma_lch_in.dma_conf.dst_addr = dd->phys_base + 615 + TDES_IDATA1R; 616 + dd->dma_lch_in.dma_conf.src_maxburst = 1; 617 + dd->dma_lch_in.dma_conf.src_addr_width = 618 + DMA_SLAVE_BUSWIDTH_4_BYTES; 619 + dd->dma_lch_in.dma_conf.dst_maxburst = 1; 620 + dd->dma_lch_in.dma_conf.dst_addr_width = 621 + DMA_SLAVE_BUSWIDTH_4_BYTES; 622 + dd->dma_lch_in.dma_conf.device_fc = false; 623 + 624 + dma_cap_zero(mask_out); 625 + dma_cap_set(DMA_SLAVE, mask_out); 626 + dd->dma_lch_out.chan = dma_request_channel(mask_out, 627 + atmel_tdes_filter, &pdata->dma_slave->txdata); 628 + 629 + if (!dd->dma_lch_out.chan) 630 + goto err_dma_out; 631 + 632 + dd->dma_lch_out.dma_conf.direction = DMA_DEV_TO_MEM; 633 + dd->dma_lch_out.dma_conf.src_addr = dd->phys_base + 634 + TDES_ODATA1R; 635 + dd->dma_lch_out.dma_conf.src_maxburst = 1; 636 + dd->dma_lch_out.dma_conf.src_addr_width = 637 + DMA_SLAVE_BUSWIDTH_4_BYTES; 638 + dd->dma_lch_out.dma_conf.dst_maxburst = 1; 639 + dd->dma_lch_out.dma_conf.dst_addr_width = 640 + DMA_SLAVE_BUSWIDTH_4_BYTES; 641 + dd->dma_lch_out.dma_conf.device_fc = false; 642 + 643 + return 0; 644 + } else { 645 + return -ENODEV; 646 + } 647 + 648 + err_dma_out: 649 + dma_release_channel(dd->dma_lch_in.chan); 650 + err_dma_in: 651 + return err; 652 + } 653 + 654 + static void atmel_tdes_dma_cleanup(struct atmel_tdes_dev *dd) 655 + { 656 + dma_release_channel(dd->dma_lch_in.chan); 657 + dma_release_channel(dd->dma_lch_out.chan); 675 658 } 676 659 677 660 static int atmel_des_setkey(struct crypto_ablkcipher *tfm, const u8 *key, ··· 815 590 /* 816 591 * HW bug in cfb 3-keys mode. 817 592 */ 818 - if (strstr(alg_name, "cfb") && (keylen != 2*DES_KEY_SIZE)) { 593 + if (!ctx->dd->caps.has_cfb_3keys && strstr(alg_name, "cfb") 594 + && (keylen != 2*DES_KEY_SIZE)) { 819 595 crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); 820 596 return -EINVAL; 821 597 } else if ((keylen != 2*DES_KEY_SIZE) && (keylen != 3*DES_KEY_SIZE)) { ··· 904 678 905 679 static int atmel_tdes_cra_init(struct crypto_tfm *tfm) 906 680 { 681 + struct atmel_tdes_ctx *ctx = crypto_tfm_ctx(tfm); 682 + struct atmel_tdes_dev *dd; 683 + 907 684 tfm->crt_ablkcipher.reqsize = sizeof(struct atmel_tdes_reqctx); 685 + 686 + dd = atmel_tdes_find_dev(ctx); 687 + if (!dd) 688 + return -ENODEV; 908 689 909 690 return 0; 910 691 } ··· 928 695 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 929 696 .cra_blocksize = DES_BLOCK_SIZE, 930 697 .cra_ctxsize = sizeof(struct atmel_tdes_ctx), 931 - .cra_alignmask = 0, 698 + .cra_alignmask = 0x7, 932 699 .cra_type = &crypto_ablkcipher_type, 933 700 .cra_module = THIS_MODULE, 934 701 .cra_init = atmel_tdes_cra_init, ··· 948 715 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 949 716 .cra_blocksize = DES_BLOCK_SIZE, 950 717 .cra_ctxsize = sizeof(struct atmel_tdes_ctx), 951 - .cra_alignmask = 0, 718 + .cra_alignmask = 0x7, 952 719 .cra_type = &crypto_ablkcipher_type, 953 720 .cra_module = THIS_MODULE, 954 721 .cra_init = atmel_tdes_cra_init, ··· 969 736 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 970 737 .cra_blocksize = DES_BLOCK_SIZE, 971 738 .cra_ctxsize = sizeof(struct atmel_tdes_ctx), 972 - .cra_alignmask = 0, 739 + .cra_alignmask = 0x7, 973 740 .cra_type = &crypto_ablkcipher_type, 974 741 .cra_module = THIS_MODULE, 975 742 .cra_init = atmel_tdes_cra_init, ··· 1011 778 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1012 779 .cra_blocksize = CFB16_BLOCK_SIZE, 1013 780 .cra_ctxsize = sizeof(struct atmel_tdes_ctx), 1014 - .cra_alignmask = 0, 781 + .cra_alignmask = 0x1, 1015 782 .cra_type = &crypto_ablkcipher_type, 1016 783 .cra_module = THIS_MODULE, 1017 784 .cra_init = atmel_tdes_cra_init, ··· 1032 799 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1033 800 .cra_blocksize = CFB32_BLOCK_SIZE, 1034 801 .cra_ctxsize = sizeof(struct atmel_tdes_ctx), 1035 - .cra_alignmask = 0, 802 + .cra_alignmask = 0x3, 1036 803 .cra_type = &crypto_ablkcipher_type, 1037 804 .cra_module = THIS_MODULE, 1038 805 .cra_init = atmel_tdes_cra_init, ··· 1053 820 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1054 821 .cra_blocksize = DES_BLOCK_SIZE, 1055 822 .cra_ctxsize = sizeof(struct atmel_tdes_ctx), 1056 - .cra_alignmask = 0, 823 + .cra_alignmask = 0x7, 1057 824 .cra_type = &crypto_ablkcipher_type, 1058 825 .cra_module = THIS_MODULE, 1059 826 .cra_init = atmel_tdes_cra_init, ··· 1074 841 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1075 842 .cra_blocksize = DES_BLOCK_SIZE, 1076 843 .cra_ctxsize = sizeof(struct atmel_tdes_ctx), 1077 - .cra_alignmask = 0, 844 + .cra_alignmask = 0x7, 1078 845 .cra_type = &crypto_ablkcipher_type, 1079 846 .cra_module = THIS_MODULE, 1080 847 .cra_init = atmel_tdes_cra_init, ··· 1094 861 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1095 862 .cra_blocksize = DES_BLOCK_SIZE, 1096 863 .cra_ctxsize = sizeof(struct atmel_tdes_ctx), 1097 - .cra_alignmask = 0, 864 + .cra_alignmask = 0x7, 1098 865 .cra_type = &crypto_ablkcipher_type, 1099 866 .cra_module = THIS_MODULE, 1100 867 .cra_init = atmel_tdes_cra_init, ··· 1115 882 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1116 883 .cra_blocksize = DES_BLOCK_SIZE, 1117 884 .cra_ctxsize = sizeof(struct atmel_tdes_ctx), 1118 - .cra_alignmask = 0, 885 + .cra_alignmask = 0x7, 1119 886 .cra_type = &crypto_ablkcipher_type, 1120 887 .cra_module = THIS_MODULE, 1121 888 .cra_init = atmel_tdes_cra_init, ··· 1157 924 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1158 925 .cra_blocksize = CFB16_BLOCK_SIZE, 1159 926 .cra_ctxsize = sizeof(struct atmel_tdes_ctx), 1160 - .cra_alignmask = 0, 927 + .cra_alignmask = 0x1, 1161 928 .cra_type = &crypto_ablkcipher_type, 1162 929 .cra_module = THIS_MODULE, 1163 930 .cra_init = atmel_tdes_cra_init, ··· 1178 945 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1179 946 .cra_blocksize = CFB32_BLOCK_SIZE, 1180 947 .cra_ctxsize = sizeof(struct atmel_tdes_ctx), 1181 - .cra_alignmask = 0, 948 + .cra_alignmask = 0x3, 1182 949 .cra_type = &crypto_ablkcipher_type, 1183 950 .cra_module = THIS_MODULE, 1184 951 .cra_init = atmel_tdes_cra_init, ··· 1199 966 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, 1200 967 .cra_blocksize = DES_BLOCK_SIZE, 1201 968 .cra_ctxsize = sizeof(struct atmel_tdes_ctx), 1202 - .cra_alignmask = 0, 969 + .cra_alignmask = 0x7, 1203 970 .cra_type = &crypto_ablkcipher_type, 1204 971 .cra_module = THIS_MODULE, 1205 972 .cra_init = atmel_tdes_cra_init, ··· 1227 994 struct atmel_tdes_dev *dd = (struct atmel_tdes_dev *) data; 1228 995 int err; 1229 996 1230 - err = atmel_tdes_crypt_dma_stop(dd); 997 + if (!(dd->flags & TDES_FLAGS_DMA)) 998 + err = atmel_tdes_crypt_pdc_stop(dd); 999 + else 1000 + err = atmel_tdes_crypt_dma_stop(dd); 1231 1001 1232 1002 err = dd->err ? : err; 1233 1003 1234 1004 if (dd->total && !err) { 1235 - err = atmel_tdes_crypt_dma_start(dd); 1005 + if (dd->flags & TDES_FLAGS_FAST) { 1006 + dd->in_sg = sg_next(dd->in_sg); 1007 + dd->out_sg = sg_next(dd->out_sg); 1008 + if (!dd->in_sg || !dd->out_sg) 1009 + err = -EINVAL; 1010 + } 1236 1011 if (!err) 1237 - return; 1012 + err = atmel_tdes_crypt_start(dd); 1013 + if (!err) 1014 + return; /* DMA started. Not fininishing. */ 1238 1015 } 1239 1016 1240 1017 atmel_tdes_finish_req(dd, err); ··· 1296 1053 return err; 1297 1054 } 1298 1055 1056 + static void atmel_tdes_get_cap(struct atmel_tdes_dev *dd) 1057 + { 1058 + 1059 + dd->caps.has_dma = 0; 1060 + dd->caps.has_cfb_3keys = 0; 1061 + 1062 + /* keep only major version number */ 1063 + switch (dd->hw_version & 0xf00) { 1064 + case 0x700: 1065 + dd->caps.has_dma = 1; 1066 + dd->caps.has_cfb_3keys = 1; 1067 + break; 1068 + case 0x600: 1069 + break; 1070 + default: 1071 + dev_warn(dd->dev, 1072 + "Unmanaged tdes version, set minimum capabilities\n"); 1073 + break; 1074 + } 1075 + } 1076 + 1299 1077 static int atmel_tdes_probe(struct platform_device *pdev) 1300 1078 { 1301 1079 struct atmel_tdes_dev *tdes_dd; 1080 + struct crypto_platform_data *pdata; 1302 1081 struct device *dev = &pdev->dev; 1303 1082 struct resource *tdes_res; 1304 1083 unsigned long tdes_phys_size; ··· 1374 1109 } 1375 1110 1376 1111 /* Initializing the clock */ 1377 - tdes_dd->iclk = clk_get(&pdev->dev, NULL); 1112 + tdes_dd->iclk = clk_get(&pdev->dev, "tdes_clk"); 1378 1113 if (IS_ERR(tdes_dd->iclk)) { 1379 1114 dev_err(dev, "clock intialization failed.\n"); 1380 1115 err = PTR_ERR(tdes_dd->iclk); ··· 1388 1123 goto tdes_io_err; 1389 1124 } 1390 1125 1391 - err = atmel_tdes_dma_init(tdes_dd); 1126 + atmel_tdes_hw_version_init(tdes_dd); 1127 + 1128 + atmel_tdes_get_cap(tdes_dd); 1129 + 1130 + err = atmel_tdes_buff_init(tdes_dd); 1392 1131 if (err) 1393 - goto err_tdes_dma; 1132 + goto err_tdes_buff; 1133 + 1134 + if (tdes_dd->caps.has_dma) { 1135 + pdata = pdev->dev.platform_data; 1136 + if (!pdata) { 1137 + dev_err(&pdev->dev, "platform data not available\n"); 1138 + err = -ENXIO; 1139 + goto err_pdata; 1140 + } 1141 + err = atmel_tdes_dma_init(tdes_dd, pdata); 1142 + if (err) 1143 + goto err_tdes_dma; 1144 + } 1394 1145 1395 1146 spin_lock(&atmel_tdes.lock); 1396 1147 list_add_tail(&tdes_dd->list, &atmel_tdes.dev_list); ··· 1424 1143 spin_lock(&atmel_tdes.lock); 1425 1144 list_del(&tdes_dd->list); 1426 1145 spin_unlock(&atmel_tdes.lock); 1427 - atmel_tdes_dma_cleanup(tdes_dd); 1146 + if (tdes_dd->caps.has_dma) 1147 + atmel_tdes_dma_cleanup(tdes_dd); 1428 1148 err_tdes_dma: 1149 + err_pdata: 1150 + atmel_tdes_buff_cleanup(tdes_dd); 1151 + err_tdes_buff: 1429 1152 iounmap(tdes_dd->io_base); 1430 1153 tdes_io_err: 1431 1154 clk_put(tdes_dd->iclk); ··· 1463 1178 tasklet_kill(&tdes_dd->done_task); 1464 1179 tasklet_kill(&tdes_dd->queue_task); 1465 1180 1466 - atmel_tdes_dma_cleanup(tdes_dd); 1181 + if (tdes_dd->caps.has_dma) 1182 + atmel_tdes_dma_cleanup(tdes_dd); 1183 + 1184 + atmel_tdes_buff_cleanup(tdes_dd); 1467 1185 1468 1186 iounmap(tdes_dd->io_base); 1469 1187

+3 -3

drivers/crypto/bfin_crc.c

··· 151 151 struct bfin_crypto_crc_reqctx *ctx = ahash_request_ctx(req); 152 152 struct bfin_crypto_crc *crc; 153 153 154 - dev_dbg(crc->dev, "crc_init\n"); 154 + dev_dbg(ctx->crc->dev, "crc_init\n"); 155 155 spin_lock_bh(&crc_list.lock); 156 156 list_for_each_entry(crc, &crc_list.dev_list, list) { 157 157 crc_ctx->crc = crc; ··· 160 160 spin_unlock_bh(&crc_list.lock); 161 161 162 162 if (sg_count(req->src) > CRC_MAX_DMA_DESC) { 163 - dev_dbg(crc->dev, "init: requested sg list is too big > %d\n", 163 + dev_dbg(ctx->crc->dev, "init: requested sg list is too big > %d\n", 164 164 CRC_MAX_DMA_DESC); 165 165 return -EINVAL; 166 166 } ··· 175 175 /* init crc results */ 176 176 put_unaligned_le32(crc_ctx->key, req->result); 177 177 178 - dev_dbg(crc->dev, "init: digest size: %d\n", 178 + dev_dbg(ctx->crc->dev, "init: digest size: %d\n", 179 179 crypto_ahash_digestsize(tfm)); 180 180 181 181 return bfin_crypto_crc_init_hw(crc, crc_ctx->key);

+1 -1

drivers/crypto/caam/Kconfig

··· 78 78 tristate "Register hash algorithm implementations with Crypto API" 79 79 depends on CRYPTO_DEV_FSL_CAAM 80 80 default y 81 - select CRYPTO_AHASH 81 + select CRYPTO_HASH 82 82 help 83 83 Selecting this will offload ahash for users of the 84 84 scatterlist crypto API to the SEC4 via job ring.

+6

drivers/crypto/caam/caamalg.c

··· 1693 1693 .name = "authenc(hmac(sha224),cbc(aes))", 1694 1694 .driver_name = "authenc-hmac-sha224-cbc-aes-caam", 1695 1695 .blocksize = AES_BLOCK_SIZE, 1696 + .type = CRYPTO_ALG_TYPE_AEAD, 1696 1697 .template_aead = { 1697 1698 .setkey = aead_setkey, 1698 1699 .setauthsize = aead_setauthsize, ··· 1733 1732 .name = "authenc(hmac(sha384),cbc(aes))", 1734 1733 .driver_name = "authenc-hmac-sha384-cbc-aes-caam", 1735 1734 .blocksize = AES_BLOCK_SIZE, 1735 + .type = CRYPTO_ALG_TYPE_AEAD, 1736 1736 .template_aead = { 1737 1737 .setkey = aead_setkey, 1738 1738 .setauthsize = aead_setauthsize, ··· 1812 1810 .name = "authenc(hmac(sha224),cbc(des3_ede))", 1813 1811 .driver_name = "authenc-hmac-sha224-cbc-des3_ede-caam", 1814 1812 .blocksize = DES3_EDE_BLOCK_SIZE, 1813 + .type = CRYPTO_ALG_TYPE_AEAD, 1815 1814 .template_aead = { 1816 1815 .setkey = aead_setkey, 1817 1816 .setauthsize = aead_setauthsize, ··· 1852 1849 .name = "authenc(hmac(sha384),cbc(des3_ede))", 1853 1850 .driver_name = "authenc-hmac-sha384-cbc-des3_ede-caam", 1854 1851 .blocksize = DES3_EDE_BLOCK_SIZE, 1852 + .type = CRYPTO_ALG_TYPE_AEAD, 1855 1853 .template_aead = { 1856 1854 .setkey = aead_setkey, 1857 1855 .setauthsize = aead_setauthsize, ··· 1930 1926 .name = "authenc(hmac(sha224),cbc(des))", 1931 1927 .driver_name = "authenc-hmac-sha224-cbc-des-caam", 1932 1928 .blocksize = DES_BLOCK_SIZE, 1929 + .type = CRYPTO_ALG_TYPE_AEAD, 1933 1930 .template_aead = { 1934 1931 .setkey = aead_setkey, 1935 1932 .setauthsize = aead_setauthsize, ··· 1970 1965 .name = "authenc(hmac(sha384),cbc(des))", 1971 1966 .driver_name = "authenc-hmac-sha384-cbc-des-caam", 1972 1967 .blocksize = DES_BLOCK_SIZE, 1968 + .type = CRYPTO_ALG_TYPE_AEAD, 1973 1969 .template_aead = { 1974 1970 .setkey = aead_setkey, 1975 1971 .setauthsize = aead_setauthsize,

+2 -2

drivers/crypto/caam/caamhash.c

··· 411 411 return 0; 412 412 } 413 413 414 - static u32 gen_split_hash_key(struct caam_hash_ctx *ctx, const u8 *key_in, 414 + static int gen_split_hash_key(struct caam_hash_ctx *ctx, const u8 *key_in, 415 415 u32 keylen) 416 416 { 417 417 return gen_split_key(ctx->jrdev, ctx->key, ctx->split_key_len, ··· 420 420 } 421 421 422 422 /* Digest hash size if it is too large */ 423 - static u32 hash_digest_key(struct caam_hash_ctx *ctx, const u8 *key_in, 423 + static int hash_digest_key(struct caam_hash_ctx *ctx, const u8 *key_in, 424 424 u32 *keylen, u8 *key_out, u32 digestsize) 425 425 { 426 426 struct device *jrdev = ctx->jrdev;

+3

drivers/crypto/caam/ctrl.c

··· 304 304 caam_remove(pdev); 305 305 return ret; 306 306 } 307 + 308 + /* Enable RDB bit so that RNG works faster */ 309 + setbits32(&topregs->ctrl.scfgr, SCFGR_RDBENABLE); 307 310 } 308 311 309 312 /* NOTE: RTIC detection ought to go here, around Si time */

+5 -5

drivers/crypto/caam/error.c

··· 36 36 37 37 static void report_ccb_status(u32 status, char *outstr) 38 38 { 39 - char *cha_id_list[] = { 39 + static const char * const cha_id_list[] = { 40 40 "", 41 41 "AES", 42 42 "DES", ··· 51 51 "ZUCE", 52 52 "ZUCA", 53 53 }; 54 - char *err_id_list[] = { 54 + static const char * const err_id_list[] = { 55 55 "No error.", 56 56 "Mode error.", 57 57 "Data size error.", ··· 69 69 "Invalid CHA combination was selected", 70 70 "Invalid CHA selected.", 71 71 }; 72 - char *rng_err_id_list[] = { 72 + static const char * const rng_err_id_list[] = { 73 73 "", 74 74 "", 75 75 "", ··· 117 117 118 118 static void report_deco_status(u32 status, char *outstr) 119 119 { 120 - const struct { 120 + static const struct { 121 121 u8 value; 122 122 char *error_text; 123 123 } desc_error_list[] = { ··· 245 245 246 246 char *caam_jr_strstatus(char *outstr, u32 status) 247 247 { 248 - struct stat_src { 248 + static const struct stat_src { 249 249 void (*report_ssed)(u32 status, char *outstr); 250 250 char *error; 251 251 } status_src[] = {

+1

drivers/crypto/caam/intern.h

··· 41 41 /* Private sub-storage for a single JobR */ 42 42 struct caam_drv_private_jr { 43 43 struct device *parentdev; /* points back to controller dev */ 44 + struct platform_device *jr_pdev;/* points to platform device for JR */ 44 45 int ridx; 45 46 struct caam_job_ring __iomem *rregs; /* JobR's register space */ 46 47 struct tasklet_struct irqtask;

+4

drivers/crypto/caam/jr.c

··· 407 407 dma_free_coherent(dev, sizeof(struct jr_outentry) * JOBR_DEPTH, 408 408 jrp->outring, outbusaddr); 409 409 kfree(jrp->entinfo); 410 + of_device_unregister(jrp->jr_pdev); 410 411 411 412 return ret; 412 413 } ··· 455 454 kfree(jrpriv); 456 455 return -EINVAL; 457 456 } 457 + 458 + jrpriv->jr_pdev = jr_pdev; 458 459 jrdev = &jr_pdev->dev; 459 460 dev_set_drvdata(jrdev, jrpriv); 460 461 ctrlpriv->jrdev[ring] = jrdev; ··· 475 472 /* Now do the platform independent part */ 476 473 error = caam_jr_init(jrdev); /* now turn on hardware */ 477 474 if (error) { 475 + of_device_unregister(jr_pdev); 478 476 kfree(jrpriv); 479 477 return error; 480 478 }

+1 -1

drivers/crypto/caam/key_gen.c

··· 44 44 [06] 0x64260028 fifostr: class2 mdsplit-jdk len=40 45 45 @0xffe04000 46 46 */ 47 - u32 gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len, 47 + int gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len, 48 48 int split_key_pad_len, const u8 *key_in, u32 keylen, 49 49 u32 alg_op) 50 50 {

+1 -1

drivers/crypto/caam/key_gen.h

··· 12 12 13 13 void split_key_done(struct device *dev, u32 *desc, u32 err, void *context); 14 14 15 - u32 gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len, 15 + int gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len, 16 16 int split_key_pad_len, const u8 *key_in, u32 keylen, 17 17 u32 alg_op);

+3 -1

drivers/crypto/caam/regs.h

··· 252 252 /* Read/Writable */ 253 253 u32 rsvd1; 254 254 u32 mcr; /* MCFG Master Config Register */ 255 - u32 rsvd2[2]; 255 + u32 rsvd2; 256 + u32 scfgr; /* SCFGR, Security Config Register */ 256 257 257 258 /* Bus Access Configuration Section 010-11f */ 258 259 /* Read/Writable */ ··· 300 299 #define MCFGR_WDFAIL 0x20000000 /* DECO watchdog force-fail */ 301 300 #define MCFGR_DMA_RESET 0x10000000 302 301 #define MCFGR_LONG_PTR 0x00010000 /* Use >32-bit desc addressing */ 302 + #define SCFGR_RDBENABLE 0x00000400 303 303 304 304 /* AXI read cache control */ 305 305 #define MCFGR_ARCACHE_SHIFT 12

+2 -13

drivers/crypto/omap-aes.c

··· 636 636 637 637 pr_debug("err: %d\n", err); 638 638 639 - pm_runtime_put_sync(dd->dev); 639 + pm_runtime_put(dd->dev); 640 640 dd->flags &= ~FLAGS_BUSY; 641 641 642 642 req->base.complete(&req->base, err); ··· 1248 1248 }, 1249 1249 }; 1250 1250 1251 - static int __init omap_aes_mod_init(void) 1252 - { 1253 - return platform_driver_register(&omap_aes_driver); 1254 - } 1255 - 1256 - static void __exit omap_aes_mod_exit(void) 1257 - { 1258 - platform_driver_unregister(&omap_aes_driver); 1259 - } 1260 - 1261 - module_init(omap_aes_mod_init); 1262 - module_exit(omap_aes_mod_exit); 1251 + module_platform_driver(omap_aes_driver); 1263 1252 1264 1253 MODULE_DESCRIPTION("OMAP AES hw acceleration support."); 1265 1254 MODULE_LICENSE("GPL v2");

+2 -13

drivers/crypto/omap-sham.c

··· 923 923 dd->flags &= ~(BIT(FLAGS_BUSY) | BIT(FLAGS_FINAL) | BIT(FLAGS_CPU) | 924 924 BIT(FLAGS_DMA_READY) | BIT(FLAGS_OUTPUT_READY)); 925 925 926 - pm_runtime_put_sync(dd->dev); 926 + pm_runtime_put(dd->dev); 927 927 928 928 if (req->base.complete) 929 929 req->base.complete(&req->base, err); ··· 1813 1813 }, 1814 1814 }; 1815 1815 1816 - static int __init omap_sham_mod_init(void) 1817 - { 1818 - return platform_driver_register(&omap_sham_driver); 1819 - } 1820 - 1821 - static void __exit omap_sham_mod_exit(void) 1822 - { 1823 - platform_driver_unregister(&omap_sham_driver); 1824 - } 1825 - 1826 - module_init(omap_sham_mod_init); 1827 - module_exit(omap_sham_mod_exit); 1816 + module_platform_driver(omap_sham_driver); 1828 1817 1829 1818 MODULE_DESCRIPTION("OMAP SHA1/MD5 hw acceleration support."); 1830 1819 MODULE_LICENSE("GPL v2");

+1 -3

drivers/crypto/picoxcell_crypto.c

··· 1688 1688 { .compatible = "picochip,spacc-l2" }, 1689 1689 {} 1690 1690 }; 1691 - #else /* CONFIG_OF */ 1692 - #define spacc_of_id_table NULL 1693 1691 #endif /* CONFIG_OF */ 1694 1692 1695 1693 static bool spacc_is_compatible(struct platform_device *pdev, ··· 1872 1874 #ifdef CONFIG_PM 1873 1875 .pm = &spacc_pm_ops, 1874 1876 #endif /* CONFIG_PM */ 1875 - .of_match_table = spacc_of_id_table, 1877 + .of_match_table = of_match_ptr(spacc_of_id_table), 1876 1878 }, 1877 1879 .id_table = spacc_id_table, 1878 1880 };

+1070

drivers/crypto/sahara.c

··· 1 + /* 2 + * Cryptographic API. 3 + * 4 + * Support for SAHARA cryptographic accelerator. 5 + * 6 + * Copyright (c) 2013 Vista Silicon S.L. 7 + * Author: Javier Martin <javier.martin@vista-silicon.com> 8 + * 9 + * This program is free software; you can redistribute it and/or modify 10 + * it under the terms of the GNU General Public License version 2 as published 11 + * by the Free Software Foundation. 12 + * 13 + * Based on omap-aes.c and tegra-aes.c 14 + */ 15 + 16 + #include <crypto/algapi.h> 17 + #include <crypto/aes.h> 18 + 19 + #include <linux/clk.h> 20 + #include <linux/crypto.h> 21 + #include <linux/interrupt.h> 22 + #include <linux/io.h> 23 + #include <linux/irq.h> 24 + #include <linux/kernel.h> 25 + #include <linux/module.h> 26 + #include <linux/of.h> 27 + #include <linux/platform_device.h> 28 + 29 + #define SAHARA_NAME "sahara" 30 + #define SAHARA_VERSION_3 3 31 + #define SAHARA_TIMEOUT_MS 1000 32 + #define SAHARA_MAX_HW_DESC 2 33 + #define SAHARA_MAX_HW_LINK 20 34 + 35 + #define FLAGS_MODE_MASK 0x000f 36 + #define FLAGS_ENCRYPT BIT(0) 37 + #define FLAGS_CBC BIT(1) 38 + #define FLAGS_NEW_KEY BIT(3) 39 + #define FLAGS_BUSY 4 40 + 41 + #define SAHARA_HDR_BASE 0x00800000 42 + #define SAHARA_HDR_SKHA_ALG_AES 0 43 + #define SAHARA_HDR_SKHA_OP_ENC (1 << 2) 44 + #define SAHARA_HDR_SKHA_MODE_ECB (0 << 3) 45 + #define SAHARA_HDR_SKHA_MODE_CBC (1 << 3) 46 + #define SAHARA_HDR_FORM_DATA (5 << 16) 47 + #define SAHARA_HDR_FORM_KEY (8 << 16) 48 + #define SAHARA_HDR_LLO (1 << 24) 49 + #define SAHARA_HDR_CHA_SKHA (1 << 28) 50 + #define SAHARA_HDR_CHA_MDHA (2 << 28) 51 + #define SAHARA_HDR_PARITY_BIT (1 << 31) 52 + 53 + /* SAHARA can only process one request at a time */ 54 + #define SAHARA_QUEUE_LENGTH 1 55 + 56 + #define SAHARA_REG_VERSION 0x00 57 + #define SAHARA_REG_DAR 0x04 58 + #define SAHARA_REG_CONTROL 0x08 59 + #define SAHARA_CONTROL_SET_THROTTLE(x) (((x) & 0xff) << 24) 60 + #define SAHARA_CONTROL_SET_MAXBURST(x) (((x) & 0xff) << 16) 61 + #define SAHARA_CONTROL_RNG_AUTORSD (1 << 7) 62 + #define SAHARA_CONTROL_ENABLE_INT (1 << 4) 63 + #define SAHARA_REG_CMD 0x0C 64 + #define SAHARA_CMD_RESET (1 << 0) 65 + #define SAHARA_CMD_CLEAR_INT (1 << 8) 66 + #define SAHARA_CMD_CLEAR_ERR (1 << 9) 67 + #define SAHARA_CMD_SINGLE_STEP (1 << 10) 68 + #define SAHARA_CMD_MODE_BATCH (1 << 16) 69 + #define SAHARA_CMD_MODE_DEBUG (1 << 18) 70 + #define SAHARA_REG_STATUS 0x10 71 + #define SAHARA_STATUS_GET_STATE(x) ((x) & 0x7) 72 + #define SAHARA_STATE_IDLE 0 73 + #define SAHARA_STATE_BUSY 1 74 + #define SAHARA_STATE_ERR 2 75 + #define SAHARA_STATE_FAULT 3 76 + #define SAHARA_STATE_COMPLETE 4 77 + #define SAHARA_STATE_COMP_FLAG (1 << 2) 78 + #define SAHARA_STATUS_DAR_FULL (1 << 3) 79 + #define SAHARA_STATUS_ERROR (1 << 4) 80 + #define SAHARA_STATUS_SECURE (1 << 5) 81 + #define SAHARA_STATUS_FAIL (1 << 6) 82 + #define SAHARA_STATUS_INIT (1 << 7) 83 + #define SAHARA_STATUS_RNG_RESEED (1 << 8) 84 + #define SAHARA_STATUS_ACTIVE_RNG (1 << 9) 85 + #define SAHARA_STATUS_ACTIVE_MDHA (1 << 10) 86 + #define SAHARA_STATUS_ACTIVE_SKHA (1 << 11) 87 + #define SAHARA_STATUS_MODE_BATCH (1 << 16) 88 + #define SAHARA_STATUS_MODE_DEDICATED (1 << 17) 89 + #define SAHARA_STATUS_MODE_DEBUG (1 << 18) 90 + #define SAHARA_STATUS_GET_ISTATE(x) (((x) >> 24) & 0xff) 91 + #define SAHARA_REG_ERRSTATUS 0x14 92 + #define SAHARA_ERRSTATUS_GET_SOURCE(x) ((x) & 0xf) 93 + #define SAHARA_ERRSOURCE_CHA 14 94 + #define SAHARA_ERRSOURCE_DMA 15 95 + #define SAHARA_ERRSTATUS_DMA_DIR (1 << 8) 96 + #define SAHARA_ERRSTATUS_GET_DMASZ(x)(((x) >> 9) & 0x3) 97 + #define SAHARA_ERRSTATUS_GET_DMASRC(x) (((x) >> 13) & 0x7) 98 + #define SAHARA_ERRSTATUS_GET_CHASRC(x) (((x) >> 16) & 0xfff) 99 + #define SAHARA_ERRSTATUS_GET_CHAERR(x) (((x) >> 28) & 0x3) 100 + #define SAHARA_REG_FADDR 0x18 101 + #define SAHARA_REG_CDAR 0x1C 102 + #define SAHARA_REG_IDAR 0x20 103 + 104 + struct sahara_hw_desc { 105 + u32 hdr; 106 + u32 len1; 107 + dma_addr_t p1; 108 + u32 len2; 109 + dma_addr_t p2; 110 + dma_addr_t next; 111 + }; 112 + 113 + struct sahara_hw_link { 114 + u32 len; 115 + dma_addr_t p; 116 + dma_addr_t next; 117 + }; 118 + 119 + struct sahara_ctx { 120 + struct sahara_dev *dev; 121 + unsigned long flags; 122 + int keylen; 123 + u8 key[AES_KEYSIZE_128]; 124 + struct crypto_ablkcipher *fallback; 125 + }; 126 + 127 + struct sahara_aes_reqctx { 128 + unsigned long mode; 129 + }; 130 + 131 + struct sahara_dev { 132 + struct device *device; 133 + void __iomem *regs_base; 134 + struct clk *clk_ipg; 135 + struct clk *clk_ahb; 136 + 137 + struct sahara_ctx *ctx; 138 + spinlock_t lock; 139 + struct crypto_queue queue; 140 + unsigned long flags; 141 + 142 + struct tasklet_struct done_task; 143 + struct tasklet_struct queue_task; 144 + 145 + struct sahara_hw_desc *hw_desc[SAHARA_MAX_HW_DESC]; 146 + dma_addr_t hw_phys_desc[SAHARA_MAX_HW_DESC]; 147 + 148 + u8 *key_base; 149 + dma_addr_t key_phys_base; 150 + 151 + u8 *iv_base; 152 + dma_addr_t iv_phys_base; 153 + 154 + struct sahara_hw_link *hw_link[SAHARA_MAX_HW_LINK]; 155 + dma_addr_t hw_phys_link[SAHARA_MAX_HW_LINK]; 156 + 157 + struct ablkcipher_request *req; 158 + size_t total; 159 + struct scatterlist *in_sg; 160 + unsigned int nb_in_sg; 161 + struct scatterlist *out_sg; 162 + unsigned int nb_out_sg; 163 + 164 + u32 error; 165 + struct timer_list watchdog; 166 + }; 167 + 168 + static struct sahara_dev *dev_ptr; 169 + 170 + static inline void sahara_write(struct sahara_dev *dev, u32 data, u32 reg) 171 + { 172 + writel(data, dev->regs_base + reg); 173 + } 174 + 175 + static inline unsigned int sahara_read(struct sahara_dev *dev, u32 reg) 176 + { 177 + return readl(dev->regs_base + reg); 178 + } 179 + 180 + static u32 sahara_aes_key_hdr(struct sahara_dev *dev) 181 + { 182 + u32 hdr = SAHARA_HDR_BASE | SAHARA_HDR_SKHA_ALG_AES | 183 + SAHARA_HDR_FORM_KEY | SAHARA_HDR_LLO | 184 + SAHARA_HDR_CHA_SKHA | SAHARA_HDR_PARITY_BIT; 185 + 186 + if (dev->flags & FLAGS_CBC) { 187 + hdr |= SAHARA_HDR_SKHA_MODE_CBC; 188 + hdr ^= SAHARA_HDR_PARITY_BIT; 189 + } 190 + 191 + if (dev->flags & FLAGS_ENCRYPT) { 192 + hdr |= SAHARA_HDR_SKHA_OP_ENC; 193 + hdr ^= SAHARA_HDR_PARITY_BIT; 194 + } 195 + 196 + return hdr; 197 + } 198 + 199 + static u32 sahara_aes_data_link_hdr(struct sahara_dev *dev) 200 + { 201 + return SAHARA_HDR_BASE | SAHARA_HDR_FORM_DATA | 202 + SAHARA_HDR_CHA_SKHA | SAHARA_HDR_PARITY_BIT; 203 + } 204 + 205 + static int sahara_sg_length(struct scatterlist *sg, 206 + unsigned int total) 207 + { 208 + int sg_nb; 209 + unsigned int len; 210 + struct scatterlist *sg_list; 211 + 212 + sg_nb = 0; 213 + sg_list = sg; 214 + 215 + while (total) { 216 + len = min(sg_list->length, total); 217 + 218 + sg_nb++; 219 + total -= len; 220 + 221 + sg_list = sg_next(sg_list); 222 + if (!sg_list) 223 + total = 0; 224 + } 225 + 226 + return sg_nb; 227 + } 228 + 229 + static char *sahara_err_src[16] = { 230 + "No error", 231 + "Header error", 232 + "Descriptor length error", 233 + "Descriptor length or pointer error", 234 + "Link length error", 235 + "Link pointer error", 236 + "Input buffer error", 237 + "Output buffer error", 238 + "Output buffer starvation", 239 + "Internal state fault", 240 + "General descriptor problem", 241 + "Reserved", 242 + "Descriptor address error", 243 + "Link address error", 244 + "CHA error", 245 + "DMA error" 246 + }; 247 + 248 + static char *sahara_err_dmasize[4] = { 249 + "Byte transfer", 250 + "Half-word transfer", 251 + "Word transfer", 252 + "Reserved" 253 + }; 254 + 255 + static char *sahara_err_dmasrc[8] = { 256 + "No error", 257 + "AHB bus error", 258 + "Internal IP bus error", 259 + "Parity error", 260 + "DMA crosses 256 byte boundary", 261 + "DMA is busy", 262 + "Reserved", 263 + "DMA HW error" 264 + }; 265 + 266 + static char *sahara_cha_errsrc[12] = { 267 + "Input buffer non-empty", 268 + "Illegal address", 269 + "Illegal mode", 270 + "Illegal data size", 271 + "Illegal key size", 272 + "Write during processing", 273 + "CTX read during processing", 274 + "HW error", 275 + "Input buffer disabled/underflow", 276 + "Output buffer disabled/overflow", 277 + "DES key parity error", 278 + "Reserved" 279 + }; 280 + 281 + static char *sahara_cha_err[4] = { "No error", "SKHA", "MDHA", "RNG" }; 282 + 283 + static void sahara_decode_error(struct sahara_dev *dev, unsigned int error) 284 + { 285 + u8 source = SAHARA_ERRSTATUS_GET_SOURCE(error); 286 + u16 chasrc = ffs(SAHARA_ERRSTATUS_GET_CHASRC(error)); 287 + 288 + dev_err(dev->device, "%s: Error Register = 0x%08x\n", __func__, error); 289 + 290 + dev_err(dev->device, " - %s.\n", sahara_err_src[source]); 291 + 292 + if (source == SAHARA_ERRSOURCE_DMA) { 293 + if (error & SAHARA_ERRSTATUS_DMA_DIR) 294 + dev_err(dev->device, " * DMA read.\n"); 295 + else 296 + dev_err(dev->device, " * DMA write.\n"); 297 + 298 + dev_err(dev->device, " * %s.\n", 299 + sahara_err_dmasize[SAHARA_ERRSTATUS_GET_DMASZ(error)]); 300 + dev_err(dev->device, " * %s.\n", 301 + sahara_err_dmasrc[SAHARA_ERRSTATUS_GET_DMASRC(error)]); 302 + } else if (source == SAHARA_ERRSOURCE_CHA) { 303 + dev_err(dev->device, " * %s.\n", 304 + sahara_cha_errsrc[chasrc]); 305 + dev_err(dev->device, " * %s.\n", 306 + sahara_cha_err[SAHARA_ERRSTATUS_GET_CHAERR(error)]); 307 + } 308 + dev_err(dev->device, "\n"); 309 + } 310 + 311 + static char *sahara_state[4] = { "Idle", "Busy", "Error", "HW Fault" }; 312 + 313 + static void sahara_decode_status(struct sahara_dev *dev, unsigned int status) 314 + { 315 + u8 state; 316 + 317 + if (!IS_ENABLED(DEBUG)) 318 + return; 319 + 320 + state = SAHARA_STATUS_GET_STATE(status); 321 + 322 + dev_dbg(dev->device, "%s: Status Register = 0x%08x\n", 323 + __func__, status); 324 + 325 + dev_dbg(dev->device, " - State = %d:\n", state); 326 + if (state & SAHARA_STATE_COMP_FLAG) 327 + dev_dbg(dev->device, " * Descriptor completed. IRQ pending.\n"); 328 + 329 + dev_dbg(dev->device, " * %s.\n", 330 + sahara_state[state & ~SAHARA_STATE_COMP_FLAG]); 331 + 332 + if (status & SAHARA_STATUS_DAR_FULL) 333 + dev_dbg(dev->device, " - DAR Full.\n"); 334 + if (status & SAHARA_STATUS_ERROR) 335 + dev_dbg(dev->device, " - Error.\n"); 336 + if (status & SAHARA_STATUS_SECURE) 337 + dev_dbg(dev->device, " - Secure.\n"); 338 + if (status & SAHARA_STATUS_FAIL) 339 + dev_dbg(dev->device, " - Fail.\n"); 340 + if (status & SAHARA_STATUS_RNG_RESEED) 341 + dev_dbg(dev->device, " - RNG Reseed Request.\n"); 342 + if (status & SAHARA_STATUS_ACTIVE_RNG) 343 + dev_dbg(dev->device, " - RNG Active.\n"); 344 + if (status & SAHARA_STATUS_ACTIVE_MDHA) 345 + dev_dbg(dev->device, " - MDHA Active.\n"); 346 + if (status & SAHARA_STATUS_ACTIVE_SKHA) 347 + dev_dbg(dev->device, " - SKHA Active.\n"); 348 + 349 + if (status & SAHARA_STATUS_MODE_BATCH) 350 + dev_dbg(dev->device, " - Batch Mode.\n"); 351 + else if (status & SAHARA_STATUS_MODE_DEDICATED) 352 + dev_dbg(dev->device, " - Decidated Mode.\n"); 353 + else if (status & SAHARA_STATUS_MODE_DEBUG) 354 + dev_dbg(dev->device, " - Debug Mode.\n"); 355 + 356 + dev_dbg(dev->device, " - Internal state = 0x%02x\n", 357 + SAHARA_STATUS_GET_ISTATE(status)); 358 + 359 + dev_dbg(dev->device, "Current DAR: 0x%08x\n", 360 + sahara_read(dev, SAHARA_REG_CDAR)); 361 + dev_dbg(dev->device, "Initial DAR: 0x%08x\n\n", 362 + sahara_read(dev, SAHARA_REG_IDAR)); 363 + } 364 + 365 + static void sahara_dump_descriptors(struct sahara_dev *dev) 366 + { 367 + int i; 368 + 369 + if (!IS_ENABLED(DEBUG)) 370 + return; 371 + 372 + for (i = 0; i < SAHARA_MAX_HW_DESC; i++) { 373 + dev_dbg(dev->device, "Descriptor (%d) (0x%08x):\n", 374 + i, dev->hw_phys_desc[i]); 375 + dev_dbg(dev->device, "\thdr = 0x%08x\n", dev->hw_desc[i]->hdr); 376 + dev_dbg(dev->device, "\tlen1 = %u\n", dev->hw_desc[i]->len1); 377 + dev_dbg(dev->device, "\tp1 = 0x%08x\n", dev->hw_desc[i]->p1); 378 + dev_dbg(dev->device, "\tlen2 = %u\n", dev->hw_desc[i]->len2); 379 + dev_dbg(dev->device, "\tp2 = 0x%08x\n", dev->hw_desc[i]->p2); 380 + dev_dbg(dev->device, "\tnext = 0x%08x\n", 381 + dev->hw_desc[i]->next); 382 + } 383 + dev_dbg(dev->device, "\n"); 384 + } 385 + 386 + static void sahara_dump_links(struct sahara_dev *dev) 387 + { 388 + int i; 389 + 390 + if (!IS_ENABLED(DEBUG)) 391 + return; 392 + 393 + for (i = 0; i < SAHARA_MAX_HW_LINK; i++) { 394 + dev_dbg(dev->device, "Link (%d) (0x%08x):\n", 395 + i, dev->hw_phys_link[i]); 396 + dev_dbg(dev->device, "\tlen = %u\n", dev->hw_link[i]->len); 397 + dev_dbg(dev->device, "\tp = 0x%08x\n", dev->hw_link[i]->p); 398 + dev_dbg(dev->device, "\tnext = 0x%08x\n", 399 + dev->hw_link[i]->next); 400 + } 401 + dev_dbg(dev->device, "\n"); 402 + } 403 + 404 + static void sahara_aes_done_task(unsigned long data) 405 + { 406 + struct sahara_dev *dev = (struct sahara_dev *)data; 407 + 408 + dma_unmap_sg(dev->device, dev->out_sg, dev->nb_out_sg, 409 + DMA_TO_DEVICE); 410 + dma_unmap_sg(dev->device, dev->in_sg, dev->nb_in_sg, 411 + DMA_FROM_DEVICE); 412 + 413 + spin_lock(&dev->lock); 414 + clear_bit(FLAGS_BUSY, &dev->flags); 415 + spin_unlock(&dev->lock); 416 + 417 + dev->req->base.complete(&dev->req->base, dev->error); 418 + } 419 + 420 + void sahara_watchdog(unsigned long data) 421 + { 422 + struct sahara_dev *dev = (struct sahara_dev *)data; 423 + unsigned int err = sahara_read(dev, SAHARA_REG_ERRSTATUS); 424 + unsigned int stat = sahara_read(dev, SAHARA_REG_STATUS); 425 + 426 + sahara_decode_status(dev, stat); 427 + sahara_decode_error(dev, err); 428 + dev->error = -ETIMEDOUT; 429 + sahara_aes_done_task(data); 430 + } 431 + 432 + static int sahara_hw_descriptor_create(struct sahara_dev *dev) 433 + { 434 + struct sahara_ctx *ctx = dev->ctx; 435 + struct scatterlist *sg; 436 + int ret; 437 + int i, j; 438 + 439 + /* Copy new key if necessary */ 440 + if (ctx->flags & FLAGS_NEW_KEY) { 441 + memcpy(dev->key_base, ctx->key, ctx->keylen); 442 + ctx->flags &= ~FLAGS_NEW_KEY; 443 + 444 + if (dev->flags & FLAGS_CBC) { 445 + dev->hw_desc[0]->len1 = AES_BLOCK_SIZE; 446 + dev->hw_desc[0]->p1 = dev->iv_phys_base; 447 + } else { 448 + dev->hw_desc[0]->len1 = 0; 449 + dev->hw_desc[0]->p1 = 0; 450 + } 451 + dev->hw_desc[0]->len2 = ctx->keylen; 452 + dev->hw_desc[0]->p2 = dev->key_phys_base; 453 + dev->hw_desc[0]->next = dev->hw_phys_desc[1]; 454 + } 455 + dev->hw_desc[0]->hdr = sahara_aes_key_hdr(dev); 456 + 457 + dev->nb_in_sg = sahara_sg_length(dev->in_sg, dev->total); 458 + dev->nb_out_sg = sahara_sg_length(dev->out_sg, dev->total); 459 + if ((dev->nb_in_sg + dev->nb_out_sg) > SAHARA_MAX_HW_LINK) { 460 + dev_err(dev->device, "not enough hw links (%d)\n", 461 + dev->nb_in_sg + dev->nb_out_sg); 462 + return -EINVAL; 463 + } 464 + 465 + ret = dma_map_sg(dev->device, dev->in_sg, dev->nb_in_sg, 466 + DMA_TO_DEVICE); 467 + if (ret != dev->nb_in_sg) { 468 + dev_err(dev->device, "couldn't map in sg\n"); 469 + goto unmap_in; 470 + } 471 + ret = dma_map_sg(dev->device, dev->out_sg, dev->nb_out_sg, 472 + DMA_FROM_DEVICE); 473 + if (ret != dev->nb_out_sg) { 474 + dev_err(dev->device, "couldn't map out sg\n"); 475 + goto unmap_out; 476 + } 477 + 478 + /* Create input links */ 479 + dev->hw_desc[1]->p1 = dev->hw_phys_link[0]; 480 + sg = dev->in_sg; 481 + for (i = 0; i < dev->nb_in_sg; i++) { 482 + dev->hw_link[i]->len = sg->length; 483 + dev->hw_link[i]->p = sg->dma_address; 484 + if (i == (dev->nb_in_sg - 1)) { 485 + dev->hw_link[i]->next = 0; 486 + } else { 487 + dev->hw_link[i]->next = dev->hw_phys_link[i + 1]; 488 + sg = sg_next(sg); 489 + } 490 + } 491 + 492 + /* Create output links */ 493 + dev->hw_desc[1]->p2 = dev->hw_phys_link[i]; 494 + sg = dev->out_sg; 495 + for (j = i; j < dev->nb_out_sg + i; j++) { 496 + dev->hw_link[j]->len = sg->length; 497 + dev->hw_link[j]->p = sg->dma_address; 498 + if (j == (dev->nb_out_sg + i - 1)) { 499 + dev->hw_link[j]->next = 0; 500 + } else { 501 + dev->hw_link[j]->next = dev->hw_phys_link[j + 1]; 502 + sg = sg_next(sg); 503 + } 504 + } 505 + 506 + /* Fill remaining fields of hw_desc[1] */ 507 + dev->hw_desc[1]->hdr = sahara_aes_data_link_hdr(dev); 508 + dev->hw_desc[1]->len1 = dev->total; 509 + dev->hw_desc[1]->len2 = dev->total; 510 + dev->hw_desc[1]->next = 0; 511 + 512 + sahara_dump_descriptors(dev); 513 + sahara_dump_links(dev); 514 + 515 + /* Start processing descriptor chain. */ 516 + mod_timer(&dev->watchdog, 517 + jiffies + msecs_to_jiffies(SAHARA_TIMEOUT_MS)); 518 + sahara_write(dev, dev->hw_phys_desc[0], SAHARA_REG_DAR); 519 + 520 + return 0; 521 + 522 + unmap_out: 523 + dma_unmap_sg(dev->device, dev->out_sg, dev->nb_out_sg, 524 + DMA_TO_DEVICE); 525 + unmap_in: 526 + dma_unmap_sg(dev->device, dev->in_sg, dev->nb_in_sg, 527 + DMA_FROM_DEVICE); 528 + 529 + return -EINVAL; 530 + } 531 + 532 + static void sahara_aes_queue_task(unsigned long data) 533 + { 534 + struct sahara_dev *dev = (struct sahara_dev *)data; 535 + struct crypto_async_request *async_req, *backlog; 536 + struct sahara_ctx *ctx; 537 + struct sahara_aes_reqctx *rctx; 538 + struct ablkcipher_request *req; 539 + int ret; 540 + 541 + spin_lock(&dev->lock); 542 + backlog = crypto_get_backlog(&dev->queue); 543 + async_req = crypto_dequeue_request(&dev->queue); 544 + if (!async_req) 545 + clear_bit(FLAGS_BUSY, &dev->flags); 546 + spin_unlock(&dev->lock); 547 + 548 + if (!async_req) 549 + return; 550 + 551 + if (backlog) 552 + backlog->complete(backlog, -EINPROGRESS); 553 + 554 + req = ablkcipher_request_cast(async_req); 555 + 556 + /* Request is ready to be dispatched by the device */ 557 + dev_dbg(dev->device, 558 + "dispatch request (nbytes=%d, src=%p, dst=%p)\n", 559 + req->nbytes, req->src, req->dst); 560 + 561 + /* assign new request to device */ 562 + dev->req = req; 563 + dev->total = req->nbytes; 564 + dev->in_sg = req->src; 565 + dev->out_sg = req->dst; 566 + 567 + rctx = ablkcipher_request_ctx(req); 568 + ctx = crypto_ablkcipher_ctx(crypto_ablkcipher_reqtfm(req)); 569 + rctx->mode &= FLAGS_MODE_MASK; 570 + dev->flags = (dev->flags & ~FLAGS_MODE_MASK) | rctx->mode; 571 + 572 + if ((dev->flags & FLAGS_CBC) && req->info) 573 + memcpy(dev->iv_base, req->info, AES_KEYSIZE_128); 574 + 575 + /* assign new context to device */ 576 + ctx->dev = dev; 577 + dev->ctx = ctx; 578 + 579 + ret = sahara_hw_descriptor_create(dev); 580 + if (ret < 0) { 581 + spin_lock(&dev->lock); 582 + clear_bit(FLAGS_BUSY, &dev->flags); 583 + spin_unlock(&dev->lock); 584 + dev->req->base.complete(&dev->req->base, ret); 585 + } 586 + } 587 + 588 + static int sahara_aes_setkey(struct crypto_ablkcipher *tfm, const u8 *key, 589 + unsigned int keylen) 590 + { 591 + struct sahara_ctx *ctx = crypto_ablkcipher_ctx(tfm); 592 + int ret; 593 + 594 + ctx->keylen = keylen; 595 + 596 + /* SAHARA only supports 128bit keys */ 597 + if (keylen == AES_KEYSIZE_128) { 598 + memcpy(ctx->key, key, keylen); 599 + ctx->flags |= FLAGS_NEW_KEY; 600 + return 0; 601 + } 602 + 603 + if (keylen != AES_KEYSIZE_128 && 604 + keylen != AES_KEYSIZE_192 && keylen != AES_KEYSIZE_256) 605 + return -EINVAL; 606 + 607 + /* 608 + * The requested key size is not supported by HW, do a fallback. 609 + */ 610 + ctx->fallback->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK; 611 + ctx->fallback->base.crt_flags |= 612 + (tfm->base.crt_flags & CRYPTO_TFM_REQ_MASK); 613 + 614 + ret = crypto_ablkcipher_setkey(ctx->fallback, key, keylen); 615 + if (ret) { 616 + struct crypto_tfm *tfm_aux = crypto_ablkcipher_tfm(tfm); 617 + 618 + tfm_aux->crt_flags &= ~CRYPTO_TFM_RES_MASK; 619 + tfm_aux->crt_flags |= 620 + (ctx->fallback->base.crt_flags & CRYPTO_TFM_RES_MASK); 621 + } 622 + return ret; 623 + } 624 + 625 + static int sahara_aes_crypt(struct ablkcipher_request *req, unsigned long mode) 626 + { 627 + struct sahara_ctx *ctx = crypto_ablkcipher_ctx( 628 + crypto_ablkcipher_reqtfm(req)); 629 + struct sahara_aes_reqctx *rctx = ablkcipher_request_ctx(req); 630 + struct sahara_dev *dev = dev_ptr; 631 + int err = 0; 632 + int busy; 633 + 634 + dev_dbg(dev->device, "nbytes: %d, enc: %d, cbc: %d\n", 635 + req->nbytes, !!(mode & FLAGS_ENCRYPT), !!(mode & FLAGS_CBC)); 636 + 637 + if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE)) { 638 + dev_err(dev->device, 639 + "request size is not exact amount of AES blocks\n"); 640 + return -EINVAL; 641 + } 642 + 643 + ctx->dev = dev; 644 + 645 + rctx->mode = mode; 646 + spin_lock_bh(&dev->lock); 647 + err = ablkcipher_enqueue_request(&dev->queue, req); 648 + busy = test_and_set_bit(FLAGS_BUSY, &dev->flags); 649 + spin_unlock_bh(&dev->lock); 650 + 651 + if (!busy) 652 + tasklet_schedule(&dev->queue_task); 653 + 654 + return err; 655 + } 656 + 657 + static int sahara_aes_ecb_encrypt(struct ablkcipher_request *req) 658 + { 659 + struct crypto_tfm *tfm = 660 + crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req)); 661 + struct sahara_ctx *ctx = crypto_ablkcipher_ctx( 662 + crypto_ablkcipher_reqtfm(req)); 663 + int err; 664 + 665 + if (unlikely(ctx->keylen != AES_KEYSIZE_128)) { 666 + ablkcipher_request_set_tfm(req, ctx->fallback); 667 + err = crypto_ablkcipher_encrypt(req); 668 + ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm)); 669 + return err; 670 + } 671 + 672 + return sahara_aes_crypt(req, FLAGS_ENCRYPT); 673 + } 674 + 675 + static int sahara_aes_ecb_decrypt(struct ablkcipher_request *req) 676 + { 677 + struct crypto_tfm *tfm = 678 + crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req)); 679 + struct sahara_ctx *ctx = crypto_ablkcipher_ctx( 680 + crypto_ablkcipher_reqtfm(req)); 681 + int err; 682 + 683 + if (unlikely(ctx->keylen != AES_KEYSIZE_128)) { 684 + ablkcipher_request_set_tfm(req, ctx->fallback); 685 + err = crypto_ablkcipher_decrypt(req); 686 + ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm)); 687 + return err; 688 + } 689 + 690 + return sahara_aes_crypt(req, 0); 691 + } 692 + 693 + static int sahara_aes_cbc_encrypt(struct ablkcipher_request *req) 694 + { 695 + struct crypto_tfm *tfm = 696 + crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req)); 697 + struct sahara_ctx *ctx = crypto_ablkcipher_ctx( 698 + crypto_ablkcipher_reqtfm(req)); 699 + int err; 700 + 701 + if (unlikely(ctx->keylen != AES_KEYSIZE_128)) { 702 + ablkcipher_request_set_tfm(req, ctx->fallback); 703 + err = crypto_ablkcipher_encrypt(req); 704 + ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm)); 705 + return err; 706 + } 707 + 708 + return sahara_aes_crypt(req, FLAGS_ENCRYPT | FLAGS_CBC); 709 + } 710 + 711 + static int sahara_aes_cbc_decrypt(struct ablkcipher_request *req) 712 + { 713 + struct crypto_tfm *tfm = 714 + crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req)); 715 + struct sahara_ctx *ctx = crypto_ablkcipher_ctx( 716 + crypto_ablkcipher_reqtfm(req)); 717 + int err; 718 + 719 + if (unlikely(ctx->keylen != AES_KEYSIZE_128)) { 720 + ablkcipher_request_set_tfm(req, ctx->fallback); 721 + err = crypto_ablkcipher_decrypt(req); 722 + ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm)); 723 + return err; 724 + } 725 + 726 + return sahara_aes_crypt(req, FLAGS_CBC); 727 + } 728 + 729 + static int sahara_aes_cra_init(struct crypto_tfm *tfm) 730 + { 731 + const char *name = tfm->__crt_alg->cra_name; 732 + struct sahara_ctx *ctx = crypto_tfm_ctx(tfm); 733 + 734 + ctx->fallback = crypto_alloc_ablkcipher(name, 0, 735 + CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK); 736 + if (IS_ERR(ctx->fallback)) { 737 + pr_err("Error allocating fallback algo %s\n", name); 738 + return PTR_ERR(ctx->fallback); 739 + } 740 + 741 + tfm->crt_ablkcipher.reqsize = sizeof(struct sahara_aes_reqctx); 742 + 743 + return 0; 744 + } 745 + 746 + static void sahara_aes_cra_exit(struct crypto_tfm *tfm) 747 + { 748 + struct sahara_ctx *ctx = crypto_tfm_ctx(tfm); 749 + 750 + if (ctx->fallback) 751 + crypto_free_ablkcipher(ctx->fallback); 752 + ctx->fallback = NULL; 753 + } 754 + 755 + static struct crypto_alg aes_algs[] = { 756 + { 757 + .cra_name = "ecb(aes)", 758 + .cra_driver_name = "sahara-ecb-aes", 759 + .cra_priority = 300, 760 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | 761 + CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK, 762 + .cra_blocksize = AES_BLOCK_SIZE, 763 + .cra_ctxsize = sizeof(struct sahara_ctx), 764 + .cra_alignmask = 0x0, 765 + .cra_type = &crypto_ablkcipher_type, 766 + .cra_module = THIS_MODULE, 767 + .cra_init = sahara_aes_cra_init, 768 + .cra_exit = sahara_aes_cra_exit, 769 + .cra_u.ablkcipher = { 770 + .min_keysize = AES_MIN_KEY_SIZE , 771 + .max_keysize = AES_MAX_KEY_SIZE, 772 + .setkey = sahara_aes_setkey, 773 + .encrypt = sahara_aes_ecb_encrypt, 774 + .decrypt = sahara_aes_ecb_decrypt, 775 + } 776 + }, { 777 + .cra_name = "cbc(aes)", 778 + .cra_driver_name = "sahara-cbc-aes", 779 + .cra_priority = 300, 780 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | 781 + CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK, 782 + .cra_blocksize = AES_BLOCK_SIZE, 783 + .cra_ctxsize = sizeof(struct sahara_ctx), 784 + .cra_alignmask = 0x0, 785 + .cra_type = &crypto_ablkcipher_type, 786 + .cra_module = THIS_MODULE, 787 + .cra_init = sahara_aes_cra_init, 788 + .cra_exit = sahara_aes_cra_exit, 789 + .cra_u.ablkcipher = { 790 + .min_keysize = AES_MIN_KEY_SIZE , 791 + .max_keysize = AES_MAX_KEY_SIZE, 792 + .ivsize = AES_BLOCK_SIZE, 793 + .setkey = sahara_aes_setkey, 794 + .encrypt = sahara_aes_cbc_encrypt, 795 + .decrypt = sahara_aes_cbc_decrypt, 796 + } 797 + } 798 + }; 799 + 800 + static irqreturn_t sahara_irq_handler(int irq, void *data) 801 + { 802 + struct sahara_dev *dev = (struct sahara_dev *)data; 803 + unsigned int stat = sahara_read(dev, SAHARA_REG_STATUS); 804 + unsigned int err = sahara_read(dev, SAHARA_REG_ERRSTATUS); 805 + 806 + del_timer(&dev->watchdog); 807 + 808 + sahara_write(dev, SAHARA_CMD_CLEAR_INT | SAHARA_CMD_CLEAR_ERR, 809 + SAHARA_REG_CMD); 810 + 811 + sahara_decode_status(dev, stat); 812 + 813 + if (SAHARA_STATUS_GET_STATE(stat) == SAHARA_STATE_BUSY) { 814 + return IRQ_NONE; 815 + } else if (SAHARA_STATUS_GET_STATE(stat) == SAHARA_STATE_COMPLETE) { 816 + dev->error = 0; 817 + } else { 818 + sahara_decode_error(dev, err); 819 + dev->error = -EINVAL; 820 + } 821 + 822 + tasklet_schedule(&dev->done_task); 823 + 824 + return IRQ_HANDLED; 825 + } 826 + 827 + 828 + static int sahara_register_algs(struct sahara_dev *dev) 829 + { 830 + int err, i, j; 831 + 832 + for (i = 0; i < ARRAY_SIZE(aes_algs); i++) { 833 + INIT_LIST_HEAD(&aes_algs[i].cra_list); 834 + err = crypto_register_alg(&aes_algs[i]); 835 + if (err) 836 + goto err_aes_algs; 837 + } 838 + 839 + return 0; 840 + 841 + err_aes_algs: 842 + for (j = 0; j < i; j++) 843 + crypto_unregister_alg(&aes_algs[j]); 844 + 845 + return err; 846 + } 847 + 848 + static void sahara_unregister_algs(struct sahara_dev *dev) 849 + { 850 + int i; 851 + 852 + for (i = 0; i < ARRAY_SIZE(aes_algs); i++) 853 + crypto_unregister_alg(&aes_algs[i]); 854 + } 855 + 856 + static struct platform_device_id sahara_platform_ids[] = { 857 + { .name = "sahara-imx27" }, 858 + { /* sentinel */ } 859 + }; 860 + MODULE_DEVICE_TABLE(platform, sahara_platform_ids); 861 + 862 + static struct of_device_id sahara_dt_ids[] = { 863 + { .compatible = "fsl,imx27-sahara" }, 864 + { /* sentinel */ } 865 + }; 866 + MODULE_DEVICE_TABLE(platform, sahara_dt_ids); 867 + 868 + static int sahara_probe(struct platform_device *pdev) 869 + { 870 + struct sahara_dev *dev; 871 + struct resource *res; 872 + u32 version; 873 + int irq; 874 + int err; 875 + int i; 876 + 877 + dev = devm_kzalloc(&pdev->dev, sizeof(struct sahara_dev), GFP_KERNEL); 878 + if (dev == NULL) { 879 + dev_err(&pdev->dev, "unable to alloc data struct.\n"); 880 + return -ENOMEM; 881 + } 882 + 883 + dev->device = &pdev->dev; 884 + platform_set_drvdata(pdev, dev); 885 + 886 + /* Get the base address */ 887 + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); 888 + if (!res) { 889 + dev_err(&pdev->dev, "failed to get memory region resource\n"); 890 + return -ENODEV; 891 + } 892 + 893 + if (devm_request_mem_region(&pdev->dev, res->start, 894 + resource_size(res), SAHARA_NAME) == NULL) { 895 + dev_err(&pdev->dev, "failed to request memory region\n"); 896 + return -ENOENT; 897 + } 898 + dev->regs_base = devm_ioremap(&pdev->dev, res->start, 899 + resource_size(res)); 900 + if (!dev->regs_base) { 901 + dev_err(&pdev->dev, "failed to ioremap address region\n"); 902 + return -ENOENT; 903 + } 904 + 905 + /* Get the IRQ */ 906 + irq = platform_get_irq(pdev, 0); 907 + if (irq < 0) { 908 + dev_err(&pdev->dev, "failed to get irq resource\n"); 909 + return irq; 910 + } 911 + 912 + if (devm_request_irq(&pdev->dev, irq, sahara_irq_handler, 913 + 0, SAHARA_NAME, dev) < 0) { 914 + dev_err(&pdev->dev, "failed to request irq\n"); 915 + return -ENOENT; 916 + } 917 + 918 + /* clocks */ 919 + dev->clk_ipg = devm_clk_get(&pdev->dev, "ipg"); 920 + if (IS_ERR(dev->clk_ipg)) { 921 + dev_err(&pdev->dev, "Could not get ipg clock\n"); 922 + return PTR_ERR(dev->clk_ipg); 923 + } 924 + 925 + dev->clk_ahb = devm_clk_get(&pdev->dev, "ahb"); 926 + if (IS_ERR(dev->clk_ahb)) { 927 + dev_err(&pdev->dev, "Could not get ahb clock\n"); 928 + return PTR_ERR(dev->clk_ahb); 929 + } 930 + 931 + /* Allocate HW descriptors */ 932 + dev->hw_desc[0] = dma_alloc_coherent(&pdev->dev, 933 + SAHARA_MAX_HW_DESC * sizeof(struct sahara_hw_desc), 934 + &dev->hw_phys_desc[0], GFP_KERNEL); 935 + if (!dev->hw_desc[0]) { 936 + dev_err(&pdev->dev, "Could not allocate hw descriptors\n"); 937 + return -ENOMEM; 938 + } 939 + dev->hw_desc[1] = dev->hw_desc[0] + 1; 940 + dev->hw_phys_desc[1] = dev->hw_phys_desc[0] + 941 + sizeof(struct sahara_hw_desc); 942 + 943 + /* Allocate space for iv and key */ 944 + dev->key_base = dma_alloc_coherent(&pdev->dev, 2 * AES_KEYSIZE_128, 945 + &dev->key_phys_base, GFP_KERNEL); 946 + if (!dev->key_base) { 947 + dev_err(&pdev->dev, "Could not allocate memory for key\n"); 948 + err = -ENOMEM; 949 + goto err_key; 950 + } 951 + dev->iv_base = dev->key_base + AES_KEYSIZE_128; 952 + dev->iv_phys_base = dev->key_phys_base + AES_KEYSIZE_128; 953 + 954 + /* Allocate space for HW links */ 955 + dev->hw_link[0] = dma_alloc_coherent(&pdev->dev, 956 + SAHARA_MAX_HW_LINK * sizeof(struct sahara_hw_link), 957 + &dev->hw_phys_link[0], GFP_KERNEL); 958 + if (!dev->hw_link) { 959 + dev_err(&pdev->dev, "Could not allocate hw links\n"); 960 + err = -ENOMEM; 961 + goto err_link; 962 + } 963 + for (i = 1; i < SAHARA_MAX_HW_LINK; i++) { 964 + dev->hw_phys_link[i] = dev->hw_phys_link[i - 1] + 965 + sizeof(struct sahara_hw_link); 966 + dev->hw_link[i] = dev->hw_link[i - 1] + 1; 967 + } 968 + 969 + crypto_init_queue(&dev->queue, SAHARA_QUEUE_LENGTH); 970 + 971 + dev_ptr = dev; 972 + 973 + tasklet_init(&dev->queue_task, sahara_aes_queue_task, 974 + (unsigned long)dev); 975 + tasklet_init(&dev->done_task, sahara_aes_done_task, 976 + (unsigned long)dev); 977 + 978 + init_timer(&dev->watchdog); 979 + dev->watchdog.function = &sahara_watchdog; 980 + dev->watchdog.data = (unsigned long)dev; 981 + 982 + clk_prepare_enable(dev->clk_ipg); 983 + clk_prepare_enable(dev->clk_ahb); 984 + 985 + version = sahara_read(dev, SAHARA_REG_VERSION); 986 + if (version != SAHARA_VERSION_3) { 987 + dev_err(&pdev->dev, "SAHARA version %d not supported\n", 988 + version); 989 + err = -ENODEV; 990 + goto err_algs; 991 + } 992 + 993 + sahara_write(dev, SAHARA_CMD_RESET | SAHARA_CMD_MODE_BATCH, 994 + SAHARA_REG_CMD); 995 + sahara_write(dev, SAHARA_CONTROL_SET_THROTTLE(0) | 996 + SAHARA_CONTROL_SET_MAXBURST(8) | 997 + SAHARA_CONTROL_RNG_AUTORSD | 998 + SAHARA_CONTROL_ENABLE_INT, 999 + SAHARA_REG_CONTROL); 1000 + 1001 + err = sahara_register_algs(dev); 1002 + if (err) 1003 + goto err_algs; 1004 + 1005 + dev_info(&pdev->dev, "SAHARA version %d initialized\n", version); 1006 + 1007 + return 0; 1008 + 1009 + err_algs: 1010 + dma_free_coherent(&pdev->dev, 1011 + SAHARA_MAX_HW_LINK * sizeof(struct sahara_hw_link), 1012 + dev->hw_link[0], dev->hw_phys_link[0]); 1013 + clk_disable_unprepare(dev->clk_ipg); 1014 + clk_disable_unprepare(dev->clk_ahb); 1015 + dev_ptr = NULL; 1016 + err_link: 1017 + dma_free_coherent(&pdev->dev, 1018 + 2 * AES_KEYSIZE_128, 1019 + dev->key_base, dev->key_phys_base); 1020 + err_key: 1021 + dma_free_coherent(&pdev->dev, 1022 + SAHARA_MAX_HW_DESC * sizeof(struct sahara_hw_desc), 1023 + dev->hw_desc[0], dev->hw_phys_desc[0]); 1024 + 1025 + return err; 1026 + } 1027 + 1028 + static int sahara_remove(struct platform_device *pdev) 1029 + { 1030 + struct sahara_dev *dev = platform_get_drvdata(pdev); 1031 + 1032 + dma_free_coherent(&pdev->dev, 1033 + SAHARA_MAX_HW_LINK * sizeof(struct sahara_hw_link), 1034 + dev->hw_link[0], dev->hw_phys_link[0]); 1035 + dma_free_coherent(&pdev->dev, 1036 + 2 * AES_KEYSIZE_128, 1037 + dev->key_base, dev->key_phys_base); 1038 + dma_free_coherent(&pdev->dev, 1039 + SAHARA_MAX_HW_DESC * sizeof(struct sahara_hw_desc), 1040 + dev->hw_desc[0], dev->hw_phys_desc[0]); 1041 + 1042 + tasklet_kill(&dev->done_task); 1043 + tasklet_kill(&dev->queue_task); 1044 + 1045 + sahara_unregister_algs(dev); 1046 + 1047 + clk_disable_unprepare(dev->clk_ipg); 1048 + clk_disable_unprepare(dev->clk_ahb); 1049 + 1050 + dev_ptr = NULL; 1051 + 1052 + return 0; 1053 + } 1054 + 1055 + static struct platform_driver sahara_driver = { 1056 + .probe = sahara_probe, 1057 + .remove = sahara_remove, 1058 + .driver = { 1059 + .name = SAHARA_NAME, 1060 + .owner = THIS_MODULE, 1061 + .of_match_table = of_match_ptr(sahara_dt_ids), 1062 + }, 1063 + .id_table = sahara_platform_ids, 1064 + }; 1065 + 1066 + module_platform_driver(sahara_driver); 1067 + 1068 + MODULE_LICENSE("GPL"); 1069 + MODULE_AUTHOR("Javier Martin <javier.martin@vista-silicon.com>"); 1070 + MODULE_DESCRIPTION("SAHARA2 HW crypto accelerator");

+3 -3

drivers/crypto/ux500/hash/hash_core.c

··· 938 938 if (!ctx->device->dma.nents) { 939 939 dev_err(device_data->dev, "[%s] " 940 940 "ctx->device->dma.nents = 0", __func__); 941 + ret = ctx->device->dma.nents; 941 942 goto out; 942 943 } 943 944 ··· 946 945 if (bytes_written != req->nbytes) { 947 946 dev_err(device_data->dev, "[%s] " 948 947 "hash_dma_write() failed!", __func__); 948 + ret = bytes_written; 949 949 goto out; 950 950 } 951 951 ··· 1369 1367 /** 1370 1368 * Freed in final. 1371 1369 */ 1372 - ctx->key = kmalloc(keylen, GFP_KERNEL); 1370 + ctx->key = kmemdup(key, keylen, GFP_KERNEL); 1373 1371 if (!ctx->key) { 1374 1372 pr_err(DEV_DBG_NAME " [%s] Failed to allocate ctx->key " 1375 1373 "for %d\n", __func__, alg); 1376 1374 return -ENOMEM; 1377 1375 } 1378 - 1379 - memcpy(ctx->key, key, keylen); 1380 1376 ctx->keylen = keylen; 1381 1377 1382 1378 return ret;

+5

include/crypto/sha.h

··· 87 87 extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data, 88 88 unsigned int len); 89 89 90 + extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data, 91 + unsigned int len); 92 + 93 + extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data, 94 + unsigned int len); 90 95 #endif

-22

include/linux/platform_data/atmel-aes.h

··· 1 - #ifndef __LINUX_ATMEL_AES_H 2 - #define __LINUX_ATMEL_AES_H 3 - 4 - #include <linux/platform_data/dma-atmel.h> 5 - 6 - /** 7 - * struct aes_dma_data - DMA data for AES 8 - */ 9 - struct aes_dma_data { 10 - struct at_dma_slave txdata; 11 - struct at_dma_slave rxdata; 12 - }; 13 - 14 - /** 15 - * struct aes_platform_data - board-specific AES configuration 16 - * @dma_slave: DMA slave interface to use in data transfers. 17 - */ 18 - struct aes_platform_data { 19 - struct aes_dma_data *dma_slave; 20 - }; 21 - 22 - #endif /* __LINUX_ATMEL_AES_H */

+22

include/linux/platform_data/crypto-atmel.h

··· 1 + #ifndef __LINUX_CRYPTO_ATMEL_H 2 + #define __LINUX_CRYPTO_ATMEL_H 3 + 4 + #include <linux/platform_data/dma-atmel.h> 5 + 6 + /** 7 + * struct crypto_dma_data - DMA data for AES/TDES/SHA 8 + */ 9 + struct crypto_dma_data { 10 + struct at_dma_slave txdata; 11 + struct at_dma_slave rxdata; 12 + }; 13 + 14 + /** 15 + * struct crypto_platform_data - board-specific AES/TDES/SHA configuration 16 + * @dma_slave: DMA slave interface to use in data transfers. 17 + */ 18 + struct crypto_platform_data { 19 + struct crypto_dma_data *dma_slave; 20 + }; 21 + 22 + #endif /* __LINUX_CRYPTO_ATMEL_H */

-5

include/linux/timeriomem-rng.h

··· 8 8 * published by the Free Software Foundation. 9 9 */ 10 10 11 - #include <linux/completion.h> 12 - 13 11 struct timeriomem_rng_data { 14 - struct completion completion; 15 - unsigned int present:1; 16 - 17 12 void __iomem *address; 18 13 19 14 /* measures in usecs */

+13

net/xfrm/xfrm_algo.c

··· 311 311 .sadb_alg_maxbits = 128 312 312 } 313 313 }, 314 + { 315 + /* rfc4494 */ 316 + .name = "cmac(aes)", 317 + 318 + .uinfo = { 319 + .auth = { 320 + .icv_truncbits = 96, 321 + .icv_fullbits = 128, 322 + } 323 + }, 324 + 325 + .pfkey_supported = 0, 326 + }, 314 327 }; 315 328 316 329 static struct xfrm_algo_desc ealg_list[] = {