Merge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux into next

+1 -9

arch/arm/include/asm/ftrace.h

··· 52 52 53 53 #endif 54 54 55 - #define HAVE_ARCH_CALLER_ADDR 56 - 57 - #define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) 58 - #define CALLER_ADDR1 ((unsigned long)return_address(1)) 59 - #define CALLER_ADDR2 ((unsigned long)return_address(2)) 60 - #define CALLER_ADDR3 ((unsigned long)return_address(3)) 61 - #define CALLER_ADDR4 ((unsigned long)return_address(4)) 62 - #define CALLER_ADDR5 ((unsigned long)return_address(5)) 63 - #define CALLER_ADDR6 ((unsigned long)return_address(6)) 55 + #define ftrace_return_addr(n) return_address(n) 64 56 65 57 #endif /* ifndef __ASSEMBLY__ */ 66 58

+12

arch/arm64/Kconfig

··· 30 30 select HAVE_ARCH_JUMP_LABEL 31 31 select HAVE_ARCH_KGDB 32 32 select HAVE_ARCH_TRACEHOOK 33 + select HAVE_C_RECORDMCOUNT 33 34 select HAVE_DEBUG_BUGVERBOSE 34 35 select HAVE_DEBUG_KMEMLEAK 35 36 select HAVE_DMA_API_DEBUG 36 37 select HAVE_DMA_ATTRS 37 38 select HAVE_DMA_CONTIGUOUS 39 + select HAVE_DYNAMIC_FTRACE 38 40 select HAVE_EFFICIENT_UNALIGNED_ACCESS 41 + select HAVE_FTRACE_MCOUNT_RECORD 42 + select HAVE_FUNCTION_TRACER 43 + select HAVE_FUNCTION_GRAPH_TRACER 39 44 select HAVE_GENERIC_DMA_COHERENT 40 45 select HAVE_HW_BREAKPOINT if PERF_EVENTS 41 46 select HAVE_MEMBLOCK ··· 48 43 select HAVE_PERF_EVENTS 49 44 select HAVE_PERF_REGS 50 45 select HAVE_PERF_USER_STACK_DUMP 46 + select HAVE_SYSCALL_TRACEPOINTS 51 47 select IRQ_DOMAIN 52 48 select MODULES_USE_ELF_RELA 53 49 select NO_BOOTMEM ··· 251 245 config HAVE_ARCH_TRANSPARENT_HUGEPAGE 252 246 def_bool y 253 247 248 + config ARCH_HAS_CACHE_LINE_SIZE 249 + def_bool y 250 + 254 251 source "mm/Kconfig" 255 252 256 253 config XEN_DOM0 ··· 368 359 source "security/Kconfig" 369 360 370 361 source "crypto/Kconfig" 362 + if CRYPTO 363 + source "arch/arm64/crypto/Kconfig" 364 + endif 371 365 372 366 source "lib/Kconfig"

+1

arch/arm64/Makefile

··· 45 45 core-y += arch/arm64/kernel/ arch/arm64/mm/ 46 46 core-$(CONFIG_KVM) += arch/arm64/kvm/ 47 47 core-$(CONFIG_XEN) += arch/arm64/xen/ 48 + core-$(CONFIG_CRYPTO) += arch/arm64/crypto/ 48 49 libs-y := arch/arm64/lib/ $(libs-y) 49 50 libs-y += $(LIBGCC) 50 51

+25 -15

arch/arm64/configs/defconfig

··· 1 1 # CONFIG_LOCALVERSION_AUTO is not set 2 - # CONFIG_SWAP is not set 3 2 CONFIG_SYSVIPC=y 4 3 CONFIG_POSIX_MQUEUE=y 4 + CONFIG_AUDIT=y 5 + CONFIG_NO_HZ_IDLE=y 6 + CONFIG_HIGH_RES_TIMERS=y 5 7 CONFIG_BSD_PROCESS_ACCT=y 6 8 CONFIG_BSD_PROCESS_ACCT_V3=y 7 - CONFIG_NO_HZ=y 8 - CONFIG_HIGH_RES_TIMERS=y 9 9 CONFIG_IKCONFIG=y 10 10 CONFIG_IKCONFIG_PROC=y 11 11 CONFIG_LOG_BUF_SHIFT=14 ··· 27 27 CONFIG_ARCH_XGENE=y 28 28 CONFIG_SMP=y 29 29 CONFIG_PREEMPT=y 30 + CONFIG_TRANSPARENT_HUGEPAGE=y 30 31 CONFIG_CMA=y 31 32 CONFIG_CMDLINE="console=ttyAMA0" 32 33 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set ··· 45 44 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" 46 45 CONFIG_DEVTMPFS=y 47 46 CONFIG_DMA_CMA=y 48 - CONFIG_SCSI=y 47 + CONFIG_VIRTIO_BLK=y 49 48 # CONFIG_SCSI_PROC_FS is not set 50 49 CONFIG_BLK_DEV_SD=y 51 50 # CONFIG_SCSI_LOWLEVEL is not set ··· 57 56 CONFIG_SMSC911X=y 58 57 # CONFIG_WLAN is not set 59 58 CONFIG_INPUT_EVDEV=y 60 - # CONFIG_SERIO_I8042 is not set 61 59 # CONFIG_SERIO_SERPORT is not set 62 60 CONFIG_LEGACY_PTY_COUNT=16 63 61 CONFIG_SERIAL_8250=y 64 62 CONFIG_SERIAL_8250_CONSOLE=y 65 - CONFIG_SERIAL_OF_PLATFORM=y 66 63 CONFIG_SERIAL_AMBA_PL011=y 67 64 CONFIG_SERIAL_AMBA_PL011_CONSOLE=y 65 + CONFIG_SERIAL_OF_PLATFORM=y 68 66 # CONFIG_HW_RANDOM is not set 69 67 # CONFIG_HWMON is not set 70 68 CONFIG_REGULATOR=y 71 69 CONFIG_REGULATOR_FIXED_VOLTAGE=y 72 70 CONFIG_FB=y 73 - # CONFIG_VGA_CONSOLE is not set 74 71 CONFIG_FRAMEBUFFER_CONSOLE=y 75 72 CONFIG_LOGO=y 76 73 # CONFIG_LOGO_LINUX_MONO is not set ··· 78 79 CONFIG_USB_STORAGE=y 79 80 CONFIG_MMC=y 80 81 CONFIG_MMC_ARMMMCI=y 82 + CONFIG_VIRTIO_MMIO=y 81 83 # CONFIG_IOMMU_SUPPORT is not set 82 84 CONFIG_EXT2_FS=y 83 85 CONFIG_EXT3_FS=y 84 - CONFIG_EXT4_FS=y 85 86 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set 86 87 # CONFIG_EXT3_FS_XATTR is not set 88 + CONFIG_EXT4_FS=y 87 89 CONFIG_FUSE_FS=y 88 90 CONFIG_CUSE=y 89 91 CONFIG_VFAT_FS=y 90 92 CONFIG_TMPFS=y 93 + CONFIG_HUGETLBFS=y 91 94 # CONFIG_MISC_FILESYSTEMS is not set 92 95 CONFIG_NFS_FS=y 93 96 CONFIG_ROOT_NFS=y 94 97 CONFIG_NLS_CODEPAGE_437=y 95 98 CONFIG_NLS_ISO8859_1=y 96 - CONFIG_MAGIC_SYSRQ=y 97 - CONFIG_DEBUG_FS=y 98 - CONFIG_DEBUG_KERNEL=y 99 - # CONFIG_SCHED_DEBUG is not set 99 + CONFIG_VIRTUALIZATION=y 100 + CONFIG_KVM=y 100 101 CONFIG_DEBUG_INFO=y 102 + CONFIG_DEBUG_FS=y 103 + CONFIG_MAGIC_SYSRQ=y 104 + CONFIG_DEBUG_KERNEL=y 105 + CONFIG_LOCKUP_DETECTOR=y 106 + # CONFIG_SCHED_DEBUG is not set 101 107 # CONFIG_FTRACE is not set 102 - CONFIG_ATOMIC64_SELFTEST=y 103 - CONFIG_VIRTIO_MMIO=y 104 - CONFIG_VIRTIO_BLK=y 108 + CONFIG_CRYPTO_ANSI_CPRNG=y 109 + CONFIG_ARM64_CRYPTO=y 110 + CONFIG_CRYPTO_SHA1_ARM64_CE=y 111 + CONFIG_CRYPTO_SHA2_ARM64_CE=y 112 + CONFIG_CRYPTO_GHASH_ARM64_CE=y 113 + CONFIG_CRYPTO_AES_ARM64_CE=y 114 + CONFIG_CRYPTO_AES_ARM64_CE_CCM=y 115 + CONFIG_CRYPTO_AES_ARM64_CE_BLK=y 116 + CONFIG_CRYPTO_AES_ARM64_NEON_BLK=y

+53

arch/arm64/crypto/Kconfig

··· 1 + 2 + menuconfig ARM64_CRYPTO 3 + bool "ARM64 Accelerated Cryptographic Algorithms" 4 + depends on ARM64 5 + help 6 + Say Y here to choose from a selection of cryptographic algorithms 7 + implemented using ARM64 specific CPU features or instructions. 8 + 9 + if ARM64_CRYPTO 10 + 11 + config CRYPTO_SHA1_ARM64_CE 12 + tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)" 13 + depends on ARM64 && KERNEL_MODE_NEON 14 + select CRYPTO_HASH 15 + 16 + config CRYPTO_SHA2_ARM64_CE 17 + tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)" 18 + depends on ARM64 && KERNEL_MODE_NEON 19 + select CRYPTO_HASH 20 + 21 + config CRYPTO_GHASH_ARM64_CE 22 + tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions" 23 + depends on ARM64 && KERNEL_MODE_NEON 24 + select CRYPTO_HASH 25 + 26 + config CRYPTO_AES_ARM64_CE 27 + tristate "AES core cipher using ARMv8 Crypto Extensions" 28 + depends on ARM64 && KERNEL_MODE_NEON 29 + select CRYPTO_ALGAPI 30 + select CRYPTO_AES 31 + 32 + config CRYPTO_AES_ARM64_CE_CCM 33 + tristate "AES in CCM mode using ARMv8 Crypto Extensions" 34 + depends on ARM64 && KERNEL_MODE_NEON 35 + select CRYPTO_ALGAPI 36 + select CRYPTO_AES 37 + select CRYPTO_AEAD 38 + 39 + config CRYPTO_AES_ARM64_CE_BLK 40 + tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions" 41 + depends on ARM64 && KERNEL_MODE_NEON 42 + select CRYPTO_BLKCIPHER 43 + select CRYPTO_AES 44 + select CRYPTO_ABLK_HELPER 45 + 46 + config CRYPTO_AES_ARM64_NEON_BLK 47 + tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions" 48 + depends on ARM64 && KERNEL_MODE_NEON 49 + select CRYPTO_BLKCIPHER 50 + select CRYPTO_AES 51 + select CRYPTO_ABLK_HELPER 52 + 53 + endif

+38

arch/arm64/crypto/Makefile

··· 1 + # 2 + # linux/arch/arm64/crypto/Makefile 3 + # 4 + # Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + # 6 + # This program is free software; you can redistribute it and/or modify 7 + # it under the terms of the GNU General Public License version 2 as 8 + # published by the Free Software Foundation. 9 + # 10 + 11 + obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o 12 + sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o 13 + 14 + obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o 15 + sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o 16 + 17 + obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o 18 + ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o 19 + 20 + obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o 21 + CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto 22 + 23 + obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o 24 + aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o 25 + 26 + obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o 27 + aes-ce-blk-y := aes-glue-ce.o aes-ce.o 28 + 29 + obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o 30 + aes-neon-blk-y := aes-glue-neon.o aes-neon.o 31 + 32 + AFLAGS_aes-ce.o := -DINTERLEAVE=2 -DINTERLEAVE_INLINE 33 + AFLAGS_aes-neon.o := -DINTERLEAVE=4 34 + 35 + CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS 36 + 37 + $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE 38 + $(call if_changed_dep,cc_o_c)

+222

arch/arm64/crypto/aes-ce-ccm-core.S

··· 1 + /* 2 + * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions 3 + * 4 + * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <linux/linkage.h> 12 + 13 + .text 14 + .arch armv8-a+crypto 15 + 16 + /* 17 + * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, 18 + * u32 *macp, u8 const rk[], u32 rounds); 19 + */ 20 + ENTRY(ce_aes_ccm_auth_data) 21 + ldr w8, [x3] /* leftover from prev round? */ 22 + ld1 {v0.2d}, [x0] /* load mac */ 23 + cbz w8, 1f 24 + sub w8, w8, #16 25 + eor v1.16b, v1.16b, v1.16b 26 + 0: ldrb w7, [x1], #1 /* get 1 byte of input */ 27 + subs w2, w2, #1 28 + add w8, w8, #1 29 + ins v1.b[0], w7 30 + ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ 31 + beq 8f /* out of input? */ 32 + cbnz w8, 0b 33 + eor v0.16b, v0.16b, v1.16b 34 + 1: ld1 {v3.2d}, [x4] /* load first round key */ 35 + prfm pldl1strm, [x1] 36 + cmp w5, #12 /* which key size? */ 37 + add x6, x4, #16 38 + sub w7, w5, #2 /* modified # of rounds */ 39 + bmi 2f 40 + bne 5f 41 + mov v5.16b, v3.16b 42 + b 4f 43 + 2: mov v4.16b, v3.16b 44 + ld1 {v5.2d}, [x6], #16 /* load 2nd round key */ 45 + 3: aese v0.16b, v4.16b 46 + aesmc v0.16b, v0.16b 47 + 4: ld1 {v3.2d}, [x6], #16 /* load next round key */ 48 + aese v0.16b, v5.16b 49 + aesmc v0.16b, v0.16b 50 + 5: ld1 {v4.2d}, [x6], #16 /* load next round key */ 51 + subs w7, w7, #3 52 + aese v0.16b, v3.16b 53 + aesmc v0.16b, v0.16b 54 + ld1 {v5.2d}, [x6], #16 /* load next round key */ 55 + bpl 3b 56 + aese v0.16b, v4.16b 57 + subs w2, w2, #16 /* last data? */ 58 + eor v0.16b, v0.16b, v5.16b /* final round */ 59 + bmi 6f 60 + ld1 {v1.16b}, [x1], #16 /* load next input block */ 61 + eor v0.16b, v0.16b, v1.16b /* xor with mac */ 62 + bne 1b 63 + 6: st1 {v0.2d}, [x0] /* store mac */ 64 + beq 10f 65 + adds w2, w2, #16 66 + beq 10f 67 + mov w8, w2 68 + 7: ldrb w7, [x1], #1 69 + umov w6, v0.b[0] 70 + eor w6, w6, w7 71 + strb w6, [x0], #1 72 + subs w2, w2, #1 73 + beq 10f 74 + ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ 75 + b 7b 76 + 8: mov w7, w8 77 + add w8, w8, #16 78 + 9: ext v1.16b, v1.16b, v1.16b, #1 79 + adds w7, w7, #1 80 + bne 9b 81 + eor v0.16b, v0.16b, v1.16b 82 + st1 {v0.2d}, [x0] 83 + 10: str w8, [x3] 84 + ret 85 + ENDPROC(ce_aes_ccm_auth_data) 86 + 87 + /* 88 + * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[], 89 + * u32 rounds); 90 + */ 91 + ENTRY(ce_aes_ccm_final) 92 + ld1 {v3.2d}, [x2], #16 /* load first round key */ 93 + ld1 {v0.2d}, [x0] /* load mac */ 94 + cmp w3, #12 /* which key size? */ 95 + sub w3, w3, #2 /* modified # of rounds */ 96 + ld1 {v1.2d}, [x1] /* load 1st ctriv */ 97 + bmi 0f 98 + bne 3f 99 + mov v5.16b, v3.16b 100 + b 2f 101 + 0: mov v4.16b, v3.16b 102 + 1: ld1 {v5.2d}, [x2], #16 /* load next round key */ 103 + aese v0.16b, v4.16b 104 + aese v1.16b, v4.16b 105 + aesmc v0.16b, v0.16b 106 + aesmc v1.16b, v1.16b 107 + 2: ld1 {v3.2d}, [x2], #16 /* load next round key */ 108 + aese v0.16b, v5.16b 109 + aese v1.16b, v5.16b 110 + aesmc v0.16b, v0.16b 111 + aesmc v1.16b, v1.16b 112 + 3: ld1 {v4.2d}, [x2], #16 /* load next round key */ 113 + subs w3, w3, #3 114 + aese v0.16b, v3.16b 115 + aese v1.16b, v3.16b 116 + aesmc v0.16b, v0.16b 117 + aesmc v1.16b, v1.16b 118 + bpl 1b 119 + aese v0.16b, v4.16b 120 + aese v1.16b, v4.16b 121 + /* final round key cancels out */ 122 + eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */ 123 + st1 {v0.2d}, [x0] /* store result */ 124 + ret 125 + ENDPROC(ce_aes_ccm_final) 126 + 127 + .macro aes_ccm_do_crypt,enc 128 + ldr x8, [x6, #8] /* load lower ctr */ 129 + ld1 {v0.2d}, [x5] /* load mac */ 130 + rev x8, x8 /* keep swabbed ctr in reg */ 131 + 0: /* outer loop */ 132 + ld1 {v1.1d}, [x6] /* load upper ctr */ 133 + prfm pldl1strm, [x1] 134 + add x8, x8, #1 135 + rev x9, x8 136 + cmp w4, #12 /* which key size? */ 137 + sub w7, w4, #2 /* get modified # of rounds */ 138 + ins v1.d[1], x9 /* no carry in lower ctr */ 139 + ld1 {v3.2d}, [x3] /* load first round key */ 140 + add x10, x3, #16 141 + bmi 1f 142 + bne 4f 143 + mov v5.16b, v3.16b 144 + b 3f 145 + 1: mov v4.16b, v3.16b 146 + ld1 {v5.2d}, [x10], #16 /* load 2nd round key */ 147 + 2: /* inner loop: 3 rounds, 2x interleaved */ 148 + aese v0.16b, v4.16b 149 + aese v1.16b, v4.16b 150 + aesmc v0.16b, v0.16b 151 + aesmc v1.16b, v1.16b 152 + 3: ld1 {v3.2d}, [x10], #16 /* load next round key */ 153 + aese v0.16b, v5.16b 154 + aese v1.16b, v5.16b 155 + aesmc v0.16b, v0.16b 156 + aesmc v1.16b, v1.16b 157 + 4: ld1 {v4.2d}, [x10], #16 /* load next round key */ 158 + subs w7, w7, #3 159 + aese v0.16b, v3.16b 160 + aese v1.16b, v3.16b 161 + aesmc v0.16b, v0.16b 162 + aesmc v1.16b, v1.16b 163 + ld1 {v5.2d}, [x10], #16 /* load next round key */ 164 + bpl 2b 165 + aese v0.16b, v4.16b 166 + aese v1.16b, v4.16b 167 + subs w2, w2, #16 168 + bmi 6f /* partial block? */ 169 + ld1 {v2.16b}, [x1], #16 /* load next input block */ 170 + .if \enc == 1 171 + eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ 172 + eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ 173 + .else 174 + eor v2.16b, v2.16b, v1.16b /* xor with crypted ctr */ 175 + eor v1.16b, v2.16b, v5.16b /* final round enc */ 176 + .endif 177 + eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ 178 + st1 {v1.16b}, [x0], #16 /* write output block */ 179 + bne 0b 180 + rev x8, x8 181 + st1 {v0.2d}, [x5] /* store mac */ 182 + str x8, [x6, #8] /* store lsb end of ctr (BE) */ 183 + 5: ret 184 + 185 + 6: eor v0.16b, v0.16b, v5.16b /* final round mac */ 186 + eor v1.16b, v1.16b, v5.16b /* final round enc */ 187 + st1 {v0.2d}, [x5] /* store mac */ 188 + add w2, w2, #16 /* process partial tail block */ 189 + 7: ldrb w9, [x1], #1 /* get 1 byte of input */ 190 + umov w6, v1.b[0] /* get top crypted ctr byte */ 191 + umov w7, v0.b[0] /* get top mac byte */ 192 + .if \enc == 1 193 + eor w7, w7, w9 194 + eor w9, w9, w6 195 + .else 196 + eor w9, w9, w6 197 + eor w7, w7, w9 198 + .endif 199 + strb w9, [x0], #1 /* store out byte */ 200 + strb w7, [x5], #1 /* store mac byte */ 201 + subs w2, w2, #1 202 + beq 5b 203 + ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ 204 + ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ 205 + b 7b 206 + .endm 207 + 208 + /* 209 + * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, 210 + * u8 const rk[], u32 rounds, u8 mac[], 211 + * u8 ctr[]); 212 + * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, 213 + * u8 const rk[], u32 rounds, u8 mac[], 214 + * u8 ctr[]); 215 + */ 216 + ENTRY(ce_aes_ccm_encrypt) 217 + aes_ccm_do_crypt 1 218 + ENDPROC(ce_aes_ccm_encrypt) 219 + 220 + ENTRY(ce_aes_ccm_decrypt) 221 + aes_ccm_do_crypt 0 222 + ENDPROC(ce_aes_ccm_decrypt)

+297

arch/arm64/crypto/aes-ce-ccm-glue.c

··· 1 + /* 2 + * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions 3 + * 4 + * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <asm/neon.h> 12 + #include <asm/unaligned.h> 13 + #include <crypto/aes.h> 14 + #include <crypto/algapi.h> 15 + #include <crypto/scatterwalk.h> 16 + #include <linux/crypto.h> 17 + #include <linux/module.h> 18 + 19 + static int num_rounds(struct crypto_aes_ctx *ctx) 20 + { 21 + /* 22 + * # of rounds specified by AES: 23 + * 128 bit key 10 rounds 24 + * 192 bit key 12 rounds 25 + * 256 bit key 14 rounds 26 + * => n byte key => 6 + (n/4) rounds 27 + */ 28 + return 6 + ctx->key_length / 4; 29 + } 30 + 31 + asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, 32 + u32 *macp, u32 const rk[], u32 rounds); 33 + 34 + asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, 35 + u32 const rk[], u32 rounds, u8 mac[], 36 + u8 ctr[]); 37 + 38 + asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, 39 + u32 const rk[], u32 rounds, u8 mac[], 40 + u8 ctr[]); 41 + 42 + asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[], 43 + u32 rounds); 44 + 45 + static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key, 46 + unsigned int key_len) 47 + { 48 + struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm); 49 + int ret; 50 + 51 + ret = crypto_aes_expand_key(ctx, in_key, key_len); 52 + if (!ret) 53 + return 0; 54 + 55 + tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; 56 + return -EINVAL; 57 + } 58 + 59 + static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) 60 + { 61 + if ((authsize & 1) || authsize < 4) 62 + return -EINVAL; 63 + return 0; 64 + } 65 + 66 + static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen) 67 + { 68 + struct crypto_aead *aead = crypto_aead_reqtfm(req); 69 + __be32 *n = (__be32 *)&maciv[AES_BLOCK_SIZE - 8]; 70 + u32 l = req->iv[0] + 1; 71 + 72 + /* verify that CCM dimension 'L' is set correctly in the IV */ 73 + if (l < 2 || l > 8) 74 + return -EINVAL; 75 + 76 + /* verify that msglen can in fact be represented in L bytes */ 77 + if (l < 4 && msglen >> (8 * l)) 78 + return -EOVERFLOW; 79 + 80 + /* 81 + * Even if the CCM spec allows L values of up to 8, the Linux cryptoapi 82 + * uses a u32 type to represent msglen so the top 4 bytes are always 0. 83 + */ 84 + n[0] = 0; 85 + n[1] = cpu_to_be32(msglen); 86 + 87 + memcpy(maciv, req->iv, AES_BLOCK_SIZE - l); 88 + 89 + /* 90 + * Meaning of byte 0 according to CCM spec (RFC 3610/NIST 800-38C) 91 + * - bits 0..2 : max # of bytes required to represent msglen, minus 1 92 + * (already set by caller) 93 + * - bits 3..5 : size of auth tag (1 => 4 bytes, 2 => 6 bytes, etc) 94 + * - bit 6 : indicates presence of authenticate-only data 95 + */ 96 + maciv[0] |= (crypto_aead_authsize(aead) - 2) << 2; 97 + if (req->assoclen) 98 + maciv[0] |= 0x40; 99 + 100 + memset(&req->iv[AES_BLOCK_SIZE - l], 0, l); 101 + return 0; 102 + } 103 + 104 + static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) 105 + { 106 + struct crypto_aead *aead = crypto_aead_reqtfm(req); 107 + struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); 108 + struct __packed { __be16 l; __be32 h; u16 len; } ltag; 109 + struct scatter_walk walk; 110 + u32 len = req->assoclen; 111 + u32 macp = 0; 112 + 113 + /* prepend the AAD with a length tag */ 114 + if (len < 0xff00) { 115 + ltag.l = cpu_to_be16(len); 116 + ltag.len = 2; 117 + } else { 118 + ltag.l = cpu_to_be16(0xfffe); 119 + put_unaligned_be32(len, &ltag.h); 120 + ltag.len = 6; 121 + } 122 + 123 + ce_aes_ccm_auth_data(mac, (u8 *)&ltag, ltag.len, &macp, ctx->key_enc, 124 + num_rounds(ctx)); 125 + scatterwalk_start(&walk, req->assoc); 126 + 127 + do { 128 + u32 n = scatterwalk_clamp(&walk, len); 129 + u8 *p; 130 + 131 + if (!n) { 132 + scatterwalk_start(&walk, sg_next(walk.sg)); 133 + n = scatterwalk_clamp(&walk, len); 134 + } 135 + p = scatterwalk_map(&walk); 136 + ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc, 137 + num_rounds(ctx)); 138 + len -= n; 139 + 140 + scatterwalk_unmap(p); 141 + scatterwalk_advance(&walk, n); 142 + scatterwalk_done(&walk, 0, len); 143 + } while (len); 144 + } 145 + 146 + static int ccm_encrypt(struct aead_request *req) 147 + { 148 + struct crypto_aead *aead = crypto_aead_reqtfm(req); 149 + struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); 150 + struct blkcipher_desc desc = { .info = req->iv }; 151 + struct blkcipher_walk walk; 152 + u8 __aligned(8) mac[AES_BLOCK_SIZE]; 153 + u8 buf[AES_BLOCK_SIZE]; 154 + u32 len = req->cryptlen; 155 + int err; 156 + 157 + err = ccm_init_mac(req, mac, len); 158 + if (err) 159 + return err; 160 + 161 + kernel_neon_begin_partial(6); 162 + 163 + if (req->assoclen) 164 + ccm_calculate_auth_mac(req, mac); 165 + 166 + /* preserve the original iv for the final round */ 167 + memcpy(buf, req->iv, AES_BLOCK_SIZE); 168 + 169 + blkcipher_walk_init(&walk, req->dst, req->src, len); 170 + err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, 171 + AES_BLOCK_SIZE); 172 + 173 + while (walk.nbytes) { 174 + u32 tail = walk.nbytes % AES_BLOCK_SIZE; 175 + 176 + if (walk.nbytes == len) 177 + tail = 0; 178 + 179 + ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr, 180 + walk.nbytes - tail, ctx->key_enc, 181 + num_rounds(ctx), mac, walk.iv); 182 + 183 + len -= walk.nbytes - tail; 184 + err = blkcipher_walk_done(&desc, &walk, tail); 185 + } 186 + if (!err) 187 + ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); 188 + 189 + kernel_neon_end(); 190 + 191 + if (err) 192 + return err; 193 + 194 + /* copy authtag to end of dst */ 195 + scatterwalk_map_and_copy(mac, req->dst, req->cryptlen, 196 + crypto_aead_authsize(aead), 1); 197 + 198 + return 0; 199 + } 200 + 201 + static int ccm_decrypt(struct aead_request *req) 202 + { 203 + struct crypto_aead *aead = crypto_aead_reqtfm(req); 204 + struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); 205 + unsigned int authsize = crypto_aead_authsize(aead); 206 + struct blkcipher_desc desc = { .info = req->iv }; 207 + struct blkcipher_walk walk; 208 + u8 __aligned(8) mac[AES_BLOCK_SIZE]; 209 + u8 buf[AES_BLOCK_SIZE]; 210 + u32 len = req->cryptlen - authsize; 211 + int err; 212 + 213 + err = ccm_init_mac(req, mac, len); 214 + if (err) 215 + return err; 216 + 217 + kernel_neon_begin_partial(6); 218 + 219 + if (req->assoclen) 220 + ccm_calculate_auth_mac(req, mac); 221 + 222 + /* preserve the original iv for the final round */ 223 + memcpy(buf, req->iv, AES_BLOCK_SIZE); 224 + 225 + blkcipher_walk_init(&walk, req->dst, req->src, len); 226 + err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, 227 + AES_BLOCK_SIZE); 228 + 229 + while (walk.nbytes) { 230 + u32 tail = walk.nbytes % AES_BLOCK_SIZE; 231 + 232 + if (walk.nbytes == len) 233 + tail = 0; 234 + 235 + ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr, 236 + walk.nbytes - tail, ctx->key_enc, 237 + num_rounds(ctx), mac, walk.iv); 238 + 239 + len -= walk.nbytes - tail; 240 + err = blkcipher_walk_done(&desc, &walk, tail); 241 + } 242 + if (!err) 243 + ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); 244 + 245 + kernel_neon_end(); 246 + 247 + if (err) 248 + return err; 249 + 250 + /* compare calculated auth tag with the stored one */ 251 + scatterwalk_map_and_copy(buf, req->src, req->cryptlen - authsize, 252 + authsize, 0); 253 + 254 + if (memcmp(mac, buf, authsize)) 255 + return -EBADMSG; 256 + return 0; 257 + } 258 + 259 + static struct crypto_alg ccm_aes_alg = { 260 + .cra_name = "ccm(aes)", 261 + .cra_driver_name = "ccm-aes-ce", 262 + .cra_priority = 300, 263 + .cra_flags = CRYPTO_ALG_TYPE_AEAD, 264 + .cra_blocksize = 1, 265 + .cra_ctxsize = sizeof(struct crypto_aes_ctx), 266 + .cra_alignmask = 7, 267 + .cra_type = &crypto_aead_type, 268 + .cra_module = THIS_MODULE, 269 + .cra_aead = { 270 + .ivsize = AES_BLOCK_SIZE, 271 + .maxauthsize = AES_BLOCK_SIZE, 272 + .setkey = ccm_setkey, 273 + .setauthsize = ccm_setauthsize, 274 + .encrypt = ccm_encrypt, 275 + .decrypt = ccm_decrypt, 276 + } 277 + }; 278 + 279 + static int __init aes_mod_init(void) 280 + { 281 + if (!(elf_hwcap & HWCAP_AES)) 282 + return -ENODEV; 283 + return crypto_register_alg(&ccm_aes_alg); 284 + } 285 + 286 + static void __exit aes_mod_exit(void) 287 + { 288 + crypto_unregister_alg(&ccm_aes_alg); 289 + } 290 + 291 + module_init(aes_mod_init); 292 + module_exit(aes_mod_exit); 293 + 294 + MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions"); 295 + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 296 + MODULE_LICENSE("GPL v2"); 297 + MODULE_ALIAS("ccm(aes)");

+155

arch/arm64/crypto/aes-ce-cipher.c

··· 1 + /* 2 + * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions 3 + * 4 + * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <asm/neon.h> 12 + #include <crypto/aes.h> 13 + #include <linux/cpufeature.h> 14 + #include <linux/crypto.h> 15 + #include <linux/module.h> 16 + 17 + MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions"); 18 + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 19 + MODULE_LICENSE("GPL v2"); 20 + 21 + struct aes_block { 22 + u8 b[AES_BLOCK_SIZE]; 23 + }; 24 + 25 + static int num_rounds(struct crypto_aes_ctx *ctx) 26 + { 27 + /* 28 + * # of rounds specified by AES: 29 + * 128 bit key 10 rounds 30 + * 192 bit key 12 rounds 31 + * 256 bit key 14 rounds 32 + * => n byte key => 6 + (n/4) rounds 33 + */ 34 + return 6 + ctx->key_length / 4; 35 + } 36 + 37 + static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) 38 + { 39 + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); 40 + struct aes_block *out = (struct aes_block *)dst; 41 + struct aes_block const *in = (struct aes_block *)src; 42 + void *dummy0; 43 + int dummy1; 44 + 45 + kernel_neon_begin_partial(4); 46 + 47 + __asm__(" ld1 {v0.16b}, %[in] ;" 48 + " ld1 {v1.2d}, [%[key]], #16 ;" 49 + " cmp %w[rounds], #10 ;" 50 + " bmi 0f ;" 51 + " bne 3f ;" 52 + " mov v3.16b, v1.16b ;" 53 + " b 2f ;" 54 + "0: mov v2.16b, v1.16b ;" 55 + " ld1 {v3.2d}, [%[key]], #16 ;" 56 + "1: aese v0.16b, v2.16b ;" 57 + " aesmc v0.16b, v0.16b ;" 58 + "2: ld1 {v1.2d}, [%[key]], #16 ;" 59 + " aese v0.16b, v3.16b ;" 60 + " aesmc v0.16b, v0.16b ;" 61 + "3: ld1 {v2.2d}, [%[key]], #16 ;" 62 + " subs %w[rounds], %w[rounds], #3 ;" 63 + " aese v0.16b, v1.16b ;" 64 + " aesmc v0.16b, v0.16b ;" 65 + " ld1 {v3.2d}, [%[key]], #16 ;" 66 + " bpl 1b ;" 67 + " aese v0.16b, v2.16b ;" 68 + " eor v0.16b, v0.16b, v3.16b ;" 69 + " st1 {v0.16b}, %[out] ;" 70 + 71 + : [out] "=Q"(*out), 72 + [key] "=r"(dummy0), 73 + [rounds] "=r"(dummy1) 74 + : [in] "Q"(*in), 75 + "1"(ctx->key_enc), 76 + "2"(num_rounds(ctx) - 2) 77 + : "cc"); 78 + 79 + kernel_neon_end(); 80 + } 81 + 82 + static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) 83 + { 84 + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); 85 + struct aes_block *out = (struct aes_block *)dst; 86 + struct aes_block const *in = (struct aes_block *)src; 87 + void *dummy0; 88 + int dummy1; 89 + 90 + kernel_neon_begin_partial(4); 91 + 92 + __asm__(" ld1 {v0.16b}, %[in] ;" 93 + " ld1 {v1.2d}, [%[key]], #16 ;" 94 + " cmp %w[rounds], #10 ;" 95 + " bmi 0f ;" 96 + " bne 3f ;" 97 + " mov v3.16b, v1.16b ;" 98 + " b 2f ;" 99 + "0: mov v2.16b, v1.16b ;" 100 + " ld1 {v3.2d}, [%[key]], #16 ;" 101 + "1: aesd v0.16b, v2.16b ;" 102 + " aesimc v0.16b, v0.16b ;" 103 + "2: ld1 {v1.2d}, [%[key]], #16 ;" 104 + " aesd v0.16b, v3.16b ;" 105 + " aesimc v0.16b, v0.16b ;" 106 + "3: ld1 {v2.2d}, [%[key]], #16 ;" 107 + " subs %w[rounds], %w[rounds], #3 ;" 108 + " aesd v0.16b, v1.16b ;" 109 + " aesimc v0.16b, v0.16b ;" 110 + " ld1 {v3.2d}, [%[key]], #16 ;" 111 + " bpl 1b ;" 112 + " aesd v0.16b, v2.16b ;" 113 + " eor v0.16b, v0.16b, v3.16b ;" 114 + " st1 {v0.16b}, %[out] ;" 115 + 116 + : [out] "=Q"(*out), 117 + [key] "=r"(dummy0), 118 + [rounds] "=r"(dummy1) 119 + : [in] "Q"(*in), 120 + "1"(ctx->key_dec), 121 + "2"(num_rounds(ctx) - 2) 122 + : "cc"); 123 + 124 + kernel_neon_end(); 125 + } 126 + 127 + static struct crypto_alg aes_alg = { 128 + .cra_name = "aes", 129 + .cra_driver_name = "aes-ce", 130 + .cra_priority = 300, 131 + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, 132 + .cra_blocksize = AES_BLOCK_SIZE, 133 + .cra_ctxsize = sizeof(struct crypto_aes_ctx), 134 + .cra_module = THIS_MODULE, 135 + .cra_cipher = { 136 + .cia_min_keysize = AES_MIN_KEY_SIZE, 137 + .cia_max_keysize = AES_MAX_KEY_SIZE, 138 + .cia_setkey = crypto_aes_set_key, 139 + .cia_encrypt = aes_cipher_encrypt, 140 + .cia_decrypt = aes_cipher_decrypt 141 + } 142 + }; 143 + 144 + static int __init aes_mod_init(void) 145 + { 146 + return crypto_register_alg(&aes_alg); 147 + } 148 + 149 + static void __exit aes_mod_exit(void) 150 + { 151 + crypto_unregister_alg(&aes_alg); 152 + } 153 + 154 + module_cpu_feature_match(AES, aes_mod_init); 155 + module_exit(aes_mod_exit);

+133

arch/arm64/crypto/aes-ce.S

··· 1 + /* 2 + * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with 3 + * Crypto Extensions 4 + * 5 + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> 6 + * 7 + * This program is free software; you can redistribute it and/or modify 8 + * it under the terms of the GNU General Public License version 2 as 9 + * published by the Free Software Foundation. 10 + */ 11 + 12 + #include <linux/linkage.h> 13 + 14 + #define AES_ENTRY(func) ENTRY(ce_ ## func) 15 + #define AES_ENDPROC(func) ENDPROC(ce_ ## func) 16 + 17 + .arch armv8-a+crypto 18 + 19 + /* preload all round keys */ 20 + .macro load_round_keys, rounds, rk 21 + cmp \rounds, #12 22 + blo 2222f /* 128 bits */ 23 + beq 1111f /* 192 bits */ 24 + ld1 {v17.16b-v18.16b}, [\rk], #32 25 + 1111: ld1 {v19.16b-v20.16b}, [\rk], #32 26 + 2222: ld1 {v21.16b-v24.16b}, [\rk], #64 27 + ld1 {v25.16b-v28.16b}, [\rk], #64 28 + ld1 {v29.16b-v31.16b}, [\rk] 29 + .endm 30 + 31 + /* prepare for encryption with key in rk[] */ 32 + .macro enc_prepare, rounds, rk, ignore 33 + load_round_keys \rounds, \rk 34 + .endm 35 + 36 + /* prepare for encryption (again) but with new key in rk[] */ 37 + .macro enc_switch_key, rounds, rk, ignore 38 + load_round_keys \rounds, \rk 39 + .endm 40 + 41 + /* prepare for decryption with key in rk[] */ 42 + .macro dec_prepare, rounds, rk, ignore 43 + load_round_keys \rounds, \rk 44 + .endm 45 + 46 + .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3 47 + aes\de \i0\().16b, \k\().16b 48 + .ifnb \i1 49 + aes\de \i1\().16b, \k\().16b 50 + .ifnb \i3 51 + aes\de \i2\().16b, \k\().16b 52 + aes\de \i3\().16b, \k\().16b 53 + .endif 54 + .endif 55 + aes\mc \i0\().16b, \i0\().16b 56 + .ifnb \i1 57 + aes\mc \i1\().16b, \i1\().16b 58 + .ifnb \i3 59 + aes\mc \i2\().16b, \i2\().16b 60 + aes\mc \i3\().16b, \i3\().16b 61 + .endif 62 + .endif 63 + .endm 64 + 65 + /* up to 4 interleaved encryption rounds with the same round key */ 66 + .macro round_Nx, enc, k, i0, i1, i2, i3 67 + .ifc \enc, e 68 + do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3 69 + .else 70 + do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3 71 + .endif 72 + .endm 73 + 74 + /* up to 4 interleaved final rounds */ 75 + .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3 76 + aes\de \i0\().16b, \k\().16b 77 + .ifnb \i1 78 + aes\de \i1\().16b, \k\().16b 79 + .ifnb \i3 80 + aes\de \i2\().16b, \k\().16b 81 + aes\de \i3\().16b, \k\().16b 82 + .endif 83 + .endif 84 + eor \i0\().16b, \i0\().16b, \k2\().16b 85 + .ifnb \i1 86 + eor \i1\().16b, \i1\().16b, \k2\().16b 87 + .ifnb \i3 88 + eor \i2\().16b, \i2\().16b, \k2\().16b 89 + eor \i3\().16b, \i3\().16b, \k2\().16b 90 + .endif 91 + .endif 92 + .endm 93 + 94 + /* up to 4 interleaved blocks */ 95 + .macro do_block_Nx, enc, rounds, i0, i1, i2, i3 96 + cmp \rounds, #12 97 + blo 2222f /* 128 bits */ 98 + beq 1111f /* 192 bits */ 99 + round_Nx \enc, v17, \i0, \i1, \i2, \i3 100 + round_Nx \enc, v18, \i0, \i1, \i2, \i3 101 + 1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3 102 + round_Nx \enc, v20, \i0, \i1, \i2, \i3 103 + 2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 104 + round_Nx \enc, \key, \i0, \i1, \i2, \i3 105 + .endr 106 + fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3 107 + .endm 108 + 109 + .macro encrypt_block, in, rounds, t0, t1, t2 110 + do_block_Nx e, \rounds, \in 111 + .endm 112 + 113 + .macro encrypt_block2x, i0, i1, rounds, t0, t1, t2 114 + do_block_Nx e, \rounds, \i0, \i1 115 + .endm 116 + 117 + .macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 118 + do_block_Nx e, \rounds, \i0, \i1, \i2, \i3 119 + .endm 120 + 121 + .macro decrypt_block, in, rounds, t0, t1, t2 122 + do_block_Nx d, \rounds, \in 123 + .endm 124 + 125 + .macro decrypt_block2x, i0, i1, rounds, t0, t1, t2 126 + do_block_Nx d, \rounds, \i0, \i1 127 + .endm 128 + 129 + .macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 130 + do_block_Nx d, \rounds, \i0, \i1, \i2, \i3 131 + .endm 132 + 133 + #include "aes-modes.S"

+446

arch/arm64/crypto/aes-glue.c

··· 1 + /* 2 + * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES 3 + * 4 + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <asm/neon.h> 12 + #include <asm/hwcap.h> 13 + #include <crypto/aes.h> 14 + #include <crypto/ablk_helper.h> 15 + #include <crypto/algapi.h> 16 + #include <linux/module.h> 17 + #include <linux/cpufeature.h> 18 + 19 + #ifdef USE_V8_CRYPTO_EXTENSIONS 20 + #define MODE "ce" 21 + #define PRIO 300 22 + #define aes_ecb_encrypt ce_aes_ecb_encrypt 23 + #define aes_ecb_decrypt ce_aes_ecb_decrypt 24 + #define aes_cbc_encrypt ce_aes_cbc_encrypt 25 + #define aes_cbc_decrypt ce_aes_cbc_decrypt 26 + #define aes_ctr_encrypt ce_aes_ctr_encrypt 27 + #define aes_xts_encrypt ce_aes_xts_encrypt 28 + #define aes_xts_decrypt ce_aes_xts_decrypt 29 + MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); 30 + #else 31 + #define MODE "neon" 32 + #define PRIO 200 33 + #define aes_ecb_encrypt neon_aes_ecb_encrypt 34 + #define aes_ecb_decrypt neon_aes_ecb_decrypt 35 + #define aes_cbc_encrypt neon_aes_cbc_encrypt 36 + #define aes_cbc_decrypt neon_aes_cbc_decrypt 37 + #define aes_ctr_encrypt neon_aes_ctr_encrypt 38 + #define aes_xts_encrypt neon_aes_xts_encrypt 39 + #define aes_xts_decrypt neon_aes_xts_decrypt 40 + MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON"); 41 + MODULE_ALIAS("ecb(aes)"); 42 + MODULE_ALIAS("cbc(aes)"); 43 + MODULE_ALIAS("ctr(aes)"); 44 + MODULE_ALIAS("xts(aes)"); 45 + #endif 46 + 47 + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 48 + MODULE_LICENSE("GPL v2"); 49 + 50 + /* defined in aes-modes.S */ 51 + asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], 52 + int rounds, int blocks, int first); 53 + asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], 54 + int rounds, int blocks, int first); 55 + 56 + asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], 57 + int rounds, int blocks, u8 iv[], int first); 58 + asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], 59 + int rounds, int blocks, u8 iv[], int first); 60 + 61 + asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], 62 + int rounds, int blocks, u8 ctr[], int first); 63 + 64 + asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], 65 + int rounds, int blocks, u8 const rk2[], u8 iv[], 66 + int first); 67 + asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], 68 + int rounds, int blocks, u8 const rk2[], u8 iv[], 69 + int first); 70 + 71 + struct crypto_aes_xts_ctx { 72 + struct crypto_aes_ctx key1; 73 + struct crypto_aes_ctx __aligned(8) key2; 74 + }; 75 + 76 + static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key, 77 + unsigned int key_len) 78 + { 79 + struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm); 80 + int ret; 81 + 82 + ret = crypto_aes_expand_key(&ctx->key1, in_key, key_len / 2); 83 + if (!ret) 84 + ret = crypto_aes_expand_key(&ctx->key2, &in_key[key_len / 2], 85 + key_len / 2); 86 + if (!ret) 87 + return 0; 88 + 89 + tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; 90 + return -EINVAL; 91 + } 92 + 93 + static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 94 + struct scatterlist *src, unsigned int nbytes) 95 + { 96 + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 97 + int err, first, rounds = 6 + ctx->key_length / 4; 98 + struct blkcipher_walk walk; 99 + unsigned int blocks; 100 + 101 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 102 + blkcipher_walk_init(&walk, dst, src, nbytes); 103 + err = blkcipher_walk_virt(desc, &walk); 104 + 105 + kernel_neon_begin(); 106 + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { 107 + aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr, 108 + (u8 *)ctx->key_enc, rounds, blocks, first); 109 + err = blkcipher_walk_done(desc, &walk, 0); 110 + } 111 + kernel_neon_end(); 112 + return err; 113 + } 114 + 115 + static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 116 + struct scatterlist *src, unsigned int nbytes) 117 + { 118 + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 119 + int err, first, rounds = 6 + ctx->key_length / 4; 120 + struct blkcipher_walk walk; 121 + unsigned int blocks; 122 + 123 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 124 + blkcipher_walk_init(&walk, dst, src, nbytes); 125 + err = blkcipher_walk_virt(desc, &walk); 126 + 127 + kernel_neon_begin(); 128 + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { 129 + aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr, 130 + (u8 *)ctx->key_dec, rounds, blocks, first); 131 + err = blkcipher_walk_done(desc, &walk, 0); 132 + } 133 + kernel_neon_end(); 134 + return err; 135 + } 136 + 137 + static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 138 + struct scatterlist *src, unsigned int nbytes) 139 + { 140 + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 141 + int err, first, rounds = 6 + ctx->key_length / 4; 142 + struct blkcipher_walk walk; 143 + unsigned int blocks; 144 + 145 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 146 + blkcipher_walk_init(&walk, dst, src, nbytes); 147 + err = blkcipher_walk_virt(desc, &walk); 148 + 149 + kernel_neon_begin(); 150 + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { 151 + aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr, 152 + (u8 *)ctx->key_enc, rounds, blocks, walk.iv, 153 + first); 154 + err = blkcipher_walk_done(desc, &walk, 0); 155 + } 156 + kernel_neon_end(); 157 + return err; 158 + } 159 + 160 + static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 161 + struct scatterlist *src, unsigned int nbytes) 162 + { 163 + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 164 + int err, first, rounds = 6 + ctx->key_length / 4; 165 + struct blkcipher_walk walk; 166 + unsigned int blocks; 167 + 168 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 169 + blkcipher_walk_init(&walk, dst, src, nbytes); 170 + err = blkcipher_walk_virt(desc, &walk); 171 + 172 + kernel_neon_begin(); 173 + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { 174 + aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr, 175 + (u8 *)ctx->key_dec, rounds, blocks, walk.iv, 176 + first); 177 + err = blkcipher_walk_done(desc, &walk, 0); 178 + } 179 + kernel_neon_end(); 180 + return err; 181 + } 182 + 183 + static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 184 + struct scatterlist *src, unsigned int nbytes) 185 + { 186 + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 187 + int err, first, rounds = 6 + ctx->key_length / 4; 188 + struct blkcipher_walk walk; 189 + int blocks; 190 + 191 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 192 + blkcipher_walk_init(&walk, dst, src, nbytes); 193 + err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); 194 + 195 + first = 1; 196 + kernel_neon_begin(); 197 + while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { 198 + aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, 199 + (u8 *)ctx->key_enc, rounds, blocks, walk.iv, 200 + first); 201 + first = 0; 202 + nbytes -= blocks * AES_BLOCK_SIZE; 203 + if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE) 204 + break; 205 + err = blkcipher_walk_done(desc, &walk, 206 + walk.nbytes % AES_BLOCK_SIZE); 207 + } 208 + if (nbytes) { 209 + u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; 210 + u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; 211 + u8 __aligned(8) tail[AES_BLOCK_SIZE]; 212 + 213 + /* 214 + * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need 215 + * to tell aes_ctr_encrypt() to only read half a block. 216 + */ 217 + blocks = (nbytes <= 8) ? -1 : 1; 218 + 219 + aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds, 220 + blocks, walk.iv, first); 221 + memcpy(tdst, tail, nbytes); 222 + err = blkcipher_walk_done(desc, &walk, 0); 223 + } 224 + kernel_neon_end(); 225 + 226 + return err; 227 + } 228 + 229 + static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 230 + struct scatterlist *src, unsigned int nbytes) 231 + { 232 + struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 233 + int err, first, rounds = 6 + ctx->key1.key_length / 4; 234 + struct blkcipher_walk walk; 235 + unsigned int blocks; 236 + 237 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 238 + blkcipher_walk_init(&walk, dst, src, nbytes); 239 + err = blkcipher_walk_virt(desc, &walk); 240 + 241 + kernel_neon_begin(); 242 + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { 243 + aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr, 244 + (u8 *)ctx->key1.key_enc, rounds, blocks, 245 + (u8 *)ctx->key2.key_enc, walk.iv, first); 246 + err = blkcipher_walk_done(desc, &walk, 0); 247 + } 248 + kernel_neon_end(); 249 + 250 + return err; 251 + } 252 + 253 + static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 254 + struct scatterlist *src, unsigned int nbytes) 255 + { 256 + struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 257 + int err, first, rounds = 6 + ctx->key1.key_length / 4; 258 + struct blkcipher_walk walk; 259 + unsigned int blocks; 260 + 261 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 262 + blkcipher_walk_init(&walk, dst, src, nbytes); 263 + err = blkcipher_walk_virt(desc, &walk); 264 + 265 + kernel_neon_begin(); 266 + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { 267 + aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr, 268 + (u8 *)ctx->key1.key_dec, rounds, blocks, 269 + (u8 *)ctx->key2.key_enc, walk.iv, first); 270 + err = blkcipher_walk_done(desc, &walk, 0); 271 + } 272 + kernel_neon_end(); 273 + 274 + return err; 275 + } 276 + 277 + static struct crypto_alg aes_algs[] = { { 278 + .cra_name = "__ecb-aes-" MODE, 279 + .cra_driver_name = "__driver-ecb-aes-" MODE, 280 + .cra_priority = 0, 281 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 282 + .cra_blocksize = AES_BLOCK_SIZE, 283 + .cra_ctxsize = sizeof(struct crypto_aes_ctx), 284 + .cra_alignmask = 7, 285 + .cra_type = &crypto_blkcipher_type, 286 + .cra_module = THIS_MODULE, 287 + .cra_blkcipher = { 288 + .min_keysize = AES_MIN_KEY_SIZE, 289 + .max_keysize = AES_MAX_KEY_SIZE, 290 + .ivsize = AES_BLOCK_SIZE, 291 + .setkey = crypto_aes_set_key, 292 + .encrypt = ecb_encrypt, 293 + .decrypt = ecb_decrypt, 294 + }, 295 + }, { 296 + .cra_name = "__cbc-aes-" MODE, 297 + .cra_driver_name = "__driver-cbc-aes-" MODE, 298 + .cra_priority = 0, 299 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 300 + .cra_blocksize = AES_BLOCK_SIZE, 301 + .cra_ctxsize = sizeof(struct crypto_aes_ctx), 302 + .cra_alignmask = 7, 303 + .cra_type = &crypto_blkcipher_type, 304 + .cra_module = THIS_MODULE, 305 + .cra_blkcipher = { 306 + .min_keysize = AES_MIN_KEY_SIZE, 307 + .max_keysize = AES_MAX_KEY_SIZE, 308 + .ivsize = AES_BLOCK_SIZE, 309 + .setkey = crypto_aes_set_key, 310 + .encrypt = cbc_encrypt, 311 + .decrypt = cbc_decrypt, 312 + }, 313 + }, { 314 + .cra_name = "__ctr-aes-" MODE, 315 + .cra_driver_name = "__driver-ctr-aes-" MODE, 316 + .cra_priority = 0, 317 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 318 + .cra_blocksize = 1, 319 + .cra_ctxsize = sizeof(struct crypto_aes_ctx), 320 + .cra_alignmask = 7, 321 + .cra_type = &crypto_blkcipher_type, 322 + .cra_module = THIS_MODULE, 323 + .cra_blkcipher = { 324 + .min_keysize = AES_MIN_KEY_SIZE, 325 + .max_keysize = AES_MAX_KEY_SIZE, 326 + .ivsize = AES_BLOCK_SIZE, 327 + .setkey = crypto_aes_set_key, 328 + .encrypt = ctr_encrypt, 329 + .decrypt = ctr_encrypt, 330 + }, 331 + }, { 332 + .cra_name = "__xts-aes-" MODE, 333 + .cra_driver_name = "__driver-xts-aes-" MODE, 334 + .cra_priority = 0, 335 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 336 + .cra_blocksize = AES_BLOCK_SIZE, 337 + .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx), 338 + .cra_alignmask = 7, 339 + .cra_type = &crypto_blkcipher_type, 340 + .cra_module = THIS_MODULE, 341 + .cra_blkcipher = { 342 + .min_keysize = 2 * AES_MIN_KEY_SIZE, 343 + .max_keysize = 2 * AES_MAX_KEY_SIZE, 344 + .ivsize = AES_BLOCK_SIZE, 345 + .setkey = xts_set_key, 346 + .encrypt = xts_encrypt, 347 + .decrypt = xts_decrypt, 348 + }, 349 + }, { 350 + .cra_name = "ecb(aes)", 351 + .cra_driver_name = "ecb-aes-" MODE, 352 + .cra_priority = PRIO, 353 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, 354 + .cra_blocksize = AES_BLOCK_SIZE, 355 + .cra_ctxsize = sizeof(struct async_helper_ctx), 356 + .cra_alignmask = 7, 357 + .cra_type = &crypto_ablkcipher_type, 358 + .cra_module = THIS_MODULE, 359 + .cra_init = ablk_init, 360 + .cra_exit = ablk_exit, 361 + .cra_ablkcipher = { 362 + .min_keysize = AES_MIN_KEY_SIZE, 363 + .max_keysize = AES_MAX_KEY_SIZE, 364 + .ivsize = AES_BLOCK_SIZE, 365 + .setkey = ablk_set_key, 366 + .encrypt = ablk_encrypt, 367 + .decrypt = ablk_decrypt, 368 + } 369 + }, { 370 + .cra_name = "cbc(aes)", 371 + .cra_driver_name = "cbc-aes-" MODE, 372 + .cra_priority = PRIO, 373 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, 374 + .cra_blocksize = AES_BLOCK_SIZE, 375 + .cra_ctxsize = sizeof(struct async_helper_ctx), 376 + .cra_alignmask = 7, 377 + .cra_type = &crypto_ablkcipher_type, 378 + .cra_module = THIS_MODULE, 379 + .cra_init = ablk_init, 380 + .cra_exit = ablk_exit, 381 + .cra_ablkcipher = { 382 + .min_keysize = AES_MIN_KEY_SIZE, 383 + .max_keysize = AES_MAX_KEY_SIZE, 384 + .ivsize = AES_BLOCK_SIZE, 385 + .setkey = ablk_set_key, 386 + .encrypt = ablk_encrypt, 387 + .decrypt = ablk_decrypt, 388 + } 389 + }, { 390 + .cra_name = "ctr(aes)", 391 + .cra_driver_name = "ctr-aes-" MODE, 392 + .cra_priority = PRIO, 393 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, 394 + .cra_blocksize = 1, 395 + .cra_ctxsize = sizeof(struct async_helper_ctx), 396 + .cra_alignmask = 7, 397 + .cra_type = &crypto_ablkcipher_type, 398 + .cra_module = THIS_MODULE, 399 + .cra_init = ablk_init, 400 + .cra_exit = ablk_exit, 401 + .cra_ablkcipher = { 402 + .min_keysize = AES_MIN_KEY_SIZE, 403 + .max_keysize = AES_MAX_KEY_SIZE, 404 + .ivsize = AES_BLOCK_SIZE, 405 + .setkey = ablk_set_key, 406 + .encrypt = ablk_encrypt, 407 + .decrypt = ablk_decrypt, 408 + } 409 + }, { 410 + .cra_name = "xts(aes)", 411 + .cra_driver_name = "xts-aes-" MODE, 412 + .cra_priority = PRIO, 413 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, 414 + .cra_blocksize = AES_BLOCK_SIZE, 415 + .cra_ctxsize = sizeof(struct async_helper_ctx), 416 + .cra_alignmask = 7, 417 + .cra_type = &crypto_ablkcipher_type, 418 + .cra_module = THIS_MODULE, 419 + .cra_init = ablk_init, 420 + .cra_exit = ablk_exit, 421 + .cra_ablkcipher = { 422 + .min_keysize = 2 * AES_MIN_KEY_SIZE, 423 + .max_keysize = 2 * AES_MAX_KEY_SIZE, 424 + .ivsize = AES_BLOCK_SIZE, 425 + .setkey = ablk_set_key, 426 + .encrypt = ablk_encrypt, 427 + .decrypt = ablk_decrypt, 428 + } 429 + } }; 430 + 431 + static int __init aes_init(void) 432 + { 433 + return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs)); 434 + } 435 + 436 + static void __exit aes_exit(void) 437 + { 438 + crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs)); 439 + } 440 + 441 + #ifdef USE_V8_CRYPTO_EXTENSIONS 442 + module_cpu_feature_match(AES, aes_init); 443 + #else 444 + module_init(aes_init); 445 + #endif 446 + module_exit(aes_exit);

+532

arch/arm64/crypto/aes-modes.S

··· 1 + /* 2 + * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES 3 + * 4 + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + /* included by aes-ce.S and aes-neon.S */ 12 + 13 + .text 14 + .align 4 15 + 16 + /* 17 + * There are several ways to instantiate this code: 18 + * - no interleave, all inline 19 + * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2) 20 + * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE) 21 + * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4) 22 + * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE) 23 + * 24 + * Macros imported by this code: 25 + * - enc_prepare - setup NEON registers for encryption 26 + * - dec_prepare - setup NEON registers for decryption 27 + * - enc_switch_key - change to new key after having prepared for encryption 28 + * - encrypt_block - encrypt a single block 29 + * - decrypt block - decrypt a single block 30 + * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2) 31 + * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2) 32 + * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4) 33 + * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4) 34 + */ 35 + 36 + #if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE) 37 + #define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp 38 + #define FRAME_POP ldp x29, x30, [sp],#16 39 + 40 + #if INTERLEAVE == 2 41 + 42 + aes_encrypt_block2x: 43 + encrypt_block2x v0, v1, w3, x2, x6, w7 44 + ret 45 + ENDPROC(aes_encrypt_block2x) 46 + 47 + aes_decrypt_block2x: 48 + decrypt_block2x v0, v1, w3, x2, x6, w7 49 + ret 50 + ENDPROC(aes_decrypt_block2x) 51 + 52 + #elif INTERLEAVE == 4 53 + 54 + aes_encrypt_block4x: 55 + encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 56 + ret 57 + ENDPROC(aes_encrypt_block4x) 58 + 59 + aes_decrypt_block4x: 60 + decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 61 + ret 62 + ENDPROC(aes_decrypt_block4x) 63 + 64 + #else 65 + #error INTERLEAVE should equal 2 or 4 66 + #endif 67 + 68 + .macro do_encrypt_block2x 69 + bl aes_encrypt_block2x 70 + .endm 71 + 72 + .macro do_decrypt_block2x 73 + bl aes_decrypt_block2x 74 + .endm 75 + 76 + .macro do_encrypt_block4x 77 + bl aes_encrypt_block4x 78 + .endm 79 + 80 + .macro do_decrypt_block4x 81 + bl aes_decrypt_block4x 82 + .endm 83 + 84 + #else 85 + #define FRAME_PUSH 86 + #define FRAME_POP 87 + 88 + .macro do_encrypt_block2x 89 + encrypt_block2x v0, v1, w3, x2, x6, w7 90 + .endm 91 + 92 + .macro do_decrypt_block2x 93 + decrypt_block2x v0, v1, w3, x2, x6, w7 94 + .endm 95 + 96 + .macro do_encrypt_block4x 97 + encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 98 + .endm 99 + 100 + .macro do_decrypt_block4x 101 + decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 102 + .endm 103 + 104 + #endif 105 + 106 + /* 107 + * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 108 + * int blocks, int first) 109 + * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 110 + * int blocks, int first) 111 + */ 112 + 113 + AES_ENTRY(aes_ecb_encrypt) 114 + FRAME_PUSH 115 + cbz w5, .LecbencloopNx 116 + 117 + enc_prepare w3, x2, x5 118 + 119 + .LecbencloopNx: 120 + #if INTERLEAVE >= 2 121 + subs w4, w4, #INTERLEAVE 122 + bmi .Lecbenc1x 123 + #if INTERLEAVE == 2 124 + ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ 125 + do_encrypt_block2x 126 + st1 {v0.16b-v1.16b}, [x0], #32 127 + #else 128 + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 129 + do_encrypt_block4x 130 + st1 {v0.16b-v3.16b}, [x0], #64 131 + #endif 132 + b .LecbencloopNx 133 + .Lecbenc1x: 134 + adds w4, w4, #INTERLEAVE 135 + beq .Lecbencout 136 + #endif 137 + .Lecbencloop: 138 + ld1 {v0.16b}, [x1], #16 /* get next pt block */ 139 + encrypt_block v0, w3, x2, x5, w6 140 + st1 {v0.16b}, [x0], #16 141 + subs w4, w4, #1 142 + bne .Lecbencloop 143 + .Lecbencout: 144 + FRAME_POP 145 + ret 146 + AES_ENDPROC(aes_ecb_encrypt) 147 + 148 + 149 + AES_ENTRY(aes_ecb_decrypt) 150 + FRAME_PUSH 151 + cbz w5, .LecbdecloopNx 152 + 153 + dec_prepare w3, x2, x5 154 + 155 + .LecbdecloopNx: 156 + #if INTERLEAVE >= 2 157 + subs w4, w4, #INTERLEAVE 158 + bmi .Lecbdec1x 159 + #if INTERLEAVE == 2 160 + ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 161 + do_decrypt_block2x 162 + st1 {v0.16b-v1.16b}, [x0], #32 163 + #else 164 + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 165 + do_decrypt_block4x 166 + st1 {v0.16b-v3.16b}, [x0], #64 167 + #endif 168 + b .LecbdecloopNx 169 + .Lecbdec1x: 170 + adds w4, w4, #INTERLEAVE 171 + beq .Lecbdecout 172 + #endif 173 + .Lecbdecloop: 174 + ld1 {v0.16b}, [x1], #16 /* get next ct block */ 175 + decrypt_block v0, w3, x2, x5, w6 176 + st1 {v0.16b}, [x0], #16 177 + subs w4, w4, #1 178 + bne .Lecbdecloop 179 + .Lecbdecout: 180 + FRAME_POP 181 + ret 182 + AES_ENDPROC(aes_ecb_decrypt) 183 + 184 + 185 + /* 186 + * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 187 + * int blocks, u8 iv[], int first) 188 + * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 189 + * int blocks, u8 iv[], int first) 190 + */ 191 + 192 + AES_ENTRY(aes_cbc_encrypt) 193 + cbz w6, .Lcbcencloop 194 + 195 + ld1 {v0.16b}, [x5] /* get iv */ 196 + enc_prepare w3, x2, x5 197 + 198 + .Lcbcencloop: 199 + ld1 {v1.16b}, [x1], #16 /* get next pt block */ 200 + eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */ 201 + encrypt_block v0, w3, x2, x5, w6 202 + st1 {v0.16b}, [x0], #16 203 + subs w4, w4, #1 204 + bne .Lcbcencloop 205 + ret 206 + AES_ENDPROC(aes_cbc_encrypt) 207 + 208 + 209 + AES_ENTRY(aes_cbc_decrypt) 210 + FRAME_PUSH 211 + cbz w6, .LcbcdecloopNx 212 + 213 + ld1 {v7.16b}, [x5] /* get iv */ 214 + dec_prepare w3, x2, x5 215 + 216 + .LcbcdecloopNx: 217 + #if INTERLEAVE >= 2 218 + subs w4, w4, #INTERLEAVE 219 + bmi .Lcbcdec1x 220 + #if INTERLEAVE == 2 221 + ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 222 + mov v2.16b, v0.16b 223 + mov v3.16b, v1.16b 224 + do_decrypt_block2x 225 + eor v0.16b, v0.16b, v7.16b 226 + eor v1.16b, v1.16b, v2.16b 227 + mov v7.16b, v3.16b 228 + st1 {v0.16b-v1.16b}, [x0], #32 229 + #else 230 + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 231 + mov v4.16b, v0.16b 232 + mov v5.16b, v1.16b 233 + mov v6.16b, v2.16b 234 + do_decrypt_block4x 235 + sub x1, x1, #16 236 + eor v0.16b, v0.16b, v7.16b 237 + eor v1.16b, v1.16b, v4.16b 238 + ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */ 239 + eor v2.16b, v2.16b, v5.16b 240 + eor v3.16b, v3.16b, v6.16b 241 + st1 {v0.16b-v3.16b}, [x0], #64 242 + #endif 243 + b .LcbcdecloopNx 244 + .Lcbcdec1x: 245 + adds w4, w4, #INTERLEAVE 246 + beq .Lcbcdecout 247 + #endif 248 + .Lcbcdecloop: 249 + ld1 {v1.16b}, [x1], #16 /* get next ct block */ 250 + mov v0.16b, v1.16b /* ...and copy to v0 */ 251 + decrypt_block v0, w3, x2, x5, w6 252 + eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ 253 + mov v7.16b, v1.16b /* ct is next iv */ 254 + st1 {v0.16b}, [x0], #16 255 + subs w4, w4, #1 256 + bne .Lcbcdecloop 257 + .Lcbcdecout: 258 + FRAME_POP 259 + ret 260 + AES_ENDPROC(aes_cbc_decrypt) 261 + 262 + 263 + /* 264 + * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 265 + * int blocks, u8 ctr[], int first) 266 + */ 267 + 268 + AES_ENTRY(aes_ctr_encrypt) 269 + FRAME_PUSH 270 + cbnz w6, .Lctrfirst /* 1st time around? */ 271 + umov x5, v4.d[1] /* keep swabbed ctr in reg */ 272 + rev x5, x5 273 + #if INTERLEAVE >= 2 274 + cmn w5, w4 /* 32 bit overflow? */ 275 + bcs .Lctrinc 276 + add x5, x5, #1 /* increment BE ctr */ 277 + b .LctrincNx 278 + #else 279 + b .Lctrinc 280 + #endif 281 + .Lctrfirst: 282 + enc_prepare w3, x2, x6 283 + ld1 {v4.16b}, [x5] 284 + umov x5, v4.d[1] /* keep swabbed ctr in reg */ 285 + rev x5, x5 286 + #if INTERLEAVE >= 2 287 + cmn w5, w4 /* 32 bit overflow? */ 288 + bcs .Lctrloop 289 + .LctrloopNx: 290 + subs w4, w4, #INTERLEAVE 291 + bmi .Lctr1x 292 + #if INTERLEAVE == 2 293 + mov v0.8b, v4.8b 294 + mov v1.8b, v4.8b 295 + rev x7, x5 296 + add x5, x5, #1 297 + ins v0.d[1], x7 298 + rev x7, x5 299 + add x5, x5, #1 300 + ins v1.d[1], x7 301 + ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */ 302 + do_encrypt_block2x 303 + eor v0.16b, v0.16b, v2.16b 304 + eor v1.16b, v1.16b, v3.16b 305 + st1 {v0.16b-v1.16b}, [x0], #32 306 + #else 307 + ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */ 308 + dup v7.4s, w5 309 + mov v0.16b, v4.16b 310 + add v7.4s, v7.4s, v8.4s 311 + mov v1.16b, v4.16b 312 + rev32 v8.16b, v7.16b 313 + mov v2.16b, v4.16b 314 + mov v3.16b, v4.16b 315 + mov v1.s[3], v8.s[0] 316 + mov v2.s[3], v8.s[1] 317 + mov v3.s[3], v8.s[2] 318 + ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ 319 + do_encrypt_block4x 320 + eor v0.16b, v5.16b, v0.16b 321 + ld1 {v5.16b}, [x1], #16 /* get 1 input block */ 322 + eor v1.16b, v6.16b, v1.16b 323 + eor v2.16b, v7.16b, v2.16b 324 + eor v3.16b, v5.16b, v3.16b 325 + st1 {v0.16b-v3.16b}, [x0], #64 326 + add x5, x5, #INTERLEAVE 327 + #endif 328 + cbz w4, .LctroutNx 329 + .LctrincNx: 330 + rev x7, x5 331 + ins v4.d[1], x7 332 + b .LctrloopNx 333 + .LctroutNx: 334 + sub x5, x5, #1 335 + rev x7, x5 336 + ins v4.d[1], x7 337 + b .Lctrout 338 + .Lctr1x: 339 + adds w4, w4, #INTERLEAVE 340 + beq .Lctrout 341 + #endif 342 + .Lctrloop: 343 + mov v0.16b, v4.16b 344 + encrypt_block v0, w3, x2, x6, w7 345 + subs w4, w4, #1 346 + bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */ 347 + ld1 {v3.16b}, [x1], #16 348 + eor v3.16b, v0.16b, v3.16b 349 + st1 {v3.16b}, [x0], #16 350 + beq .Lctrout 351 + .Lctrinc: 352 + adds x5, x5, #1 /* increment BE ctr */ 353 + rev x7, x5 354 + ins v4.d[1], x7 355 + bcc .Lctrloop /* no overflow? */ 356 + umov x7, v4.d[0] /* load upper word of ctr */ 357 + rev x7, x7 /* ... to handle the carry */ 358 + add x7, x7, #1 359 + rev x7, x7 360 + ins v4.d[0], x7 361 + b .Lctrloop 362 + .Lctrhalfblock: 363 + ld1 {v3.8b}, [x1] 364 + eor v3.8b, v0.8b, v3.8b 365 + st1 {v3.8b}, [x0] 366 + .Lctrout: 367 + FRAME_POP 368 + ret 369 + AES_ENDPROC(aes_ctr_encrypt) 370 + .ltorg 371 + 372 + 373 + /* 374 + * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 375 + * int blocks, u8 const rk2[], u8 iv[], int first) 376 + * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 377 + * int blocks, u8 const rk2[], u8 iv[], int first) 378 + */ 379 + 380 + .macro next_tweak, out, in, const, tmp 381 + sshr \tmp\().2d, \in\().2d, #63 382 + and \tmp\().16b, \tmp\().16b, \const\().16b 383 + add \out\().2d, \in\().2d, \in\().2d 384 + ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 385 + eor \out\().16b, \out\().16b, \tmp\().16b 386 + .endm 387 + 388 + .Lxts_mul_x: 389 + .word 1, 0, 0x87, 0 390 + 391 + AES_ENTRY(aes_xts_encrypt) 392 + FRAME_PUSH 393 + cbz w7, .LxtsencloopNx 394 + 395 + ld1 {v4.16b}, [x6] 396 + enc_prepare w3, x5, x6 397 + encrypt_block v4, w3, x5, x6, w7 /* first tweak */ 398 + enc_switch_key w3, x2, x6 399 + ldr q7, .Lxts_mul_x 400 + b .LxtsencNx 401 + 402 + .LxtsencloopNx: 403 + ldr q7, .Lxts_mul_x 404 + next_tweak v4, v4, v7, v8 405 + .LxtsencNx: 406 + #if INTERLEAVE >= 2 407 + subs w4, w4, #INTERLEAVE 408 + bmi .Lxtsenc1x 409 + #if INTERLEAVE == 2 410 + ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ 411 + next_tweak v5, v4, v7, v8 412 + eor v0.16b, v0.16b, v4.16b 413 + eor v1.16b, v1.16b, v5.16b 414 + do_encrypt_block2x 415 + eor v0.16b, v0.16b, v4.16b 416 + eor v1.16b, v1.16b, v5.16b 417 + st1 {v0.16b-v1.16b}, [x0], #32 418 + cbz w4, .LxtsencoutNx 419 + next_tweak v4, v5, v7, v8 420 + b .LxtsencNx 421 + .LxtsencoutNx: 422 + mov v4.16b, v5.16b 423 + b .Lxtsencout 424 + #else 425 + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 426 + next_tweak v5, v4, v7, v8 427 + eor v0.16b, v0.16b, v4.16b 428 + next_tweak v6, v5, v7, v8 429 + eor v1.16b, v1.16b, v5.16b 430 + eor v2.16b, v2.16b, v6.16b 431 + next_tweak v7, v6, v7, v8 432 + eor v3.16b, v3.16b, v7.16b 433 + do_encrypt_block4x 434 + eor v3.16b, v3.16b, v7.16b 435 + eor v0.16b, v0.16b, v4.16b 436 + eor v1.16b, v1.16b, v5.16b 437 + eor v2.16b, v2.16b, v6.16b 438 + st1 {v0.16b-v3.16b}, [x0], #64 439 + mov v4.16b, v7.16b 440 + cbz w4, .Lxtsencout 441 + b .LxtsencloopNx 442 + #endif 443 + .Lxtsenc1x: 444 + adds w4, w4, #INTERLEAVE 445 + beq .Lxtsencout 446 + #endif 447 + .Lxtsencloop: 448 + ld1 {v1.16b}, [x1], #16 449 + eor v0.16b, v1.16b, v4.16b 450 + encrypt_block v0, w3, x2, x6, w7 451 + eor v0.16b, v0.16b, v4.16b 452 + st1 {v0.16b}, [x0], #16 453 + subs w4, w4, #1 454 + beq .Lxtsencout 455 + next_tweak v4, v4, v7, v8 456 + b .Lxtsencloop 457 + .Lxtsencout: 458 + FRAME_POP 459 + ret 460 + AES_ENDPROC(aes_xts_encrypt) 461 + 462 + 463 + AES_ENTRY(aes_xts_decrypt) 464 + FRAME_PUSH 465 + cbz w7, .LxtsdecloopNx 466 + 467 + ld1 {v4.16b}, [x6] 468 + enc_prepare w3, x5, x6 469 + encrypt_block v4, w3, x5, x6, w7 /* first tweak */ 470 + dec_prepare w3, x2, x6 471 + ldr q7, .Lxts_mul_x 472 + b .LxtsdecNx 473 + 474 + .LxtsdecloopNx: 475 + ldr q7, .Lxts_mul_x 476 + next_tweak v4, v4, v7, v8 477 + .LxtsdecNx: 478 + #if INTERLEAVE >= 2 479 + subs w4, w4, #INTERLEAVE 480 + bmi .Lxtsdec1x 481 + #if INTERLEAVE == 2 482 + ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 483 + next_tweak v5, v4, v7, v8 484 + eor v0.16b, v0.16b, v4.16b 485 + eor v1.16b, v1.16b, v5.16b 486 + do_decrypt_block2x 487 + eor v0.16b, v0.16b, v4.16b 488 + eor v1.16b, v1.16b, v5.16b 489 + st1 {v0.16b-v1.16b}, [x0], #32 490 + cbz w4, .LxtsdecoutNx 491 + next_tweak v4, v5, v7, v8 492 + b .LxtsdecNx 493 + .LxtsdecoutNx: 494 + mov v4.16b, v5.16b 495 + b .Lxtsdecout 496 + #else 497 + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 498 + next_tweak v5, v4, v7, v8 499 + eor v0.16b, v0.16b, v4.16b 500 + next_tweak v6, v5, v7, v8 501 + eor v1.16b, v1.16b, v5.16b 502 + eor v2.16b, v2.16b, v6.16b 503 + next_tweak v7, v6, v7, v8 504 + eor v3.16b, v3.16b, v7.16b 505 + do_decrypt_block4x 506 + eor v3.16b, v3.16b, v7.16b 507 + eor v0.16b, v0.16b, v4.16b 508 + eor v1.16b, v1.16b, v5.16b 509 + eor v2.16b, v2.16b, v6.16b 510 + st1 {v0.16b-v3.16b}, [x0], #64 511 + mov v4.16b, v7.16b 512 + cbz w4, .Lxtsdecout 513 + b .LxtsdecloopNx 514 + #endif 515 + .Lxtsdec1x: 516 + adds w4, w4, #INTERLEAVE 517 + beq .Lxtsdecout 518 + #endif 519 + .Lxtsdecloop: 520 + ld1 {v1.16b}, [x1], #16 521 + eor v0.16b, v1.16b, v4.16b 522 + decrypt_block v0, w3, x2, x6, w7 523 + eor v0.16b, v0.16b, v4.16b 524 + st1 {v0.16b}, [x0], #16 525 + subs w4, w4, #1 526 + beq .Lxtsdecout 527 + next_tweak v4, v4, v7, v8 528 + b .Lxtsdecloop 529 + .Lxtsdecout: 530 + FRAME_POP 531 + ret 532 + AES_ENDPROC(aes_xts_decrypt)

+382

arch/arm64/crypto/aes-neon.S

··· 1 + /* 2 + * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON 3 + * 4 + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <linux/linkage.h> 12 + 13 + #define AES_ENTRY(func) ENTRY(neon_ ## func) 14 + #define AES_ENDPROC(func) ENDPROC(neon_ ## func) 15 + 16 + /* multiply by polynomial 'x' in GF(2^8) */ 17 + .macro mul_by_x, out, in, temp, const 18 + sshr \temp, \in, #7 19 + add \out, \in, \in 20 + and \temp, \temp, \const 21 + eor \out, \out, \temp 22 + .endm 23 + 24 + /* preload the entire Sbox */ 25 + .macro prepare, sbox, shiftrows, temp 26 + adr \temp, \sbox 27 + movi v12.16b, #0x40 28 + ldr q13, \shiftrows 29 + movi v14.16b, #0x1b 30 + ld1 {v16.16b-v19.16b}, [\temp], #64 31 + ld1 {v20.16b-v23.16b}, [\temp], #64 32 + ld1 {v24.16b-v27.16b}, [\temp], #64 33 + ld1 {v28.16b-v31.16b}, [\temp] 34 + .endm 35 + 36 + /* do preload for encryption */ 37 + .macro enc_prepare, ignore0, ignore1, temp 38 + prepare .LForward_Sbox, .LForward_ShiftRows, \temp 39 + .endm 40 + 41 + .macro enc_switch_key, ignore0, ignore1, temp 42 + /* do nothing */ 43 + .endm 44 + 45 + /* do preload for decryption */ 46 + .macro dec_prepare, ignore0, ignore1, temp 47 + prepare .LReverse_Sbox, .LReverse_ShiftRows, \temp 48 + .endm 49 + 50 + /* apply SubBytes transformation using the the preloaded Sbox */ 51 + .macro sub_bytes, in 52 + sub v9.16b, \in\().16b, v12.16b 53 + tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b 54 + sub v10.16b, v9.16b, v12.16b 55 + tbx \in\().16b, {v20.16b-v23.16b}, v9.16b 56 + sub v11.16b, v10.16b, v12.16b 57 + tbx \in\().16b, {v24.16b-v27.16b}, v10.16b 58 + tbx \in\().16b, {v28.16b-v31.16b}, v11.16b 59 + .endm 60 + 61 + /* apply MixColumns transformation */ 62 + .macro mix_columns, in 63 + mul_by_x v10.16b, \in\().16b, v9.16b, v14.16b 64 + rev32 v8.8h, \in\().8h 65 + eor \in\().16b, v10.16b, \in\().16b 66 + shl v9.4s, v8.4s, #24 67 + shl v11.4s, \in\().4s, #24 68 + sri v9.4s, v8.4s, #8 69 + sri v11.4s, \in\().4s, #8 70 + eor v9.16b, v9.16b, v8.16b 71 + eor v10.16b, v10.16b, v9.16b 72 + eor \in\().16b, v10.16b, v11.16b 73 + .endm 74 + 75 + /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ 76 + .macro inv_mix_columns, in 77 + mul_by_x v11.16b, \in\().16b, v10.16b, v14.16b 78 + mul_by_x v11.16b, v11.16b, v10.16b, v14.16b 79 + eor \in\().16b, \in\().16b, v11.16b 80 + rev32 v11.8h, v11.8h 81 + eor \in\().16b, \in\().16b, v11.16b 82 + mix_columns \in 83 + .endm 84 + 85 + .macro do_block, enc, in, rounds, rk, rkp, i 86 + ld1 {v15.16b}, [\rk] 87 + add \rkp, \rk, #16 88 + mov \i, \rounds 89 + 1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ 90 + tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ 91 + sub_bytes \in 92 + ld1 {v15.16b}, [\rkp], #16 93 + subs \i, \i, #1 94 + beq 2222f 95 + .if \enc == 1 96 + mix_columns \in 97 + .else 98 + inv_mix_columns \in 99 + .endif 100 + b 1111b 101 + 2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ 102 + .endm 103 + 104 + .macro encrypt_block, in, rounds, rk, rkp, i 105 + do_block 1, \in, \rounds, \rk, \rkp, \i 106 + .endm 107 + 108 + .macro decrypt_block, in, rounds, rk, rkp, i 109 + do_block 0, \in, \rounds, \rk, \rkp, \i 110 + .endm 111 + 112 + /* 113 + * Interleaved versions: functionally equivalent to the 114 + * ones above, but applied to 2 or 4 AES states in parallel. 115 + */ 116 + 117 + .macro sub_bytes_2x, in0, in1 118 + sub v8.16b, \in0\().16b, v12.16b 119 + sub v9.16b, \in1\().16b, v12.16b 120 + tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b 121 + tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b 122 + sub v10.16b, v8.16b, v12.16b 123 + sub v11.16b, v9.16b, v12.16b 124 + tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b 125 + tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b 126 + sub v8.16b, v10.16b, v12.16b 127 + sub v9.16b, v11.16b, v12.16b 128 + tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b 129 + tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b 130 + tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b 131 + tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b 132 + .endm 133 + 134 + .macro sub_bytes_4x, in0, in1, in2, in3 135 + sub v8.16b, \in0\().16b, v12.16b 136 + tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b 137 + sub v9.16b, \in1\().16b, v12.16b 138 + tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b 139 + sub v10.16b, \in2\().16b, v12.16b 140 + tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b 141 + sub v11.16b, \in3\().16b, v12.16b 142 + tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b 143 + tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b 144 + tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b 145 + sub v8.16b, v8.16b, v12.16b 146 + tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b 147 + sub v9.16b, v9.16b, v12.16b 148 + tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b 149 + sub v10.16b, v10.16b, v12.16b 150 + tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b 151 + sub v11.16b, v11.16b, v12.16b 152 + tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b 153 + sub v8.16b, v8.16b, v12.16b 154 + tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b 155 + sub v9.16b, v9.16b, v12.16b 156 + tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b 157 + sub v10.16b, v10.16b, v12.16b 158 + tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b 159 + sub v11.16b, v11.16b, v12.16b 160 + tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b 161 + tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b 162 + tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b 163 + .endm 164 + 165 + .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const 166 + sshr \tmp0\().16b, \in0\().16b, #7 167 + add \out0\().16b, \in0\().16b, \in0\().16b 168 + sshr \tmp1\().16b, \in1\().16b, #7 169 + and \tmp0\().16b, \tmp0\().16b, \const\().16b 170 + add \out1\().16b, \in1\().16b, \in1\().16b 171 + and \tmp1\().16b, \tmp1\().16b, \const\().16b 172 + eor \out0\().16b, \out0\().16b, \tmp0\().16b 173 + eor \out1\().16b, \out1\().16b, \tmp1\().16b 174 + .endm 175 + 176 + .macro mix_columns_2x, in0, in1 177 + mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 178 + rev32 v10.8h, \in0\().8h 179 + rev32 v11.8h, \in1\().8h 180 + eor \in0\().16b, v8.16b, \in0\().16b 181 + eor \in1\().16b, v9.16b, \in1\().16b 182 + shl v12.4s, v10.4s, #24 183 + shl v13.4s, v11.4s, #24 184 + eor v8.16b, v8.16b, v10.16b 185 + sri v12.4s, v10.4s, #8 186 + shl v10.4s, \in0\().4s, #24 187 + eor v9.16b, v9.16b, v11.16b 188 + sri v13.4s, v11.4s, #8 189 + shl v11.4s, \in1\().4s, #24 190 + sri v10.4s, \in0\().4s, #8 191 + eor \in0\().16b, v8.16b, v12.16b 192 + sri v11.4s, \in1\().4s, #8 193 + eor \in1\().16b, v9.16b, v13.16b 194 + eor \in0\().16b, v10.16b, \in0\().16b 195 + eor \in1\().16b, v11.16b, \in1\().16b 196 + .endm 197 + 198 + .macro inv_mix_cols_2x, in0, in1 199 + mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 200 + mul_by_x_2x v8, v9, v8, v9, v10, v11, v14 201 + eor \in0\().16b, \in0\().16b, v8.16b 202 + eor \in1\().16b, \in1\().16b, v9.16b 203 + rev32 v8.8h, v8.8h 204 + rev32 v9.8h, v9.8h 205 + eor \in0\().16b, \in0\().16b, v8.16b 206 + eor \in1\().16b, \in1\().16b, v9.16b 207 + mix_columns_2x \in0, \in1 208 + .endm 209 + 210 + .macro inv_mix_cols_4x, in0, in1, in2, in3 211 + mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 212 + mul_by_x_2x v10, v11, \in2, \in3, v12, v13, v14 213 + mul_by_x_2x v8, v9, v8, v9, v12, v13, v14 214 + mul_by_x_2x v10, v11, v10, v11, v12, v13, v14 215 + eor \in0\().16b, \in0\().16b, v8.16b 216 + eor \in1\().16b, \in1\().16b, v9.16b 217 + eor \in2\().16b, \in2\().16b, v10.16b 218 + eor \in3\().16b, \in3\().16b, v11.16b 219 + rev32 v8.8h, v8.8h 220 + rev32 v9.8h, v9.8h 221 + rev32 v10.8h, v10.8h 222 + rev32 v11.8h, v11.8h 223 + eor \in0\().16b, \in0\().16b, v8.16b 224 + eor \in1\().16b, \in1\().16b, v9.16b 225 + eor \in2\().16b, \in2\().16b, v10.16b 226 + eor \in3\().16b, \in3\().16b, v11.16b 227 + mix_columns_2x \in0, \in1 228 + mix_columns_2x \in2, \in3 229 + .endm 230 + 231 + .macro do_block_2x, enc, in0, in1 rounds, rk, rkp, i 232 + ld1 {v15.16b}, [\rk] 233 + add \rkp, \rk, #16 234 + mov \i, \rounds 235 + 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 236 + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ 237 + sub_bytes_2x \in0, \in1 238 + tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ 239 + tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ 240 + ld1 {v15.16b}, [\rkp], #16 241 + subs \i, \i, #1 242 + beq 2222f 243 + .if \enc == 1 244 + mix_columns_2x \in0, \in1 245 + ldr q13, .LForward_ShiftRows 246 + .else 247 + inv_mix_cols_2x \in0, \in1 248 + ldr q13, .LReverse_ShiftRows 249 + .endif 250 + movi v12.16b, #0x40 251 + b 1111b 252 + 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 253 + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ 254 + .endm 255 + 256 + .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i 257 + ld1 {v15.16b}, [\rk] 258 + add \rkp, \rk, #16 259 + mov \i, \rounds 260 + 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 261 + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ 262 + eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ 263 + eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ 264 + sub_bytes_4x \in0, \in1, \in2, \in3 265 + tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ 266 + tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ 267 + tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ 268 + tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ 269 + ld1 {v15.16b}, [\rkp], #16 270 + subs \i, \i, #1 271 + beq 2222f 272 + .if \enc == 1 273 + mix_columns_2x \in0, \in1 274 + mix_columns_2x \in2, \in3 275 + ldr q13, .LForward_ShiftRows 276 + .else 277 + inv_mix_cols_4x \in0, \in1, \in2, \in3 278 + ldr q13, .LReverse_ShiftRows 279 + .endif 280 + movi v12.16b, #0x40 281 + b 1111b 282 + 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ 283 + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ 284 + eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ 285 + eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ 286 + .endm 287 + 288 + .macro encrypt_block2x, in0, in1, rounds, rk, rkp, i 289 + do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i 290 + .endm 291 + 292 + .macro decrypt_block2x, in0, in1, rounds, rk, rkp, i 293 + do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i 294 + .endm 295 + 296 + .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i 297 + do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i 298 + .endm 299 + 300 + .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i 301 + do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i 302 + .endm 303 + 304 + #include "aes-modes.S" 305 + 306 + .text 307 + .align 4 308 + .LForward_ShiftRows: 309 + .byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3 310 + .byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb 311 + 312 + .LReverse_ShiftRows: 313 + .byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb 314 + .byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3 315 + 316 + .LForward_Sbox: 317 + .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 318 + .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 319 + .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 320 + .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 321 + .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc 322 + .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 323 + .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a 324 + .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 325 + .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 326 + .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 327 + .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b 328 + .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf 329 + .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 330 + .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 331 + .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 332 + .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 333 + .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 334 + .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 335 + .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 336 + .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb 337 + .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c 338 + .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 339 + .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 340 + .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 341 + .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 342 + .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a 343 + .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e 344 + .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e 345 + .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 346 + .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf 347 + .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 348 + .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 349 + 350 + .LReverse_Sbox: 351 + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 352 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb 353 + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 354 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb 355 + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d 356 + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e 357 + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 358 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 359 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 360 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 361 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda 362 + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 363 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a 364 + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 365 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 366 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b 367 + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea 368 + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 369 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 370 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e 371 + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 372 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b 373 + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 374 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 375 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 376 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f 377 + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d 378 + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef 379 + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 380 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 381 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 382 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d

+95

arch/arm64/crypto/ghash-ce-core.S

··· 1 + /* 2 + * Accelerated GHASH implementation with ARMv8 PMULL instructions. 3 + * 4 + * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> 5 + * 6 + * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S 7 + * 8 + * Copyright (c) 2009 Intel Corp. 9 + * Author: Huang Ying <ying.huang@intel.com> 10 + * Vinodh Gopal 11 + * Erdinc Ozturk 12 + * Deniz Karakoyunlu 13 + * 14 + * This program is free software; you can redistribute it and/or modify it 15 + * under the terms of the GNU General Public License version 2 as published 16 + * by the Free Software Foundation. 17 + */ 18 + 19 + #include <linux/linkage.h> 20 + #include <asm/assembler.h> 21 + 22 + DATA .req v0 23 + SHASH .req v1 24 + IN1 .req v2 25 + T1 .req v2 26 + T2 .req v3 27 + T3 .req v4 28 + VZR .req v5 29 + 30 + .text 31 + .arch armv8-a+crypto 32 + 33 + /* 34 + * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 35 + * struct ghash_key const *k, const char *head) 36 + */ 37 + ENTRY(pmull_ghash_update) 38 + ld1 {DATA.16b}, [x1] 39 + ld1 {SHASH.16b}, [x3] 40 + eor VZR.16b, VZR.16b, VZR.16b 41 + 42 + /* do the head block first, if supplied */ 43 + cbz x4, 0f 44 + ld1 {IN1.2d}, [x4] 45 + b 1f 46 + 47 + 0: ld1 {IN1.2d}, [x2], #16 48 + sub w0, w0, #1 49 + 1: ext IN1.16b, IN1.16b, IN1.16b, #8 50 + CPU_LE( rev64 IN1.16b, IN1.16b ) 51 + eor DATA.16b, DATA.16b, IN1.16b 52 + 53 + /* multiply DATA by SHASH in GF(2^128) */ 54 + ext T2.16b, DATA.16b, DATA.16b, #8 55 + ext T3.16b, SHASH.16b, SHASH.16b, #8 56 + eor T2.16b, T2.16b, DATA.16b 57 + eor T3.16b, T3.16b, SHASH.16b 58 + 59 + pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1 60 + pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0 61 + pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0) 62 + eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0) 63 + eor T2.16b, T2.16b, DATA.16b 64 + 65 + ext T3.16b, VZR.16b, T2.16b, #8 66 + ext T2.16b, T2.16b, VZR.16b, #8 67 + eor DATA.16b, DATA.16b, T3.16b 68 + eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of 69 + // carry-less multiplication 70 + 71 + /* first phase of the reduction */ 72 + shl T3.2d, DATA.2d, #1 73 + eor T3.16b, T3.16b, DATA.16b 74 + shl T3.2d, T3.2d, #5 75 + eor T3.16b, T3.16b, DATA.16b 76 + shl T3.2d, T3.2d, #57 77 + ext T2.16b, VZR.16b, T3.16b, #8 78 + ext T3.16b, T3.16b, VZR.16b, #8 79 + eor DATA.16b, DATA.16b, T2.16b 80 + eor T1.16b, T1.16b, T3.16b 81 + 82 + /* second phase of the reduction */ 83 + ushr T2.2d, DATA.2d, #5 84 + eor T2.16b, T2.16b, DATA.16b 85 + ushr T2.2d, T2.2d, #1 86 + eor T2.16b, T2.16b, DATA.16b 87 + ushr T2.2d, T2.2d, #1 88 + eor T1.16b, T1.16b, T2.16b 89 + eor DATA.16b, DATA.16b, T1.16b 90 + 91 + cbnz w0, 0b 92 + 93 + st1 {DATA.16b}, [x1] 94 + ret 95 + ENDPROC(pmull_ghash_update)

+155

arch/arm64/crypto/ghash-ce-glue.c

··· 1 + /* 2 + * Accelerated GHASH implementation with ARMv8 PMULL instructions. 3 + * 4 + * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify it 7 + * under the terms of the GNU General Public License version 2 as published 8 + * by the Free Software Foundation. 9 + */ 10 + 11 + #include <asm/neon.h> 12 + #include <asm/unaligned.h> 13 + #include <crypto/internal/hash.h> 14 + #include <linux/cpufeature.h> 15 + #include <linux/crypto.h> 16 + #include <linux/module.h> 17 + 18 + MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions"); 19 + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 20 + MODULE_LICENSE("GPL v2"); 21 + 22 + #define GHASH_BLOCK_SIZE 16 23 + #define GHASH_DIGEST_SIZE 16 24 + 25 + struct ghash_key { 26 + u64 a; 27 + u64 b; 28 + }; 29 + 30 + struct ghash_desc_ctx { 31 + u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)]; 32 + u8 buf[GHASH_BLOCK_SIZE]; 33 + u32 count; 34 + }; 35 + 36 + asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src, 37 + struct ghash_key const *k, const char *head); 38 + 39 + static int ghash_init(struct shash_desc *desc) 40 + { 41 + struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); 42 + 43 + *ctx = (struct ghash_desc_ctx){}; 44 + return 0; 45 + } 46 + 47 + static int ghash_update(struct shash_desc *desc, const u8 *src, 48 + unsigned int len) 49 + { 50 + struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); 51 + unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; 52 + 53 + ctx->count += len; 54 + 55 + if ((partial + len) >= GHASH_BLOCK_SIZE) { 56 + struct ghash_key *key = crypto_shash_ctx(desc->tfm); 57 + int blocks; 58 + 59 + if (partial) { 60 + int p = GHASH_BLOCK_SIZE - partial; 61 + 62 + memcpy(ctx->buf + partial, src, p); 63 + src += p; 64 + len -= p; 65 + } 66 + 67 + blocks = len / GHASH_BLOCK_SIZE; 68 + len %= GHASH_BLOCK_SIZE; 69 + 70 + kernel_neon_begin_partial(6); 71 + pmull_ghash_update(blocks, ctx->digest, src, key, 72 + partial ? ctx->buf : NULL); 73 + kernel_neon_end(); 74 + src += blocks * GHASH_BLOCK_SIZE; 75 + } 76 + if (len) 77 + memcpy(ctx->buf + partial, src, len); 78 + return 0; 79 + } 80 + 81 + static int ghash_final(struct shash_desc *desc, u8 *dst) 82 + { 83 + struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); 84 + unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; 85 + 86 + if (partial) { 87 + struct ghash_key *key = crypto_shash_ctx(desc->tfm); 88 + 89 + memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); 90 + 91 + kernel_neon_begin_partial(6); 92 + pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL); 93 + kernel_neon_end(); 94 + } 95 + put_unaligned_be64(ctx->digest[1], dst); 96 + put_unaligned_be64(ctx->digest[0], dst + 8); 97 + 98 + *ctx = (struct ghash_desc_ctx){}; 99 + return 0; 100 + } 101 + 102 + static int ghash_setkey(struct crypto_shash *tfm, 103 + const u8 *inkey, unsigned int keylen) 104 + { 105 + struct ghash_key *key = crypto_shash_ctx(tfm); 106 + u64 a, b; 107 + 108 + if (keylen != GHASH_BLOCK_SIZE) { 109 + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); 110 + return -EINVAL; 111 + } 112 + 113 + /* perform multiplication by 'x' in GF(2^128) */ 114 + b = get_unaligned_be64(inkey); 115 + a = get_unaligned_be64(inkey + 8); 116 + 117 + key->a = (a << 1) | (b >> 63); 118 + key->b = (b << 1) | (a >> 63); 119 + 120 + if (b >> 63) 121 + key->b ^= 0xc200000000000000UL; 122 + 123 + return 0; 124 + } 125 + 126 + static struct shash_alg ghash_alg = { 127 + .digestsize = GHASH_DIGEST_SIZE, 128 + .init = ghash_init, 129 + .update = ghash_update, 130 + .final = ghash_final, 131 + .setkey = ghash_setkey, 132 + .descsize = sizeof(struct ghash_desc_ctx), 133 + .base = { 134 + .cra_name = "ghash", 135 + .cra_driver_name = "ghash-ce", 136 + .cra_priority = 200, 137 + .cra_flags = CRYPTO_ALG_TYPE_SHASH, 138 + .cra_blocksize = GHASH_BLOCK_SIZE, 139 + .cra_ctxsize = sizeof(struct ghash_key), 140 + .cra_module = THIS_MODULE, 141 + }, 142 + }; 143 + 144 + static int __init ghash_ce_mod_init(void) 145 + { 146 + return crypto_register_shash(&ghash_alg); 147 + } 148 + 149 + static void __exit ghash_ce_mod_exit(void) 150 + { 151 + crypto_unregister_shash(&ghash_alg); 152 + } 153 + 154 + module_cpu_feature_match(PMULL, ghash_ce_mod_init); 155 + module_exit(ghash_ce_mod_exit);

+153

arch/arm64/crypto/sha1-ce-core.S

··· 1 + /* 2 + * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions 3 + * 4 + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <linux/linkage.h> 12 + #include <asm/assembler.h> 13 + 14 + .text 15 + .arch armv8-a+crypto 16 + 17 + k0 .req v0 18 + k1 .req v1 19 + k2 .req v2 20 + k3 .req v3 21 + 22 + t0 .req v4 23 + t1 .req v5 24 + 25 + dga .req q6 26 + dgav .req v6 27 + dgb .req s7 28 + dgbv .req v7 29 + 30 + dg0q .req q12 31 + dg0s .req s12 32 + dg0v .req v12 33 + dg1s .req s13 34 + dg1v .req v13 35 + dg2s .req s14 36 + 37 + .macro add_only, op, ev, rc, s0, dg1 38 + .ifc \ev, ev 39 + add t1.4s, v\s0\().4s, \rc\().4s 40 + sha1h dg2s, dg0s 41 + .ifnb \dg1 42 + sha1\op dg0q, \dg1, t0.4s 43 + .else 44 + sha1\op dg0q, dg1s, t0.4s 45 + .endif 46 + .else 47 + .ifnb \s0 48 + add t0.4s, v\s0\().4s, \rc\().4s 49 + .endif 50 + sha1h dg1s, dg0s 51 + sha1\op dg0q, dg2s, t1.4s 52 + .endif 53 + .endm 54 + 55 + .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 56 + sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s 57 + add_only \op, \ev, \rc, \s1, \dg1 58 + sha1su1 v\s0\().4s, v\s3\().4s 59 + .endm 60 + 61 + /* 62 + * The SHA1 round constants 63 + */ 64 + .align 4 65 + .Lsha1_rcon: 66 + .word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6 67 + 68 + /* 69 + * void sha1_ce_transform(int blocks, u8 const *src, u32 *state, 70 + * u8 *head, long bytes) 71 + */ 72 + ENTRY(sha1_ce_transform) 73 + /* load round constants */ 74 + adr x6, .Lsha1_rcon 75 + ld1r {k0.4s}, [x6], #4 76 + ld1r {k1.4s}, [x6], #4 77 + ld1r {k2.4s}, [x6], #4 78 + ld1r {k3.4s}, [x6] 79 + 80 + /* load state */ 81 + ldr dga, [x2] 82 + ldr dgb, [x2, #16] 83 + 84 + /* load partial state (if supplied) */ 85 + cbz x3, 0f 86 + ld1 {v8.4s-v11.4s}, [x3] 87 + b 1f 88 + 89 + /* load input */ 90 + 0: ld1 {v8.4s-v11.4s}, [x1], #64 91 + sub w0, w0, #1 92 + 93 + 1: 94 + CPU_LE( rev32 v8.16b, v8.16b ) 95 + CPU_LE( rev32 v9.16b, v9.16b ) 96 + CPU_LE( rev32 v10.16b, v10.16b ) 97 + CPU_LE( rev32 v11.16b, v11.16b ) 98 + 99 + 2: add t0.4s, v8.4s, k0.4s 100 + mov dg0v.16b, dgav.16b 101 + 102 + add_update c, ev, k0, 8, 9, 10, 11, dgb 103 + add_update c, od, k0, 9, 10, 11, 8 104 + add_update c, ev, k0, 10, 11, 8, 9 105 + add_update c, od, k0, 11, 8, 9, 10 106 + add_update c, ev, k1, 8, 9, 10, 11 107 + 108 + add_update p, od, k1, 9, 10, 11, 8 109 + add_update p, ev, k1, 10, 11, 8, 9 110 + add_update p, od, k1, 11, 8, 9, 10 111 + add_update p, ev, k1, 8, 9, 10, 11 112 + add_update p, od, k2, 9, 10, 11, 8 113 + 114 + add_update m, ev, k2, 10, 11, 8, 9 115 + add_update m, od, k2, 11, 8, 9, 10 116 + add_update m, ev, k2, 8, 9, 10, 11 117 + add_update m, od, k2, 9, 10, 11, 8 118 + add_update m, ev, k3, 10, 11, 8, 9 119 + 120 + add_update p, od, k3, 11, 8, 9, 10 121 + add_only p, ev, k3, 9 122 + add_only p, od, k3, 10 123 + add_only p, ev, k3, 11 124 + add_only p, od 125 + 126 + /* update state */ 127 + add dgbv.2s, dgbv.2s, dg1v.2s 128 + add dgav.4s, dgav.4s, dg0v.4s 129 + 130 + cbnz w0, 0b 131 + 132 + /* 133 + * Final block: add padding and total bit count. 134 + * Skip if we have no total byte count in x4. In that case, the input 135 + * size was not a round multiple of the block size, and the padding is 136 + * handled by the C code. 137 + */ 138 + cbz x4, 3f 139 + movi v9.2d, #0 140 + mov x8, #0x80000000 141 + movi v10.2d, #0 142 + ror x7, x4, #29 // ror(lsl(x4, 3), 32) 143 + fmov d8, x8 144 + mov x4, #0 145 + mov v11.d[0], xzr 146 + mov v11.d[1], x7 147 + b 2b 148 + 149 + /* store new state */ 150 + 3: str dga, [x2] 151 + str dgb, [x2, #16] 152 + ret 153 + ENDPROC(sha1_ce_transform)

+174

arch/arm64/crypto/sha1-ce-glue.c

··· 1 + /* 2 + * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions 3 + * 4 + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <asm/neon.h> 12 + #include <asm/unaligned.h> 13 + #include <crypto/internal/hash.h> 14 + #include <crypto/sha.h> 15 + #include <linux/cpufeature.h> 16 + #include <linux/crypto.h> 17 + #include <linux/module.h> 18 + 19 + MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); 20 + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 21 + MODULE_LICENSE("GPL v2"); 22 + 23 + asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state, 24 + u8 *head, long bytes); 25 + 26 + static int sha1_init(struct shash_desc *desc) 27 + { 28 + struct sha1_state *sctx = shash_desc_ctx(desc); 29 + 30 + *sctx = (struct sha1_state){ 31 + .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, 32 + }; 33 + return 0; 34 + } 35 + 36 + static int sha1_update(struct shash_desc *desc, const u8 *data, 37 + unsigned int len) 38 + { 39 + struct sha1_state *sctx = shash_desc_ctx(desc); 40 + unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; 41 + 42 + sctx->count += len; 43 + 44 + if ((partial + len) >= SHA1_BLOCK_SIZE) { 45 + int blocks; 46 + 47 + if (partial) { 48 + int p = SHA1_BLOCK_SIZE - partial; 49 + 50 + memcpy(sctx->buffer + partial, data, p); 51 + data += p; 52 + len -= p; 53 + } 54 + 55 + blocks = len / SHA1_BLOCK_SIZE; 56 + len %= SHA1_BLOCK_SIZE; 57 + 58 + kernel_neon_begin_partial(16); 59 + sha1_ce_transform(blocks, data, sctx->state, 60 + partial ? sctx->buffer : NULL, 0); 61 + kernel_neon_end(); 62 + 63 + data += blocks * SHA1_BLOCK_SIZE; 64 + partial = 0; 65 + } 66 + if (len) 67 + memcpy(sctx->buffer + partial, data, len); 68 + return 0; 69 + } 70 + 71 + static int sha1_final(struct shash_desc *desc, u8 *out) 72 + { 73 + static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; 74 + 75 + struct sha1_state *sctx = shash_desc_ctx(desc); 76 + __be64 bits = cpu_to_be64(sctx->count << 3); 77 + __be32 *dst = (__be32 *)out; 78 + int i; 79 + 80 + u32 padlen = SHA1_BLOCK_SIZE 81 + - ((sctx->count + sizeof(bits)) % SHA1_BLOCK_SIZE); 82 + 83 + sha1_update(desc, padding, padlen); 84 + sha1_update(desc, (const u8 *)&bits, sizeof(bits)); 85 + 86 + for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) 87 + put_unaligned_be32(sctx->state[i], dst++); 88 + 89 + *sctx = (struct sha1_state){}; 90 + return 0; 91 + } 92 + 93 + static int sha1_finup(struct shash_desc *desc, const u8 *data, 94 + unsigned int len, u8 *out) 95 + { 96 + struct sha1_state *sctx = shash_desc_ctx(desc); 97 + __be32 *dst = (__be32 *)out; 98 + int blocks; 99 + int i; 100 + 101 + if (sctx->count || !len || (len % SHA1_BLOCK_SIZE)) { 102 + sha1_update(desc, data, len); 103 + return sha1_final(desc, out); 104 + } 105 + 106 + /* 107 + * Use a fast path if the input is a multiple of 64 bytes. In 108 + * this case, there is no need to copy data around, and we can 109 + * perform the entire digest calculation in a single invocation 110 + * of sha1_ce_transform() 111 + */ 112 + blocks = len / SHA1_BLOCK_SIZE; 113 + 114 + kernel_neon_begin_partial(16); 115 + sha1_ce_transform(blocks, data, sctx->state, NULL, len); 116 + kernel_neon_end(); 117 + 118 + for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) 119 + put_unaligned_be32(sctx->state[i], dst++); 120 + 121 + *sctx = (struct sha1_state){}; 122 + return 0; 123 + } 124 + 125 + static int sha1_export(struct shash_desc *desc, void *out) 126 + { 127 + struct sha1_state *sctx = shash_desc_ctx(desc); 128 + struct sha1_state *dst = out; 129 + 130 + *dst = *sctx; 131 + return 0; 132 + } 133 + 134 + static int sha1_import(struct shash_desc *desc, const void *in) 135 + { 136 + struct sha1_state *sctx = shash_desc_ctx(desc); 137 + struct sha1_state const *src = in; 138 + 139 + *sctx = *src; 140 + return 0; 141 + } 142 + 143 + static struct shash_alg alg = { 144 + .init = sha1_init, 145 + .update = sha1_update, 146 + .final = sha1_final, 147 + .finup = sha1_finup, 148 + .export = sha1_export, 149 + .import = sha1_import, 150 + .descsize = sizeof(struct sha1_state), 151 + .digestsize = SHA1_DIGEST_SIZE, 152 + .statesize = sizeof(struct sha1_state), 153 + .base = { 154 + .cra_name = "sha1", 155 + .cra_driver_name = "sha1-ce", 156 + .cra_priority = 200, 157 + .cra_flags = CRYPTO_ALG_TYPE_SHASH, 158 + .cra_blocksize = SHA1_BLOCK_SIZE, 159 + .cra_module = THIS_MODULE, 160 + } 161 + }; 162 + 163 + static int __init sha1_ce_mod_init(void) 164 + { 165 + return crypto_register_shash(&alg); 166 + } 167 + 168 + static void __exit sha1_ce_mod_fini(void) 169 + { 170 + crypto_unregister_shash(&alg); 171 + } 172 + 173 + module_cpu_feature_match(SHA1, sha1_ce_mod_init); 174 + module_exit(sha1_ce_mod_fini);

+156

arch/arm64/crypto/sha2-ce-core.S

··· 1 + /* 2 + * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions 3 + * 4 + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <linux/linkage.h> 12 + #include <asm/assembler.h> 13 + 14 + .text 15 + .arch armv8-a+crypto 16 + 17 + dga .req q20 18 + dgav .req v20 19 + dgb .req q21 20 + dgbv .req v21 21 + 22 + t0 .req v22 23 + t1 .req v23 24 + 25 + dg0q .req q24 26 + dg0v .req v24 27 + dg1q .req q25 28 + dg1v .req v25 29 + dg2q .req q26 30 + dg2v .req v26 31 + 32 + .macro add_only, ev, rc, s0 33 + mov dg2v.16b, dg0v.16b 34 + .ifeq \ev 35 + add t1.4s, v\s0\().4s, \rc\().4s 36 + sha256h dg0q, dg1q, t0.4s 37 + sha256h2 dg1q, dg2q, t0.4s 38 + .else 39 + .ifnb \s0 40 + add t0.4s, v\s0\().4s, \rc\().4s 41 + .endif 42 + sha256h dg0q, dg1q, t1.4s 43 + sha256h2 dg1q, dg2q, t1.4s 44 + .endif 45 + .endm 46 + 47 + .macro add_update, ev, rc, s0, s1, s2, s3 48 + sha256su0 v\s0\().4s, v\s1\().4s 49 + add_only \ev, \rc, \s1 50 + sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s 51 + .endm 52 + 53 + /* 54 + * The SHA-256 round constants 55 + */ 56 + .align 4 57 + .Lsha2_rcon: 58 + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 59 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 60 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 61 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 62 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 63 + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 64 + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 65 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 66 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 67 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 68 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 69 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 70 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 71 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 72 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 73 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 74 + 75 + /* 76 + * void sha2_ce_transform(int blocks, u8 const *src, u32 *state, 77 + * u8 *head, long bytes) 78 + */ 79 + ENTRY(sha2_ce_transform) 80 + /* load round constants */ 81 + adr x8, .Lsha2_rcon 82 + ld1 { v0.4s- v3.4s}, [x8], #64 83 + ld1 { v4.4s- v7.4s}, [x8], #64 84 + ld1 { v8.4s-v11.4s}, [x8], #64 85 + ld1 {v12.4s-v15.4s}, [x8] 86 + 87 + /* load state */ 88 + ldp dga, dgb, [x2] 89 + 90 + /* load partial input (if supplied) */ 91 + cbz x3, 0f 92 + ld1 {v16.4s-v19.4s}, [x3] 93 + b 1f 94 + 95 + /* load input */ 96 + 0: ld1 {v16.4s-v19.4s}, [x1], #64 97 + sub w0, w0, #1 98 + 99 + 1: 100 + CPU_LE( rev32 v16.16b, v16.16b ) 101 + CPU_LE( rev32 v17.16b, v17.16b ) 102 + CPU_LE( rev32 v18.16b, v18.16b ) 103 + CPU_LE( rev32 v19.16b, v19.16b ) 104 + 105 + 2: add t0.4s, v16.4s, v0.4s 106 + mov dg0v.16b, dgav.16b 107 + mov dg1v.16b, dgbv.16b 108 + 109 + add_update 0, v1, 16, 17, 18, 19 110 + add_update 1, v2, 17, 18, 19, 16 111 + add_update 0, v3, 18, 19, 16, 17 112 + add_update 1, v4, 19, 16, 17, 18 113 + 114 + add_update 0, v5, 16, 17, 18, 19 115 + add_update 1, v6, 17, 18, 19, 16 116 + add_update 0, v7, 18, 19, 16, 17 117 + add_update 1, v8, 19, 16, 17, 18 118 + 119 + add_update 0, v9, 16, 17, 18, 19 120 + add_update 1, v10, 17, 18, 19, 16 121 + add_update 0, v11, 18, 19, 16, 17 122 + add_update 1, v12, 19, 16, 17, 18 123 + 124 + add_only 0, v13, 17 125 + add_only 1, v14, 18 126 + add_only 0, v15, 19 127 + add_only 1 128 + 129 + /* update state */ 130 + add dgav.4s, dgav.4s, dg0v.4s 131 + add dgbv.4s, dgbv.4s, dg1v.4s 132 + 133 + /* handled all input blocks? */ 134 + cbnz w0, 0b 135 + 136 + /* 137 + * Final block: add padding and total bit count. 138 + * Skip if we have no total byte count in x4. In that case, the input 139 + * size was not a round multiple of the block size, and the padding is 140 + * handled by the C code. 141 + */ 142 + cbz x4, 3f 143 + movi v17.2d, #0 144 + mov x8, #0x80000000 145 + movi v18.2d, #0 146 + ror x7, x4, #29 // ror(lsl(x4, 3), 32) 147 + fmov d16, x8 148 + mov x4, #0 149 + mov v19.d[0], xzr 150 + mov v19.d[1], x7 151 + b 2b 152 + 153 + /* store new state */ 154 + 3: stp dga, dgb, [x2] 155 + ret 156 + ENDPROC(sha2_ce_transform)

+255

arch/arm64/crypto/sha2-ce-glue.c

··· 1 + /* 2 + * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions 3 + * 4 + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + */ 10 + 11 + #include <asm/neon.h> 12 + #include <asm/unaligned.h> 13 + #include <crypto/internal/hash.h> 14 + #include <crypto/sha.h> 15 + #include <linux/cpufeature.h> 16 + #include <linux/crypto.h> 17 + #include <linux/module.h> 18 + 19 + MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); 20 + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 21 + MODULE_LICENSE("GPL v2"); 22 + 23 + asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state, 24 + u8 *head, long bytes); 25 + 26 + static int sha224_init(struct shash_desc *desc) 27 + { 28 + struct sha256_state *sctx = shash_desc_ctx(desc); 29 + 30 + *sctx = (struct sha256_state){ 31 + .state = { 32 + SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3, 33 + SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7, 34 + } 35 + }; 36 + return 0; 37 + } 38 + 39 + static int sha256_init(struct shash_desc *desc) 40 + { 41 + struct sha256_state *sctx = shash_desc_ctx(desc); 42 + 43 + *sctx = (struct sha256_state){ 44 + .state = { 45 + SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, 46 + SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, 47 + } 48 + }; 49 + return 0; 50 + } 51 + 52 + static int sha2_update(struct shash_desc *desc, const u8 *data, 53 + unsigned int len) 54 + { 55 + struct sha256_state *sctx = shash_desc_ctx(desc); 56 + unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; 57 + 58 + sctx->count += len; 59 + 60 + if ((partial + len) >= SHA256_BLOCK_SIZE) { 61 + int blocks; 62 + 63 + if (partial) { 64 + int p = SHA256_BLOCK_SIZE - partial; 65 + 66 + memcpy(sctx->buf + partial, data, p); 67 + data += p; 68 + len -= p; 69 + } 70 + 71 + blocks = len / SHA256_BLOCK_SIZE; 72 + len %= SHA256_BLOCK_SIZE; 73 + 74 + kernel_neon_begin_partial(28); 75 + sha2_ce_transform(blocks, data, sctx->state, 76 + partial ? sctx->buf : NULL, 0); 77 + kernel_neon_end(); 78 + 79 + data += blocks * SHA256_BLOCK_SIZE; 80 + partial = 0; 81 + } 82 + if (len) 83 + memcpy(sctx->buf + partial, data, len); 84 + return 0; 85 + } 86 + 87 + static void sha2_final(struct shash_desc *desc) 88 + { 89 + static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; 90 + 91 + struct sha256_state *sctx = shash_desc_ctx(desc); 92 + __be64 bits = cpu_to_be64(sctx->count << 3); 93 + u32 padlen = SHA256_BLOCK_SIZE 94 + - ((sctx->count + sizeof(bits)) % SHA256_BLOCK_SIZE); 95 + 96 + sha2_update(desc, padding, padlen); 97 + sha2_update(desc, (const u8 *)&bits, sizeof(bits)); 98 + } 99 + 100 + static int sha224_final(struct shash_desc *desc, u8 *out) 101 + { 102 + struct sha256_state *sctx = shash_desc_ctx(desc); 103 + __be32 *dst = (__be32 *)out; 104 + int i; 105 + 106 + sha2_final(desc); 107 + 108 + for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) 109 + put_unaligned_be32(sctx->state[i], dst++); 110 + 111 + *sctx = (struct sha256_state){}; 112 + return 0; 113 + } 114 + 115 + static int sha256_final(struct shash_desc *desc, u8 *out) 116 + { 117 + struct sha256_state *sctx = shash_desc_ctx(desc); 118 + __be32 *dst = (__be32 *)out; 119 + int i; 120 + 121 + sha2_final(desc); 122 + 123 + for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) 124 + put_unaligned_be32(sctx->state[i], dst++); 125 + 126 + *sctx = (struct sha256_state){}; 127 + return 0; 128 + } 129 + 130 + static void sha2_finup(struct shash_desc *desc, const u8 *data, 131 + unsigned int len) 132 + { 133 + struct sha256_state *sctx = shash_desc_ctx(desc); 134 + int blocks; 135 + 136 + if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) { 137 + sha2_update(desc, data, len); 138 + sha2_final(desc); 139 + return; 140 + } 141 + 142 + /* 143 + * Use a fast path if the input is a multiple of 64 bytes. In 144 + * this case, there is no need to copy data around, and we can 145 + * perform the entire digest calculation in a single invocation 146 + * of sha2_ce_transform() 147 + */ 148 + blocks = len / SHA256_BLOCK_SIZE; 149 + 150 + kernel_neon_begin_partial(28); 151 + sha2_ce_transform(blocks, data, sctx->state, NULL, len); 152 + kernel_neon_end(); 153 + data += blocks * SHA256_BLOCK_SIZE; 154 + } 155 + 156 + static int sha224_finup(struct shash_desc *desc, const u8 *data, 157 + unsigned int len, u8 *out) 158 + { 159 + struct sha256_state *sctx = shash_desc_ctx(desc); 160 + __be32 *dst = (__be32 *)out; 161 + int i; 162 + 163 + sha2_finup(desc, data, len); 164 + 165 + for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) 166 + put_unaligned_be32(sctx->state[i], dst++); 167 + 168 + *sctx = (struct sha256_state){}; 169 + return 0; 170 + } 171 + 172 + static int sha256_finup(struct shash_desc *desc, const u8 *data, 173 + unsigned int len, u8 *out) 174 + { 175 + struct sha256_state *sctx = shash_desc_ctx(desc); 176 + __be32 *dst = (__be32 *)out; 177 + int i; 178 + 179 + sha2_finup(desc, data, len); 180 + 181 + for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) 182 + put_unaligned_be32(sctx->state[i], dst++); 183 + 184 + *sctx = (struct sha256_state){}; 185 + return 0; 186 + } 187 + 188 + static int sha2_export(struct shash_desc *desc, void *out) 189 + { 190 + struct sha256_state *sctx = shash_desc_ctx(desc); 191 + struct sha256_state *dst = out; 192 + 193 + *dst = *sctx; 194 + return 0; 195 + } 196 + 197 + static int sha2_import(struct shash_desc *desc, const void *in) 198 + { 199 + struct sha256_state *sctx = shash_desc_ctx(desc); 200 + struct sha256_state const *src = in; 201 + 202 + *sctx = *src; 203 + return 0; 204 + } 205 + 206 + static struct shash_alg algs[] = { { 207 + .init = sha224_init, 208 + .update = sha2_update, 209 + .final = sha224_final, 210 + .finup = sha224_finup, 211 + .export = sha2_export, 212 + .import = sha2_import, 213 + .descsize = sizeof(struct sha256_state), 214 + .digestsize = SHA224_DIGEST_SIZE, 215 + .statesize = sizeof(struct sha256_state), 216 + .base = { 217 + .cra_name = "sha224", 218 + .cra_driver_name = "sha224-ce", 219 + .cra_priority = 200, 220 + .cra_flags = CRYPTO_ALG_TYPE_SHASH, 221 + .cra_blocksize = SHA256_BLOCK_SIZE, 222 + .cra_module = THIS_MODULE, 223 + } 224 + }, { 225 + .init = sha256_init, 226 + .update = sha2_update, 227 + .final = sha256_final, 228 + .finup = sha256_finup, 229 + .export = sha2_export, 230 + .import = sha2_import, 231 + .descsize = sizeof(struct sha256_state), 232 + .digestsize = SHA256_DIGEST_SIZE, 233 + .statesize = sizeof(struct sha256_state), 234 + .base = { 235 + .cra_name = "sha256", 236 + .cra_driver_name = "sha256-ce", 237 + .cra_priority = 200, 238 + .cra_flags = CRYPTO_ALG_TYPE_SHASH, 239 + .cra_blocksize = SHA256_BLOCK_SIZE, 240 + .cra_module = THIS_MODULE, 241 + } 242 + } }; 243 + 244 + static int __init sha2_ce_mod_init(void) 245 + { 246 + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); 247 + } 248 + 249 + static void __exit sha2_ce_mod_fini(void) 250 + { 251 + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); 252 + } 253 + 254 + module_cpu_feature_match(SHA2, sha2_ce_mod_init); 255 + module_exit(sha2_ce_mod_fini);

+1

arch/arm64/include/asm/Kbuild

··· 40 40 generic-y += sembuf.h 41 41 generic-y += serial.h 42 42 generic-y += shmbuf.h 43 + generic-y += simd.h 43 44 generic-y += sizes.h 44 45 generic-y += socket.h 45 46 generic-y += sockios.h

+16 -7

arch/arm64/include/asm/assembler.h

··· 21 21 #endif 22 22 23 23 #include <asm/ptrace.h> 24 + #include <asm/thread_info.h> 24 25 25 26 /* 26 27 * Stack pushing/popping (register pairs only). Equivalent to store decrement ··· 69 68 msr daifclr, #8 70 69 .endm 71 70 72 - .macro disable_step, tmp 71 + .macro disable_step_tsk, flgs, tmp 72 + tbz \flgs, #TIF_SINGLESTEP, 9990f 73 73 mrs \tmp, mdscr_el1 74 74 bic \tmp, \tmp, #1 75 75 msr mdscr_el1, \tmp 76 + isb // Synchronise with enable_dbg 77 + 9990: 76 78 .endm 77 79 78 - .macro enable_step, tmp 80 + .macro enable_step_tsk, flgs, tmp 81 + tbz \flgs, #TIF_SINGLESTEP, 9990f 82 + disable_dbg 79 83 mrs \tmp, mdscr_el1 80 84 orr \tmp, \tmp, #1 81 85 msr mdscr_el1, \tmp 86 + 9990: 82 87 .endm 83 88 84 - .macro enable_dbg_if_not_stepping, tmp 85 - mrs \tmp, mdscr_el1 86 - tbnz \tmp, #0, 9990f 87 - enable_dbg 88 - 9990: 89 + /* 90 + * Enable both debug exceptions and interrupts. This is likely to be 91 + * faster than two daifclr operations, since writes to this register 92 + * are self-synchronising. 93 + */ 94 + .macro enable_dbg_and_irq 95 + msr daifclr, #(8 | 2) 89 96 .endm 90 97 91 98 /*

+1 -1

arch/arm64/include/asm/atomic.h

··· 157 157 */ 158 158 #define ATOMIC64_INIT(i) { (i) } 159 159 160 - #define atomic64_read(v) (*(volatile long long *)&(v)->counter) 160 + #define atomic64_read(v) (*(volatile long *)&(v)->counter) 161 161 #define atomic64_set(v,i) (((v)->counter) = (i)) 162 162 163 163 static inline void atomic64_add(u64 i, atomic64_t *v)

+10 -10

arch/arm64/include/asm/barrier.h

··· 25 25 #define wfi() asm volatile("wfi" : : : "memory") 26 26 27 27 #define isb() asm volatile("isb" : : : "memory") 28 - #define dmb(opt) asm volatile("dmb sy" : : : "memory") 29 - #define dsb(opt) asm volatile("dsb sy" : : : "memory") 28 + #define dmb(opt) asm volatile("dmb " #opt : : : "memory") 29 + #define dsb(opt) asm volatile("dsb " #opt : : : "memory") 30 30 31 - #define mb() dsb() 32 - #define rmb() asm volatile("dsb ld" : : : "memory") 33 - #define wmb() asm volatile("dsb st" : : : "memory") 31 + #define mb() dsb(sy) 32 + #define rmb() dsb(ld) 33 + #define wmb() dsb(st) 34 34 35 35 #ifndef CONFIG_SMP 36 36 #define smp_mb() barrier() ··· 40 40 #define smp_store_release(p, v) \ 41 41 do { \ 42 42 compiletime_assert_atomic_type(*p); \ 43 - smp_mb(); \ 43 + barrier(); \ 44 44 ACCESS_ONCE(*p) = (v); \ 45 45 } while (0) 46 46 ··· 48 48 ({ \ 49 49 typeof(*p) ___p1 = ACCESS_ONCE(*p); \ 50 50 compiletime_assert_atomic_type(*p); \ 51 - smp_mb(); \ 51 + barrier(); \ 52 52 ___p1; \ 53 53 }) 54 54 55 55 #else 56 56 57 - #define smp_mb() asm volatile("dmb ish" : : : "memory") 58 - #define smp_rmb() asm volatile("dmb ishld" : : : "memory") 59 - #define smp_wmb() asm volatile("dmb ishst" : : : "memory") 57 + #define smp_mb() dmb(ish) 58 + #define smp_rmb() dmb(ishld) 59 + #define smp_wmb() dmb(ishst) 60 60 61 61 #define smp_store_release(p, v) \ 62 62 do { \

+12 -1

arch/arm64/include/asm/cache.h

··· 16 16 #ifndef __ASM_CACHE_H 17 17 #define __ASM_CACHE_H 18 18 19 + #include <asm/cachetype.h> 20 + 19 21 #define L1_CACHE_SHIFT 6 20 22 #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) 21 23 ··· 29 27 * the CPU. 30 28 */ 31 29 #define ARCH_DMA_MINALIGN L1_CACHE_BYTES 32 - #define ARCH_SLAB_MINALIGN 8 30 + 31 + #ifndef __ASSEMBLY__ 32 + 33 + static inline int cache_line_size(void) 34 + { 35 + u32 cwg = cache_type_cwg(); 36 + return cwg ? 4 << cwg : L1_CACHE_BYTES; 37 + } 38 + 39 + #endif /* __ASSEMBLY__ */ 33 40 34 41 #endif

+2 -2

arch/arm64/include/asm/cacheflush.h

··· 123 123 static inline void __flush_icache_all(void) 124 124 { 125 125 asm("ic ialluis"); 126 - dsb(); 126 + dsb(ish); 127 127 } 128 128 129 129 #define flush_dcache_mmap_lock(mapping) \ ··· 150 150 * set_pte_at() called from vmap_pte_range() does not 151 151 * have a DSB after cleaning the cache line. 152 152 */ 153 - dsb(); 153 + dsb(ish); 154 154 } 155 155 156 156 static inline void flush_cache_vunmap(unsigned long start, unsigned long end)

+11

arch/arm64/include/asm/cachetype.h

··· 20 20 21 21 #define CTR_L1IP_SHIFT 14 22 22 #define CTR_L1IP_MASK 3 23 + #define CTR_CWG_SHIFT 24 24 + #define CTR_CWG_MASK 15 23 25 24 26 #define ICACHE_POLICY_RESERVED 0 25 27 #define ICACHE_POLICY_AIVIVT 1 26 28 #define ICACHE_POLICY_VIPT 2 27 29 #define ICACHE_POLICY_PIPT 3 30 + 31 + #ifndef __ASSEMBLY__ 28 32 29 33 static inline u32 icache_policy(void) 30 34 { ··· 48 44 { 49 45 return icache_policy() == ICACHE_POLICY_AIVIVT; 50 46 } 47 + 48 + static inline u32 cache_type_cwg(void) 49 + { 50 + return (read_cpuid_cachetype() >> CTR_CWG_SHIFT) & CTR_CWG_MASK; 51 + } 52 + 53 + #endif /* __ASSEMBLY__ */ 51 54 52 55 #endif /* __ASM_CACHETYPE_H */

+6 -1

arch/arm64/include/asm/cmpxchg.h

··· 72 72 } 73 73 74 74 #define xchg(ptr,x) \ 75 - ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)))) 75 + ({ \ 76 + __typeof__(*(ptr)) __ret; \ 77 + __ret = (__typeof__(*(ptr))) \ 78 + __xchg((unsigned long)(x), (ptr), sizeof(*(ptr))); \ 79 + __ret; \ 80 + }) 76 81 77 82 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, 78 83 unsigned long new, int size)

-5

arch/arm64/include/asm/compat.h

··· 305 305 306 306 #else /* !CONFIG_COMPAT */ 307 307 308 - static inline int is_compat_task(void) 309 - { 310 - return 0; 311 - } 312 - 313 308 static inline int is_compat_thread(struct thread_info *thread) 314 309 { 315 310 return 0;

+4 -2

arch/arm64/include/asm/esr.h

··· 18 18 #ifndef __ASM_ESR_H 19 19 #define __ASM_ESR_H 20 20 21 - #define ESR_EL1_EC_SHIFT (26) 22 - #define ESR_EL1_IL (1U << 25) 21 + #define ESR_EL1_WRITE (1 << 6) 22 + #define ESR_EL1_CM (1 << 8) 23 + #define ESR_EL1_IL (1 << 25) 23 24 25 + #define ESR_EL1_EC_SHIFT (26) 24 26 #define ESR_EL1_EC_UNKNOWN (0x00) 25 27 #define ESR_EL1_EC_WFI (0x01) 26 28 #define ESR_EL1_EC_CP15_32 (0x03)

+23

arch/arm64/include/asm/fpsimd.h

··· 37 37 u32 fpcr; 38 38 }; 39 39 }; 40 + /* the id of the last cpu to have restored this state */ 41 + unsigned int cpu; 40 42 }; 43 + 44 + /* 45 + * Struct for stacking the bottom 'n' FP/SIMD registers. 46 + */ 47 + struct fpsimd_partial_state { 48 + u32 fpsr; 49 + u32 fpcr; 50 + u32 num_regs; 51 + __uint128_t vregs[32]; 52 + }; 53 + 41 54 42 55 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) 43 56 /* Masks for extracting the FPSR and FPCR from the FPSCR */ ··· 70 57 71 58 extern void fpsimd_thread_switch(struct task_struct *next); 72 59 extern void fpsimd_flush_thread(void); 60 + 61 + extern void fpsimd_preserve_current_state(void); 62 + extern void fpsimd_restore_current_state(void); 63 + extern void fpsimd_update_current_state(struct fpsimd_state *state); 64 + 65 + extern void fpsimd_flush_task_state(struct task_struct *target); 66 + 67 + extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state, 68 + u32 num_regs); 69 + extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state); 73 70 74 71 #endif 75 72

+35

arch/arm64/include/asm/fpsimdmacros.h

··· 62 62 ldr w\tmpnr, [\state, #16 * 2 + 4] 63 63 msr fpcr, x\tmpnr 64 64 .endm 65 + 66 + .altmacro 67 + .macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2 68 + mrs x\tmpnr1, fpsr 69 + str w\numnr, [\state, #8] 70 + mrs x\tmpnr2, fpcr 71 + stp w\tmpnr1, w\tmpnr2, [\state] 72 + adr x\tmpnr1, 0f 73 + add \state, \state, x\numnr, lsl #4 74 + sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1 75 + br x\tmpnr1 76 + .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 77 + .irp qb, %(qa + 1) 78 + stp q\qa, q\qb, [\state, # -16 * \qa - 16] 79 + .endr 80 + .endr 81 + 0: 82 + .endm 83 + 84 + .macro fpsimd_restore_partial state, tmpnr1, tmpnr2 85 + ldp w\tmpnr1, w\tmpnr2, [\state] 86 + msr fpsr, x\tmpnr1 87 + msr fpcr, x\tmpnr2 88 + adr x\tmpnr1, 0f 89 + ldr w\tmpnr2, [\state, #8] 90 + add \state, \state, x\tmpnr2, lsl #4 91 + sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1 92 + br x\tmpnr1 93 + .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 94 + .irp qb, %(qa + 1) 95 + ldp q\qa, q\qb, [\state, # -16 * \qa - 16] 96 + .endr 97 + .endr 98 + 0: 99 + .endm

+59

arch/arm64/include/asm/ftrace.h

··· 1 + /* 2 + * arch/arm64/include/asm/ftrace.h 3 + * 4 + * Copyright (C) 2013 Linaro Limited 5 + * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> 6 + * 7 + * This program is free software; you can redistribute it and/or modify 8 + * it under the terms of the GNU General Public License version 2 as 9 + * published by the Free Software Foundation. 10 + */ 11 + #ifndef __ASM_FTRACE_H 12 + #define __ASM_FTRACE_H 13 + 14 + #include <asm/insn.h> 15 + 16 + #define MCOUNT_ADDR ((unsigned long)_mcount) 17 + #define MCOUNT_INSN_SIZE AARCH64_INSN_SIZE 18 + 19 + #ifndef __ASSEMBLY__ 20 + #include <linux/compat.h> 21 + 22 + extern void _mcount(unsigned long); 23 + extern void *return_address(unsigned int); 24 + 25 + struct dyn_arch_ftrace { 26 + /* No extra data needed for arm64 */ 27 + }; 28 + 29 + extern unsigned long ftrace_graph_call; 30 + 31 + static inline unsigned long ftrace_call_adjust(unsigned long addr) 32 + { 33 + /* 34 + * addr is the address of the mcount call instruction. 35 + * recordmcount does the necessary offset calculation. 36 + */ 37 + return addr; 38 + } 39 + 40 + #define ftrace_return_address(n) return_address(n) 41 + 42 + /* 43 + * Because AArch32 mode does not share the same syscall table with AArch64, 44 + * tracing compat syscalls may result in reporting bogus syscalls or even 45 + * hang-up, so just do not trace them. 46 + * See kernel/trace/trace_syscalls.c 47 + * 48 + * x86 code says: 49 + * If the user realy wants these, then they should use the 50 + * raw syscall tracepoints with filtering. 51 + */ 52 + #define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 53 + static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) 54 + { 55 + return is_compat_task(); 56 + } 57 + #endif /* ifndef __ASSEMBLY__ */ 58 + 59 + #endif /* __ASM_FTRACE_H */

+1 -1

arch/arm64/include/asm/hardirq.h

··· 20 20 #include <linux/threads.h> 21 21 #include <asm/irq.h> 22 22 23 - #define NR_IPI 5 23 + #define NR_IPI 6 24 24 25 25 typedef struct { 26 26 unsigned int __softirq_pending;

+2

arch/arm64/include/asm/insn.h

··· 21 21 /* A64 instructions are always 32 bits. */ 22 22 #define AARCH64_INSN_SIZE 4 23 23 24 + #ifndef __ASSEMBLY__ 24 25 /* 25 26 * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a 26 27 * Section C3.1 "A64 instruction index by encoding": ··· 105 104 int aarch64_insn_patch_text_nosync(void *addr, u32 insn); 106 105 int aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt); 107 106 int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt); 107 + #endif /* __ASSEMBLY__ */ 108 108 109 109 #endif /* __ASM_INSN_H */

-8

arch/arm64/include/asm/io.h

··· 230 230 extern void __iounmap(volatile void __iomem *addr); 231 231 extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size); 232 232 233 - #define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_DIRTY) 234 - #define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE)) 235 - #define PROT_NORMAL_NC (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL_NC)) 236 - #define PROT_NORMAL (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) 237 - 238 233 #define ioremap(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) 239 234 #define ioremap_nocache(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) 240 235 #define ioremap_wc(addr, size) __ioremap((addr), (size), __pgprot(PROT_NORMAL_NC)) 241 236 #define iounmap __iounmap 242 - 243 - #define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF) 244 - #define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PTE_PXN | PTE_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE)) 245 237 246 238 #define ARCH_HAS_IOREMAP_WC 247 239 #include <asm-generic/iomap.h>

+5 -1

arch/arm64/include/asm/neon.h

··· 8 8 * published by the Free Software Foundation. 9 9 */ 10 10 11 + #include <linux/types.h> 12 + 11 13 #define cpu_has_neon() (1) 12 14 13 - void kernel_neon_begin(void); 15 + #define kernel_neon_begin() kernel_neon_begin_partial(32) 16 + 17 + void kernel_neon_begin_partial(u32 num_regs); 14 18 void kernel_neon_end(void);

+2

arch/arm64/include/asm/pgtable-hwdef.h

··· 29 29 */ 30 30 31 31 #define PUD_TABLE_BIT (_AT(pgdval_t, 1) << 1) 32 + #define PUD_TYPE_MASK (_AT(pgdval_t, 3) << 0) 33 + #define PUD_TYPE_SECT (_AT(pgdval_t, 1) << 0) 32 34 33 35 /* 34 36 * Level 2 descriptor (PMD).

+53 -50

arch/arm64/include/asm/pgtable.h

··· 52 52 #endif 53 53 #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) 54 54 55 - /* 56 - * The pgprot_* and protection_map entries will be fixed up at runtime to 57 - * include the cachable and bufferable bits based on memory policy, as well as 58 - * any architecture dependent bits like global/ASID and SMP shared mapping 59 - * bits. 60 - */ 61 - #define _PAGE_DEFAULT PTE_TYPE_PAGE | PTE_AF 55 + #ifdef CONFIG_SMP 56 + #define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) 57 + #define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) 58 + #else 59 + #define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF) 60 + #define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF) 61 + #endif 62 62 63 - extern pgprot_t pgprot_default; 63 + #define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE)) 64 + #define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_NC)) 65 + #define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL)) 64 66 65 - #define __pgprot_modify(prot,mask,bits) \ 66 - __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) 67 + #define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE)) 68 + #define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) 69 + #define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) 67 70 68 - #define _MOD_PROT(p, b) __pgprot_modify(p, 0, b) 71 + #define _PAGE_DEFAULT (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) 69 72 70 - #define PAGE_NONE __pgprot_modify(pgprot_default, PTE_TYPE_MASK, PTE_PROT_NONE | PTE_PXN | PTE_UXN) 71 - #define PAGE_SHARED _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) 72 - #define PAGE_SHARED_EXEC _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) 73 - #define PAGE_COPY _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) 74 - #define PAGE_COPY_EXEC _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN) 75 - #define PAGE_READONLY _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) 76 - #define PAGE_READONLY_EXEC _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN) 77 - #define PAGE_KERNEL _MOD_PROT(pgprot_default, PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE) 78 - #define PAGE_KERNEL_EXEC _MOD_PROT(pgprot_default, PTE_UXN | PTE_DIRTY | PTE_WRITE) 73 + #define PAGE_KERNEL __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE) 74 + #define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE) 79 75 80 - #define PAGE_HYP _MOD_PROT(pgprot_default, PTE_HYP) 76 + #define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP) 81 77 #define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP) 82 78 83 - #define PAGE_S2 __pgprot_modify(pgprot_default, PTE_S2_MEMATTR_MASK, PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) 79 + #define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) 84 80 #define PAGE_S2_DEVICE __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDWR | PTE_UXN) 85 81 86 - #define __PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_TYPE_MASK) | PTE_PROT_NONE | PTE_PXN | PTE_UXN) 87 - #define __PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) 88 - #define __PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) 89 - #define __PAGE_COPY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) 90 - #define __PAGE_COPY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) 91 - #define __PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) 92 - #define __PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) 82 + #define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_TYPE_MASK) | PTE_PROT_NONE | PTE_PXN | PTE_UXN) 83 + #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) 84 + #define PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) 85 + #define PAGE_COPY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) 86 + #define PAGE_COPY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) 87 + #define PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) 88 + #define PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) 93 89 94 - #endif /* __ASSEMBLY__ */ 90 + #define __P000 PAGE_NONE 91 + #define __P001 PAGE_READONLY 92 + #define __P010 PAGE_COPY 93 + #define __P011 PAGE_COPY 94 + #define __P100 PAGE_READONLY_EXEC 95 + #define __P101 PAGE_READONLY_EXEC 96 + #define __P110 PAGE_COPY_EXEC 97 + #define __P111 PAGE_COPY_EXEC 95 98 96 - #define __P000 __PAGE_NONE 97 - #define __P001 __PAGE_READONLY 98 - #define __P010 __PAGE_COPY 99 - #define __P011 __PAGE_COPY 100 - #define __P100 __PAGE_READONLY_EXEC 101 - #define __P101 __PAGE_READONLY_EXEC 102 - #define __P110 __PAGE_COPY_EXEC 103 - #define __P111 __PAGE_COPY_EXEC 99 + #define __S000 PAGE_NONE 100 + #define __S001 PAGE_READONLY 101 + #define __S010 PAGE_SHARED 102 + #define __S011 PAGE_SHARED 103 + #define __S100 PAGE_READONLY_EXEC 104 + #define __S101 PAGE_READONLY_EXEC 105 + #define __S110 PAGE_SHARED_EXEC 106 + #define __S111 PAGE_SHARED_EXEC 104 107 105 - #define __S000 __PAGE_NONE 106 - #define __S001 __PAGE_READONLY 107 - #define __S010 __PAGE_SHARED 108 - #define __S011 __PAGE_SHARED 109 - #define __S100 __PAGE_READONLY_EXEC 110 - #define __S101 __PAGE_READONLY_EXEC 111 - #define __S110 __PAGE_SHARED_EXEC 112 - #define __S111 __PAGE_SHARED_EXEC 113 - 114 - #ifndef __ASSEMBLY__ 115 108 /* 116 109 * ZERO_PAGE is a global shared page that is always zero: used 117 110 * for zero-mapped memory areas etc.. ··· 258 265 #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) 259 266 260 267 #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) 268 + #define pud_pfn(pud) (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT) 261 269 262 270 #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) 263 271 ··· 266 272 { 267 273 return 1; 268 274 } 275 + 276 + #define __pgprot_modify(prot,mask,bits) \ 277 + __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) 269 278 270 279 /* 271 280 * Mark the prot value as uncacheable and unbufferable. ··· 292 295 #define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ 293 296 PMD_TYPE_SECT) 294 297 298 + #ifdef ARM64_64K_PAGES 299 + #define pud_sect(pud) (0) 300 + #else 301 + #define pud_sect(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \ 302 + PUD_TYPE_SECT) 303 + #endif 295 304 296 305 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) 297 306 { 298 307 *pmdp = pmd; 299 - dsb(); 308 + dsb(ishst); 300 309 } 301 310 302 311 static inline void pmd_clear(pmd_t *pmdp) ··· 332 329 static inline void set_pud(pud_t *pudp, pud_t pud) 333 330 { 334 331 *pudp = pud; 335 - dsb(); 332 + dsb(ishst); 336 333 } 337 334 338 335 static inline void pud_clear(pud_t *pudp)

+1

arch/arm64/include/asm/processor.h

··· 79 79 unsigned long tp_value; 80 80 struct fpsimd_state fpsimd_state; 81 81 unsigned long fault_address; /* fault info */ 82 + unsigned long fault_code; /* ESR_EL1 value */ 82 83 struct debug_info debug; /* debugging */ 83 84 }; 84 85

+5

arch/arm64/include/asm/ptrace.h

··· 135 135 #define user_stack_pointer(regs) \ 136 136 (!compat_user_mode(regs)) ? ((regs)->sp) : ((regs)->compat_sp) 137 137 138 + static inline unsigned long regs_return_value(struct pt_regs *regs) 139 + { 140 + return regs->regs[0]; 141 + } 142 + 138 143 /* 139 144 * Are the current registers suitable for user mode? (used to maintain 140 145 * security in signal handlers)

-31

arch/arm64/include/asm/sigcontext.h

··· 1 - /* 2 - * Copyright (C) 2012 ARM Ltd. 3 - * 4 - * This program is free software; you can redistribute it and/or modify 5 - * it under the terms of the GNU General Public License version 2 as 6 - * published by the Free Software Foundation. 7 - * 8 - * This program is distributed in the hope that it will be useful, 9 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 - * GNU General Public License for more details. 12 - * 13 - * You should have received a copy of the GNU General Public License 14 - * along with this program. If not, see <http://www.gnu.org/licenses/>. 15 - */ 16 - #ifndef __ASM_SIGCONTEXT_H 17 - #define __ASM_SIGCONTEXT_H 18 - 19 - #include <uapi/asm/sigcontext.h> 20 - 21 - /* 22 - * Auxiliary context saved in the sigcontext.__reserved array. Not exported to 23 - * user space as it will change with the addition of new context. User space 24 - * should check the magic/size information. 25 - */ 26 - struct aux_context { 27 - struct fpsimd_context fpsimd; 28 - /* additional context to be added before "end" */ 29 - struct _aarch64_ctx end; 30 - }; 31 - #endif

+15

arch/arm64/include/asm/string.h

··· 22 22 #define __HAVE_ARCH_STRCHR 23 23 extern char *strchr(const char *, int c); 24 24 25 + #define __HAVE_ARCH_STRCMP 26 + extern int strcmp(const char *, const char *); 27 + 28 + #define __HAVE_ARCH_STRNCMP 29 + extern int strncmp(const char *, const char *, __kernel_size_t); 30 + 31 + #define __HAVE_ARCH_STRLEN 32 + extern __kernel_size_t strlen(const char *); 33 + 34 + #define __HAVE_ARCH_STRNLEN 35 + extern __kernel_size_t strnlen(const char *, __kernel_size_t); 36 + 25 37 #define __HAVE_ARCH_MEMCPY 26 38 extern void *memcpy(void *, const void *, __kernel_size_t); 27 39 ··· 45 33 46 34 #define __HAVE_ARCH_MEMSET 47 35 extern void *memset(void *, int, __kernel_size_t); 36 + 37 + #define __HAVE_ARCH_MEMCMP 38 + extern int memcmp(const void *, const void *, size_t); 48 39 49 40 #endif

+1

arch/arm64/include/asm/syscall.h

··· 18 18 19 19 #include <linux/err.h> 20 20 21 + extern const void *sys_call_table[]; 21 22 22 23 static inline int syscall_get_nr(struct task_struct *task, 23 24 struct pt_regs *regs)

+16 -1

arch/arm64/include/asm/thread_info.h

··· 91 91 /* 92 92 * thread information flags: 93 93 * TIF_SYSCALL_TRACE - syscall trace active 94 + * TIF_SYSCALL_TRACEPOINT - syscall tracepoint for ftrace 95 + * TIF_SYSCALL_AUDIT - syscall auditing 96 + * TIF_SECOMP - syscall secure computing 94 97 * TIF_SIGPENDING - signal pending 95 98 * TIF_NEED_RESCHED - rescheduling necessary 96 99 * TIF_NOTIFY_RESUME - callback before returning to user ··· 102 99 #define TIF_SIGPENDING 0 103 100 #define TIF_NEED_RESCHED 1 104 101 #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ 102 + #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ 105 103 #define TIF_SYSCALL_TRACE 8 104 + #define TIF_SYSCALL_AUDIT 9 105 + #define TIF_SYSCALL_TRACEPOINT 10 106 + #define TIF_SECCOMP 11 106 107 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ 107 108 #define TIF_FREEZE 19 108 109 #define TIF_RESTORE_SIGMASK 20 ··· 117 110 #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) 118 111 #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) 119 112 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 113 + #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE) 114 + #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 115 + #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 116 + #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) 117 + #define _TIF_SECCOMP (1 << TIF_SECCOMP) 120 118 #define _TIF_32BIT (1 << TIF_32BIT) 121 119 122 120 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ 123 - _TIF_NOTIFY_RESUME) 121 + _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE) 122 + 123 + #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ 124 + _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP) 124 125 125 126 #endif /* __KERNEL__ */ 126 127 #endif /* __ASM_THREAD_INFO_H */

+32 -12

arch/arm64/include/asm/tlbflush.h

··· 72 72 */ 73 73 static inline void flush_tlb_all(void) 74 74 { 75 - dsb(); 75 + dsb(ishst); 76 76 asm("tlbi vmalle1is"); 77 - dsb(); 77 + dsb(ish); 78 78 isb(); 79 79 } 80 80 ··· 82 82 { 83 83 unsigned long asid = (unsigned long)ASID(mm) << 48; 84 84 85 - dsb(); 85 + dsb(ishst); 86 86 asm("tlbi aside1is, %0" : : "r" (asid)); 87 - dsb(); 87 + dsb(ish); 88 88 } 89 89 90 90 static inline void flush_tlb_page(struct vm_area_struct *vma, ··· 93 93 unsigned long addr = uaddr >> 12 | 94 94 ((unsigned long)ASID(vma->vm_mm) << 48); 95 95 96 - dsb(); 96 + dsb(ishst); 97 97 asm("tlbi vae1is, %0" : : "r" (addr)); 98 - dsb(); 98 + dsb(ish); 99 99 } 100 100 101 - /* 102 - * Convert calls to our calling convention. 103 - */ 104 - #define flush_tlb_range(vma,start,end) __cpu_flush_user_tlb_range(start,end,vma) 105 - #define flush_tlb_kernel_range(s,e) __cpu_flush_kern_tlb_range(s,e) 101 + static inline void flush_tlb_range(struct vm_area_struct *vma, 102 + unsigned long start, unsigned long end) 103 + { 104 + unsigned long asid = (unsigned long)ASID(vma->vm_mm) << 48; 105 + unsigned long addr; 106 + start = asid | (start >> 12); 107 + end = asid | (end >> 12); 108 + 109 + dsb(ishst); 110 + for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) 111 + asm("tlbi vae1is, %0" : : "r"(addr)); 112 + dsb(ish); 113 + } 114 + 115 + static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) 116 + { 117 + unsigned long addr; 118 + start >>= 12; 119 + end >>= 12; 120 + 121 + dsb(ishst); 122 + for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) 123 + asm("tlbi vaae1is, %0" : : "r"(addr)); 124 + dsb(ish); 125 + } 106 126 107 127 /* 108 128 * On AArch64, the cache coherency is handled via the set_pte_at() function. ··· 134 114 * set_pte() does not have a DSB, so make sure that the page table 135 115 * write is visible. 136 116 */ 137 - dsb(); 117 + dsb(ishst); 138 118 } 139 119 140 120 #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)

-3

arch/arm64/include/asm/topology.h

··· 20 20 #define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) 21 21 #define topology_thread_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) 22 22 23 - #define mc_capable() (cpu_topology[0].cluster_id != -1) 24 - #define smt_capable() (cpu_topology[0].thread_id != -1) 25 - 26 23 void init_cpu_topology(void); 27 24 void store_cpu_topology(unsigned int cpuid); 28 25 const struct cpumask *cpu_coregroup_mask(int cpu);

+2

arch/arm64/include/asm/unistd.h

··· 29 29 #endif 30 30 #define __ARCH_WANT_SYS_CLONE 31 31 #include <uapi/asm/unistd.h> 32 + 33 + #define NR_syscalls (__NR_syscalls)

+7

arch/arm64/include/uapi/asm/sigcontext.h

··· 53 53 __uint128_t vregs[32]; 54 54 }; 55 55 56 + /* ESR_EL1 context */ 57 + #define ESR_MAGIC 0x45535201 58 + 59 + struct esr_context { 60 + struct _aarch64_ctx head; 61 + u64 esr; 62 + }; 56 63 57 64 #endif /* _UAPI__ASM_SIGCONTEXT_H */

+6 -1

arch/arm64/kernel/Makefile

··· 7 7 CFLAGS_efi-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) \ 8 8 -I$(src)/../../../scripts/dtc/libfdt 9 9 10 + CFLAGS_REMOVE_ftrace.o = -pg 11 + CFLAGS_REMOVE_insn.o = -pg 12 + CFLAGS_REMOVE_return_address.o = -pg 13 + 10 14 # Object file lists. 11 15 arm64-obj-y := cputable.o debug-monitors.o entry.o irq.o fpsimd.o \ 12 16 entry-fpsimd.o process.o ptrace.o setup.o signal.o \ 13 17 sys.o stacktrace.o time.o traps.o io.o vdso.o \ 14 - hyp-stub.o psci.o cpu_ops.o insn.o 18 + hyp-stub.o psci.o cpu_ops.o insn.o return_address.o 15 19 16 20 arm64-obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \ 17 21 sys_compat.o 22 + arm64-obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o 18 23 arm64-obj-$(CONFIG_MODULES) += arm64ksyms.o module.o 19 24 arm64-obj-$(CONFIG_SMP) += smp.o smp_spin_table.o topology.o 20 25 arm64-obj-$(CONFIG_PERF_EVENTS) += perf_regs.o

+9

arch/arm64/kernel/arm64ksyms.c

··· 44 44 /* string / mem functions */ 45 45 EXPORT_SYMBOL(strchr); 46 46 EXPORT_SYMBOL(strrchr); 47 + EXPORT_SYMBOL(strcmp); 48 + EXPORT_SYMBOL(strncmp); 49 + EXPORT_SYMBOL(strlen); 50 + EXPORT_SYMBOL(strnlen); 47 51 EXPORT_SYMBOL(memset); 48 52 EXPORT_SYMBOL(memcpy); 49 53 EXPORT_SYMBOL(memmove); 50 54 EXPORT_SYMBOL(memchr); 55 + EXPORT_SYMBOL(memcmp); 51 56 52 57 /* atomic bitops */ 53 58 EXPORT_SYMBOL(set_bit); ··· 61 56 EXPORT_SYMBOL(test_and_clear_bit); 62 57 EXPORT_SYMBOL(change_bit); 63 58 EXPORT_SYMBOL(test_and_change_bit); 59 + 60 + #ifdef CONFIG_FUNCTION_TRACER 61 + EXPORT_SYMBOL(_mcount); 62 + #endif

+24

arch/arm64/kernel/entry-fpsimd.S

··· 41 41 fpsimd_restore x0, 8 42 42 ret 43 43 ENDPROC(fpsimd_load_state) 44 + 45 + #ifdef CONFIG_KERNEL_MODE_NEON 46 + 47 + /* 48 + * Save the bottom n FP registers. 49 + * 50 + * x0 - pointer to struct fpsimd_partial_state 51 + */ 52 + ENTRY(fpsimd_save_partial_state) 53 + fpsimd_save_partial x0, 1, 8, 9 54 + ret 55 + ENDPROC(fpsimd_load_partial_state) 56 + 57 + /* 58 + * Load the bottom n FP registers. 59 + * 60 + * x0 - pointer to struct fpsimd_partial_state 61 + */ 62 + ENTRY(fpsimd_load_partial_state) 63 + fpsimd_restore_partial x0, 8, 9 64 + ret 65 + ENDPROC(fpsimd_load_partial_state) 66 + 67 + #endif

+218

arch/arm64/kernel/entry-ftrace.S

··· 1 + /* 2 + * arch/arm64/kernel/entry-ftrace.S 3 + * 4 + * Copyright (C) 2013 Linaro Limited 5 + * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> 6 + * 7 + * This program is free software; you can redistribute it and/or modify 8 + * it under the terms of the GNU General Public License version 2 as 9 + * published by the Free Software Foundation. 10 + */ 11 + 12 + #include <linux/linkage.h> 13 + #include <asm/ftrace.h> 14 + #include <asm/insn.h> 15 + 16 + /* 17 + * Gcc with -pg will put the following code in the beginning of each function: 18 + * mov x0, x30 19 + * bl _mcount 20 + * [function's body ...] 21 + * "bl _mcount" may be replaced to "bl ftrace_caller" or NOP if dynamic 22 + * ftrace is enabled. 23 + * 24 + * Please note that x0 as an argument will not be used here because we can 25 + * get lr(x30) of instrumented function at any time by winding up call stack 26 + * as long as the kernel is compiled without -fomit-frame-pointer. 27 + * (or CONFIG_FRAME_POINTER, this is forced on arm64) 28 + * 29 + * stack layout after mcount_enter in _mcount(): 30 + * 31 + * current sp/fp => 0:+-----+ 32 + * in _mcount() | x29 | -> instrumented function's fp 33 + * +-----+ 34 + * | x30 | -> _mcount()'s lr (= instrumented function's pc) 35 + * old sp => +16:+-----+ 36 + * when instrumented | | 37 + * function calls | ... | 38 + * _mcount() | | 39 + * | | 40 + * instrumented => +xx:+-----+ 41 + * function's fp | x29 | -> parent's fp 42 + * +-----+ 43 + * | x30 | -> instrumented function's lr (= parent's pc) 44 + * +-----+ 45 + * | ... | 46 + */ 47 + 48 + .macro mcount_enter 49 + stp x29, x30, [sp, #-16]! 50 + mov x29, sp 51 + .endm 52 + 53 + .macro mcount_exit 54 + ldp x29, x30, [sp], #16 55 + ret 56 + .endm 57 + 58 + .macro mcount_adjust_addr rd, rn 59 + sub \rd, \rn, #AARCH64_INSN_SIZE 60 + .endm 61 + 62 + /* for instrumented function's parent */ 63 + .macro mcount_get_parent_fp reg 64 + ldr \reg, [x29] 65 + ldr \reg, [\reg] 66 + .endm 67 + 68 + /* for instrumented function */ 69 + .macro mcount_get_pc0 reg 70 + mcount_adjust_addr \reg, x30 71 + .endm 72 + 73 + .macro mcount_get_pc reg 74 + ldr \reg, [x29, #8] 75 + mcount_adjust_addr \reg, \reg 76 + .endm 77 + 78 + .macro mcount_get_lr reg 79 + ldr \reg, [x29] 80 + ldr \reg, [\reg, #8] 81 + mcount_adjust_addr \reg, \reg 82 + .endm 83 + 84 + .macro mcount_get_lr_addr reg 85 + ldr \reg, [x29] 86 + add \reg, \reg, #8 87 + .endm 88 + 89 + #ifndef CONFIG_DYNAMIC_FTRACE 90 + /* 91 + * void _mcount(unsigned long return_address) 92 + * @return_address: return address to instrumented function 93 + * 94 + * This function makes calls, if enabled, to: 95 + * - tracer function to probe instrumented function's entry, 96 + * - ftrace_graph_caller to set up an exit hook 97 + */ 98 + ENTRY(_mcount) 99 + #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 100 + ldr x0, =ftrace_trace_stop 101 + ldr x0, [x0] // if ftrace_trace_stop 102 + ret // return; 103 + #endif 104 + mcount_enter 105 + 106 + ldr x0, =ftrace_trace_function 107 + ldr x2, [x0] 108 + adr x0, ftrace_stub 109 + cmp x0, x2 // if (ftrace_trace_function 110 + b.eq skip_ftrace_call // != ftrace_stub) { 111 + 112 + mcount_get_pc x0 // function's pc 113 + mcount_get_lr x1 // function's lr (= parent's pc) 114 + blr x2 // (*ftrace_trace_function)(pc, lr); 115 + 116 + #ifndef CONFIG_FUNCTION_GRAPH_TRACER 117 + skip_ftrace_call: // return; 118 + mcount_exit // } 119 + #else 120 + mcount_exit // return; 121 + // } 122 + skip_ftrace_call: 123 + ldr x1, =ftrace_graph_return 124 + ldr x2, [x1] // if ((ftrace_graph_return 125 + cmp x0, x2 // != ftrace_stub) 126 + b.ne ftrace_graph_caller 127 + 128 + ldr x1, =ftrace_graph_entry // || (ftrace_graph_entry 129 + ldr x2, [x1] // != ftrace_graph_entry_stub)) 130 + ldr x0, =ftrace_graph_entry_stub 131 + cmp x0, x2 132 + b.ne ftrace_graph_caller // ftrace_graph_caller(); 133 + 134 + mcount_exit 135 + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 136 + ENDPROC(_mcount) 137 + 138 + #else /* CONFIG_DYNAMIC_FTRACE */ 139 + /* 140 + * _mcount() is used to build the kernel with -pg option, but all the branch 141 + * instructions to _mcount() are replaced to NOP initially at kernel start up, 142 + * and later on, NOP to branch to ftrace_caller() when enabled or branch to 143 + * NOP when disabled per-function base. 144 + */ 145 + ENTRY(_mcount) 146 + ret 147 + ENDPROC(_mcount) 148 + 149 + /* 150 + * void ftrace_caller(unsigned long return_address) 151 + * @return_address: return address to instrumented function 152 + * 153 + * This function is a counterpart of _mcount() in 'static' ftrace, and 154 + * makes calls to: 155 + * - tracer function to probe instrumented function's entry, 156 + * - ftrace_graph_caller to set up an exit hook 157 + */ 158 + ENTRY(ftrace_caller) 159 + mcount_enter 160 + 161 + mcount_get_pc0 x0 // function's pc 162 + mcount_get_lr x1 // function's lr 163 + 164 + .global ftrace_call 165 + ftrace_call: // tracer(pc, lr); 166 + nop // This will be replaced with "bl xxx" 167 + // where xxx can be any kind of tracer. 168 + 169 + #ifdef CONFIG_FUNCTION_GRAPH_TRACER 170 + .global ftrace_graph_call 171 + ftrace_graph_call: // ftrace_graph_caller(); 172 + nop // If enabled, this will be replaced 173 + // "b ftrace_graph_caller" 174 + #endif 175 + 176 + mcount_exit 177 + ENDPROC(ftrace_caller) 178 + #endif /* CONFIG_DYNAMIC_FTRACE */ 179 + 180 + ENTRY(ftrace_stub) 181 + ret 182 + ENDPROC(ftrace_stub) 183 + 184 + #ifdef CONFIG_FUNCTION_GRAPH_TRACER 185 + /* 186 + * void ftrace_graph_caller(void) 187 + * 188 + * Called from _mcount() or ftrace_caller() when function_graph tracer is 189 + * selected. 190 + * This function w/ prepare_ftrace_return() fakes link register's value on 191 + * the call stack in order to intercept instrumented function's return path 192 + * and run return_to_handler() later on its exit. 193 + */ 194 + ENTRY(ftrace_graph_caller) 195 + mcount_get_lr_addr x0 // pointer to function's saved lr 196 + mcount_get_pc x1 // function's pc 197 + mcount_get_parent_fp x2 // parent's fp 198 + bl prepare_ftrace_return // prepare_ftrace_return(&lr, pc, fp) 199 + 200 + mcount_exit 201 + ENDPROC(ftrace_graph_caller) 202 + 203 + /* 204 + * void return_to_handler(void) 205 + * 206 + * Run ftrace_return_to_handler() before going back to parent. 207 + * @fp is checked against the value passed by ftrace_graph_caller() 208 + * only when CONFIG_FUNCTION_GRAPH_FP_TEST is enabled. 209 + */ 210 + ENTRY(return_to_handler) 211 + str x0, [sp, #-16]! 212 + mov x0, x29 // parent's fp 213 + bl ftrace_return_to_handler// addr = ftrace_return_to_hander(fp); 214 + mov x30, x0 // restore the original return address 215 + ldr x0, [sp], #16 216 + ret 217 + END(return_to_handler) 218 + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */

+34 -56

arch/arm64/kernel/entry.S

··· 60 60 push x0, x1 61 61 .if \el == 0 62 62 mrs x21, sp_el0 63 + get_thread_info tsk // Ensure MDSCR_EL1.SS is clear, 64 + ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug 65 + disable_step_tsk x19, x20 // exceptions when scheduling. 63 66 .else 64 67 add x21, sp, #S_FRAME_SIZE 65 68 .endif ··· 262 259 * Data abort handling 263 260 */ 264 261 mrs x0, far_el1 265 - enable_dbg_if_not_stepping x2 262 + enable_dbg 266 263 // re-enable interrupts if they were enabled in the aborted context 267 264 tbnz x23, #7, 1f // PSR_I_BIT 268 265 enable_irq ··· 278 275 * Stack or PC alignment exception handling 279 276 */ 280 277 mrs x0, far_el1 278 + enable_dbg 281 279 mov x1, x25 282 280 mov x2, sp 283 281 b do_sp_pc_abort ··· 286 282 /* 287 283 * Undefined instruction 288 284 */ 285 + enable_dbg 289 286 mov x0, sp 290 287 b do_undefinstr 291 288 el1_dbg: ··· 299 294 mrs x0, far_el1 300 295 mov x2, sp // struct pt_regs 301 296 bl do_debug_exception 302 - 297 + enable_dbg 303 298 kernel_exit 1 304 299 el1_inv: 305 300 // TODO: add support for undefined instructions in kernel mode 301 + enable_dbg 306 302 mov x0, sp 307 303 mov x1, #BAD_SYNC 308 304 mrs x2, esr_el1 ··· 313 307 .align 6 314 308 el1_irq: 315 309 kernel_entry 1 316 - enable_dbg_if_not_stepping x0 310 + enable_dbg 317 311 #ifdef CONFIG_TRACE_IRQFLAGS 318 312 bl trace_hardirqs_off 319 313 #endif ··· 338 332 #ifdef CONFIG_PREEMPT 339 333 el1_preempt: 340 334 mov x24, lr 341 - 1: enable_dbg 342 - bl preempt_schedule_irq // irq en/disable is done inside 335 + 1: bl preempt_schedule_irq // irq en/disable is done inside 343 336 ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS 344 337 tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling? 345 338 ret x24 ··· 354 349 lsr x24, x25, #ESR_EL1_EC_SHIFT // exception class 355 350 cmp x24, #ESR_EL1_EC_SVC64 // SVC in 64-bit state 356 351 b.eq el0_svc 357 - adr lr, ret_from_exception 352 + adr lr, ret_to_user 358 353 cmp x24, #ESR_EL1_EC_DABT_EL0 // data abort in EL0 359 354 b.eq el0_da 360 355 cmp x24, #ESR_EL1_EC_IABT_EL0 // instruction abort in EL0 ··· 383 378 lsr x24, x25, #ESR_EL1_EC_SHIFT // exception class 384 379 cmp x24, #ESR_EL1_EC_SVC32 // SVC in 32-bit state 385 380 b.eq el0_svc_compat 386 - adr lr, ret_from_exception 381 + adr lr, ret_to_user 387 382 cmp x24, #ESR_EL1_EC_DABT_EL0 // data abort in EL0 388 383 b.eq el0_da 389 384 cmp x24, #ESR_EL1_EC_IABT_EL0 // instruction abort in EL0 ··· 428 423 */ 429 424 mrs x0, far_el1 430 425 bic x0, x0, #(0xff << 56) 431 - disable_step x1 432 - isb 433 - enable_dbg 434 426 // enable interrupts before calling the main handler 435 - enable_irq 427 + enable_dbg_and_irq 436 428 mov x1, x25 437 429 mov x2, sp 438 430 b do_mem_abort ··· 438 436 * Instruction abort handling 439 437 */ 440 438 mrs x0, far_el1 441 - disable_step x1 442 - isb 443 - enable_dbg 444 439 // enable interrupts before calling the main handler 445 - enable_irq 440 + enable_dbg_and_irq 446 441 orr x1, x25, #1 << 24 // use reserved ISS bit for instruction aborts 447 442 mov x2, sp 448 443 b do_mem_abort ··· 447 448 /* 448 449 * Floating Point or Advanced SIMD access 449 450 */ 451 + enable_dbg 450 452 mov x0, x25 451 453 mov x1, sp 452 454 b do_fpsimd_acc ··· 455 455 /* 456 456 * Floating Point or Advanced SIMD exception 457 457 */ 458 + enable_dbg 458 459 mov x0, x25 459 460 mov x1, sp 460 461 b do_fpsimd_exc ··· 464 463 * Stack or PC alignment exception handling 465 464 */ 466 465 mrs x0, far_el1 467 - disable_step x1 468 - isb 469 - enable_dbg 470 466 // enable interrupts before calling the main handler 471 - enable_irq 467 + enable_dbg_and_irq 472 468 mov x1, x25 473 469 mov x2, sp 474 470 b do_sp_pc_abort ··· 473 475 /* 474 476 * Undefined instruction 475 477 */ 476 - mov x0, sp 477 478 // enable interrupts before calling the main handler 478 - enable_irq 479 + enable_dbg_and_irq 480 + mov x0, sp 479 481 b do_undefinstr 480 482 el0_dbg: 481 483 /* ··· 483 485 */ 484 486 tbnz x24, #0, el0_inv // EL0 only 485 487 mrs x0, far_el1 486 - disable_step x1 487 488 mov x1, x25 488 489 mov x2, sp 489 - b do_debug_exception 490 + bl do_debug_exception 491 + enable_dbg 492 + b ret_to_user 490 493 el0_inv: 494 + enable_dbg 491 495 mov x0, sp 492 496 mov x1, #BAD_SYNC 493 497 mrs x2, esr_el1 ··· 500 500 el0_irq: 501 501 kernel_entry 0 502 502 el0_irq_naked: 503 - disable_step x1 504 - isb 505 503 enable_dbg 506 504 #ifdef CONFIG_TRACE_IRQFLAGS 507 505 bl trace_hardirqs_off 508 506 #endif 509 507 510 508 irq_handler 511 - get_thread_info tsk 512 509 513 510 #ifdef CONFIG_TRACE_IRQFLAGS 514 511 bl trace_hardirqs_on 515 512 #endif 516 513 b ret_to_user 517 514 ENDPROC(el0_irq) 518 - 519 - /* 520 - * This is the return code to user mode for abort handlers 521 - */ 522 - ret_from_exception: 523 - get_thread_info tsk 524 - b ret_to_user 525 - ENDPROC(ret_from_exception) 526 515 527 516 /* 528 517 * Register switch for AArch64. The callee-saved registers need to be saved ··· 552 563 ldr x1, [tsk, #TI_FLAGS] 553 564 and x2, x1, #_TIF_WORK_MASK 554 565 cbnz x2, fast_work_pending 555 - tbz x1, #TIF_SINGLESTEP, fast_exit 556 - disable_dbg 557 - enable_step x2 558 - fast_exit: 566 + enable_step_tsk x1, x2 559 567 kernel_exit 0, ret = 1 560 568 561 569 /* ··· 562 576 str x0, [sp, #S_X0] // returned x0 563 577 work_pending: 564 578 tbnz x1, #TIF_NEED_RESCHED, work_resched 565 - /* TIF_SIGPENDING or TIF_NOTIFY_RESUME case */ 579 + /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */ 566 580 ldr x2, [sp, #S_PSTATE] 567 581 mov x0, sp // 'regs' 568 582 tst x2, #PSR_MODE_MASK // user mode regs? ··· 571 585 bl do_notify_resume 572 586 b ret_to_user 573 587 work_resched: 574 - enable_dbg 575 588 bl schedule 576 589 577 590 /* ··· 581 596 ldr x1, [tsk, #TI_FLAGS] 582 597 and x2, x1, #_TIF_WORK_MASK 583 598 cbnz x2, work_pending 584 - tbz x1, #TIF_SINGLESTEP, no_work_pending 585 - disable_dbg 586 - enable_step x2 599 + enable_step_tsk x1, x2 587 600 no_work_pending: 588 601 kernel_exit 0, ret = 0 589 602 ENDPROC(ret_to_user) ··· 608 625 mov sc_nr, #__NR_syscalls 609 626 el0_svc_naked: // compat entry point 610 627 stp x0, scno, [sp, #S_ORIG_X0] // save the original x0 and syscall number 611 - disable_step x16 612 - isb 613 - enable_dbg 614 - enable_irq 628 + enable_dbg_and_irq 615 629 616 - get_thread_info tsk 617 - ldr x16, [tsk, #TI_FLAGS] // check for syscall tracing 618 - tbnz x16, #TIF_SYSCALL_TRACE, __sys_trace // are we tracing syscalls? 630 + ldr x16, [tsk, #TI_FLAGS] // check for syscall hooks 631 + tst x16, #_TIF_SYSCALL_WORK 632 + b.ne __sys_trace 619 633 adr lr, ret_fast_syscall // return address 620 634 cmp scno, sc_nr // check upper syscall limit 621 635 b.hs ni_sys ··· 628 648 * switches, and waiting for our parent to respond. 629 649 */ 630 650 __sys_trace: 631 - mov x1, sp 632 - mov w0, #0 // trace entry 633 - bl syscall_trace 651 + mov x0, sp 652 + bl syscall_trace_enter 634 653 adr lr, __sys_trace_return // return address 635 654 uxtw scno, w0 // syscall number (possibly new) 636 655 mov x1, sp // pointer to regs ··· 644 665 645 666 __sys_trace_return: 646 667 str x0, [sp] // save returned x0 647 - mov x1, sp 648 - mov w0, #1 // trace exit 649 - bl syscall_trace 668 + mov x0, sp 669 + bl syscall_trace_exit 650 670 b ret_to_user 651 671 652 672 /*

+167 -19

arch/arm64/kernel/fpsimd.c

··· 35 35 #define FPEXC_IDF (1 << 7) 36 36 37 37 /* 38 + * In order to reduce the number of times the FPSIMD state is needlessly saved 39 + * and restored, we need to keep track of two things: 40 + * (a) for each task, we need to remember which CPU was the last one to have 41 + * the task's FPSIMD state loaded into its FPSIMD registers; 42 + * (b) for each CPU, we need to remember which task's userland FPSIMD state has 43 + * been loaded into its FPSIMD registers most recently, or whether it has 44 + * been used to perform kernel mode NEON in the meantime. 45 + * 46 + * For (a), we add a 'cpu' field to struct fpsimd_state, which gets updated to 47 + * the id of the current CPU everytime the state is loaded onto a CPU. For (b), 48 + * we add the per-cpu variable 'fpsimd_last_state' (below), which contains the 49 + * address of the userland FPSIMD state of the task that was loaded onto the CPU 50 + * the most recently, or NULL if kernel mode NEON has been performed after that. 51 + * 52 + * With this in place, we no longer have to restore the next FPSIMD state right 53 + * when switching between tasks. Instead, we can defer this check to userland 54 + * resume, at which time we verify whether the CPU's fpsimd_last_state and the 55 + * task's fpsimd_state.cpu are still mutually in sync. If this is the case, we 56 + * can omit the FPSIMD restore. 57 + * 58 + * As an optimization, we use the thread_info flag TIF_FOREIGN_FPSTATE to 59 + * indicate whether or not the userland FPSIMD state of the current task is 60 + * present in the registers. The flag is set unless the FPSIMD registers of this 61 + * CPU currently contain the most recent userland FPSIMD state of the current 62 + * task. 63 + * 64 + * For a certain task, the sequence may look something like this: 65 + * - the task gets scheduled in; if both the task's fpsimd_state.cpu field 66 + * contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu 67 + * variable points to the task's fpsimd_state, the TIF_FOREIGN_FPSTATE flag is 68 + * cleared, otherwise it is set; 69 + * 70 + * - the task returns to userland; if TIF_FOREIGN_FPSTATE is set, the task's 71 + * userland FPSIMD state is copied from memory to the registers, the task's 72 + * fpsimd_state.cpu field is set to the id of the current CPU, the current 73 + * CPU's fpsimd_last_state pointer is set to this task's fpsimd_state and the 74 + * TIF_FOREIGN_FPSTATE flag is cleared; 75 + * 76 + * - the task executes an ordinary syscall; upon return to userland, the 77 + * TIF_FOREIGN_FPSTATE flag will still be cleared, so no FPSIMD state is 78 + * restored; 79 + * 80 + * - the task executes a syscall which executes some NEON instructions; this is 81 + * preceded by a call to kernel_neon_begin(), which copies the task's FPSIMD 82 + * register contents to memory, clears the fpsimd_last_state per-cpu variable 83 + * and sets the TIF_FOREIGN_FPSTATE flag; 84 + * 85 + * - the task gets preempted after kernel_neon_end() is called; as we have not 86 + * returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so 87 + * whatever is in the FPSIMD registers is not saved to memory, but discarded. 88 + */ 89 + static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state); 90 + 91 + /* 38 92 * Trapped FP/ASIMD access. 39 93 */ 40 94 void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) ··· 126 72 127 73 void fpsimd_thread_switch(struct task_struct *next) 128 74 { 129 - /* check if not kernel threads */ 130 - if (current->mm) 75 + /* 76 + * Save the current FPSIMD state to memory, but only if whatever is in 77 + * the registers is in fact the most recent userland FPSIMD state of 78 + * 'current'. 79 + */ 80 + if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE)) 131 81 fpsimd_save_state(&current->thread.fpsimd_state); 132 - if (next->mm) 133 - fpsimd_load_state(&next->thread.fpsimd_state); 82 + 83 + if (next->mm) { 84 + /* 85 + * If we are switching to a task whose most recent userland 86 + * FPSIMD state is already in the registers of *this* cpu, 87 + * we can skip loading the state from memory. Otherwise, set 88 + * the TIF_FOREIGN_FPSTATE flag so the state will be loaded 89 + * upon the next return to userland. 90 + */ 91 + struct fpsimd_state *st = &next->thread.fpsimd_state; 92 + 93 + if (__this_cpu_read(fpsimd_last_state) == st 94 + && st->cpu == smp_processor_id()) 95 + clear_ti_thread_flag(task_thread_info(next), 96 + TIF_FOREIGN_FPSTATE); 97 + else 98 + set_ti_thread_flag(task_thread_info(next), 99 + TIF_FOREIGN_FPSTATE); 100 + } 134 101 } 135 102 136 103 void fpsimd_flush_thread(void) 137 104 { 138 - preempt_disable(); 139 105 memset(&current->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); 140 - fpsimd_load_state(&current->thread.fpsimd_state); 106 + set_thread_flag(TIF_FOREIGN_FPSTATE); 107 + } 108 + 109 + /* 110 + * Save the userland FPSIMD state of 'current' to memory, but only if the state 111 + * currently held in the registers does in fact belong to 'current' 112 + */ 113 + void fpsimd_preserve_current_state(void) 114 + { 115 + preempt_disable(); 116 + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) 117 + fpsimd_save_state(&current->thread.fpsimd_state); 141 118 preempt_enable(); 119 + } 120 + 121 + /* 122 + * Load the userland FPSIMD state of 'current' from memory, but only if the 123 + * FPSIMD state already held in the registers is /not/ the most recent FPSIMD 124 + * state of 'current' 125 + */ 126 + void fpsimd_restore_current_state(void) 127 + { 128 + preempt_disable(); 129 + if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { 130 + struct fpsimd_state *st = &current->thread.fpsimd_state; 131 + 132 + fpsimd_load_state(st); 133 + this_cpu_write(fpsimd_last_state, st); 134 + st->cpu = smp_processor_id(); 135 + } 136 + preempt_enable(); 137 + } 138 + 139 + /* 140 + * Load an updated userland FPSIMD state for 'current' from memory and set the 141 + * flag that indicates that the FPSIMD register contents are the most recent 142 + * FPSIMD state of 'current' 143 + */ 144 + void fpsimd_update_current_state(struct fpsimd_state *state) 145 + { 146 + preempt_disable(); 147 + fpsimd_load_state(state); 148 + if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { 149 + struct fpsimd_state *st = &current->thread.fpsimd_state; 150 + 151 + this_cpu_write(fpsimd_last_state, st); 152 + st->cpu = smp_processor_id(); 153 + } 154 + preempt_enable(); 155 + } 156 + 157 + /* 158 + * Invalidate live CPU copies of task t's FPSIMD state 159 + */ 160 + void fpsimd_flush_task_state(struct task_struct *t) 161 + { 162 + t->thread.fpsimd_state.cpu = NR_CPUS; 142 163 } 143 164 144 165 #ifdef CONFIG_KERNEL_MODE_NEON 145 166 167 + static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate); 168 + static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate); 169 + 146 170 /* 147 171 * Kernel-side NEON support functions 148 172 */ 149 - void kernel_neon_begin(void) 173 + void kernel_neon_begin_partial(u32 num_regs) 150 174 { 151 - /* Avoid using the NEON in interrupt context */ 152 - BUG_ON(in_interrupt()); 153 - preempt_disable(); 175 + if (in_interrupt()) { 176 + struct fpsimd_partial_state *s = this_cpu_ptr( 177 + in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); 154 178 155 - if (current->mm) 156 - fpsimd_save_state(&current->thread.fpsimd_state); 179 + BUG_ON(num_regs > 32); 180 + fpsimd_save_partial_state(s, roundup(num_regs, 2)); 181 + } else { 182 + /* 183 + * Save the userland FPSIMD state if we have one and if we 184 + * haven't done so already. Clear fpsimd_last_state to indicate 185 + * that there is no longer userland FPSIMD state in the 186 + * registers. 187 + */ 188 + preempt_disable(); 189 + if (current->mm && 190 + !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) 191 + fpsimd_save_state(&current->thread.fpsimd_state); 192 + this_cpu_write(fpsimd_last_state, NULL); 193 + } 157 194 } 158 - EXPORT_SYMBOL(kernel_neon_begin); 195 + EXPORT_SYMBOL(kernel_neon_begin_partial); 159 196 160 197 void kernel_neon_end(void) 161 198 { 162 - if (current->mm) 163 - fpsimd_load_state(&current->thread.fpsimd_state); 164 - 165 - preempt_enable(); 199 + if (in_interrupt()) { 200 + struct fpsimd_partial_state *s = this_cpu_ptr( 201 + in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); 202 + fpsimd_load_partial_state(s); 203 + } else { 204 + preempt_enable(); 205 + } 166 206 } 167 207 EXPORT_SYMBOL(kernel_neon_end); 168 208 ··· 268 120 { 269 121 switch (cmd) { 270 122 case CPU_PM_ENTER: 271 - if (current->mm) 123 + if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE)) 272 124 fpsimd_save_state(&current->thread.fpsimd_state); 273 125 break; 274 126 case CPU_PM_EXIT: 275 127 if (current->mm) 276 - fpsimd_load_state(&current->thread.fpsimd_state); 128 + set_thread_flag(TIF_FOREIGN_FPSTATE); 277 129 break; 278 130 case CPU_PM_ENTER_FAILED: 279 131 default:

+176

arch/arm64/kernel/ftrace.c

··· 1 + /* 2 + * arch/arm64/kernel/ftrace.c 3 + * 4 + * Copyright (C) 2013 Linaro Limited 5 + * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> 6 + * 7 + * This program is free software; you can redistribute it and/or modify 8 + * it under the terms of the GNU General Public License version 2 as 9 + * published by the Free Software Foundation. 10 + */ 11 + 12 + #include <linux/ftrace.h> 13 + #include <linux/swab.h> 14 + #include <linux/uaccess.h> 15 + 16 + #include <asm/cacheflush.h> 17 + #include <asm/ftrace.h> 18 + #include <asm/insn.h> 19 + 20 + #ifdef CONFIG_DYNAMIC_FTRACE 21 + /* 22 + * Replace a single instruction, which may be a branch or NOP. 23 + * If @validate == true, a replaced instruction is checked against 'old'. 24 + */ 25 + static int ftrace_modify_code(unsigned long pc, u32 old, u32 new, 26 + bool validate) 27 + { 28 + u32 replaced; 29 + 30 + /* 31 + * Note: 32 + * Due to modules and __init, code can disappear and change, 33 + * we need to protect against faulting as well as code changing. 34 + * We do this by aarch64_insn_*() which use the probe_kernel_*(). 35 + * 36 + * No lock is held here because all the modifications are run 37 + * through stop_machine(). 38 + */ 39 + if (validate) { 40 + if (aarch64_insn_read((void *)pc, &replaced)) 41 + return -EFAULT; 42 + 43 + if (replaced != old) 44 + return -EINVAL; 45 + } 46 + if (aarch64_insn_patch_text_nosync((void *)pc, new)) 47 + return -EPERM; 48 + 49 + return 0; 50 + } 51 + 52 + /* 53 + * Replace tracer function in ftrace_caller() 54 + */ 55 + int ftrace_update_ftrace_func(ftrace_func_t func) 56 + { 57 + unsigned long pc; 58 + u32 new; 59 + 60 + pc = (unsigned long)&ftrace_call; 61 + new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, true); 62 + 63 + return ftrace_modify_code(pc, 0, new, false); 64 + } 65 + 66 + /* 67 + * Turn on the call to ftrace_caller() in instrumented function 68 + */ 69 + int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) 70 + { 71 + unsigned long pc = rec->ip; 72 + u32 old, new; 73 + 74 + old = aarch64_insn_gen_nop(); 75 + new = aarch64_insn_gen_branch_imm(pc, addr, true); 76 + 77 + return ftrace_modify_code(pc, old, new, true); 78 + } 79 + 80 + /* 81 + * Turn off the call to ftrace_caller() in instrumented function 82 + */ 83 + int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, 84 + unsigned long addr) 85 + { 86 + unsigned long pc = rec->ip; 87 + u32 old, new; 88 + 89 + old = aarch64_insn_gen_branch_imm(pc, addr, true); 90 + new = aarch64_insn_gen_nop(); 91 + 92 + return ftrace_modify_code(pc, old, new, true); 93 + } 94 + 95 + int __init ftrace_dyn_arch_init(void) 96 + { 97 + return 0; 98 + } 99 + #endif /* CONFIG_DYNAMIC_FTRACE */ 100 + 101 + #ifdef CONFIG_FUNCTION_GRAPH_TRACER 102 + /* 103 + * function_graph tracer expects ftrace_return_to_handler() to be called 104 + * on the way back to parent. For this purpose, this function is called 105 + * in _mcount() or ftrace_caller() to replace return address (*parent) on 106 + * the call stack to return_to_handler. 107 + * 108 + * Note that @frame_pointer is used only for sanity check later. 109 + */ 110 + void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, 111 + unsigned long frame_pointer) 112 + { 113 + unsigned long return_hooker = (unsigned long)&return_to_handler; 114 + unsigned long old; 115 + struct ftrace_graph_ent trace; 116 + int err; 117 + 118 + if (unlikely(atomic_read(&current->tracing_graph_pause))) 119 + return; 120 + 121 + /* 122 + * Note: 123 + * No protection against faulting at *parent, which may be seen 124 + * on other archs. It's unlikely on AArch64. 125 + */ 126 + old = *parent; 127 + *parent = return_hooker; 128 + 129 + trace.func = self_addr; 130 + trace.depth = current->curr_ret_stack + 1; 131 + 132 + /* Only trace if the calling function expects to */ 133 + if (!ftrace_graph_entry(&trace)) { 134 + *parent = old; 135 + return; 136 + } 137 + 138 + err = ftrace_push_return_trace(old, self_addr, &trace.depth, 139 + frame_pointer); 140 + if (err == -EBUSY) { 141 + *parent = old; 142 + return; 143 + } 144 + } 145 + 146 + #ifdef CONFIG_DYNAMIC_FTRACE 147 + /* 148 + * Turn on/off the call to ftrace_graph_caller() in ftrace_caller() 149 + * depending on @enable. 150 + */ 151 + static int ftrace_modify_graph_caller(bool enable) 152 + { 153 + unsigned long pc = (unsigned long)&ftrace_graph_call; 154 + u32 branch, nop; 155 + 156 + branch = aarch64_insn_gen_branch_imm(pc, 157 + (unsigned long)ftrace_graph_caller, false); 158 + nop = aarch64_insn_gen_nop(); 159 + 160 + if (enable) 161 + return ftrace_modify_code(pc, nop, branch, true); 162 + else 163 + return ftrace_modify_code(pc, branch, nop, true); 164 + } 165 + 166 + int ftrace_enable_ftrace_graph_caller(void) 167 + { 168 + return ftrace_modify_graph_caller(true); 169 + } 170 + 171 + int ftrace_disable_ftrace_graph_caller(void) 172 + { 173 + return ftrace_modify_graph_caller(false); 174 + } 175 + #endif /* CONFIG_DYNAMIC_FTRACE */ 176 + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */

+3 -5

arch/arm64/kernel/head.S

··· 342 342 cmp w20, #BOOT_CPU_MODE_EL2 343 343 b.ne 1f 344 344 add x1, x1, #4 345 - 1: dc cvac, x1 // Clean potentially dirty cache line 346 - dsb sy 347 - str w20, [x1] // This CPU has booted in EL1 348 - dc civac, x1 // Clean&invalidate potentially stale cache line 349 - dsb sy 345 + 1: str w20, [x1] // This CPU has booted in EL1 346 + dmb sy 347 + dc ivac, x1 // Invalidate potentially stale cache line 350 348 ret 351 349 ENDPROC(set_cpu_boot_mode_flag) 352 350

+1 -1

arch/arm64/kernel/hw_breakpoint.c

··· 20 20 21 21 #define pr_fmt(fmt) "hw-breakpoint: " fmt 22 22 23 + #include <linux/compat.h> 23 24 #include <linux/cpu_pm.h> 24 25 #include <linux/errno.h> 25 26 #include <linux/hw_breakpoint.h> ··· 28 27 #include <linux/ptrace.h> 29 28 #include <linux/smp.h> 30 29 31 - #include <asm/compat.h> 32 30 #include <asm/current.h> 33 31 #include <asm/debug-monitors.h> 34 32 #include <asm/hw_breakpoint.h>

+40 -9

arch/arm64/kernel/process.c

··· 20 20 21 21 #include <stdarg.h> 22 22 23 + #include <linux/compat.h> 23 24 #include <linux/export.h> 24 25 #include <linux/sched.h> 25 26 #include <linux/kernel.h> ··· 114 113 } 115 114 #endif 116 115 116 + /* 117 + * Called by kexec, immediately prior to machine_kexec(). 118 + * 119 + * This must completely disable all secondary CPUs; simply causing those CPUs 120 + * to execute e.g. a RAM-based pin loop is not sufficient. This allows the 121 + * kexec'd kernel to use any and all RAM as it sees fit, without having to 122 + * avoid any code or data used by any SW CPU pin loop. The CPU hotplug 123 + * functionality embodied in disable_nonboot_cpus() to achieve this. 124 + */ 117 125 void machine_shutdown(void) 118 126 { 119 - #ifdef CONFIG_SMP 120 - smp_send_stop(); 121 - #endif 127 + disable_nonboot_cpus(); 122 128 } 123 129 130 + /* 131 + * Halting simply requires that the secondary CPUs stop performing any 132 + * activity (executing tasks, handling interrupts). smp_send_stop() 133 + * achieves this. 134 + */ 124 135 void machine_halt(void) 125 136 { 126 - machine_shutdown(); 137 + local_irq_disable(); 138 + smp_send_stop(); 127 139 while (1); 128 140 } 129 141 142 + /* 143 + * Power-off simply requires that the secondary CPUs stop performing any 144 + * activity (executing tasks, handling interrupts). smp_send_stop() 145 + * achieves this. When the system power is turned off, it will take all CPUs 146 + * with it. 147 + */ 130 148 void machine_power_off(void) 131 149 { 132 - machine_shutdown(); 150 + local_irq_disable(); 151 + smp_send_stop(); 133 152 if (pm_power_off) 134 153 pm_power_off(); 135 154 } 136 155 156 + /* 157 + * Restart requires that the secondary CPUs stop performing any activity 158 + * while the primary CPU resets the system. Systems with a single CPU can 159 + * use soft_restart() as their machine descriptor's .restart hook, since that 160 + * will cause the only available CPU to reset. Systems with multiple CPUs must 161 + * provide a HW restart implementation, to ensure that all CPUs reset at once. 162 + * This is required so that any code running after reset on the primary CPU 163 + * doesn't have to co-ordinate with other CPUs to ensure they aren't still 164 + * executing pre-reset code, and using RAM that the primary CPU's code wishes 165 + * to use. Implementing such co-ordination would be essentially impossible. 166 + */ 137 167 void machine_restart(char *cmd) 138 168 { 139 - machine_shutdown(); 140 - 141 169 /* Disable interrupts first */ 142 170 local_irq_disable(); 171 + smp_send_stop(); 143 172 144 173 /* Now call the architecture specific reboot code. */ 145 174 if (arm_pm_restart) ··· 236 205 237 206 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 238 207 { 239 - fpsimd_save_state(&current->thread.fpsimd_state); 208 + fpsimd_preserve_current_state(); 240 209 *dst = *src; 241 210 return 0; 242 211 } ··· 331 300 * Complete any pending TLB or cache maintenance on this CPU in case 332 301 * the thread migrates to a different CPU. 333 302 */ 334 - dsb(); 303 + dsb(ish); 335 304 336 305 /* the actual thread switch */ 337 306 last = cpu_switch_to(prev, next);

+41 -21

arch/arm64/kernel/ptrace.c

··· 19 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 20 */ 21 21 22 + #include <linux/compat.h> 22 23 #include <linux/kernel.h> 23 24 #include <linux/sched.h> 24 25 #include <linux/mm.h> ··· 41 40 #include <asm/pgtable.h> 42 41 #include <asm/traps.h> 43 42 #include <asm/system_misc.h> 43 + 44 + #define CREATE_TRACE_POINTS 45 + #include <trace/events/syscalls.h> 44 46 45 47 /* 46 48 * TODO: does not yet catch signals sent when the child dies. ··· 521 517 return ret; 522 518 523 519 target->thread.fpsimd_state.user_fpsimd = newstate; 520 + fpsimd_flush_task_state(target); 524 521 return ret; 525 522 } 526 523 ··· 769 764 uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK; 770 765 } 771 766 767 + fpsimd_flush_task_state(target); 772 768 return ret; 773 769 } 774 770 ··· 1064 1058 return ptrace_request(child, request, addr, data); 1065 1059 } 1066 1060 1067 - asmlinkage int syscall_trace(int dir, struct pt_regs *regs) 1061 + enum ptrace_syscall_dir { 1062 + PTRACE_SYSCALL_ENTER = 0, 1063 + PTRACE_SYSCALL_EXIT, 1064 + }; 1065 + 1066 + static void tracehook_report_syscall(struct pt_regs *regs, 1067 + enum ptrace_syscall_dir dir) 1068 1068 { 1069 + int regno; 1069 1070 unsigned long saved_reg; 1070 1071 1071 - if (!test_thread_flag(TIF_SYSCALL_TRACE)) 1072 - return regs->syscallno; 1072 + /* 1073 + * A scratch register (ip(r12) on AArch32, x7 on AArch64) is 1074 + * used to denote syscall entry/exit: 1075 + */ 1076 + regno = (is_compat_task() ? 12 : 7); 1077 + saved_reg = regs->regs[regno]; 1078 + regs->regs[regno] = dir; 1073 1079 1074 - if (is_compat_task()) { 1075 - /* AArch32 uses ip (r12) for scratch */ 1076 - saved_reg = regs->regs[12]; 1077 - regs->regs[12] = dir; 1078 - } else { 1079 - /* 1080 - * Save X7. X7 is used to denote syscall entry/exit: 1081 - * X7 = 0 -> entry, = 1 -> exit 1082 - */ 1083 - saved_reg = regs->regs[7]; 1084 - regs->regs[7] = dir; 1085 - } 1086 - 1087 - if (dir) 1080 + if (dir == PTRACE_SYSCALL_EXIT) 1088 1081 tracehook_report_syscall_exit(regs, 0); 1089 1082 else if (tracehook_report_syscall_entry(regs)) 1090 1083 regs->syscallno = ~0UL; 1091 1084 1092 - if (is_compat_task()) 1093 - regs->regs[12] = saved_reg; 1094 - else 1095 - regs->regs[7] = saved_reg; 1085 + regs->regs[regno] = saved_reg; 1086 + } 1087 + 1088 + asmlinkage int syscall_trace_enter(struct pt_regs *regs) 1089 + { 1090 + if (test_thread_flag(TIF_SYSCALL_TRACE)) 1091 + tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); 1092 + 1093 + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) 1094 + trace_sys_enter(regs, regs->syscallno); 1096 1095 1097 1096 return regs->syscallno; 1097 + } 1098 + 1099 + asmlinkage void syscall_trace_exit(struct pt_regs *regs) 1100 + { 1101 + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) 1102 + trace_sys_exit(regs, regs_return_value(regs)); 1103 + 1104 + if (test_thread_flag(TIF_SYSCALL_TRACE)) 1105 + tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT); 1098 1106 }

+55

arch/arm64/kernel/return_address.c

··· 1 + /* 2 + * arch/arm64/kernel/return_address.c 3 + * 4 + * Copyright (C) 2013 Linaro Limited 5 + * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> 6 + * 7 + * This program is free software; you can redistribute it and/or modify 8 + * it under the terms of the GNU General Public License version 2 as 9 + * published by the Free Software Foundation. 10 + */ 11 + 12 + #include <linux/export.h> 13 + #include <linux/ftrace.h> 14 + 15 + #include <asm/stacktrace.h> 16 + 17 + struct return_address_data { 18 + unsigned int level; 19 + void *addr; 20 + }; 21 + 22 + static int save_return_addr(struct stackframe *frame, void *d) 23 + { 24 + struct return_address_data *data = d; 25 + 26 + if (!data->level) { 27 + data->addr = (void *)frame->pc; 28 + return 1; 29 + } else { 30 + --data->level; 31 + return 0; 32 + } 33 + } 34 + 35 + void *return_address(unsigned int level) 36 + { 37 + struct return_address_data data; 38 + struct stackframe frame; 39 + register unsigned long current_sp asm ("sp"); 40 + 41 + data.level = level + 2; 42 + data.addr = NULL; 43 + 44 + frame.fp = (unsigned long)__builtin_frame_address(0); 45 + frame.sp = current_sp; 46 + frame.pc = (unsigned long)return_address; /* dummy */ 47 + 48 + walk_stackframe(&frame, save_return_addr, &data); 49 + 50 + if (!data.level) 51 + return data.addr; 52 + else 53 + return NULL; 54 + } 55 + EXPORT_SYMBOL_GPL(return_address);

+15 -1

arch/arm64/kernel/setup.c

··· 25 25 #include <linux/utsname.h> 26 26 #include <linux/initrd.h> 27 27 #include <linux/console.h> 28 + #include <linux/cache.h> 28 29 #include <linux/bootmem.h> 29 30 #include <linux/seq_file.h> 30 31 #include <linux/screen_info.h> ··· 201 200 { 202 201 struct cpu_info *cpu_info; 203 202 u64 features, block; 203 + u32 cwg; 204 + int cls; 204 205 205 206 cpu_info = lookup_processor_type(read_cpuid_id()); 206 207 if (!cpu_info) { ··· 218 215 219 216 sprintf(init_utsname()->machine, ELF_PLATFORM); 220 217 elf_hwcap = 0; 218 + 219 + /* 220 + * Check for sane CTR_EL0.CWG value. 221 + */ 222 + cwg = cache_type_cwg(); 223 + cls = cache_line_size(); 224 + if (!cwg) 225 + pr_warn("No Cache Writeback Granule information, assuming cache line size %d\n", 226 + cls); 227 + if (L1_CACHE_BYTES < cls) 228 + pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n", 229 + L1_CACHE_BYTES, cls); 221 230 222 231 /* 223 232 * ID_AA64ISAR0_EL1 contains 4-bit wide signed feature blocks. ··· 378 363 379 364 *cmdline_p = boot_command_line; 380 365 381 - init_mem_pgprot(); 382 366 early_ioremap_init(); 383 367 384 368 parse_early_param();

+35 -17

arch/arm64/kernel/signal.c

··· 17 17 * along with this program. If not, see <http://www.gnu.org/licenses/>. 18 18 */ 19 19 20 + #include <linux/compat.h> 20 21 #include <linux/errno.h> 21 22 #include <linux/signal.h> 22 23 #include <linux/personality.h> ··· 26 25 #include <linux/tracehook.h> 27 26 #include <linux/ratelimit.h> 28 27 29 - #include <asm/compat.h> 30 28 #include <asm/debug-monitors.h> 31 29 #include <asm/elf.h> 32 30 #include <asm/cacheflush.h> ··· 51 51 int err; 52 52 53 53 /* dump the hardware registers to the fpsimd_state structure */ 54 - fpsimd_save_state(fpsimd); 54 + fpsimd_preserve_current_state(); 55 55 56 56 /* copy the FP and status/control registers */ 57 57 err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); ··· 86 86 __get_user_error(fpsimd.fpcr, &ctx->fpcr, err); 87 87 88 88 /* load the hardware registers from the fpsimd_state structure */ 89 - if (!err) { 90 - preempt_disable(); 91 - fpsimd_load_state(&fpsimd); 92 - preempt_enable(); 93 - } 89 + if (!err) 90 + fpsimd_update_current_state(&fpsimd); 94 91 95 92 return err ? -EFAULT : 0; 96 93 } ··· 97 100 { 98 101 sigset_t set; 99 102 int i, err; 100 - struct aux_context __user *aux = 101 - (struct aux_context __user *)sf->uc.uc_mcontext.__reserved; 103 + void *aux = sf->uc.uc_mcontext.__reserved; 102 104 103 105 err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set)); 104 106 if (err == 0) ··· 117 121 118 122 err |= !valid_user_regs(&regs->user_regs); 119 123 120 - if (err == 0) 121 - err |= restore_fpsimd_context(&aux->fpsimd); 124 + if (err == 0) { 125 + struct fpsimd_context *fpsimd_ctx = 126 + container_of(aux, struct fpsimd_context, head); 127 + err |= restore_fpsimd_context(fpsimd_ctx); 128 + } 122 129 123 130 return err; 124 131 } ··· 166 167 struct pt_regs *regs, sigset_t *set) 167 168 { 168 169 int i, err = 0; 169 - struct aux_context __user *aux = 170 - (struct aux_context __user *)sf->uc.uc_mcontext.__reserved; 170 + void *aux = sf->uc.uc_mcontext.__reserved; 171 + struct _aarch64_ctx *end; 171 172 172 173 /* set up the stack frame for unwinding */ 173 174 __put_user_error(regs->regs[29], &sf->fp, err); ··· 184 185 185 186 err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set)); 186 187 187 - if (err == 0) 188 - err |= preserve_fpsimd_context(&aux->fpsimd); 188 + if (err == 0) { 189 + struct fpsimd_context *fpsimd_ctx = 190 + container_of(aux, struct fpsimd_context, head); 191 + err |= preserve_fpsimd_context(fpsimd_ctx); 192 + aux += sizeof(*fpsimd_ctx); 193 + } 194 + 195 + /* fault information, if valid */ 196 + if (current->thread.fault_code) { 197 + struct esr_context *esr_ctx = 198 + container_of(aux, struct esr_context, head); 199 + __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err); 200 + __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err); 201 + __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); 202 + aux += sizeof(*esr_ctx); 203 + } 189 204 190 205 /* set the "end" magic */ 191 - __put_user_error(0, &aux->end.magic, err); 192 - __put_user_error(0, &aux->end.size, err); 206 + end = aux; 207 + __put_user_error(0, &end->magic, err); 208 + __put_user_error(0, &end->size, err); 193 209 194 210 return err; 195 211 } ··· 430 416 clear_thread_flag(TIF_NOTIFY_RESUME); 431 417 tracehook_notify_resume(regs); 432 418 } 419 + 420 + if (thread_flags & _TIF_FOREIGN_FPSTATE) 421 + fpsimd_restore_current_state(); 422 + 433 423 }

+9 -7

arch/arm64/kernel/signal32.c

··· 23 23 #include <linux/syscalls.h> 24 24 #include <linux/ratelimit.h> 25 25 26 + #include <asm/esr.h> 26 27 #include <asm/fpsimd.h> 27 28 #include <asm/signal32.h> 28 29 #include <asm/uaccess.h> ··· 81 80 82 81 #define VFP_MAGIC 0x56465001 83 82 #define VFP_STORAGE_SIZE sizeof(struct compat_vfp_sigframe) 83 + 84 + #define FSR_WRITE_SHIFT (11) 84 85 85 86 struct compat_aux_sigframe { 86 87 struct compat_vfp_sigframe vfp; ··· 222 219 * Note that this also saves V16-31, which aren't visible 223 220 * in AArch32. 224 221 */ 225 - fpsimd_save_state(fpsimd); 222 + fpsimd_preserve_current_state(); 226 223 227 224 /* Place structure header on the stack */ 228 225 __put_user_error(magic, &frame->magic, err); ··· 285 282 * We don't need to touch the exception register, so 286 283 * reload the hardware state. 287 284 */ 288 - if (!err) { 289 - preempt_disable(); 290 - fpsimd_load_state(&fpsimd); 291 - preempt_enable(); 292 - } 285 + if (!err) 286 + fpsimd_update_current_state(&fpsimd); 293 287 294 288 return err ? -EFAULT : 0; 295 289 } ··· 500 500 __put_user_error(regs->pstate, &sf->uc.uc_mcontext.arm_cpsr, err); 501 501 502 502 __put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.trap_no, err); 503 - __put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.error_code, err); 503 + /* set the compat FSR WnR */ 504 + __put_user_error(!!(current->thread.fault_code & ESR_EL1_WRITE) << 505 + FSR_WRITE_SHIFT, &sf->uc.uc_mcontext.error_code, err); 504 506 __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err); 505 507 __put_user_error(set->sig[0], &sf->uc.uc_mcontext.oldmask, err); 506 508

+19

arch/arm64/kernel/smp.c

··· 35 35 #include <linux/clockchips.h> 36 36 #include <linux/completion.h> 37 37 #include <linux/of.h> 38 + #include <linux/irq_work.h> 38 39 39 40 #include <asm/atomic.h> 40 41 #include <asm/cacheflush.h> ··· 63 62 IPI_CALL_FUNC_SINGLE, 64 63 IPI_CPU_STOP, 65 64 IPI_TIMER, 65 + IPI_IRQ_WORK, 66 66 }; 67 67 68 68 /* ··· 479 477 smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE); 480 478 } 481 479 480 + #ifdef CONFIG_IRQ_WORK 481 + void arch_irq_work_raise(void) 482 + { 483 + if (smp_cross_call) 484 + smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK); 485 + } 486 + #endif 487 + 482 488 static const char *ipi_types[NR_IPI] = { 483 489 #define S(x,s) [x - IPI_RESCHEDULE] = s 484 490 S(IPI_RESCHEDULE, "Rescheduling interrupts"), ··· 494 484 S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"), 495 485 S(IPI_CPU_STOP, "CPU stop interrupts"), 496 486 S(IPI_TIMER, "Timer broadcast interrupts"), 487 + S(IPI_IRQ_WORK, "IRQ work interrupts"), 497 488 }; 498 489 499 490 void show_ipi_list(struct seq_file *p, int prec) ··· 583 572 case IPI_TIMER: 584 573 irq_enter(); 585 574 tick_receive_broadcast(); 575 + irq_exit(); 576 + break; 577 + #endif 578 + 579 + #ifdef CONFIG_IRQ_WORK 580 + case IPI_IRQ_WORK: 581 + irq_enter(); 582 + irq_work_run(); 586 583 irq_exit(); 587 584 break; 588 585 #endif

+1 -38

arch/arm64/kernel/smp_spin_table.c

··· 30 30 volatile unsigned long secondary_holding_pen_release = INVALID_HWID; 31 31 32 32 static phys_addr_t cpu_release_addr[NR_CPUS]; 33 - static DEFINE_RAW_SPINLOCK(boot_lock); 34 33 35 34 /* 36 35 * Write secondary_holding_pen_release in a way that is guaranteed to be ··· 93 94 94 95 static int smp_spin_table_cpu_boot(unsigned int cpu) 95 96 { 96 - unsigned long timeout; 97 - 98 - /* 99 - * Set synchronisation state between this boot processor 100 - * and the secondary one 101 - */ 102 - raw_spin_lock(&boot_lock); 103 - 104 97 /* 105 98 * Update the pen release flag. 106 99 */ ··· 103 112 */ 104 113 sev(); 105 114 106 - timeout = jiffies + (1 * HZ); 107 - while (time_before(jiffies, timeout)) { 108 - if (secondary_holding_pen_release == INVALID_HWID) 109 - break; 110 - udelay(10); 111 - } 112 - 113 - /* 114 - * Now the secondary core is starting up let it run its 115 - * calibrations, then wait for it to finish 116 - */ 117 - raw_spin_unlock(&boot_lock); 118 - 119 - return secondary_holding_pen_release != INVALID_HWID ? -ENOSYS : 0; 120 - } 121 - 122 - static void smp_spin_table_cpu_postboot(void) 123 - { 124 - /* 125 - * Let the primary processor know we're out of the pen. 126 - */ 127 - write_pen_release(INVALID_HWID); 128 - 129 - /* 130 - * Synchronise with the boot thread. 131 - */ 132 - raw_spin_lock(&boot_lock); 133 - raw_spin_unlock(&boot_lock); 115 + return 0; 134 116 } 135 117 136 118 const struct cpu_operations smp_spin_table_ops = { ··· 111 147 .cpu_init = smp_spin_table_cpu_init, 112 148 .cpu_prepare = smp_spin_table_cpu_prepare, 113 149 .cpu_boot = smp_spin_table_cpu_boot, 114 - .cpu_postboot = smp_spin_table_cpu_postboot, 115 150 };

+1 -1

arch/arm64/kernel/stacktrace.c

··· 35 35 * ldp x29, x30, [sp] 36 36 * add sp, sp, #0x10 37 37 */ 38 - int unwind_frame(struct stackframe *frame) 38 + int notrace unwind_frame(struct stackframe *frame) 39 39 { 40 40 unsigned long high, low; 41 41 unsigned long fp = frame->fp;

+3

arch/arm64/kernel/time.c

··· 18 18 * along with this program. If not, see <http://www.gnu.org/licenses/>. 19 19 */ 20 20 21 + #include <linux/clockchips.h> 21 22 #include <linux/export.h> 22 23 #include <linux/kernel.h> 23 24 #include <linux/interrupt.h> ··· 69 68 70 69 of_clk_init(NULL); 71 70 clocksource_of_init(); 71 + 72 + tick_setup_hrtimer_broadcast(); 72 73 73 74 arch_timer_rate = arch_timer_get_rate(); 74 75 if (!arch_timer_rate)

+200 -12

arch/arm64/kernel/topology.c

··· 17 17 #include <linux/percpu.h> 18 18 #include <linux/node.h> 19 19 #include <linux/nodemask.h> 20 + #include <linux/of.h> 20 21 #include <linux/sched.h> 21 22 22 23 #include <asm/topology.h> 24 + 25 + static int __init get_cpu_for_node(struct device_node *node) 26 + { 27 + struct device_node *cpu_node; 28 + int cpu; 29 + 30 + cpu_node = of_parse_phandle(node, "cpu", 0); 31 + if (!cpu_node) 32 + return -1; 33 + 34 + for_each_possible_cpu(cpu) { 35 + if (of_get_cpu_node(cpu, NULL) == cpu_node) { 36 + of_node_put(cpu_node); 37 + return cpu; 38 + } 39 + } 40 + 41 + pr_crit("Unable to find CPU node for %s\n", cpu_node->full_name); 42 + 43 + of_node_put(cpu_node); 44 + return -1; 45 + } 46 + 47 + static int __init parse_core(struct device_node *core, int cluster_id, 48 + int core_id) 49 + { 50 + char name[10]; 51 + bool leaf = true; 52 + int i = 0; 53 + int cpu; 54 + struct device_node *t; 55 + 56 + do { 57 + snprintf(name, sizeof(name), "thread%d", i); 58 + t = of_get_child_by_name(core, name); 59 + if (t) { 60 + leaf = false; 61 + cpu = get_cpu_for_node(t); 62 + if (cpu >= 0) { 63 + cpu_topology[cpu].cluster_id = cluster_id; 64 + cpu_topology[cpu].core_id = core_id; 65 + cpu_topology[cpu].thread_id = i; 66 + } else { 67 + pr_err("%s: Can't get CPU for thread\n", 68 + t->full_name); 69 + of_node_put(t); 70 + return -EINVAL; 71 + } 72 + of_node_put(t); 73 + } 74 + i++; 75 + } while (t); 76 + 77 + cpu = get_cpu_for_node(core); 78 + if (cpu >= 0) { 79 + if (!leaf) { 80 + pr_err("%s: Core has both threads and CPU\n", 81 + core->full_name); 82 + return -EINVAL; 83 + } 84 + 85 + cpu_topology[cpu].cluster_id = cluster_id; 86 + cpu_topology[cpu].core_id = core_id; 87 + } else if (leaf) { 88 + pr_err("%s: Can't get CPU for leaf core\n", core->full_name); 89 + return -EINVAL; 90 + } 91 + 92 + return 0; 93 + } 94 + 95 + static int __init parse_cluster(struct device_node *cluster, int depth) 96 + { 97 + char name[10]; 98 + bool leaf = true; 99 + bool has_cores = false; 100 + struct device_node *c; 101 + static int cluster_id __initdata; 102 + int core_id = 0; 103 + int i, ret; 104 + 105 + /* 106 + * First check for child clusters; we currently ignore any 107 + * information about the nesting of clusters and present the 108 + * scheduler with a flat list of them. 109 + */ 110 + i = 0; 111 + do { 112 + snprintf(name, sizeof(name), "cluster%d", i); 113 + c = of_get_child_by_name(cluster, name); 114 + if (c) { 115 + leaf = false; 116 + ret = parse_cluster(c, depth + 1); 117 + of_node_put(c); 118 + if (ret != 0) 119 + return ret; 120 + } 121 + i++; 122 + } while (c); 123 + 124 + /* Now check for cores */ 125 + i = 0; 126 + do { 127 + snprintf(name, sizeof(name), "core%d", i); 128 + c = of_get_child_by_name(cluster, name); 129 + if (c) { 130 + has_cores = true; 131 + 132 + if (depth == 0) { 133 + pr_err("%s: cpu-map children should be clusters\n", 134 + c->full_name); 135 + of_node_put(c); 136 + return -EINVAL; 137 + } 138 + 139 + if (leaf) { 140 + ret = parse_core(c, cluster_id, core_id++); 141 + } else { 142 + pr_err("%s: Non-leaf cluster with core %s\n", 143 + cluster->full_name, name); 144 + ret = -EINVAL; 145 + } 146 + 147 + of_node_put(c); 148 + if (ret != 0) 149 + return ret; 150 + } 151 + i++; 152 + } while (c); 153 + 154 + if (leaf && !has_cores) 155 + pr_warn("%s: empty cluster\n", cluster->full_name); 156 + 157 + if (leaf) 158 + cluster_id++; 159 + 160 + return 0; 161 + } 162 + 163 + static int __init parse_dt_topology(void) 164 + { 165 + struct device_node *cn, *map; 166 + int ret = 0; 167 + int cpu; 168 + 169 + cn = of_find_node_by_path("/cpus"); 170 + if (!cn) { 171 + pr_err("No CPU information found in DT\n"); 172 + return 0; 173 + } 174 + 175 + /* 176 + * When topology is provided cpu-map is essentially a root 177 + * cluster with restricted subnodes. 178 + */ 179 + map = of_get_child_by_name(cn, "cpu-map"); 180 + if (!map) 181 + goto out; 182 + 183 + ret = parse_cluster(map, 0); 184 + if (ret != 0) 185 + goto out_map; 186 + 187 + /* 188 + * Check that all cores are in the topology; the SMP code will 189 + * only mark cores described in the DT as possible. 190 + */ 191 + for_each_possible_cpu(cpu) { 192 + if (cpu_topology[cpu].cluster_id == -1) { 193 + pr_err("CPU%d: No topology information specified\n", 194 + cpu); 195 + ret = -EINVAL; 196 + } 197 + } 198 + 199 + out_map: 200 + of_node_put(map); 201 + out: 202 + of_node_put(cn); 203 + return ret; 204 + } 23 205 24 206 /* 25 207 * cpu topology table ··· 221 39 222 40 if (cpuid_topo->cluster_id == -1) { 223 41 /* 224 - * DT does not contain topology information for this cpu 225 - * reset it to default behaviour 42 + * DT does not contain topology information for this cpu. 226 43 */ 227 44 pr_debug("CPU%u: No topology information configured\n", cpuid); 228 - cpuid_topo->core_id = 0; 229 - cpumask_set_cpu(cpuid, &cpuid_topo->core_sibling); 230 - cpumask_set_cpu(cpuid, &cpuid_topo->thread_sibling); 231 45 return; 232 46 } 233 47 ··· 252 74 update_siblings_masks(cpuid); 253 75 } 254 76 255 - /* 256 - * init_cpu_topology is called at boot when only one cpu is running 257 - * which prevent simultaneous write access to cpu_topology array 258 - */ 259 - void __init init_cpu_topology(void) 77 + static void __init reset_cpu_topology(void) 260 78 { 261 79 unsigned int cpu; 262 80 263 - /* init core mask and power*/ 264 81 for_each_possible_cpu(cpu) { 265 82 struct cpu_topology *cpu_topo = &cpu_topology[cpu]; 266 83 267 84 cpu_topo->thread_id = -1; 268 - cpu_topo->core_id = -1; 85 + cpu_topo->core_id = 0; 269 86 cpu_topo->cluster_id = -1; 87 + 270 88 cpumask_clear(&cpu_topo->core_sibling); 89 + cpumask_set_cpu(cpu, &cpu_topo->core_sibling); 271 90 cpumask_clear(&cpu_topo->thread_sibling); 91 + cpumask_set_cpu(cpu, &cpu_topo->thread_sibling); 272 92 } 93 + } 94 + 95 + void __init init_cpu_topology(void) 96 + { 97 + reset_cpu_topology(); 98 + 99 + /* 100 + * Discard anything that was parsed if we hit an error so we 101 + * don't use partial information. 102 + */ 103 + if (parse_dt_topology()) 104 + reset_cpu_topology(); 273 105 }

+5 -2

arch/arm64/kernel/traps.c

··· 251 251 void arm64_notify_die(const char *str, struct pt_regs *regs, 252 252 struct siginfo *info, int err) 253 253 { 254 - if (user_mode(regs)) 254 + if (user_mode(regs)) { 255 + current->thread.fault_address = 0; 256 + current->thread.fault_code = err; 255 257 force_sig_info(info->si_signo, info, current); 256 - else 258 + } else { 257 259 die(str, regs, err); 260 + } 258 261 } 259 262 260 263 asmlinkage void __exception do_undefinstr(struct pt_regs *regs)

+1 -1

arch/arm64/kernel/vmlinux.lds.S

··· 13 13 #define ARM_EXIT_DISCARD(x) x 14 14 15 15 OUTPUT_ARCH(aarch64) 16 - ENTRY(stext) 16 + ENTRY(_text) 17 17 18 18 jiffies = jiffies_64; 19 19

+9 -3

arch/arm64/kvm/hyp.S

··· 630 630 * whole of Stage-1. Weep... 631 631 */ 632 632 tlbi ipas2e1is, x1 633 - dsb sy 633 + /* 634 + * We have to ensure completion of the invalidation at Stage-2, 635 + * since a table walk on another CPU could refill a TLB with a 636 + * complete (S1 + S2) walk based on the old Stage-2 mapping if 637 + * the Stage-1 invalidation happened first. 638 + */ 639 + dsb ish 634 640 tlbi vmalle1is 635 - dsb sy 641 + dsb ish 636 642 isb 637 643 638 644 msr vttbr_el2, xzr ··· 649 643 dsb ishst 650 644 tlbi alle1is 651 645 ic ialluis 652 - dsb sy 646 + dsb ish 653 647 ret 654 648 ENDPROC(__kvm_flush_vm_context) 655 649

+2 -2

arch/arm64/kvm/sys_regs.c

··· 71 71 static void do_dc_cisw(u32 val) 72 72 { 73 73 asm volatile("dc cisw, %x0" : : "r" (val)); 74 - dsb(); 74 + dsb(ish); 75 75 } 76 76 77 77 static void do_dc_csw(u32 val) 78 78 { 79 79 asm volatile("dc csw, %x0" : : "r" (val)); 80 - dsb(); 80 + dsb(ish); 81 81 } 82 82 83 83 /* See note at ARM ARM B1.14.4 */

+1

arch/arm64/lib/Makefile

··· 1 1 lib-y := bitops.o clear_user.o delay.o copy_from_user.o \ 2 2 copy_to_user.o copy_in_user.o copy_page.o \ 3 3 clear_page.o memchr.o memcpy.o memmove.o memset.o \ 4 + memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ 4 5 strchr.o strrchr.o

+258

arch/arm64/lib/memcmp.S

··· 1 + /* 2 + * Copyright (C) 2013 ARM Ltd. 3 + * Copyright (C) 2013 Linaro. 4 + * 5 + * This code is based on glibc cortex strings work originally authored by Linaro 6 + * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 + * be found @ 8 + * 9 + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 + * files/head:/src/aarch64/ 11 + * 12 + * This program is free software; you can redistribute it and/or modify 13 + * it under the terms of the GNU General Public License version 2 as 14 + * published by the Free Software Foundation. 15 + * 16 + * This program is distributed in the hope that it will be useful, 17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 + * GNU General Public License for more details. 20 + * 21 + * You should have received a copy of the GNU General Public License 22 + * along with this program. If not, see <http://www.gnu.org/licenses/>. 23 + */ 24 + 25 + #include <linux/linkage.h> 26 + #include <asm/assembler.h> 27 + 28 + /* 29 + * compare memory areas(when two memory areas' offset are different, 30 + * alignment handled by the hardware) 31 + * 32 + * Parameters: 33 + * x0 - const memory area 1 pointer 34 + * x1 - const memory area 2 pointer 35 + * x2 - the maximal compare byte length 36 + * Returns: 37 + * x0 - a compare result, maybe less than, equal to, or greater than ZERO 38 + */ 39 + 40 + /* Parameters and result. */ 41 + src1 .req x0 42 + src2 .req x1 43 + limit .req x2 44 + result .req x0 45 + 46 + /* Internal variables. */ 47 + data1 .req x3 48 + data1w .req w3 49 + data2 .req x4 50 + data2w .req w4 51 + has_nul .req x5 52 + diff .req x6 53 + endloop .req x7 54 + tmp1 .req x8 55 + tmp2 .req x9 56 + tmp3 .req x10 57 + pos .req x11 58 + limit_wd .req x12 59 + mask .req x13 60 + 61 + ENTRY(memcmp) 62 + cbz limit, .Lret0 63 + eor tmp1, src1, src2 64 + tst tmp1, #7 65 + b.ne .Lmisaligned8 66 + ands tmp1, src1, #7 67 + b.ne .Lmutual_align 68 + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 69 + lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ 70 + /* 71 + * The input source addresses are at alignment boundary. 72 + * Directly compare eight bytes each time. 73 + */ 74 + .Lloop_aligned: 75 + ldr data1, [src1], #8 76 + ldr data2, [src2], #8 77 + .Lstart_realigned: 78 + subs limit_wd, limit_wd, #1 79 + eor diff, data1, data2 /* Non-zero if differences found. */ 80 + csinv endloop, diff, xzr, cs /* Last Dword or differences. */ 81 + cbz endloop, .Lloop_aligned 82 + 83 + /* Not reached the limit, must have found a diff. */ 84 + tbz limit_wd, #63, .Lnot_limit 85 + 86 + /* Limit % 8 == 0 => the diff is in the last 8 bytes. */ 87 + ands limit, limit, #7 88 + b.eq .Lnot_limit 89 + /* 90 + * The remained bytes less than 8. It is needed to extract valid data 91 + * from last eight bytes of the intended memory range. 92 + */ 93 + lsl limit, limit, #3 /* bytes-> bits. */ 94 + mov mask, #~0 95 + CPU_BE( lsr mask, mask, limit ) 96 + CPU_LE( lsl mask, mask, limit ) 97 + bic data1, data1, mask 98 + bic data2, data2, mask 99 + 100 + orr diff, diff, mask 101 + b .Lnot_limit 102 + 103 + .Lmutual_align: 104 + /* 105 + * Sources are mutually aligned, but are not currently at an 106 + * alignment boundary. Round down the addresses and then mask off 107 + * the bytes that precede the start point. 108 + */ 109 + bic src1, src1, #7 110 + bic src2, src2, #7 111 + ldr data1, [src1], #8 112 + ldr data2, [src2], #8 113 + /* 114 + * We can not add limit with alignment offset(tmp1) here. Since the 115 + * addition probably make the limit overflown. 116 + */ 117 + sub limit_wd, limit, #1/*limit != 0, so no underflow.*/ 118 + and tmp3, limit_wd, #7 119 + lsr limit_wd, limit_wd, #3 120 + add tmp3, tmp3, tmp1 121 + add limit_wd, limit_wd, tmp3, lsr #3 122 + add limit, limit, tmp1/* Adjust the limit for the extra. */ 123 + 124 + lsl tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/ 125 + neg tmp1, tmp1/* Bits to alignment -64. */ 126 + mov tmp2, #~0 127 + /*mask off the non-intended bytes before the start address.*/ 128 + CPU_BE( lsl tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/ 129 + /* Little-endian. Early bytes are at LSB. */ 130 + CPU_LE( lsr tmp2, tmp2, tmp1 ) 131 + 132 + orr data1, data1, tmp2 133 + orr data2, data2, tmp2 134 + b .Lstart_realigned 135 + 136 + /*src1 and src2 have different alignment offset.*/ 137 + .Lmisaligned8: 138 + cmp limit, #8 139 + b.lo .Ltiny8proc /*limit < 8: compare byte by byte*/ 140 + 141 + and tmp1, src1, #7 142 + neg tmp1, tmp1 143 + add tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/ 144 + and tmp2, src2, #7 145 + neg tmp2, tmp2 146 + add tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/ 147 + subs tmp3, tmp1, tmp2 148 + csel pos, tmp1, tmp2, hi /*Choose the maximum.*/ 149 + 150 + sub limit, limit, pos 151 + /*compare the proceeding bytes in the first 8 byte segment.*/ 152 + .Ltinycmp: 153 + ldrb data1w, [src1], #1 154 + ldrb data2w, [src2], #1 155 + subs pos, pos, #1 156 + ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ 157 + b.eq .Ltinycmp 158 + cbnz pos, 1f /*diff occurred before the last byte.*/ 159 + cmp data1w, data2w 160 + b.eq .Lstart_align 161 + 1: 162 + sub result, data1, data2 163 + ret 164 + 165 + .Lstart_align: 166 + lsr limit_wd, limit, #3 167 + cbz limit_wd, .Lremain8 168 + 169 + ands xzr, src1, #7 170 + b.eq .Lrecal_offset 171 + /*process more leading bytes to make src1 aligned...*/ 172 + add src1, src1, tmp3 /*backwards src1 to alignment boundary*/ 173 + add src2, src2, tmp3 174 + sub limit, limit, tmp3 175 + lsr limit_wd, limit, #3 176 + cbz limit_wd, .Lremain8 177 + /*load 8 bytes from aligned SRC1..*/ 178 + ldr data1, [src1], #8 179 + ldr data2, [src2], #8 180 + 181 + subs limit_wd, limit_wd, #1 182 + eor diff, data1, data2 /*Non-zero if differences found.*/ 183 + csinv endloop, diff, xzr, ne 184 + cbnz endloop, .Lunequal_proc 185 + /*How far is the current SRC2 from the alignment boundary...*/ 186 + and tmp3, tmp3, #7 187 + 188 + .Lrecal_offset:/*src1 is aligned now..*/ 189 + neg pos, tmp3 190 + .Lloopcmp_proc: 191 + /* 192 + * Divide the eight bytes into two parts. First,backwards the src2 193 + * to an alignment boundary,load eight bytes and compare from 194 + * the SRC2 alignment boundary. If all 8 bytes are equal,then start 195 + * the second part's comparison. Otherwise finish the comparison. 196 + * This special handle can garantee all the accesses are in the 197 + * thread/task space in avoid to overrange access. 198 + */ 199 + ldr data1, [src1,pos] 200 + ldr data2, [src2,pos] 201 + eor diff, data1, data2 /* Non-zero if differences found. */ 202 + cbnz diff, .Lnot_limit 203 + 204 + /*The second part process*/ 205 + ldr data1, [src1], #8 206 + ldr data2, [src2], #8 207 + eor diff, data1, data2 /* Non-zero if differences found. */ 208 + subs limit_wd, limit_wd, #1 209 + csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ 210 + cbz endloop, .Lloopcmp_proc 211 + .Lunequal_proc: 212 + cbz diff, .Lremain8 213 + 214 + /*There is differnence occured in the latest comparison.*/ 215 + .Lnot_limit: 216 + /* 217 + * For little endian,reverse the low significant equal bits into MSB,then 218 + * following CLZ can find how many equal bits exist. 219 + */ 220 + CPU_LE( rev diff, diff ) 221 + CPU_LE( rev data1, data1 ) 222 + CPU_LE( rev data2, data2 ) 223 + 224 + /* 225 + * The MS-non-zero bit of DIFF marks either the first bit 226 + * that is different, or the end of the significant data. 227 + * Shifting left now will bring the critical information into the 228 + * top bits. 229 + */ 230 + clz pos, diff 231 + lsl data1, data1, pos 232 + lsl data2, data2, pos 233 + /* 234 + * We need to zero-extend (char is unsigned) the value and then 235 + * perform a signed subtraction. 236 + */ 237 + lsr data1, data1, #56 238 + sub result, data1, data2, lsr #56 239 + ret 240 + 241 + .Lremain8: 242 + /* Limit % 8 == 0 =>. all data are equal.*/ 243 + ands limit, limit, #7 244 + b.eq .Lret0 245 + 246 + .Ltiny8proc: 247 + ldrb data1w, [src1], #1 248 + ldrb data2w, [src2], #1 249 + subs limit, limit, #1 250 + 251 + ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ 252 + b.eq .Ltiny8proc 253 + sub result, data1, data2 254 + ret 255 + .Lret0: 256 + mov result, #0 257 + ret 258 + ENDPROC(memcmp)

+170 -22

arch/arm64/lib/memcpy.S

··· 1 1 /* 2 2 * Copyright (C) 2013 ARM Ltd. 3 + * Copyright (C) 2013 Linaro. 4 + * 5 + * This code is based on glibc cortex strings work originally authored by Linaro 6 + * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 + * be found @ 8 + * 9 + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 + * files/head:/src/aarch64/ 3 11 * 4 12 * This program is free software; you can redistribute it and/or modify 5 13 * it under the terms of the GNU General Public License version 2 as ··· 24 16 25 17 #include <linux/linkage.h> 26 18 #include <asm/assembler.h> 19 + #include <asm/cache.h> 27 20 28 21 /* 29 22 * Copy a buffer from src to dest (alignment handled by the hardware) ··· 36 27 * Returns: 37 28 * x0 - dest 38 29 */ 30 + dstin .req x0 31 + src .req x1 32 + count .req x2 33 + tmp1 .req x3 34 + tmp1w .req w3 35 + tmp2 .req x4 36 + tmp2w .req w4 37 + tmp3 .req x5 38 + tmp3w .req w5 39 + dst .req x6 40 + 41 + A_l .req x7 42 + A_h .req x8 43 + B_l .req x9 44 + B_h .req x10 45 + C_l .req x11 46 + C_h .req x12 47 + D_l .req x13 48 + D_h .req x14 49 + 39 50 ENTRY(memcpy) 40 - mov x4, x0 41 - subs x2, x2, #8 42 - b.mi 2f 43 - 1: ldr x3, [x1], #8 44 - subs x2, x2, #8 45 - str x3, [x4], #8 46 - b.pl 1b 47 - 2: adds x2, x2, #4 48 - b.mi 3f 49 - ldr w3, [x1], #4 50 - sub x2, x2, #4 51 - str w3, [x4], #4 52 - 3: adds x2, x2, #2 53 - b.mi 4f 54 - ldrh w3, [x1], #2 55 - sub x2, x2, #2 56 - strh w3, [x4], #2 57 - 4: adds x2, x2, #1 58 - b.mi 5f 59 - ldrb w3, [x1] 60 - strb w3, [x4] 61 - 5: ret 51 + mov dst, dstin 52 + cmp count, #16 53 + /*When memory length is less than 16, the accessed are not aligned.*/ 54 + b.lo .Ltiny15 55 + 56 + neg tmp2, src 57 + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ 58 + b.eq .LSrcAligned 59 + sub count, count, tmp2 60 + /* 61 + * Copy the leading memory data from src to dst in an increasing 62 + * address order.By this way,the risk of overwritting the source 63 + * memory data is eliminated when the distance between src and 64 + * dst is less than 16. The memory accesses here are alignment. 65 + */ 66 + tbz tmp2, #0, 1f 67 + ldrb tmp1w, [src], #1 68 + strb tmp1w, [dst], #1 69 + 1: 70 + tbz tmp2, #1, 2f 71 + ldrh tmp1w, [src], #2 72 + strh tmp1w, [dst], #2 73 + 2: 74 + tbz tmp2, #2, 3f 75 + ldr tmp1w, [src], #4 76 + str tmp1w, [dst], #4 77 + 3: 78 + tbz tmp2, #3, .LSrcAligned 79 + ldr tmp1, [src],#8 80 + str tmp1, [dst],#8 81 + 82 + .LSrcAligned: 83 + cmp count, #64 84 + b.ge .Lcpy_over64 85 + /* 86 + * Deal with small copies quickly by dropping straight into the 87 + * exit block. 88 + */ 89 + .Ltail63: 90 + /* 91 + * Copy up to 48 bytes of data. At this point we only need the 92 + * bottom 6 bits of count to be accurate. 93 + */ 94 + ands tmp1, count, #0x30 95 + b.eq .Ltiny15 96 + cmp tmp1w, #0x20 97 + b.eq 1f 98 + b.lt 2f 99 + ldp A_l, A_h, [src], #16 100 + stp A_l, A_h, [dst], #16 101 + 1: 102 + ldp A_l, A_h, [src], #16 103 + stp A_l, A_h, [dst], #16 104 + 2: 105 + ldp A_l, A_h, [src], #16 106 + stp A_l, A_h, [dst], #16 107 + .Ltiny15: 108 + /* 109 + * Prefer to break one ldp/stp into several load/store to access 110 + * memory in an increasing address order,rather than to load/store 16 111 + * bytes from (src-16) to (dst-16) and to backward the src to aligned 112 + * address,which way is used in original cortex memcpy. If keeping 113 + * the original memcpy process here, memmove need to satisfy the 114 + * precondition that src address is at least 16 bytes bigger than dst 115 + * address,otherwise some source data will be overwritten when memove 116 + * call memcpy directly. To make memmove simpler and decouple the 117 + * memcpy's dependency on memmove, withdrew the original process. 118 + */ 119 + tbz count, #3, 1f 120 + ldr tmp1, [src], #8 121 + str tmp1, [dst], #8 122 + 1: 123 + tbz count, #2, 2f 124 + ldr tmp1w, [src], #4 125 + str tmp1w, [dst], #4 126 + 2: 127 + tbz count, #1, 3f 128 + ldrh tmp1w, [src], #2 129 + strh tmp1w, [dst], #2 130 + 3: 131 + tbz count, #0, .Lexitfunc 132 + ldrb tmp1w, [src] 133 + strb tmp1w, [dst] 134 + 135 + .Lexitfunc: 136 + ret 137 + 138 + .Lcpy_over64: 139 + subs count, count, #128 140 + b.ge .Lcpy_body_large 141 + /* 142 + * Less than 128 bytes to copy, so handle 64 here and then jump 143 + * to the tail. 144 + */ 145 + ldp A_l, A_h, [src],#16 146 + stp A_l, A_h, [dst],#16 147 + ldp B_l, B_h, [src],#16 148 + ldp C_l, C_h, [src],#16 149 + stp B_l, B_h, [dst],#16 150 + stp C_l, C_h, [dst],#16 151 + ldp D_l, D_h, [src],#16 152 + stp D_l, D_h, [dst],#16 153 + 154 + tst count, #0x3f 155 + b.ne .Ltail63 156 + ret 157 + 158 + /* 159 + * Critical loop. Start at a new cache line boundary. Assuming 160 + * 64 bytes per line this ensures the entire loop is in one line. 161 + */ 162 + .p2align L1_CACHE_SHIFT 163 + .Lcpy_body_large: 164 + /* pre-get 64 bytes data. */ 165 + ldp A_l, A_h, [src],#16 166 + ldp B_l, B_h, [src],#16 167 + ldp C_l, C_h, [src],#16 168 + ldp D_l, D_h, [src],#16 169 + 1: 170 + /* 171 + * interlace the load of next 64 bytes data block with store of the last 172 + * loaded 64 bytes data. 173 + */ 174 + stp A_l, A_h, [dst],#16 175 + ldp A_l, A_h, [src],#16 176 + stp B_l, B_h, [dst],#16 177 + ldp B_l, B_h, [src],#16 178 + stp C_l, C_h, [dst],#16 179 + ldp C_l, C_h, [src],#16 180 + stp D_l, D_h, [dst],#16 181 + ldp D_l, D_h, [src],#16 182 + subs count, count, #64 183 + b.ge 1b 184 + stp A_l, A_h, [dst],#16 185 + stp B_l, B_h, [dst],#16 186 + stp C_l, C_h, [dst],#16 187 + stp D_l, D_h, [dst],#16 188 + 189 + tst count, #0x3f 190 + b.ne .Ltail63 191 + ret 62 192 ENDPROC(memcpy)

+165 -25

arch/arm64/lib/memmove.S

··· 1 1 /* 2 2 * Copyright (C) 2013 ARM Ltd. 3 + * Copyright (C) 2013 Linaro. 4 + * 5 + * This code is based on glibc cortex strings work originally authored by Linaro 6 + * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 + * be found @ 8 + * 9 + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 + * files/head:/src/aarch64/ 3 11 * 4 12 * This program is free software; you can redistribute it and/or modify 5 13 * it under the terms of the GNU General Public License version 2 as ··· 24 16 25 17 #include <linux/linkage.h> 26 18 #include <asm/assembler.h> 19 + #include <asm/cache.h> 27 20 28 21 /* 29 22 * Move a buffer from src to test (alignment handled by the hardware). ··· 37 28 * Returns: 38 29 * x0 - dest 39 30 */ 31 + dstin .req x0 32 + src .req x1 33 + count .req x2 34 + tmp1 .req x3 35 + tmp1w .req w3 36 + tmp2 .req x4 37 + tmp2w .req w4 38 + tmp3 .req x5 39 + tmp3w .req w5 40 + dst .req x6 41 + 42 + A_l .req x7 43 + A_h .req x8 44 + B_l .req x9 45 + B_h .req x10 46 + C_l .req x11 47 + C_h .req x12 48 + D_l .req x13 49 + D_h .req x14 50 + 40 51 ENTRY(memmove) 41 - cmp x0, x1 42 - b.ls memcpy 43 - add x4, x0, x2 44 - add x1, x1, x2 45 - subs x2, x2, #8 46 - b.mi 2f 47 - 1: ldr x3, [x1, #-8]! 48 - subs x2, x2, #8 49 - str x3, [x4, #-8]! 50 - b.pl 1b 51 - 2: adds x2, x2, #4 52 - b.mi 3f 53 - ldr w3, [x1, #-4]! 54 - sub x2, x2, #4 55 - str w3, [x4, #-4]! 56 - 3: adds x2, x2, #2 57 - b.mi 4f 58 - ldrh w3, [x1, #-2]! 59 - sub x2, x2, #2 60 - strh w3, [x4, #-2]! 61 - 4: adds x2, x2, #1 62 - b.mi 5f 63 - ldrb w3, [x1, #-1] 64 - strb w3, [x4, #-1] 65 - 5: ret 52 + cmp dstin, src 53 + b.lo memcpy 54 + add tmp1, src, count 55 + cmp dstin, tmp1 56 + b.hs memcpy /* No overlap. */ 57 + 58 + add dst, dstin, count 59 + add src, src, count 60 + cmp count, #16 61 + b.lo .Ltail15 /*probably non-alignment accesses.*/ 62 + 63 + ands tmp2, src, #15 /* Bytes to reach alignment. */ 64 + b.eq .LSrcAligned 65 + sub count, count, tmp2 66 + /* 67 + * process the aligned offset length to make the src aligned firstly. 68 + * those extra instructions' cost is acceptable. It also make the 69 + * coming accesses are based on aligned address. 70 + */ 71 + tbz tmp2, #0, 1f 72 + ldrb tmp1w, [src, #-1]! 73 + strb tmp1w, [dst, #-1]! 74 + 1: 75 + tbz tmp2, #1, 2f 76 + ldrh tmp1w, [src, #-2]! 77 + strh tmp1w, [dst, #-2]! 78 + 2: 79 + tbz tmp2, #2, 3f 80 + ldr tmp1w, [src, #-4]! 81 + str tmp1w, [dst, #-4]! 82 + 3: 83 + tbz tmp2, #3, .LSrcAligned 84 + ldr tmp1, [src, #-8]! 85 + str tmp1, [dst, #-8]! 86 + 87 + .LSrcAligned: 88 + cmp count, #64 89 + b.ge .Lcpy_over64 90 + 91 + /* 92 + * Deal with small copies quickly by dropping straight into the 93 + * exit block. 94 + */ 95 + .Ltail63: 96 + /* 97 + * Copy up to 48 bytes of data. At this point we only need the 98 + * bottom 6 bits of count to be accurate. 99 + */ 100 + ands tmp1, count, #0x30 101 + b.eq .Ltail15 102 + cmp tmp1w, #0x20 103 + b.eq 1f 104 + b.lt 2f 105 + ldp A_l, A_h, [src, #-16]! 106 + stp A_l, A_h, [dst, #-16]! 107 + 1: 108 + ldp A_l, A_h, [src, #-16]! 109 + stp A_l, A_h, [dst, #-16]! 110 + 2: 111 + ldp A_l, A_h, [src, #-16]! 112 + stp A_l, A_h, [dst, #-16]! 113 + 114 + .Ltail15: 115 + tbz count, #3, 1f 116 + ldr tmp1, [src, #-8]! 117 + str tmp1, [dst, #-8]! 118 + 1: 119 + tbz count, #2, 2f 120 + ldr tmp1w, [src, #-4]! 121 + str tmp1w, [dst, #-4]! 122 + 2: 123 + tbz count, #1, 3f 124 + ldrh tmp1w, [src, #-2]! 125 + strh tmp1w, [dst, #-2]! 126 + 3: 127 + tbz count, #0, .Lexitfunc 128 + ldrb tmp1w, [src, #-1] 129 + strb tmp1w, [dst, #-1] 130 + 131 + .Lexitfunc: 132 + ret 133 + 134 + .Lcpy_over64: 135 + subs count, count, #128 136 + b.ge .Lcpy_body_large 137 + /* 138 + * Less than 128 bytes to copy, so handle 64 bytes here and then jump 139 + * to the tail. 140 + */ 141 + ldp A_l, A_h, [src, #-16] 142 + stp A_l, A_h, [dst, #-16] 143 + ldp B_l, B_h, [src, #-32] 144 + ldp C_l, C_h, [src, #-48] 145 + stp B_l, B_h, [dst, #-32] 146 + stp C_l, C_h, [dst, #-48] 147 + ldp D_l, D_h, [src, #-64]! 148 + stp D_l, D_h, [dst, #-64]! 149 + 150 + tst count, #0x3f 151 + b.ne .Ltail63 152 + ret 153 + 154 + /* 155 + * Critical loop. Start at a new cache line boundary. Assuming 156 + * 64 bytes per line this ensures the entire loop is in one line. 157 + */ 158 + .p2align L1_CACHE_SHIFT 159 + .Lcpy_body_large: 160 + /* pre-load 64 bytes data. */ 161 + ldp A_l, A_h, [src, #-16] 162 + ldp B_l, B_h, [src, #-32] 163 + ldp C_l, C_h, [src, #-48] 164 + ldp D_l, D_h, [src, #-64]! 165 + 1: 166 + /* 167 + * interlace the load of next 64 bytes data block with store of the last 168 + * loaded 64 bytes data. 169 + */ 170 + stp A_l, A_h, [dst, #-16] 171 + ldp A_l, A_h, [src, #-16] 172 + stp B_l, B_h, [dst, #-32] 173 + ldp B_l, B_h, [src, #-32] 174 + stp C_l, C_h, [dst, #-48] 175 + ldp C_l, C_h, [src, #-48] 176 + stp D_l, D_h, [dst, #-64]! 177 + ldp D_l, D_h, [src, #-64]! 178 + subs count, count, #64 179 + b.ge 1b 180 + stp A_l, A_h, [dst, #-16] 181 + stp B_l, B_h, [dst, #-32] 182 + stp C_l, C_h, [dst, #-48] 183 + stp D_l, D_h, [dst, #-64]! 184 + 185 + tst count, #0x3f 186 + b.ne .Ltail63 187 + ret 66 188 ENDPROC(memmove)

+185 -22

arch/arm64/lib/memset.S

··· 1 1 /* 2 2 * Copyright (C) 2013 ARM Ltd. 3 + * Copyright (C) 2013 Linaro. 4 + * 5 + * This code is based on glibc cortex strings work originally authored by Linaro 6 + * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 + * be found @ 8 + * 9 + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 + * files/head:/src/aarch64/ 3 11 * 4 12 * This program is free software; you can redistribute it and/or modify 5 13 * it under the terms of the GNU General Public License version 2 as ··· 24 16 25 17 #include <linux/linkage.h> 26 18 #include <asm/assembler.h> 19 + #include <asm/cache.h> 27 20 28 21 /* 29 22 * Fill in the buffer with character c (alignment handled by the hardware) ··· 36 27 * Returns: 37 28 * x0 - buf 38 29 */ 30 + 31 + dstin .req x0 32 + val .req w1 33 + count .req x2 34 + tmp1 .req x3 35 + tmp1w .req w3 36 + tmp2 .req x4 37 + tmp2w .req w4 38 + zva_len_x .req x5 39 + zva_len .req w5 40 + zva_bits_x .req x6 41 + 42 + A_l .req x7 43 + A_lw .req w7 44 + dst .req x8 45 + tmp3w .req w9 46 + tmp3 .req x9 47 + 39 48 ENTRY(memset) 40 - mov x4, x0 41 - and w1, w1, #0xff 42 - orr w1, w1, w1, lsl #8 43 - orr w1, w1, w1, lsl #16 44 - orr x1, x1, x1, lsl #32 45 - subs x2, x2, #8 46 - b.mi 2f 47 - 1: str x1, [x4], #8 48 - subs x2, x2, #8 49 - b.pl 1b 50 - 2: adds x2, x2, #4 51 - b.mi 3f 52 - sub x2, x2, #4 53 - str w1, [x4], #4 54 - 3: adds x2, x2, #2 55 - b.mi 4f 56 - sub x2, x2, #2 57 - strh w1, [x4], #2 58 - 4: adds x2, x2, #1 59 - b.mi 5f 60 - strb w1, [x4] 61 - 5: ret 49 + mov dst, dstin /* Preserve return value. */ 50 + and A_lw, val, #255 51 + orr A_lw, A_lw, A_lw, lsl #8 52 + orr A_lw, A_lw, A_lw, lsl #16 53 + orr A_l, A_l, A_l, lsl #32 54 + 55 + cmp count, #15 56 + b.hi .Lover16_proc 57 + /*All store maybe are non-aligned..*/ 58 + tbz count, #3, 1f 59 + str A_l, [dst], #8 60 + 1: 61 + tbz count, #2, 2f 62 + str A_lw, [dst], #4 63 + 2: 64 + tbz count, #1, 3f 65 + strh A_lw, [dst], #2 66 + 3: 67 + tbz count, #0, 4f 68 + strb A_lw, [dst] 69 + 4: 70 + ret 71 + 72 + .Lover16_proc: 73 + /*Whether the start address is aligned with 16.*/ 74 + neg tmp2, dst 75 + ands tmp2, tmp2, #15 76 + b.eq .Laligned 77 + /* 78 + * The count is not less than 16, we can use stp to store the start 16 bytes, 79 + * then adjust the dst aligned with 16.This process will make the current 80 + * memory address at alignment boundary. 81 + */ 82 + stp A_l, A_l, [dst] /*non-aligned store..*/ 83 + /*make the dst aligned..*/ 84 + sub count, count, tmp2 85 + add dst, dst, tmp2 86 + 87 + .Laligned: 88 + cbz A_l, .Lzero_mem 89 + 90 + .Ltail_maybe_long: 91 + cmp count, #64 92 + b.ge .Lnot_short 93 + .Ltail63: 94 + ands tmp1, count, #0x30 95 + b.eq 3f 96 + cmp tmp1w, #0x20 97 + b.eq 1f 98 + b.lt 2f 99 + stp A_l, A_l, [dst], #16 100 + 1: 101 + stp A_l, A_l, [dst], #16 102 + 2: 103 + stp A_l, A_l, [dst], #16 104 + /* 105 + * The last store length is less than 16,use stp to write last 16 bytes. 106 + * It will lead some bytes written twice and the access is non-aligned. 107 + */ 108 + 3: 109 + ands count, count, #15 110 + cbz count, 4f 111 + add dst, dst, count 112 + stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ 113 + 4: 114 + ret 115 + 116 + /* 117 + * Critical loop. Start at a new cache line boundary. Assuming 118 + * 64 bytes per line, this ensures the entire loop is in one line. 119 + */ 120 + .p2align L1_CACHE_SHIFT 121 + .Lnot_short: 122 + sub dst, dst, #16/* Pre-bias. */ 123 + sub count, count, #64 124 + 1: 125 + stp A_l, A_l, [dst, #16] 126 + stp A_l, A_l, [dst, #32] 127 + stp A_l, A_l, [dst, #48] 128 + stp A_l, A_l, [dst, #64]! 129 + subs count, count, #64 130 + b.ge 1b 131 + tst count, #0x3f 132 + add dst, dst, #16 133 + b.ne .Ltail63 134 + .Lexitfunc: 135 + ret 136 + 137 + /* 138 + * For zeroing memory, check to see if we can use the ZVA feature to 139 + * zero entire 'cache' lines. 140 + */ 141 + .Lzero_mem: 142 + cmp count, #63 143 + b.le .Ltail63 144 + /* 145 + * For zeroing small amounts of memory, it's not worth setting up 146 + * the line-clear code. 147 + */ 148 + cmp count, #128 149 + b.lt .Lnot_short /*count is at least 128 bytes*/ 150 + 151 + mrs tmp1, dczid_el0 152 + tbnz tmp1, #4, .Lnot_short 153 + mov tmp3w, #4 154 + and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 155 + lsl zva_len, tmp3w, zva_len 156 + 157 + ands tmp3w, zva_len, #63 158 + /* 159 + * ensure the zva_len is not less than 64. 160 + * It is not meaningful to use ZVA if the block size is less than 64. 161 + */ 162 + b.ne .Lnot_short 163 + .Lzero_by_line: 164 + /* 165 + * Compute how far we need to go to become suitably aligned. We're 166 + * already at quad-word alignment. 167 + */ 168 + cmp count, zva_len_x 169 + b.lt .Lnot_short /* Not enough to reach alignment. */ 170 + sub zva_bits_x, zva_len_x, #1 171 + neg tmp2, dst 172 + ands tmp2, tmp2, zva_bits_x 173 + b.eq 2f /* Already aligned. */ 174 + /* Not aligned, check that there's enough to copy after alignment.*/ 175 + sub tmp1, count, tmp2 176 + /* 177 + * grantee the remain length to be ZVA is bigger than 64, 178 + * avoid to make the 2f's process over mem range.*/ 179 + cmp tmp1, #64 180 + ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ 181 + b.lt .Lnot_short 182 + /* 183 + * We know that there's at least 64 bytes to zero and that it's safe 184 + * to overrun by 64 bytes. 185 + */ 186 + mov count, tmp1 187 + 1: 188 + stp A_l, A_l, [dst] 189 + stp A_l, A_l, [dst, #16] 190 + stp A_l, A_l, [dst, #32] 191 + subs tmp2, tmp2, #64 192 + stp A_l, A_l, [dst, #48] 193 + add dst, dst, #64 194 + b.ge 1b 195 + /* We've overrun a bit, so adjust dst downwards.*/ 196 + add dst, dst, tmp2 197 + 2: 198 + sub count, count, zva_len_x 199 + 3: 200 + dc zva, dst 201 + add dst, dst, zva_len_x 202 + subs count, count, zva_len_x 203 + b.ge 3b 204 + ands count, count, zva_bits_x 205 + b.ne .Ltail_maybe_long 206 + ret 62 207 ENDPROC(memset)

+234

arch/arm64/lib/strcmp.S

··· 1 + /* 2 + * Copyright (C) 2013 ARM Ltd. 3 + * Copyright (C) 2013 Linaro. 4 + * 5 + * This code is based on glibc cortex strings work originally authored by Linaro 6 + * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 + * be found @ 8 + * 9 + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 + * files/head:/src/aarch64/ 11 + * 12 + * This program is free software; you can redistribute it and/or modify 13 + * it under the terms of the GNU General Public License version 2 as 14 + * published by the Free Software Foundation. 15 + * 16 + * This program is distributed in the hope that it will be useful, 17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 + * GNU General Public License for more details. 20 + * 21 + * You should have received a copy of the GNU General Public License 22 + * along with this program. If not, see <http://www.gnu.org/licenses/>. 23 + */ 24 + 25 + #include <linux/linkage.h> 26 + #include <asm/assembler.h> 27 + 28 + /* 29 + * compare two strings 30 + * 31 + * Parameters: 32 + * x0 - const string 1 pointer 33 + * x1 - const string 2 pointer 34 + * Returns: 35 + * x0 - an integer less than, equal to, or greater than zero 36 + * if s1 is found, respectively, to be less than, to match, 37 + * or be greater than s2. 38 + */ 39 + 40 + #define REP8_01 0x0101010101010101 41 + #define REP8_7f 0x7f7f7f7f7f7f7f7f 42 + #define REP8_80 0x8080808080808080 43 + 44 + /* Parameters and result. */ 45 + src1 .req x0 46 + src2 .req x1 47 + result .req x0 48 + 49 + /* Internal variables. */ 50 + data1 .req x2 51 + data1w .req w2 52 + data2 .req x3 53 + data2w .req w3 54 + has_nul .req x4 55 + diff .req x5 56 + syndrome .req x6 57 + tmp1 .req x7 58 + tmp2 .req x8 59 + tmp3 .req x9 60 + zeroones .req x10 61 + pos .req x11 62 + 63 + ENTRY(strcmp) 64 + eor tmp1, src1, src2 65 + mov zeroones, #REP8_01 66 + tst tmp1, #7 67 + b.ne .Lmisaligned8 68 + ands tmp1, src1, #7 69 + b.ne .Lmutual_align 70 + 71 + /* 72 + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 73 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 74 + * can be done in parallel across the entire word. 75 + */ 76 + .Lloop_aligned: 77 + ldr data1, [src1], #8 78 + ldr data2, [src2], #8 79 + .Lstart_realigned: 80 + sub tmp1, data1, zeroones 81 + orr tmp2, data1, #REP8_7f 82 + eor diff, data1, data2 /* Non-zero if differences found. */ 83 + bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 84 + orr syndrome, diff, has_nul 85 + cbz syndrome, .Lloop_aligned 86 + b .Lcal_cmpresult 87 + 88 + .Lmutual_align: 89 + /* 90 + * Sources are mutually aligned, but are not currently at an 91 + * alignment boundary. Round down the addresses and then mask off 92 + * the bytes that preceed the start point. 93 + */ 94 + bic src1, src1, #7 95 + bic src2, src2, #7 96 + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ 97 + ldr data1, [src1], #8 98 + neg tmp1, tmp1 /* Bits to alignment -64. */ 99 + ldr data2, [src2], #8 100 + mov tmp2, #~0 101 + /* Big-endian. Early bytes are at MSB. */ 102 + CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ 103 + /* Little-endian. Early bytes are at LSB. */ 104 + CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ 105 + 106 + orr data1, data1, tmp2 107 + orr data2, data2, tmp2 108 + b .Lstart_realigned 109 + 110 + .Lmisaligned8: 111 + /* 112 + * Get the align offset length to compare per byte first. 113 + * After this process, one string's address will be aligned. 114 + */ 115 + and tmp1, src1, #7 116 + neg tmp1, tmp1 117 + add tmp1, tmp1, #8 118 + and tmp2, src2, #7 119 + neg tmp2, tmp2 120 + add tmp2, tmp2, #8 121 + subs tmp3, tmp1, tmp2 122 + csel pos, tmp1, tmp2, hi /*Choose the maximum. */ 123 + .Ltinycmp: 124 + ldrb data1w, [src1], #1 125 + ldrb data2w, [src2], #1 126 + subs pos, pos, #1 127 + ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ 128 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 129 + b.eq .Ltinycmp 130 + cbnz pos, 1f /*find the null or unequal...*/ 131 + cmp data1w, #1 132 + ccmp data1w, data2w, #0, cs 133 + b.eq .Lstart_align /*the last bytes are equal....*/ 134 + 1: 135 + sub result, data1, data2 136 + ret 137 + 138 + .Lstart_align: 139 + ands xzr, src1, #7 140 + b.eq .Lrecal_offset 141 + /*process more leading bytes to make str1 aligned...*/ 142 + add src1, src1, tmp3 143 + add src2, src2, tmp3 144 + /*load 8 bytes from aligned str1 and non-aligned str2..*/ 145 + ldr data1, [src1], #8 146 + ldr data2, [src2], #8 147 + 148 + sub tmp1, data1, zeroones 149 + orr tmp2, data1, #REP8_7f 150 + bic has_nul, tmp1, tmp2 151 + eor diff, data1, data2 /* Non-zero if differences found. */ 152 + orr syndrome, diff, has_nul 153 + cbnz syndrome, .Lcal_cmpresult 154 + /*How far is the current str2 from the alignment boundary...*/ 155 + and tmp3, tmp3, #7 156 + .Lrecal_offset: 157 + neg pos, tmp3 158 + .Lloopcmp_proc: 159 + /* 160 + * Divide the eight bytes into two parts. First,backwards the src2 161 + * to an alignment boundary,load eight bytes from the SRC2 alignment 162 + * boundary,then compare with the relative bytes from SRC1. 163 + * If all 8 bytes are equal,then start the second part's comparison. 164 + * Otherwise finish the comparison. 165 + * This special handle can garantee all the accesses are in the 166 + * thread/task space in avoid to overrange access. 167 + */ 168 + ldr data1, [src1,pos] 169 + ldr data2, [src2,pos] 170 + sub tmp1, data1, zeroones 171 + orr tmp2, data1, #REP8_7f 172 + bic has_nul, tmp1, tmp2 173 + eor diff, data1, data2 /* Non-zero if differences found. */ 174 + orr syndrome, diff, has_nul 175 + cbnz syndrome, .Lcal_cmpresult 176 + 177 + /*The second part process*/ 178 + ldr data1, [src1], #8 179 + ldr data2, [src2], #8 180 + sub tmp1, data1, zeroones 181 + orr tmp2, data1, #REP8_7f 182 + bic has_nul, tmp1, tmp2 183 + eor diff, data1, data2 /* Non-zero if differences found. */ 184 + orr syndrome, diff, has_nul 185 + cbz syndrome, .Lloopcmp_proc 186 + 187 + .Lcal_cmpresult: 188 + /* 189 + * reversed the byte-order as big-endian,then CLZ can find the most 190 + * significant zero bits. 191 + */ 192 + CPU_LE( rev syndrome, syndrome ) 193 + CPU_LE( rev data1, data1 ) 194 + CPU_LE( rev data2, data2 ) 195 + 196 + /* 197 + * For big-endian we cannot use the trick with the syndrome value 198 + * as carry-propagation can corrupt the upper bits if the trailing 199 + * bytes in the string contain 0x01. 200 + * However, if there is no NUL byte in the dword, we can generate 201 + * the result directly. We ca not just subtract the bytes as the 202 + * MSB might be significant. 203 + */ 204 + CPU_BE( cbnz has_nul, 1f ) 205 + CPU_BE( cmp data1, data2 ) 206 + CPU_BE( cset result, ne ) 207 + CPU_BE( cneg result, result, lo ) 208 + CPU_BE( ret ) 209 + CPU_BE( 1: ) 210 + /*Re-compute the NUL-byte detection, using a byte-reversed value. */ 211 + CPU_BE( rev tmp3, data1 ) 212 + CPU_BE( sub tmp1, tmp3, zeroones ) 213 + CPU_BE( orr tmp2, tmp3, #REP8_7f ) 214 + CPU_BE( bic has_nul, tmp1, tmp2 ) 215 + CPU_BE( rev has_nul, has_nul ) 216 + CPU_BE( orr syndrome, diff, has_nul ) 217 + 218 + clz pos, syndrome 219 + /* 220 + * The MS-non-zero bit of the syndrome marks either the first bit 221 + * that is different, or the top bit of the first zero byte. 222 + * Shifting left now will bring the critical information into the 223 + * top bits. 224 + */ 225 + lsl data1, data1, pos 226 + lsl data2, data2, pos 227 + /* 228 + * But we need to zero-extend (char is unsigned) the value and then 229 + * perform a signed 32-bit subtraction. 230 + */ 231 + lsr data1, data1, #56 232 + sub result, data1, data2, lsr #56 233 + ret 234 + ENDPROC(strcmp)

+126

arch/arm64/lib/strlen.S

··· 1 + /* 2 + * Copyright (C) 2013 ARM Ltd. 3 + * Copyright (C) 2013 Linaro. 4 + * 5 + * This code is based on glibc cortex strings work originally authored by Linaro 6 + * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 + * be found @ 8 + * 9 + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 + * files/head:/src/aarch64/ 11 + * 12 + * This program is free software; you can redistribute it and/or modify 13 + * it under the terms of the GNU General Public License version 2 as 14 + * published by the Free Software Foundation. 15 + * 16 + * This program is distributed in the hope that it will be useful, 17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 + * GNU General Public License for more details. 20 + * 21 + * You should have received a copy of the GNU General Public License 22 + * along with this program. If not, see <http://www.gnu.org/licenses/>. 23 + */ 24 + 25 + #include <linux/linkage.h> 26 + #include <asm/assembler.h> 27 + 28 + /* 29 + * calculate the length of a string 30 + * 31 + * Parameters: 32 + * x0 - const string pointer 33 + * Returns: 34 + * x0 - the return length of specific string 35 + */ 36 + 37 + /* Arguments and results. */ 38 + srcin .req x0 39 + len .req x0 40 + 41 + /* Locals and temporaries. */ 42 + src .req x1 43 + data1 .req x2 44 + data2 .req x3 45 + data2a .req x4 46 + has_nul1 .req x5 47 + has_nul2 .req x6 48 + tmp1 .req x7 49 + tmp2 .req x8 50 + tmp3 .req x9 51 + tmp4 .req x10 52 + zeroones .req x11 53 + pos .req x12 54 + 55 + #define REP8_01 0x0101010101010101 56 + #define REP8_7f 0x7f7f7f7f7f7f7f7f 57 + #define REP8_80 0x8080808080808080 58 + 59 + ENTRY(strlen) 60 + mov zeroones, #REP8_01 61 + bic src, srcin, #15 62 + ands tmp1, srcin, #15 63 + b.ne .Lmisaligned 64 + /* 65 + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 66 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 67 + * can be done in parallel across the entire word. 68 + */ 69 + /* 70 + * The inner loop deals with two Dwords at a time. This has a 71 + * slightly higher start-up cost, but we should win quite quickly, 72 + * especially on cores with a high number of issue slots per 73 + * cycle, as we get much better parallelism out of the operations. 74 + */ 75 + .Lloop: 76 + ldp data1, data2, [src], #16 77 + .Lrealigned: 78 + sub tmp1, data1, zeroones 79 + orr tmp2, data1, #REP8_7f 80 + sub tmp3, data2, zeroones 81 + orr tmp4, data2, #REP8_7f 82 + bic has_nul1, tmp1, tmp2 83 + bics has_nul2, tmp3, tmp4 84 + ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ 85 + b.eq .Lloop 86 + 87 + sub len, src, srcin 88 + cbz has_nul1, .Lnul_in_data2 89 + CPU_BE( mov data2, data1 ) /*prepare data to re-calculate the syndrome*/ 90 + sub len, len, #8 91 + mov has_nul2, has_nul1 92 + .Lnul_in_data2: 93 + /* 94 + * For big-endian, carry propagation (if the final byte in the 95 + * string is 0x01) means we cannot use has_nul directly. The 96 + * easiest way to get the correct byte is to byte-swap the data 97 + * and calculate the syndrome a second time. 98 + */ 99 + CPU_BE( rev data2, data2 ) 100 + CPU_BE( sub tmp1, data2, zeroones ) 101 + CPU_BE( orr tmp2, data2, #REP8_7f ) 102 + CPU_BE( bic has_nul2, tmp1, tmp2 ) 103 + 104 + sub len, len, #8 105 + rev has_nul2, has_nul2 106 + clz pos, has_nul2 107 + add len, len, pos, lsr #3 /* Bits to bytes. */ 108 + ret 109 + 110 + .Lmisaligned: 111 + cmp tmp1, #8 112 + neg tmp1, tmp1 113 + ldp data1, data2, [src], #16 114 + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ 115 + mov tmp2, #~0 116 + /* Big-endian. Early bytes are at MSB. */ 117 + CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ 118 + /* Little-endian. Early bytes are at LSB. */ 119 + CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ 120 + 121 + orr data1, data1, tmp2 122 + orr data2a, data2, tmp2 123 + csinv data1, data1, xzr, le 124 + csel data2, data2, data2a, le 125 + b .Lrealigned 126 + ENDPROC(strlen)

+310

arch/arm64/lib/strncmp.S

··· 1 + /* 2 + * Copyright (C) 2013 ARM Ltd. 3 + * Copyright (C) 2013 Linaro. 4 + * 5 + * This code is based on glibc cortex strings work originally authored by Linaro 6 + * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 + * be found @ 8 + * 9 + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 + * files/head:/src/aarch64/ 11 + * 12 + * This program is free software; you can redistribute it and/or modify 13 + * it under the terms of the GNU General Public License version 2 as 14 + * published by the Free Software Foundation. 15 + * 16 + * This program is distributed in the hope that it will be useful, 17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 + * GNU General Public License for more details. 20 + * 21 + * You should have received a copy of the GNU General Public License 22 + * along with this program. If not, see <http://www.gnu.org/licenses/>. 23 + */ 24 + 25 + #include <linux/linkage.h> 26 + #include <asm/assembler.h> 27 + 28 + /* 29 + * compare two strings 30 + * 31 + * Parameters: 32 + * x0 - const string 1 pointer 33 + * x1 - const string 2 pointer 34 + * x2 - the maximal length to be compared 35 + * Returns: 36 + * x0 - an integer less than, equal to, or greater than zero if s1 is found, 37 + * respectively, to be less than, to match, or be greater than s2. 38 + */ 39 + 40 + #define REP8_01 0x0101010101010101 41 + #define REP8_7f 0x7f7f7f7f7f7f7f7f 42 + #define REP8_80 0x8080808080808080 43 + 44 + /* Parameters and result. */ 45 + src1 .req x0 46 + src2 .req x1 47 + limit .req x2 48 + result .req x0 49 + 50 + /* Internal variables. */ 51 + data1 .req x3 52 + data1w .req w3 53 + data2 .req x4 54 + data2w .req w4 55 + has_nul .req x5 56 + diff .req x6 57 + syndrome .req x7 58 + tmp1 .req x8 59 + tmp2 .req x9 60 + tmp3 .req x10 61 + zeroones .req x11 62 + pos .req x12 63 + limit_wd .req x13 64 + mask .req x14 65 + endloop .req x15 66 + 67 + ENTRY(strncmp) 68 + cbz limit, .Lret0 69 + eor tmp1, src1, src2 70 + mov zeroones, #REP8_01 71 + tst tmp1, #7 72 + b.ne .Lmisaligned8 73 + ands tmp1, src1, #7 74 + b.ne .Lmutual_align 75 + /* Calculate the number of full and partial words -1. */ 76 + /* 77 + * when limit is mulitply of 8, if not sub 1, 78 + * the judgement of last dword will wrong. 79 + */ 80 + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 81 + lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ 82 + 83 + /* 84 + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 85 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 86 + * can be done in parallel across the entire word. 87 + */ 88 + .Lloop_aligned: 89 + ldr data1, [src1], #8 90 + ldr data2, [src2], #8 91 + .Lstart_realigned: 92 + subs limit_wd, limit_wd, #1 93 + sub tmp1, data1, zeroones 94 + orr tmp2, data1, #REP8_7f 95 + eor diff, data1, data2 /* Non-zero if differences found. */ 96 + csinv endloop, diff, xzr, pl /* Last Dword or differences.*/ 97 + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 98 + ccmp endloop, #0, #0, eq 99 + b.eq .Lloop_aligned 100 + 101 + /*Not reached the limit, must have found the end or a diff. */ 102 + tbz limit_wd, #63, .Lnot_limit 103 + 104 + /* Limit % 8 == 0 => all bytes significant. */ 105 + ands limit, limit, #7 106 + b.eq .Lnot_limit 107 + 108 + lsl limit, limit, #3 /* Bits -> bytes. */ 109 + mov mask, #~0 110 + CPU_BE( lsr mask, mask, limit ) 111 + CPU_LE( lsl mask, mask, limit ) 112 + bic data1, data1, mask 113 + bic data2, data2, mask 114 + 115 + /* Make sure that the NUL byte is marked in the syndrome. */ 116 + orr has_nul, has_nul, mask 117 + 118 + .Lnot_limit: 119 + orr syndrome, diff, has_nul 120 + b .Lcal_cmpresult 121 + 122 + .Lmutual_align: 123 + /* 124 + * Sources are mutually aligned, but are not currently at an 125 + * alignment boundary. Round down the addresses and then mask off 126 + * the bytes that precede the start point. 127 + * We also need to adjust the limit calculations, but without 128 + * overflowing if the limit is near ULONG_MAX. 129 + */ 130 + bic src1, src1, #7 131 + bic src2, src2, #7 132 + ldr data1, [src1], #8 133 + neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */ 134 + ldr data2, [src2], #8 135 + mov tmp2, #~0 136 + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 137 + /* Big-endian. Early bytes are at MSB. */ 138 + CPU_BE( lsl tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */ 139 + /* Little-endian. Early bytes are at LSB. */ 140 + CPU_LE( lsr tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */ 141 + 142 + and tmp3, limit_wd, #7 143 + lsr limit_wd, limit_wd, #3 144 + /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/ 145 + add limit, limit, tmp1 146 + add tmp3, tmp3, tmp1 147 + orr data1, data1, tmp2 148 + orr data2, data2, tmp2 149 + add limit_wd, limit_wd, tmp3, lsr #3 150 + b .Lstart_realigned 151 + 152 + /*when src1 offset is not equal to src2 offset...*/ 153 + .Lmisaligned8: 154 + cmp limit, #8 155 + b.lo .Ltiny8proc /*limit < 8... */ 156 + /* 157 + * Get the align offset length to compare per byte first. 158 + * After this process, one string's address will be aligned.*/ 159 + and tmp1, src1, #7 160 + neg tmp1, tmp1 161 + add tmp1, tmp1, #8 162 + and tmp2, src2, #7 163 + neg tmp2, tmp2 164 + add tmp2, tmp2, #8 165 + subs tmp3, tmp1, tmp2 166 + csel pos, tmp1, tmp2, hi /*Choose the maximum. */ 167 + /* 168 + * Here, limit is not less than 8, so directly run .Ltinycmp 169 + * without checking the limit.*/ 170 + sub limit, limit, pos 171 + .Ltinycmp: 172 + ldrb data1w, [src1], #1 173 + ldrb data2w, [src2], #1 174 + subs pos, pos, #1 175 + ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ 176 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 177 + b.eq .Ltinycmp 178 + cbnz pos, 1f /*find the null or unequal...*/ 179 + cmp data1w, #1 180 + ccmp data1w, data2w, #0, cs 181 + b.eq .Lstart_align /*the last bytes are equal....*/ 182 + 1: 183 + sub result, data1, data2 184 + ret 185 + 186 + .Lstart_align: 187 + lsr limit_wd, limit, #3 188 + cbz limit_wd, .Lremain8 189 + /*process more leading bytes to make str1 aligned...*/ 190 + ands xzr, src1, #7 191 + b.eq .Lrecal_offset 192 + add src1, src1, tmp3 /*tmp3 is positive in this branch.*/ 193 + add src2, src2, tmp3 194 + ldr data1, [src1], #8 195 + ldr data2, [src2], #8 196 + 197 + sub limit, limit, tmp3 198 + lsr limit_wd, limit, #3 199 + subs limit_wd, limit_wd, #1 200 + 201 + sub tmp1, data1, zeroones 202 + orr tmp2, data1, #REP8_7f 203 + eor diff, data1, data2 /* Non-zero if differences found. */ 204 + csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ 205 + bics has_nul, tmp1, tmp2 206 + ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ 207 + b.ne .Lunequal_proc 208 + /*How far is the current str2 from the alignment boundary...*/ 209 + and tmp3, tmp3, #7 210 + .Lrecal_offset: 211 + neg pos, tmp3 212 + .Lloopcmp_proc: 213 + /* 214 + * Divide the eight bytes into two parts. First,backwards the src2 215 + * to an alignment boundary,load eight bytes from the SRC2 alignment 216 + * boundary,then compare with the relative bytes from SRC1. 217 + * If all 8 bytes are equal,then start the second part's comparison. 218 + * Otherwise finish the comparison. 219 + * This special handle can garantee all the accesses are in the 220 + * thread/task space in avoid to overrange access. 221 + */ 222 + ldr data1, [src1,pos] 223 + ldr data2, [src2,pos] 224 + sub tmp1, data1, zeroones 225 + orr tmp2, data1, #REP8_7f 226 + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 227 + eor diff, data1, data2 /* Non-zero if differences found. */ 228 + csinv endloop, diff, xzr, eq 229 + cbnz endloop, .Lunequal_proc 230 + 231 + /*The second part process*/ 232 + ldr data1, [src1], #8 233 + ldr data2, [src2], #8 234 + subs limit_wd, limit_wd, #1 235 + sub tmp1, data1, zeroones 236 + orr tmp2, data1, #REP8_7f 237 + eor diff, data1, data2 /* Non-zero if differences found. */ 238 + csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ 239 + bics has_nul, tmp1, tmp2 240 + ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ 241 + b.eq .Lloopcmp_proc 242 + 243 + .Lunequal_proc: 244 + orr syndrome, diff, has_nul 245 + cbz syndrome, .Lremain8 246 + .Lcal_cmpresult: 247 + /* 248 + * reversed the byte-order as big-endian,then CLZ can find the most 249 + * significant zero bits. 250 + */ 251 + CPU_LE( rev syndrome, syndrome ) 252 + CPU_LE( rev data1, data1 ) 253 + CPU_LE( rev data2, data2 ) 254 + /* 255 + * For big-endian we cannot use the trick with the syndrome value 256 + * as carry-propagation can corrupt the upper bits if the trailing 257 + * bytes in the string contain 0x01. 258 + * However, if there is no NUL byte in the dword, we can generate 259 + * the result directly. We can't just subtract the bytes as the 260 + * MSB might be significant. 261 + */ 262 + CPU_BE( cbnz has_nul, 1f ) 263 + CPU_BE( cmp data1, data2 ) 264 + CPU_BE( cset result, ne ) 265 + CPU_BE( cneg result, result, lo ) 266 + CPU_BE( ret ) 267 + CPU_BE( 1: ) 268 + /* Re-compute the NUL-byte detection, using a byte-reversed value.*/ 269 + CPU_BE( rev tmp3, data1 ) 270 + CPU_BE( sub tmp1, tmp3, zeroones ) 271 + CPU_BE( orr tmp2, tmp3, #REP8_7f ) 272 + CPU_BE( bic has_nul, tmp1, tmp2 ) 273 + CPU_BE( rev has_nul, has_nul ) 274 + CPU_BE( orr syndrome, diff, has_nul ) 275 + /* 276 + * The MS-non-zero bit of the syndrome marks either the first bit 277 + * that is different, or the top bit of the first zero byte. 278 + * Shifting left now will bring the critical information into the 279 + * top bits. 280 + */ 281 + clz pos, syndrome 282 + lsl data1, data1, pos 283 + lsl data2, data2, pos 284 + /* 285 + * But we need to zero-extend (char is unsigned) the value and then 286 + * perform a signed 32-bit subtraction. 287 + */ 288 + lsr data1, data1, #56 289 + sub result, data1, data2, lsr #56 290 + ret 291 + 292 + .Lremain8: 293 + /* Limit % 8 == 0 => all bytes significant. */ 294 + ands limit, limit, #7 295 + b.eq .Lret0 296 + .Ltiny8proc: 297 + ldrb data1w, [src1], #1 298 + ldrb data2w, [src2], #1 299 + subs limit, limit, #1 300 + 301 + ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ 302 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 303 + b.eq .Ltiny8proc 304 + sub result, data1, data2 305 + ret 306 + 307 + .Lret0: 308 + mov result, #0 309 + ret 310 + ENDPROC(strncmp)

+171

arch/arm64/lib/strnlen.S

··· 1 + /* 2 + * Copyright (C) 2013 ARM Ltd. 3 + * Copyright (C) 2013 Linaro. 4 + * 5 + * This code is based on glibc cortex strings work originally authored by Linaro 6 + * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 + * be found @ 8 + * 9 + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 + * files/head:/src/aarch64/ 11 + * 12 + * This program is free software; you can redistribute it and/or modify 13 + * it under the terms of the GNU General Public License version 2 as 14 + * published by the Free Software Foundation. 15 + * 16 + * This program is distributed in the hope that it will be useful, 17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 + * GNU General Public License for more details. 20 + * 21 + * You should have received a copy of the GNU General Public License 22 + * along with this program. If not, see <http://www.gnu.org/licenses/>. 23 + */ 24 + 25 + #include <linux/linkage.h> 26 + #include <asm/assembler.h> 27 + 28 + /* 29 + * determine the length of a fixed-size string 30 + * 31 + * Parameters: 32 + * x0 - const string pointer 33 + * x1 - maximal string length 34 + * Returns: 35 + * x0 - the return length of specific string 36 + */ 37 + 38 + /* Arguments and results. */ 39 + srcin .req x0 40 + len .req x0 41 + limit .req x1 42 + 43 + /* Locals and temporaries. */ 44 + src .req x2 45 + data1 .req x3 46 + data2 .req x4 47 + data2a .req x5 48 + has_nul1 .req x6 49 + has_nul2 .req x7 50 + tmp1 .req x8 51 + tmp2 .req x9 52 + tmp3 .req x10 53 + tmp4 .req x11 54 + zeroones .req x12 55 + pos .req x13 56 + limit_wd .req x14 57 + 58 + #define REP8_01 0x0101010101010101 59 + #define REP8_7f 0x7f7f7f7f7f7f7f7f 60 + #define REP8_80 0x8080808080808080 61 + 62 + ENTRY(strnlen) 63 + cbz limit, .Lhit_limit 64 + mov zeroones, #REP8_01 65 + bic src, srcin, #15 66 + ands tmp1, srcin, #15 67 + b.ne .Lmisaligned 68 + /* Calculate the number of full and partial words -1. */ 69 + sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ 70 + lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ 71 + 72 + /* 73 + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 74 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 75 + * can be done in parallel across the entire word. 76 + */ 77 + /* 78 + * The inner loop deals with two Dwords at a time. This has a 79 + * slightly higher start-up cost, but we should win quite quickly, 80 + * especially on cores with a high number of issue slots per 81 + * cycle, as we get much better parallelism out of the operations. 82 + */ 83 + .Lloop: 84 + ldp data1, data2, [src], #16 85 + .Lrealigned: 86 + sub tmp1, data1, zeroones 87 + orr tmp2, data1, #REP8_7f 88 + sub tmp3, data2, zeroones 89 + orr tmp4, data2, #REP8_7f 90 + bic has_nul1, tmp1, tmp2 91 + bic has_nul2, tmp3, tmp4 92 + subs limit_wd, limit_wd, #1 93 + orr tmp1, has_nul1, has_nul2 94 + ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ 95 + b.eq .Lloop 96 + 97 + cbz tmp1, .Lhit_limit /* No null in final Qword. */ 98 + 99 + /* 100 + * We know there's a null in the final Qword. The easiest thing 101 + * to do now is work out the length of the string and return 102 + * MIN (len, limit). 103 + */ 104 + sub len, src, srcin 105 + cbz has_nul1, .Lnul_in_data2 106 + CPU_BE( mov data2, data1 ) /*perpare data to re-calculate the syndrome*/ 107 + 108 + sub len, len, #8 109 + mov has_nul2, has_nul1 110 + .Lnul_in_data2: 111 + /* 112 + * For big-endian, carry propagation (if the final byte in the 113 + * string is 0x01) means we cannot use has_nul directly. The 114 + * easiest way to get the correct byte is to byte-swap the data 115 + * and calculate the syndrome a second time. 116 + */ 117 + CPU_BE( rev data2, data2 ) 118 + CPU_BE( sub tmp1, data2, zeroones ) 119 + CPU_BE( orr tmp2, data2, #REP8_7f ) 120 + CPU_BE( bic has_nul2, tmp1, tmp2 ) 121 + 122 + sub len, len, #8 123 + rev has_nul2, has_nul2 124 + clz pos, has_nul2 125 + add len, len, pos, lsr #3 /* Bits to bytes. */ 126 + cmp len, limit 127 + csel len, len, limit, ls /* Return the lower value. */ 128 + ret 129 + 130 + .Lmisaligned: 131 + /* 132 + * Deal with a partial first word. 133 + * We're doing two things in parallel here; 134 + * 1) Calculate the number of words (but avoiding overflow if 135 + * limit is near ULONG_MAX) - to do this we need to work out 136 + * limit + tmp1 - 1 as a 65-bit value before shifting it; 137 + * 2) Load and mask the initial data words - we force the bytes 138 + * before the ones we are interested in to 0xff - this ensures 139 + * early bytes will not hit any zero detection. 140 + */ 141 + ldp data1, data2, [src], #16 142 + 143 + sub limit_wd, limit, #1 144 + and tmp3, limit_wd, #15 145 + lsr limit_wd, limit_wd, #4 146 + 147 + add tmp3, tmp3, tmp1 148 + add limit_wd, limit_wd, tmp3, lsr #4 149 + 150 + neg tmp4, tmp1 151 + lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ 152 + 153 + mov tmp2, #~0 154 + /* Big-endian. Early bytes are at MSB. */ 155 + CPU_BE( lsl tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */ 156 + /* Little-endian. Early bytes are at LSB. */ 157 + CPU_LE( lsr tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */ 158 + 159 + cmp tmp1, #8 160 + 161 + orr data1, data1, tmp2 162 + orr data2a, data2, tmp2 163 + 164 + csinv data1, data1, xzr, le 165 + csel data2, data2, data2a, le 166 + b .Lrealigned 167 + 168 + .Lhit_limit: 169 + mov len, limit 170 + ret 171 + ENDPROC(strnlen)

+1 -1

arch/arm64/mm/Makefile

··· 1 1 obj-y := dma-mapping.o extable.o fault.o init.o \ 2 2 cache.o copypage.o flush.o \ 3 3 ioremap.o mmap.o pgd.o mmu.o \ 4 - context.o tlb.o proc.o 4 + context.o proc.o 5 5 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o

+3 -3

arch/arm64/mm/cache.S

··· 31 31 * Corrupted registers: x0-x7, x9-x11 32 32 */ 33 33 __flush_dcache_all: 34 - dsb sy // ensure ordering with previous memory accesses 34 + dmb sy // ensure ordering with previous memory accesses 35 35 mrs x0, clidr_el1 // read clidr 36 36 and x3, x0, #0x7000000 // extract loc from clidr 37 37 lsr x3, x3, #23 // left align loc bit field ··· 128 128 add x4, x4, x2 129 129 cmp x4, x1 130 130 b.lo 1b 131 - dsb sy 131 + dsb ish 132 132 133 133 icache_line_size x2, x3 134 134 sub x3, x2, #1 ··· 139 139 cmp x4, x1 140 140 b.lo 1b 141 141 9: // ignore any faulting cache operation 142 - dsb sy 142 + dsb ish 143 143 isb 144 144 ret 145 145 ENDPROC(flush_icache_range)

+1 -1

arch/arm64/mm/dma-mapping.c

··· 115 115 for (i = 0; i < (size >> PAGE_SHIFT); i++) 116 116 map[i] = page + i; 117 117 coherent_ptr = vmap(map, size >> PAGE_SHIFT, VM_MAP, 118 - __get_dma_pgprot(attrs, pgprot_default, false)); 118 + __get_dma_pgprot(attrs, __pgprot(PROT_NORMAL_NC), false)); 119 119 kfree(map); 120 120 if (!coherent_ptr) 121 121 goto no_map;

+4 -4

arch/arm64/mm/fault.c

··· 32 32 33 33 #include <asm/exception.h> 34 34 #include <asm/debug-monitors.h> 35 + #include <asm/esr.h> 35 36 #include <asm/system_misc.h> 36 37 #include <asm/pgtable.h> 37 38 #include <asm/tlbflush.h> ··· 124 123 } 125 124 126 125 tsk->thread.fault_address = addr; 126 + tsk->thread.fault_code = esr; 127 127 si.si_signo = sig; 128 128 si.si_errno = 0; 129 129 si.si_code = code; ··· 150 148 #define VM_FAULT_BADMAP 0x010000 151 149 #define VM_FAULT_BADACCESS 0x020000 152 150 153 - #define ESR_WRITE (1 << 6) 154 - #define ESR_CM (1 << 8) 155 151 #define ESR_LNX_EXEC (1 << 24) 156 152 157 153 static int __do_page_fault(struct mm_struct *mm, unsigned long addr, ··· 218 218 219 219 if (esr & ESR_LNX_EXEC) { 220 220 vm_flags = VM_EXEC; 221 - } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { 221 + } else if ((esr & ESR_EL1_WRITE) && !(esr & ESR_EL1_CM)) { 222 222 vm_flags = VM_WRITE; 223 223 mm_flags |= FAULT_FLAG_WRITE; 224 224 } ··· 525 525 info.si_errno = 0; 526 526 info.si_code = inf->code; 527 527 info.si_addr = (void __user *)addr; 528 - arm64_notify_die("", regs, &info, esr); 528 + arm64_notify_die("", regs, &info, 0); 529 529 530 530 return 0; 531 531 }

+30 -37

arch/arm64/mm/mmu.c

··· 43 43 struct page *empty_zero_page; 44 44 EXPORT_SYMBOL(empty_zero_page); 45 45 46 - pgprot_t pgprot_default; 47 - EXPORT_SYMBOL(pgprot_default); 48 - 49 - static pmdval_t prot_sect_kernel; 50 - 51 46 struct cachepolicy { 52 47 const char policy[16]; 53 48 u64 mair; ··· 117 122 } 118 123 early_param("cachepolicy", early_cachepolicy); 119 124 120 - /* 121 - * Adjust the PMD section entries according to the CPU in use. 122 - */ 123 - void __init init_mem_pgprot(void) 124 - { 125 - pteval_t default_pgprot; 126 - int i; 127 - 128 - default_pgprot = PTE_ATTRINDX(MT_NORMAL); 129 - prot_sect_kernel = PMD_TYPE_SECT | PMD_SECT_AF | PMD_ATTRINDX(MT_NORMAL); 130 - 131 - #ifdef CONFIG_SMP 132 - /* 133 - * Mark memory with the "shared" attribute for SMP systems 134 - */ 135 - default_pgprot |= PTE_SHARED; 136 - prot_sect_kernel |= PMD_SECT_S; 137 - #endif 138 - 139 - for (i = 0; i < 16; i++) { 140 - unsigned long v = pgprot_val(protection_map[i]); 141 - protection_map[i] = __pgprot(v | default_pgprot); 142 - } 143 - 144 - pgprot_default = __pgprot(PTE_TYPE_PAGE | PTE_AF | default_pgprot); 145 - } 146 - 147 125 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 148 126 unsigned long size, pgprot_t vma_prot) 149 127 { ··· 164 196 pgprot_t prot_pte; 165 197 166 198 if (map_io) { 167 - prot_sect = PMD_TYPE_SECT | PMD_SECT_AF | 168 - PMD_ATTRINDX(MT_DEVICE_nGnRE); 199 + prot_sect = PROT_SECT_DEVICE_nGnRE; 169 200 prot_pte = __pgprot(PROT_DEVICE_nGnRE); 170 201 } else { 171 - prot_sect = prot_sect_kernel; 202 + prot_sect = PROT_SECT_NORMAL_EXEC; 172 203 prot_pte = PAGE_KERNEL_EXEC; 173 204 } 174 205 ··· 209 242 210 243 do { 211 244 next = pud_addr_end(addr, end); 212 - alloc_init_pmd(pud, addr, next, phys, map_io); 245 + 246 + /* 247 + * For 4K granule only, attempt to put down a 1GB block 248 + */ 249 + if (!map_io && (PAGE_SHIFT == 12) && 250 + ((addr | next | phys) & ~PUD_MASK) == 0) { 251 + pud_t old_pud = *pud; 252 + set_pud(pud, __pud(phys | PROT_SECT_NORMAL_EXEC)); 253 + 254 + /* 255 + * If we have an old value for a pud, it will 256 + * be pointing to a pmd table that we no longer 257 + * need (from swapper_pg_dir). 258 + * 259 + * Look up the old pmd table and free it. 260 + */ 261 + if (!pud_none(old_pud)) { 262 + phys_addr_t table = __pa(pmd_offset(&old_pud, 0)); 263 + memblock_free(table, PAGE_SIZE); 264 + flush_tlb_all(); 265 + } 266 + } else { 267 + alloc_init_pmd(pud, addr, next, phys, map_io); 268 + } 213 269 phys += next - addr; 214 270 } while (pud++, addr = next, addr != end); 215 271 } ··· 389 399 if (pud_none(*pud)) 390 400 return 0; 391 401 402 + if (pud_sect(*pud)) 403 + return pfn_valid(pud_pfn(*pud)); 404 + 392 405 pmd = pmd_offset(pud, addr); 393 406 if (pmd_none(*pmd)) 394 407 return 0; ··· 439 446 if (!p) 440 447 return -ENOMEM; 441 448 442 - set_pmd(pmd, __pmd(__pa(p) | prot_sect_kernel)); 449 + set_pmd(pmd, __pmd(__pa(p) | PROT_SECT_NORMAL)); 443 450 } else 444 451 vmemmap_verify((pte_t *)pmd, node, addr, next); 445 452 } while (addr = next, addr != end);

+1 -1

arch/arm64/mm/proc.S

··· 182 182 ENTRY(__cpu_setup) 183 183 ic iallu // I+BTB cache invalidate 184 184 tlbi vmalle1is // invalidate I + D TLBs 185 - dsb sy 185 + dsb ish 186 186 187 187 mov x0, #3 << 20 188 188 msr cpacr_el1, x0 // Enable FP/ASIMD

-71

arch/arm64/mm/tlb.S

··· 1 - /* 2 - * Based on arch/arm/mm/tlb.S 3 - * 4 - * Copyright (C) 1997-2002 Russell King 5 - * Copyright (C) 2012 ARM Ltd. 6 - * Written by Catalin Marinas <catalin.marinas@arm.com> 7 - * 8 - * This program is free software; you can redistribute it and/or modify 9 - * it under the terms of the GNU General Public License version 2 as 10 - * published by the Free Software Foundation. 11 - * 12 - * This program is distributed in the hope that it will be useful, 13 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 - * GNU General Public License for more details. 16 - * 17 - * You should have received a copy of the GNU General Public License 18 - * along with this program. If not, see <http://www.gnu.org/licenses/>. 19 - */ 20 - #include <linux/linkage.h> 21 - #include <asm/assembler.h> 22 - #include <asm/asm-offsets.h> 23 - #include <asm/page.h> 24 - #include <asm/tlbflush.h> 25 - #include "proc-macros.S" 26 - 27 - /* 28 - * __cpu_flush_user_tlb_range(start, end, vma) 29 - * 30 - * Invalidate a range of TLB entries in the specified address space. 31 - * 32 - * - start - start address (may not be aligned) 33 - * - end - end address (exclusive, may not be aligned) 34 - * - vma - vma_struct describing address range 35 - */ 36 - ENTRY(__cpu_flush_user_tlb_range) 37 - vma_vm_mm x3, x2 // get vma->vm_mm 38 - mmid w3, x3 // get vm_mm->context.id 39 - dsb sy 40 - lsr x0, x0, #12 // align address 41 - lsr x1, x1, #12 42 - bfi x0, x3, #48, #16 // start VA and ASID 43 - bfi x1, x3, #48, #16 // end VA and ASID 44 - 1: tlbi vae1is, x0 // TLB invalidate by address and ASID 45 - add x0, x0, #1 46 - cmp x0, x1 47 - b.lo 1b 48 - dsb sy 49 - ret 50 - ENDPROC(__cpu_flush_user_tlb_range) 51 - 52 - /* 53 - * __cpu_flush_kern_tlb_range(start,end) 54 - * 55 - * Invalidate a range of kernel TLB entries. 56 - * 57 - * - start - start address (may not be aligned) 58 - * - end - end address (exclusive, may not be aligned) 59 - */ 60 - ENTRY(__cpu_flush_kern_tlb_range) 61 - dsb sy 62 - lsr x0, x0, #12 // align address 63 - lsr x1, x1, #12 64 - 1: tlbi vaae1is, x0 // TLB invalidate by address 65 - add x0, x0, #1 66 - cmp x0, x1 67 - b.lo 1b 68 - dsb sy 69 - isb 70 - ret 71 - ENDPROC(__cpu_flush_kern_tlb_range)

+1 -10

arch/blackfin/include/asm/ftrace.h

··· 66 66 67 67 #endif /* CONFIG_FRAME_POINTER */ 68 68 69 - #define HAVE_ARCH_CALLER_ADDR 70 - 71 - /* inline function or macro may lead to unexpected result */ 72 - #define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) 73 - #define CALLER_ADDR1 ((unsigned long)return_address(1)) 74 - #define CALLER_ADDR2 ((unsigned long)return_address(2)) 75 - #define CALLER_ADDR3 ((unsigned long)return_address(3)) 76 - #define CALLER_ADDR4 ((unsigned long)return_address(4)) 77 - #define CALLER_ADDR5 ((unsigned long)return_address(5)) 78 - #define CALLER_ADDR6 ((unsigned long)return_address(6)) 69 + #define ftrace_return_address(n) return_address(n) 79 70 80 71 #endif /* __ASSEMBLY__ */ 81 72

+1 -9

arch/parisc/include/asm/ftrace.h

··· 24 24 25 25 extern unsigned long return_address(unsigned int); 26 26 27 - #define HAVE_ARCH_CALLER_ADDR 28 - 29 - #define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) 30 - #define CALLER_ADDR1 return_address(1) 31 - #define CALLER_ADDR2 return_address(2) 32 - #define CALLER_ADDR3 return_address(3) 33 - #define CALLER_ADDR4 return_address(4) 34 - #define CALLER_ADDR5 return_address(5) 35 - #define CALLER_ADDR6 return_address(6) 27 + #define ftrace_return_address(n) return_address(n) 36 28 37 29 #endif /* __ASSEMBLY__ */ 38 30

+1 -9

arch/sh/include/asm/ftrace.h

··· 40 40 /* arch/sh/kernel/return_address.c */ 41 41 extern void *return_address(unsigned int); 42 42 43 - #define HAVE_ARCH_CALLER_ADDR 44 - 45 - #define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) 46 - #define CALLER_ADDR1 ((unsigned long)return_address(1)) 47 - #define CALLER_ADDR2 ((unsigned long)return_address(2)) 48 - #define CALLER_ADDR3 ((unsigned long)return_address(3)) 49 - #define CALLER_ADDR4 ((unsigned long)return_address(4)) 50 - #define CALLER_ADDR5 ((unsigned long)return_address(5)) 51 - #define CALLER_ADDR6 ((unsigned long)return_address(6)) 43 + #define ftrace_return_address(n) return_address(n) 52 44 53 45 #endif /* __ASSEMBLY__ */ 54 46

+4 -10

arch/xtensa/include/asm/ftrace.h

··· 12 12 13 13 #include <asm/processor.h> 14 14 15 - #define HAVE_ARCH_CALLER_ADDR 16 15 #ifndef __ASSEMBLY__ 17 - #define CALLER_ADDR0 ({ unsigned long a0, a1; \ 16 + #define ftrace_return_address0 ({ unsigned long a0, a1; \ 18 17 __asm__ __volatile__ ( \ 19 18 "mov %0, a0\n" \ 20 19 "mov %1, a1\n" \ 21 20 : "=r"(a0), "=r"(a1)); \ 22 21 MAKE_PC_FROM_RA(a0, a1); }) 22 + 23 23 #ifdef CONFIG_FRAME_POINTER 24 24 extern unsigned long return_address(unsigned level); 25 - #define CALLER_ADDR1 return_address(1) 26 - #define CALLER_ADDR2 return_address(2) 27 - #define CALLER_ADDR3 return_address(3) 28 - #else /* CONFIG_FRAME_POINTER */ 29 - #define CALLER_ADDR1 (0) 30 - #define CALLER_ADDR2 (0) 31 - #define CALLER_ADDR3 (0) 32 - #endif /* CONFIG_FRAME_POINTER */ 25 + #define ftrace_return_address(n) return_address(n) 26 + #endif 33 27 #endif /* __ASSEMBLY__ */ 34 28 35 29 #ifdef CONFIG_FUNCTION_TRACER

+13 -8

include/asm-generic/unaligned.h

··· 4 4 /* 5 5 * This is the most generic implementation of unaligned accesses 6 6 * and should work almost anywhere. 7 - * 8 - * If an architecture can handle unaligned accesses in hardware, 9 - * it may want to use the linux/unaligned/access_ok.h implementation 10 - * instead. 11 7 */ 12 8 #include <asm/byteorder.h> 13 9 10 + /* Set by the arch if it can handle unaligned accesses in hardware. */ 11 + #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 12 + # include <linux/unaligned/access_ok.h> 13 + #endif 14 + 14 15 #if defined(__LITTLE_ENDIAN) 15 - # include <linux/unaligned/le_struct.h> 16 - # include <linux/unaligned/be_byteshift.h> 16 + # ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 17 + # include <linux/unaligned/le_struct.h> 18 + # include <linux/unaligned/be_byteshift.h> 19 + # endif 17 20 # include <linux/unaligned/generic.h> 18 21 # define get_unaligned __get_unaligned_le 19 22 # define put_unaligned __put_unaligned_le 20 23 #elif defined(__BIG_ENDIAN) 21 - # include <linux/unaligned/be_struct.h> 22 - # include <linux/unaligned/le_byteshift.h> 24 + # ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 25 + # include <linux/unaligned/be_struct.h> 26 + # include <linux/unaligned/le_byteshift.h> 27 + # endif 23 28 # include <linux/unaligned/generic.h> 24 29 # define get_unaligned __get_unaligned_be 25 30 # define put_unaligned __put_unaligned_be

+18 -16

include/linux/ftrace.h

··· 616 616 #endif 617 617 } 618 618 619 - #ifndef HAVE_ARCH_CALLER_ADDR 619 + /* All archs should have this, but we define it for consistency */ 620 + #ifndef ftrace_return_address0 621 + # define ftrace_return_address0 __builtin_return_address(0) 622 + #endif 623 + 624 + /* Archs may use other ways for ADDR1 and beyond */ 625 + #ifndef ftrace_return_address 620 626 # ifdef CONFIG_FRAME_POINTER 621 - # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) 622 - # define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) 623 - # define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) 624 - # define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) 625 - # define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) 626 - # define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) 627 - # define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6)) 627 + # define ftrace_return_address(n) __builtin_return_address(n) 628 628 # else 629 - # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) 630 - # define CALLER_ADDR1 0UL 631 - # define CALLER_ADDR2 0UL 632 - # define CALLER_ADDR3 0UL 633 - # define CALLER_ADDR4 0UL 634 - # define CALLER_ADDR5 0UL 635 - # define CALLER_ADDR6 0UL 629 + # define ftrace_return_address(n) 0UL 636 630 # endif 637 - #endif /* ifndef HAVE_ARCH_CALLER_ADDR */ 631 + #endif 632 + 633 + #define CALLER_ADDR0 ((unsigned long)ftrace_return_address0) 634 + #define CALLER_ADDR1 ((unsigned long)ftrace_return_address(1)) 635 + #define CALLER_ADDR2 ((unsigned long)ftrace_return_address(2)) 636 + #define CALLER_ADDR3 ((unsigned long)ftrace_return_address(3)) 637 + #define CALLER_ADDR4 ((unsigned long)ftrace_return_address(4)) 638 + #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5)) 639 + #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6)) 638 640 639 641 #ifdef CONFIG_IRQSOFF_TRACER 640 642 extern void time_hardirqs_on(unsigned long a0, unsigned long a1);

+7

scripts/recordmcount.c

··· 40 40 #define R_METAG_NONE 3 41 41 #endif 42 42 43 + #ifndef EM_AARCH64 44 + #define EM_AARCH64 183 45 + #define R_AARCH64_ABS64 257 46 + #endif 47 + 43 48 static int fd_map; /* File descriptor for file being modified. */ 44 49 static int mmap_failed; /* Boolean flag. */ 45 50 static void *ehdr_curr; /* current ElfXX_Ehdr * for resource cleanup */ ··· 352 347 case EM_ARM: reltype = R_ARM_ABS32; 353 348 altmcount = "__gnu_mcount_nc"; 354 349 break; 350 + case EM_AARCH64: 351 + reltype = R_AARCH64_ABS64; gpfx = '_'; break; 355 352 case EM_IA_64: reltype = R_IA64_IMM64; gpfx = '_'; break; 356 353 case EM_METAG: reltype = R_METAG_ADDR32; 357 354 altmcount = "_mcount_wrapper";

+5

scripts/recordmcount.pl

··· 279 279 $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_ARM_(CALL|PC24|THM_CALL)" . 280 280 "\\s+(__gnu_mcount_nc|mcount)\$"; 281 281 282 + } elsif ($arch eq "arm64") { 283 + $alignment = 3; 284 + $section_type = '%progbits'; 285 + $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_AARCH64_CALL26\\s+_mcount\$"; 286 + $type = ".quad"; 282 287 } elsif ($arch eq "ia64") { 283 288 $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$"; 284 289 $type = "data8";