Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

kasan: speed up mte_set_mem_tag_range

Use DC GVA / DC GZVA to speed up KASan memory tagging in HW tags mode.

The first cacheline is always tagged using STG/STZG even if the address is
cacheline-aligned, as benchmarks show it is faster than a conditional
branch.

Signed-off-by: Evgenii Stepanov <eugenis@google.com>
Co-developed-by: Peter Collingbourne <pcc@google.com>
Signed-off-by: Peter Collingbourne <pcc@google.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210521010023.3244784-1-eugenis@google.com
Signed-off-by: Will Deacon <will@kernel.org>

authored by

Evgenii Stepanov and committed by
Will Deacon
3d0cca0b c4681547

+67 -26
+67 -26
arch/arm64/include/asm/mte-kasan.h
··· 48 48 return mte_get_ptr_tag(addr); 49 49 } 50 50 51 + static inline u64 __stg_post(u64 p) 52 + { 53 + asm volatile(__MTE_PREAMBLE "stg %0, [%0], #16" 54 + : "+r"(p) 55 + : 56 + : "memory"); 57 + return p; 58 + } 59 + 60 + static inline u64 __stzg_post(u64 p) 61 + { 62 + asm volatile(__MTE_PREAMBLE "stzg %0, [%0], #16" 63 + : "+r"(p) 64 + : 65 + : "memory"); 66 + return p; 67 + } 68 + 69 + static inline void __dc_gva(u64 p) 70 + { 71 + asm volatile(__MTE_PREAMBLE "dc gva, %0" : : "r"(p) : "memory"); 72 + } 73 + 74 + static inline void __dc_gzva(u64 p) 75 + { 76 + asm volatile(__MTE_PREAMBLE "dc gzva, %0" : : "r"(p) : "memory"); 77 + } 78 + 51 79 /* 52 80 * Assign allocation tags for a region of memory based on the pointer tag. 53 81 * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and 54 - * size must be non-zero and MTE_GRANULE_SIZE aligned. 82 + * size must be MTE_GRANULE_SIZE aligned. 55 83 */ 56 - static inline void mte_set_mem_tag_range(void *addr, size_t size, 57 - u8 tag, bool init) 84 + static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag, 85 + bool init) 58 86 { 59 - u64 curr, end; 87 + u64 curr, mask, dczid_bs, end1, end2, end3; 60 88 61 - if (!size) 62 - return; 89 + /* Read DC G(Z)VA block size from the system register. */ 90 + dczid_bs = 4ul << (read_cpuid(DCZID_EL0) & 0xf); 63 91 64 92 curr = (u64)__tag_set(addr, tag); 65 - end = curr + size; 93 + mask = dczid_bs - 1; 94 + /* STG/STZG up to the end of the first block. */ 95 + end1 = curr | mask; 96 + end3 = curr + size; 97 + /* DC GVA / GZVA in [end1, end2) */ 98 + end2 = end3 & ~mask; 66 99 67 100 /* 68 - * 'asm volatile' is required to prevent the compiler to move 69 - * the statement outside of the loop. 101 + * The following code uses STG on the first DC GVA block even if the 102 + * start address is aligned - it appears to be faster than an alignment 103 + * check + conditional branch. Also, if the range size is at least 2 DC 104 + * GVA blocks, the first two loops can use post-condition to save one 105 + * branch each. 70 106 */ 71 - if (init) { 72 - do { 73 - asm volatile(__MTE_PREAMBLE "stzg %0, [%0]" 74 - : 75 - : "r" (curr) 76 - : "memory"); 77 - curr += MTE_GRANULE_SIZE; 78 - } while (curr != end); 79 - } else { 80 - do { 81 - asm volatile(__MTE_PREAMBLE "stg %0, [%0]" 82 - : 83 - : "r" (curr) 84 - : "memory"); 85 - curr += MTE_GRANULE_SIZE; 86 - } while (curr != end); 87 - } 107 + #define SET_MEMTAG_RANGE(stg_post, dc_gva) \ 108 + do { \ 109 + if (size >= 2 * dczid_bs) { \ 110 + do { \ 111 + curr = stg_post(curr); \ 112 + } while (curr < end1); \ 113 + \ 114 + do { \ 115 + dc_gva(curr); \ 116 + curr += dczid_bs; \ 117 + } while (curr < end2); \ 118 + } \ 119 + \ 120 + while (curr < end3) \ 121 + curr = stg_post(curr); \ 122 + } while (0) 123 + 124 + if (init) 125 + SET_MEMTAG_RANGE(__stzg_post, __dc_gzva); 126 + else 127 + SET_MEMTAG_RANGE(__stg_post, __dc_gva); 128 + #undef SET_MEMTAG_RANGE 88 129 } 89 130 90 131 void mte_enable_kernel_sync(void);