···1010.section .noinstr.text, "ax"11111212/*1313- * We build a jump to memcpy_orig by default which gets NOPped out on1414- * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which1515- * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs1616- * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.1717- */1818-1919-/*2013 * memcpy - Copy a memory block.2114 *2215 * Input:···1926 *2027 * Output:2128 * rax original destination2929+ *3030+ * The FSRM alternative should be done inline (avoiding the call and3131+ * the disgusting return handling), but that would require some help3232+ * from the compiler for better calling conventions.3333+ *3434+ * The 'rep movsb' itself is small enough to replace the call, but the3535+ * two register moves blow up the code. And one of them is "needed"3636+ * only for the return value that is the same as the source input,3737+ * which the compiler could/should do much better anyway.2238 */2339SYM_TYPED_FUNC_START(__memcpy)2424- ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \2525- "jmp memcpy_erms", X86_FEATURE_ERMS4040+ ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM26412742 movq %rdi, %rax2843 movq %rdx, %rcx2929- shrq $3, %rcx3030- andl $7, %edx3131- rep movsq3232- movl %edx, %ecx3344 rep movsb3445 RET3546SYM_FUNC_END(__memcpy)···41444245SYM_FUNC_ALIAS(memcpy, __memcpy)4346EXPORT_SYMBOL(memcpy)4444-4545-/*4646- * memcpy_erms() - enhanced fast string memcpy. This is faster and4747- * simpler than memcpy. Use memcpy_erms when possible.4848- */4949-SYM_FUNC_START_LOCAL(memcpy_erms)5050- movq %rdi, %rax5151- movq %rdx, %rcx5252- rep movsb5353- RET5454-SYM_FUNC_END(memcpy_erms)55475648SYM_FUNC_START_LOCAL(memcpy_orig)5749 movq %rdi, %rax
+11-36
tools/arch/x86/lib/memset_64.S
···1818 * rdx count (bytes)1919 *2020 * rax original destination2121+ *2222+ * The FSRS alternative should be done inline (avoiding the call and2323+ * the disgusting return handling), but that would require some help2424+ * from the compiler for better calling conventions.2525+ *2626+ * The 'rep stosb' itself is small enough to replace the call, but all2727+ * the register moves blow up the code. And two of them are "needed"2828+ * only for the return value that is the same as the source input,2929+ * which the compiler could/should do much better anyway.2130 */2231SYM_FUNC_START(__memset)2323- /*2424- * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended2525- * to use it when possible. If not available, use fast string instructions.2626- *2727- * Otherwise, use original memset function.2828- */2929- ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \3030- "jmp memset_erms", X86_FEATURE_ERMS3232+ ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS31333234 movq %rdi,%r93535+ movb %sil,%al3336 movq %rdx,%rcx3434- andl $7,%edx3535- shrq $3,%rcx3636- /* expand byte value */3737- movzbl %sil,%esi3838- movabs $0x0101010101010101,%rax3939- imulq %rsi,%rax4040- rep stosq4141- movl %edx,%ecx4237 rep stosb4338 movq %r9,%rax4439 RET···42474348SYM_FUNC_ALIAS(memset, __memset)4449EXPORT_SYMBOL(memset)4545-4646-/*4747- * ISO C memset - set a memory block to a byte value. This function uses4848- * enhanced rep stosb to override the fast string function.4949- * The code is simpler and shorter than the fast string function as well.5050- *5151- * rdi destination5252- * rsi value (char)5353- * rdx count (bytes)5454- *5555- * rax original destination5656- */5757-SYM_FUNC_START_LOCAL(memset_erms)5858- movq %rdi,%r95959- movb %sil,%al6060- movq %rdx,%rcx6161- rep stosb6262- movq %r9,%rax6363- RET6464-SYM_FUNC_END(memset_erms)65506651SYM_FUNC_START_LOCAL(memset_orig)6752 movq %rdi,%r10
+1-2
tools/include/asm/alternative.h
···4455/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */6677-#define altinstruction_entry #88-#define ALTERNATIVE_2 #77+#define ALTERNATIVE #98109#endif
···449449444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset450450445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule451451446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self452452-# 447 reserved for memfd_secret452452+447 common memfd_secret sys_memfd_secret sys_memfd_secret453453448 common process_mrelease sys_process_mrelease sys_process_mrelease454454449 common futex_waitv sys_futex_waitv sys_futex_waitv455455450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node
-4
tools/perf/bench/mem-memcpy-x86-64-asm-def.h
···77MEMCPY_FN(__memcpy,88 "x86-64-movsq",99 "movsq-based memcpy() in arch/x86/lib/memcpy_64.S")1010-1111-MEMCPY_FN(memcpy_erms,1212- "x86-64-movsb",1313- "movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
+1-1
tools/perf/bench/mem-memcpy-x86-64-asm.S
···2233/* Various wrappers to make the kernel .S file build in user-space: */4455-// memcpy_orig and memcpy_erms are being defined as SYM_L_LOCAL but we need it55+// memcpy_orig is being defined as SYM_L_LOCAL but we need it66#define SYM_FUNC_START_LOCAL(name) \77 SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)88#define memcpy MEMCPY /* don't hide glibc's memcpy() */
-4
tools/perf/bench/mem-memset-x86-64-asm-def.h
···77MEMSET_FN(__memset,88 "x86-64-stosq",99 "movsq-based memset() in arch/x86/lib/memset_64.S")1010-1111-MEMSET_FN(memset_erms,1212- "x86-64-stosb",1313- "movsb-based memset() in arch/x86/lib/memset_64.S")
+1-1
tools/perf/bench/mem-memset-x86-64-asm.S
···11/* SPDX-License-Identifier: GPL-2.0 */22-// memset_orig and memset_erms are being defined as SYM_L_LOCAL but we need it22+// memset_orig is being defined as SYM_L_LOCAL but we need it33#define SYM_FUNC_START_LOCAL(name) \44 SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)55#define memset MEMSET /* don't hide glibc's memset() */