Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

nds32: support denormalized result through FP emulator

Currently, the nds32 FPU dose not support the arithmetic of denormalized
number. When the nds32 FPU finds the result of the instruction is a
denormlized number, the nds32 FPU considers it to be an underflow condition
and rounds the result to an appropriate number. It may causes some loss
of precision. This commit proposes a solution to re-execute the
instruction by the FPU emulator to enhance the precision. To transfer
calculations from user space to kernel space, this feature will enable
the underflow exception trap by default. Enabling this feature may cause
some side effects:
1. Performance loss due to extra FPU exception
2. Need another scheme to control real underflow trap
A new parameter, UDF_trap, which is belong to FPU context is used
to control underflow trap.

User can configure this feature via CONFIG_SUPPORT_DENORMAL_ARITHMETIC

Signed-off-by: Vincent Chen <vincentc@andestech.com>
Acked-by: Greentime Hu <greentime@andestech.com>
Signed-off-by: Greentime Hu <greentime@andestech.com>

authored by

Vincent Chen and committed by
Greentime Hu
44e92e03 1ac83250

+125 -4
+13
arch/nds32/Kconfig.cpu
··· 28 28 29 29 For nomal case, say Y. 30 30 31 + config SUPPORT_DENORMAL_ARITHMETIC 32 + bool "Denormal arithmetic support" 33 + depends on FPU 34 + default n 35 + help 36 + Say Y here to enable arithmetic of denormalized number. Enabling 37 + this feature can enhance the precision for tininess number. 38 + However, performance loss in float pointe calculations is 39 + possibly significant due to additional FPU exception. 40 + 41 + If the calculated tolerance for tininess number is not critical, 42 + say N to prevent performance loss. 43 + 31 44 config HWZOL 32 45 bool "hardware zero overhead loop support" 33 46 depends on CPU_D10 || CPU_D15
+11
arch/nds32/include/asm/elf.h
··· 9 9 */ 10 10 11 11 #include <asm/ptrace.h> 12 + #include <asm/fpu.h> 12 13 13 14 typedef unsigned long elf_greg_t; 14 15 typedef unsigned long elf_freg_t[3]; ··· 160 159 161 160 #endif 162 161 162 + 163 + #if IS_ENABLED(CONFIG_FPU) 164 + #define FPU_AUX_ENT NEW_AUX_ENT(AT_FPUCW, FPCSR_INIT) 165 + #else 166 + #define FPU_AUX_ENT NEW_AUX_ENT(AT_IGNORE, 0) 167 + #endif 168 + 163 169 #define ARCH_DLINFO \ 164 170 do { \ 171 + /* Optional FPU initialization */ \ 172 + FPU_AUX_ENT; \ 173 + \ 165 174 NEW_AUX_ENT(AT_SYSINFO_EHDR, \ 166 175 (elf_addr_t)current->mm->context.vdso); \ 167 176 } while (0)
+11
arch/nds32/include/asm/fpu.h
··· 28 28 #define sNAN64 0xFFFFFFFFFFFFFFFFULL 29 29 #define sNAN32 0xFFFFFFFFUL 30 30 31 + #if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) 32 + /* 33 + * Denormalized number is unsupported by nds32 FPU. Hence the operation 34 + * is treated as underflow cases when the final result is a denormalized 35 + * number. To enhance precision, underflow exception trap should be 36 + * enabled by default and kerenl will re-execute it by fpu emulator 37 + * when getting underflow exception. 38 + */ 39 + #define FPCSR_INIT FPCSR_mskUDFE 40 + #else 31 41 #define FPCSR_INIT 0x0UL 42 + #endif 32 43 33 44 extern const struct fpu_struct init_fpuregs; 34 45
+1
arch/nds32/include/asm/syscalls.h
··· 7 7 asmlinkage long sys_cacheflush(unsigned long addr, unsigned long len, unsigned int op); 8 8 asmlinkage long sys_fadvise64_64_wrapper(int fd, int advice, loff_t offset, loff_t len); 9 9 asmlinkage long sys_rt_sigreturn_wrapper(void); 10 + asmlinkage long sys_udftrap(int option); 10 11 11 12 #include <asm-generic/syscalls.h> 12 13
+7
arch/nds32/include/uapi/asm/auxvec.h
··· 4 4 #ifndef __ASM_AUXVEC_H 5 5 #define __ASM_AUXVEC_H 6 6 7 + /* 8 + * This entry gives some information about the FPU initialization 9 + * performed by the kernel. 10 + */ 11 + #define AT_FPUCW 18 /* Used FPU control word. */ 12 + 13 + 7 14 /* VDSO location */ 8 15 #define AT_SYSINFO_EHDR 33 9 16
+9
arch/nds32/include/uapi/asm/sigcontext.h
··· 12 12 struct fpu_struct { 13 13 unsigned long long fd_regs[32]; 14 14 unsigned long fpcsr; 15 + /* 16 + * UDF_trap is used to recognize whether underflow trap is enabled 17 + * or not. When UDF_trap == 1, this process will be traped and then 18 + * get a SIGFPE signal when encountering an underflow exception. 19 + * UDF_trap is only modified through setfputrap syscall. Therefore, 20 + * UDF_trap needn't be saved or loaded to context in each context 21 + * switch. 22 + */ 23 + unsigned long UDF_trap; 15 24 }; 16 25 17 26 struct zol_struct {
+13
arch/nds32/include/uapi/asm/udftrap.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright (C) 2005-2018 Andes Technology Corporation */ 3 + #ifndef _ASM_SETFPUTRAP 4 + #define _ASM_SETFPUTRAP 5 + 6 + /* 7 + * Options for setfputrap system call 8 + */ 9 + #define DISABLE_UDFTRAP 0 /* disable underflow exception trap */ 10 + #define ENABLE_UDFTRAP 1 /* enable undeflos exception trap */ 11 + #define GET_UDFTRAP 2 /* only get undeflos exception trap status */ 12 + 13 + #endif /* _ASM_CACHECTL */
+2
arch/nds32/include/uapi/asm/unistd.h
··· 9 9 10 10 /* Additional NDS32 specific syscalls. */ 11 11 #define __NR_cacheflush (__NR_arch_specific_syscall) 12 + #define __NR_udftrap (__NR_arch_specific_syscall + 1) 12 13 __SYSCALL(__NR_cacheflush, sys_cacheflush) 14 + __SYSCALL(__NR_udftrap, sys_udftrap)
+21 -4
arch/nds32/kernel/fpu.c
··· 12 12 13 13 const struct fpu_struct init_fpuregs = { 14 14 .fd_regs = {[0 ... 31] = sNAN64}, 15 - .fpcsr = FPCSR_INIT 15 + .fpcsr = FPCSR_INIT, 16 + #if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) 17 + .UDF_trap = 0 18 + #endif 16 19 }; 17 20 18 21 void save_fpu(struct task_struct *tsk) ··· 177 174 } else { 178 175 /* First time FPU user. */ 179 176 load_fpu(&init_fpuregs); 177 + #if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) 178 + current->thread.fpu.UDF_trap = init_fpuregs.UDF_trap; 179 + #endif 180 180 set_used_math(); 181 181 } 182 182 ··· 189 183 { 190 184 if (fpcsr & FPCSR_mskOVFT) 191 185 *signo = FPE_FLTOVF; 192 - else if (fpcsr & FPCSR_mskIVOT) 193 - *signo = FPE_FLTINV; 186 + #ifndef CONFIG_SUPPORT_DENORMAL_ARITHMETIC 194 187 else if (fpcsr & FPCSR_mskUDFT) 195 188 *signo = FPE_FLTUND; 189 + #endif 190 + else if (fpcsr & FPCSR_mskIVOT) 191 + *signo = FPE_FLTINV; 196 192 else if (fpcsr & FPCSR_mskDBZT) 197 193 *signo = FPE_FLTDIV; 198 194 else if (fpcsr & FPCSR_mskIEXT) ··· 205 197 { 206 198 unsigned int fpcsr; 207 199 int si_code = 0, si_signo = SIGFPE; 200 + #if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) 201 + unsigned long redo_except = FPCSR_mskDNIT|FPCSR_mskUDFT; 202 + #else 203 + unsigned long redo_except = FPCSR_mskDNIT; 204 + #endif 208 205 209 206 lose_fpu(); 210 207 fpcsr = current->thread.fpu.fpcsr; 211 208 212 - if (fpcsr & FPCSR_mskDNIT) { 209 + if (fpcsr & redo_except) { 210 + #if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) 211 + if (fpcsr & FPCSR_mskUDFT) 212 + current->thread.fpu.fpcsr &= ~FPCSR_mskIEX; 213 + #endif 213 214 si_signo = do_fpuemu(regs, &current->thread.fpu); 214 215 fpcsr = current->thread.fpu.fpcsr; 215 216 if (!si_signo)
+32
arch/nds32/kernel/sys_nds32.c
··· 6 6 7 7 #include <asm/cachectl.h> 8 8 #include <asm/proc-fns.h> 9 + #include <asm/udftrap.h> 10 + #include <asm/fpu.h> 9 11 10 12 SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len, 11 13 unsigned long, prot, unsigned long, flags, ··· 49 47 cpu_cache_wbinval_range_check(vma, start, end, flushi, wbd); 50 48 51 49 return 0; 50 + } 51 + 52 + SYSCALL_DEFINE1(udftrap, int, option) 53 + { 54 + #if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) 55 + int old_udftrap; 56 + 57 + if (!used_math()) { 58 + load_fpu(&init_fpuregs); 59 + current->thread.fpu.UDF_trap = init_fpuregs.UDF_trap; 60 + set_used_math(); 61 + } 62 + 63 + old_udftrap = current->thread.fpu.UDF_trap; 64 + switch (option) { 65 + case DISABLE_UDFTRAP: 66 + current->thread.fpu.UDF_trap = 0; 67 + break; 68 + case ENABLE_UDFTRAP: 69 + current->thread.fpu.UDF_trap = FPCSR_mskUDFE; 70 + break; 71 + case GET_UDFTRAP: 72 + break; 73 + default: 74 + return -EINVAL; 75 + } 76 + return old_udftrap; 77 + #else 78 + return -ENOTSUPP; 79 + #endif 52 80 }
+5
arch/nds32/math-emu/fpuemu.c
··· 304 304 /* 305 305 * If an exception is required, generate a tidy SIGFPE exception. 306 306 */ 307 + #if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) 308 + if (((fpu_reg->fpcsr << 5) & fpu_reg->fpcsr & FPCSR_mskALLE_NO_UDFE) || 309 + ((fpu_reg->fpcsr & FPCSR_mskUDF) && (fpu_reg->UDF_trap))) 310 + #else 307 311 if ((fpu_reg->fpcsr << 5) & fpu_reg->fpcsr & FPCSR_mskALLE) 312 + #endif 308 313 return SIGFPE; 309 314 return 0; 310 315 }