Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sh: Minor optimisations to FPU handling

A number of small optimisations to FPU handling, in particular:

- move the task USEDFPU flag from the thread_info flags field (which
is accessed asynchronously to the thread) to a new status field,
which is only accessed by the thread itself. This allows locking to
be removed in most cases, or can be reduced to a preempt_lock().
This mimics the i386 behaviour.

- move the modification of regs->sr and thread_info->status flags out
of save_fpu() to __unlazy_fpu(). This gives the compiler a better
chance to optimise things, as well as making save_fpu() symmetrical
with restore_fpu() and init_fpu().

- implement prepare_to_copy(), so that when creating a thread, we can
unlazy the FPU prior to copying the thread data structures.

Also make sure that the FPU is disabled while in the kernel, in
particular while booting, and for newly created kernel threads,

In a very artificial benchmark, the execution time for 2500000
context switches was reduced from 50 to 45 seconds.

Signed-off-by: Stuart Menefy <stuart.menefy@st.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>

authored by

Stuart Menefy and committed by
Paul Mundt
d3ea9fa0 39ac11c1

+48 -42
+15 -11
arch/sh/include/asm/fpu.h
··· 18 18 19 19 struct task_struct; 20 20 21 - extern void save_fpu(struct task_struct *__tsk, struct pt_regs *regs); 21 + extern void save_fpu(struct task_struct *__tsk); 22 22 void fpu_state_restore(struct pt_regs *regs); 23 23 #else 24 24 25 + #define save_fpu(tsk) do { } while (0) 25 26 #define release_fpu(regs) do { } while (0) 26 27 #define grab_fpu(regs) do { } while (0) 27 28 28 - static inline void save_fpu(struct task_struct *tsk, struct pt_regs *regs) 29 - { 30 - clear_tsk_thread_flag(tsk, TIF_USEDFPU); 31 - } 32 29 #endif 33 30 34 31 struct user_regset; ··· 37 40 unsigned int pos, unsigned int count, 38 41 void *kbuf, void __user *ubuf); 39 42 43 + static inline void __unlazy_fpu(struct task_struct *tsk, struct pt_regs *regs) 44 + { 45 + if (task_thread_info(tsk)->status & TS_USEDFPU) { 46 + task_thread_info(tsk)->status &= ~TS_USEDFPU; 47 + save_fpu(tsk); 48 + release_fpu(regs); 49 + } else 50 + tsk->fpu_counter = 0; 51 + } 52 + 40 53 static inline void unlazy_fpu(struct task_struct *tsk, struct pt_regs *regs) 41 54 { 42 55 preempt_disable(); 43 - if (test_tsk_thread_flag(tsk, TIF_USEDFPU)) 44 - save_fpu(tsk, regs); 45 - else 46 - tsk->fpu_counter = 0; 56 + __unlazy_fpu(tsk, regs); 47 57 preempt_enable(); 48 58 } 49 59 50 60 static inline void clear_fpu(struct task_struct *tsk, struct pt_regs *regs) 51 61 { 52 62 preempt_disable(); 53 - if (test_tsk_thread_flag(tsk, TIF_USEDFPU)) { 54 - clear_tsk_thread_flag(tsk, TIF_USEDFPU); 63 + if (task_thread_info(tsk)->status & TS_USEDFPU) { 64 + task_thread_info(tsk)->status &= ~TS_USEDFPU; 55 65 release_fpu(regs); 56 66 } 57 67 preempt_enable();
+2 -1
arch/sh/include/asm/processor_32.h
··· 56 56 #define SR_DSP 0x00001000 57 57 #define SR_IMASK 0x000000f0 58 58 #define SR_FD 0x00008000 59 + #define SR_MD 0x40000000 59 60 60 61 /* 61 62 * DSP structure and data ··· 137 136 extern void release_thread(struct task_struct *); 138 137 139 138 /* Prepare to copy thread state - unlazy all lazy status */ 140 - #define prepare_to_copy(tsk) do { } while (0) 139 + void prepare_to_copy(struct task_struct *tsk); 141 140 142 141 /* 143 142 * create a kernel thread without removing it from tasklists
+2 -2
arch/sh/include/asm/thread_info.h
··· 51 51 .task = &tsk, \ 52 52 .exec_domain = &default_exec_domain, \ 53 53 .flags = 0, \ 54 + .status = 0, \ 54 55 .cpu = 0, \ 55 56 .preempt_count = INIT_PREEMPT_COUNT, \ 56 57 .addr_limit = KERNEL_DS, \ ··· 118 117 #define TIF_SECCOMP 6 /* secure computing */ 119 118 #define TIF_NOTIFY_RESUME 7 /* callback before returning to user */ 120 119 #define TIF_SYSCALL_TRACEPOINT 8 /* for ftrace syscall instrumentation */ 121 - #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ 122 120 #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ 123 121 #define TIF_MEMDIE 18 124 122 #define TIF_FREEZE 19 /* Freezing for suspend */ ··· 130 130 #define _TIF_SECCOMP (1 << TIF_SECCOMP) 131 131 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 132 132 #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) 133 - #define _TIF_USEDFPU (1 << TIF_USEDFPU) 134 133 #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) 135 134 #define _TIF_FREEZE (1 << TIF_FREEZE) 136 135 ··· 162 163 * have to worry about atomic accesses. 163 164 */ 164 165 #define TS_RESTORE_SIGMASK 0x0001 /* restore signal mask in do_signal() */ 166 + #define TS_USEDFPU 0x0002 /* FPU used by this task this quantum */ 165 167 166 168 #ifndef __ASSEMBLY__ 167 169 #define HAVE_SET_RESTORE_SIGMASK 1
+2 -2
arch/sh/kernel/cpu/init.c
··· 311 311 if (fpu_disabled) { 312 312 printk("FPU Disabled\n"); 313 313 current_cpu_data.flags &= ~CPU_HAS_FPU; 314 - disable_fpu(); 315 314 } 316 315 317 316 /* FPU initialization */ 317 + disable_fpu(); 318 318 if ((current_cpu_data.flags & CPU_HAS_FPU)) { 319 - clear_thread_flag(TIF_USEDFPU); 319 + current_thread_info()->status &= ~TS_USEDFPU; 320 320 clear_used_math(); 321 321 } 322 322
+4 -7
arch/sh/kernel/cpu/sh2a/fpu.c
··· 25 25 26 26 /* 27 27 * Save FPU registers onto task structure. 28 - * Assume called with FPU enabled (SR.FD=0). 29 28 */ 30 29 void 31 - save_fpu(struct task_struct *tsk, struct pt_regs *regs) 30 + save_fpu(struct task_struct *tsk) 32 31 { 33 32 unsigned long dummy; 34 33 35 - clear_tsk_thread_flag(tsk, TIF_USEDFPU); 36 34 enable_fpu(); 37 35 asm volatile("sts.l fpul, @-%0\n\t" 38 36 "sts.l fpscr, @-%0\n\t" ··· 58 60 : "memory"); 59 61 60 62 disable_fpu(); 61 - release_fpu(regs); 62 63 } 63 64 64 65 static void ··· 595 598 struct task_struct *tsk = current; 596 599 TRAP_HANDLER_DECL; 597 600 598 - save_fpu(tsk, regs); 601 + __unlazy_fpu(tsk, regs); 599 602 if (ieee_fpe_handler(regs)) { 600 603 tsk->thread.fpu.hard.fpscr &= 601 604 ~(FPSCR_CAUSE_MASK | FPSCR_FLAG_MASK); 602 605 grab_fpu(regs); 603 606 restore_fpu(tsk); 604 - set_tsk_thread_flag(tsk, TIF_USEDFPU); 607 + task_thread_info(tsk)->status |= TS_USEDFPU; 605 608 return; 606 609 } 607 610 ··· 627 630 fpu_init(); 628 631 set_used_math(); 629 632 } 630 - set_tsk_thread_flag(tsk, TIF_USEDFPU); 633 + task_thread_info(tsk)->status |= TS_USEDFPU; 631 634 }
+4 -8
arch/sh/kernel/cpu/sh4/fpu.c
··· 41 41 42 42 /* 43 43 * Save FPU registers onto task structure. 44 - * Assume called with FPU enabled (SR.FD=0). 45 44 */ 46 - void save_fpu(struct task_struct *tsk, struct pt_regs *regs) 45 + void save_fpu(struct task_struct *tsk) 47 46 { 48 47 unsigned long dummy; 49 48 50 - clear_tsk_thread_flag(tsk, TIF_USEDFPU); 51 49 enable_fpu(); 52 50 asm volatile ("sts.l fpul, @-%0\n\t" 53 51 "sts.l fpscr, @-%0\n\t" ··· 90 92 :"memory"); 91 93 92 94 disable_fpu(); 93 - release_fpu(regs); 94 95 } 95 96 96 97 static void restore_fpu(struct task_struct *tsk) ··· 282 285 /* fcnvsd */ 283 286 struct task_struct *tsk = current; 284 287 285 - save_fpu(tsk, regs); 286 288 if ((tsk->thread.fpu.hard.fpscr & FPSCR_CAUSE_ERROR)) 287 289 /* FPU error */ 288 290 denormal_to_double(&tsk->thread.fpu.hard, ··· 458 462 struct task_struct *tsk = current; 459 463 TRAP_HANDLER_DECL; 460 464 461 - save_fpu(tsk, regs); 465 + __unlazy_fpu(tsk, regs); 462 466 fpu_exception_flags = 0; 463 467 if (ieee_fpe_handler(regs)) { 464 468 tsk->thread.fpu.hard.fpscr &= ··· 469 473 tsk->thread.fpu.hard.fpscr |= (fpu_exception_flags >> 10); 470 474 grab_fpu(regs); 471 475 restore_fpu(tsk); 472 - set_tsk_thread_flag(tsk, TIF_USEDFPU); 476 + task_thread_info(tsk)->status |= TS_USEDFPU; 473 477 if ((((tsk->thread.fpu.hard.fpscr & FPSCR_ENABLE_MASK) >> 7) & 474 478 (fpu_exception_flags >> 2)) == 0) { 475 479 return; ··· 498 502 fpu_init(); 499 503 set_used_math(); 500 504 } 501 - set_tsk_thread_flag(tsk, TIF_USEDFPU); 505 + task_thread_info(tsk)->status |= TS_USEDFPU; 502 506 tsk->fpu_counter++; 503 507 } 504 508
+16 -8
arch/sh/kernel/process_32.c
··· 134 134 regs.regs[5] = (unsigned long)fn; 135 135 136 136 regs.pc = (unsigned long)kernel_thread_helper; 137 - regs.sr = (1 << 30); 137 + regs.sr = SR_MD; 138 + #if defined(CONFIG_SH_FPU) 139 + regs.sr |= SR_FD; 140 + #endif 138 141 139 142 /* Ok, create the new process.. */ 140 143 pid = do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ··· 192 189 } 193 190 EXPORT_SYMBOL(dump_fpu); 194 191 192 + /* 193 + * This gets called before we allocate a new thread and copy 194 + * the current task into it. 195 + */ 196 + void prepare_to_copy(struct task_struct *tsk) 197 + { 198 + unlazy_fpu(tsk, task_pt_regs(tsk)); 199 + } 200 + 195 201 asmlinkage void ret_from_fork(void); 196 202 197 203 int copy_thread(unsigned long clone_flags, unsigned long usp, ··· 209 197 { 210 198 struct thread_info *ti = task_thread_info(p); 211 199 struct pt_regs *childregs; 212 - #if defined(CONFIG_SH_FPU) || defined(CONFIG_SH_DSP) 200 + #if defined(CONFIG_SH_DSP) 213 201 struct task_struct *tsk = current; 214 - #endif 215 - 216 - #if defined(CONFIG_SH_FPU) 217 - unlazy_fpu(tsk, regs); 218 - p->thread.fpu = tsk->thread.fpu; 219 - copy_to_stopped_child_used_math(p); 220 202 #endif 221 203 222 204 #if defined(CONFIG_SH_DSP) ··· 232 226 } else { 233 227 childregs->regs[15] = (unsigned long)childregs; 234 228 ti->addr_limit = KERNEL_DS; 229 + ti->status &= ~TS_USEDFPU; 230 + p->fpu_counter = 0; 235 231 } 236 232 237 233 if (clone_flags & CLONE_SETTLS)
+3 -3
arch/sh/math-emu/math.c
··· 558 558 (finsn >> 8) & 0xf); 559 559 tsk->thread.fpu.hard.fpscr &= 560 560 ~(FPSCR_CAUSE_MASK | FPSCR_FLAG_MASK); 561 - set_tsk_thread_flag(tsk, TIF_USEDFPU); 561 + task_thread_info(tsk)->status |= TS_USEDFPU; 562 562 } else { 563 563 info.si_signo = SIGFPE; 564 564 info.si_errno = 0; ··· 619 619 struct task_struct *tsk = current; 620 620 struct sh_fpu_soft_struct *fpu = &(tsk->thread.fpu.soft); 621 621 622 - if (!test_tsk_thread_flag(tsk, TIF_USEDFPU)) { 622 + if (!(task_thread_info(tsk)->status & TS_USEDFPU)) { 623 623 /* initialize once. */ 624 624 fpu_init(fpu); 625 - set_tsk_thread_flag(tsk, TIF_USEDFPU); 625 + task_thread_info(tsk)->status |= TS_USEDFPU; 626 626 } 627 627 628 628 return fpu_emulate(inst, fpu, regs);