[IA64] - Reduce overhead of FP exception logging messages

Improve the scalability of the fpswa code that rate-limits
logging of messages.

There are 2 distinctly different problems in this code.

1) If prctl is used to disable logging, last_time is never
updated. The result is that fpu_swa_count is zeroed out on
EVERY fp fault. This causes a very very hot cache line.
The fix reduces the wallclock time of a 1024p FP exception test
from 28734 sec to 19 sec!!!

2) On VERY large systems, excessive messages are logged because
multiple cpus can each reset or increment fpu_swa_count at
about the same time. The result is that hundreds of messages
are logged each second. The fixes reduces the logging rate
to ~1 per second.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>

authored by Jack Steiner and committed by Tony Luck 1cf24bdb 8b9c1068

+40 -10
+40 -10
arch/ia64/kernel/traps.c
··· 307 307 return ret.status; 308 308 } 309 309 310 + struct fpu_swa_msg { 311 + unsigned long count; 312 + unsigned long time; 313 + }; 314 + static DEFINE_PER_CPU(struct fpu_swa_msg, cpulast); 315 + DECLARE_PER_CPU(struct fpu_swa_msg, cpulast); 316 + static struct fpu_swa_msg last __cacheline_aligned; 317 + 318 + 310 319 /* 311 320 * Handle floating-point assist faults and traps. 312 321 */ ··· 325 316 long exception, bundle[2]; 326 317 unsigned long fault_ip; 327 318 struct siginfo siginfo; 328 - static int fpu_swa_count = 0; 329 - static unsigned long last_time; 330 319 331 320 fault_ip = regs->cr_iip; 332 321 if (!fp_fault && (ia64_psr(regs)->ri == 0)) ··· 332 325 if (copy_from_user(bundle, (void __user *) fault_ip, sizeof(bundle))) 333 326 return -1; 334 327 335 - if (jiffies - last_time > 5*HZ) 336 - fpu_swa_count = 0; 337 - if ((fpu_swa_count < 4) && !(current->thread.flags & IA64_THREAD_FPEMU_NOPRINT)) { 338 - last_time = jiffies; 339 - ++fpu_swa_count; 340 - printk(KERN_WARNING 341 - "%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n", 342 - current->comm, current->pid, regs->cr_iip + ia64_psr(regs)->ri, isr); 328 + if (!(current->thread.flags & IA64_THREAD_FPEMU_NOPRINT)) { 329 + unsigned long count, current_jiffies = jiffies; 330 + struct fpu_swa_msg *cp = &__get_cpu_var(cpulast); 331 + 332 + if (unlikely(current_jiffies > cp->time)) 333 + cp->count = 0; 334 + if (unlikely(cp->count < 5)) { 335 + cp->count++; 336 + cp->time = current_jiffies + 5 * HZ; 337 + 338 + /* minimize races by grabbing a copy of count BEFORE checking last.time. */ 339 + count = last.count; 340 + barrier(); 341 + 342 + /* 343 + * Lower 4 bits are used as a count. Upper bits are a sequence 344 + * number that is updated when count is reset. The cmpxchg will 345 + * fail is seqno has changed. This minimizes mutiple cpus 346 + * reseting the count. 347 + */ 348 + if (current_jiffies > last.time) 349 + (void) cmpxchg_acq(&last.count, count, 16 + (count & ~15)); 350 + 351 + /* used fetchadd to atomically update the count */ 352 + if ((last.count & 15) < 5 && (ia64_fetchadd(1, &last.count, acq) & 15) < 5) { 353 + last.time = current_jiffies + 5 * HZ; 354 + printk(KERN_WARNING 355 + "%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n", 356 + current->comm, current->pid, regs->cr_iip + ia64_psr(regs)->ri, isr); 357 + } 358 + } 343 359 } 344 360 345 361 exception = fp_emulate(fp_fault, bundle, &regs->cr_ipsr, &regs->ar_fpsr, &isr, &regs->pr,