powerpc/oprofile: Fix mutex locking for cell spu-oprofile

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

The issue is the SPU code is not holding the kernel mutex lock while
adding samples to the kernel buffer.

This patch creates per SPU buffers to hold the data. Data
is added to the buffers from in interrupt context. The data
is periodically pushed to the kernel buffer via a new Oprofile
function oprofile_put_buff(). The oprofile_put_buff() function
is called via a work queue enabling the funtion to acquire the
mutex lock.

The existing user controls for adjusting the per CPU buffer
size is used to control the size of the per SPU buffers.
Similarly, overflows of the SPU buffers are reported by
incrementing the per CPU buffer stats. This eliminates the
need to have architecture specific controls for the per SPU
buffers which is not acceptable to the OProfile user tool
maintainer.

The export of the oprofile add_event_entry() is removed as it
is no longer needed given this patch.

Note, this patch has not addressed the issue of indexing arrays
by the spu number. This still needs to be fixed as the spu
numbering is not guarenteed to be 0 to max_num_spus-1.

Signed-off-by: Carl Love <carll@us.ibm.com>
Signed-off-by: Maynard Johnson <maynardj@us.ibm.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Acked-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

authored by

Carl Love and committed by

Benjamin Herrenschmidt 17 years ago a5598ca0 bb5e6491

+279 -36

7 changed files

expand all

arch

powerpc

oprofile

cell

pr_util.h

spu_profiler.c

spu_task_sync.c

drivers

oprofile

buffer_sync.c

cpu_buffer.c

event_buffer.h

include

linux

oprofile.h

+13

arch/powerpc/oprofile/cell/pr_util.h

··· 24 24 #define SKIP_GENERIC_SYNC 0 25 25 #define SYNC_START_ERROR -1 26 26 #define DO_GENERIC_SYNC 1 27 + #define SPUS_PER_NODE 8 28 + #define DEFAULT_TIMER_EXPIRE (HZ / 10) 29 + 30 + extern struct delayed_work spu_work; 31 + extern int spu_prof_running; 27 32 28 33 struct spu_overlay_info { /* map of sections within an SPU overlay */ 29 34 unsigned int vma; /* SPU virtual memory address from elf */ ··· 66 61 */ 67 62 68 63 }; 64 + 65 + struct spu_buffer { 66 + int last_guard_val; 67 + int ctx_sw_seen; 68 + unsigned long *buff; 69 + unsigned int head, tail; 70 + }; 71 + 69 72 70 73 /* The three functions below are for maintaining and accessing 71 74 * the vma-to-fileoffset map.

+2 -2

arch/powerpc/oprofile/cell/spu_profiler.c

··· 23 23 24 24 static u32 *samples; 25 25 26 - static int spu_prof_running; 26 + int spu_prof_running; 27 27 static unsigned int profiling_interval; 28 28 29 29 #define NUM_SPU_BITS_TRBUF 16 30 30 #define SPUS_PER_TB_ENTRY 4 31 - #define SPUS_PER_NODE 8 32 31 33 32 #define SPU_PC_MASK 0xFFFF 34 33 ··· 207 208 208 209 spu_prof_running = 1; 209 210 hrtimer_start(&timer, kt, HRTIMER_MODE_REL); 211 + schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE); 210 212 211 213 return 0; 212 214 }

+210 -26

arch/powerpc/oprofile/cell/spu_task_sync.c

··· 35 35 static DEFINE_SPINLOCK(cache_lock); 36 36 static int num_spu_nodes; 37 37 int spu_prof_num_nodes; 38 - int last_guard_val[MAX_NUMNODES * 8]; 38 + 39 + struct spu_buffer spu_buff[MAX_NUMNODES * SPUS_PER_NODE]; 40 + struct delayed_work spu_work; 41 + static unsigned max_spu_buff; 42 + 43 + static void spu_buff_add(unsigned long int value, int spu) 44 + { 45 + /* spu buff is a circular buffer. Add entries to the 46 + * head. Head is the index to store the next value. 47 + * The buffer is full when there is one available entry 48 + * in the queue, i.e. head and tail can't be equal. 49 + * That way we can tell the difference between the 50 + * buffer being full versus empty. 51 + * 52 + * ASSUPTION: the buffer_lock is held when this function 53 + * is called to lock the buffer, head and tail. 54 + */ 55 + int full = 1; 56 + 57 + if (spu_buff[spu].head >= spu_buff[spu].tail) { 58 + if ((spu_buff[spu].head - spu_buff[spu].tail) 59 + < (max_spu_buff - 1)) 60 + full = 0; 61 + 62 + } else if (spu_buff[spu].tail > spu_buff[spu].head) { 63 + if ((spu_buff[spu].tail - spu_buff[spu].head) 64 + > 1) 65 + full = 0; 66 + } 67 + 68 + if (!full) { 69 + spu_buff[spu].buff[spu_buff[spu].head] = value; 70 + spu_buff[spu].head++; 71 + 72 + if (spu_buff[spu].head >= max_spu_buff) 73 + spu_buff[spu].head = 0; 74 + } else { 75 + /* From the user's perspective make the SPU buffer 76 + * size management/overflow look like we are using 77 + * per cpu buffers. The user uses the same 78 + * per cpu parameter to adjust the SPU buffer size. 79 + * Increment the sample_lost_overflow to inform 80 + * the user the buffer size needs to be increased. 81 + */ 82 + oprofile_cpu_buffer_inc_smpl_lost(); 83 + } 84 + } 85 + 86 + /* This function copies the per SPU buffers to the 87 + * OProfile kernel buffer. 88 + */ 89 + void sync_spu_buff(void) 90 + { 91 + int spu; 92 + unsigned long flags; 93 + int curr_head; 94 + 95 + for (spu = 0; spu < num_spu_nodes; spu++) { 96 + /* In case there was an issue and the buffer didn't 97 + * get created skip it. 98 + */ 99 + if (spu_buff[spu].buff == NULL) 100 + continue; 101 + 102 + /* Hold the lock to make sure the head/tail 103 + * doesn't change while spu_buff_add() is 104 + * deciding if the buffer is full or not. 105 + * Being a little paranoid. 106 + */ 107 + spin_lock_irqsave(&buffer_lock, flags); 108 + curr_head = spu_buff[spu].head; 109 + spin_unlock_irqrestore(&buffer_lock, flags); 110 + 111 + /* Transfer the current contents to the kernel buffer. 112 + * data can still be added to the head of the buffer. 113 + */ 114 + oprofile_put_buff(spu_buff[spu].buff, 115 + spu_buff[spu].tail, 116 + curr_head, max_spu_buff); 117 + 118 + spin_lock_irqsave(&buffer_lock, flags); 119 + spu_buff[spu].tail = curr_head; 120 + spin_unlock_irqrestore(&buffer_lock, flags); 121 + } 122 + 123 + } 124 + 125 + static void wq_sync_spu_buff(struct work_struct *work) 126 + { 127 + /* move data from spu buffers to kernel buffer */ 128 + sync_spu_buff(); 129 + 130 + /* only reschedule if profiling is not done */ 131 + if (spu_prof_running) 132 + schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE); 133 + } 39 134 40 135 /* Container for caching information about an active SPU task. */ 41 136 struct cached_info { ··· 400 305 401 306 /* Record context info in event buffer */ 402 307 spin_lock_irqsave(&buffer_lock, flags); 403 - add_event_entry(ESCAPE_CODE); 404 - add_event_entry(SPU_CTX_SWITCH_CODE); 405 - add_event_entry(spu->number); 406 - add_event_entry(spu->pid); 407 - add_event_entry(spu->tgid); 408 - add_event_entry(app_dcookie); 409 - add_event_entry(spu_cookie); 410 - add_event_entry(offset); 308 + spu_buff_add(ESCAPE_CODE, spu->number); 309 + spu_buff_add(SPU_CTX_SWITCH_CODE, spu->number); 310 + spu_buff_add(spu->number, spu->number); 311 + spu_buff_add(spu->pid, spu->number); 312 + spu_buff_add(spu->tgid, spu->number); 313 + spu_buff_add(app_dcookie, spu->number); 314 + spu_buff_add(spu_cookie, spu->number); 315 + spu_buff_add(offset, spu->number); 316 + 317 + /* Set flag to indicate SPU PC data can now be written out. If 318 + * the SPU program counter data is seen before an SPU context 319 + * record is seen, the postprocessing will fail. 320 + */ 321 + spu_buff[spu->number].ctx_sw_seen = 1; 322 + 411 323 spin_unlock_irqrestore(&buffer_lock, flags); 412 324 smp_wmb(); /* insure spu event buffer updates are written */ 413 325 /* don't want entries intermingled... */ ··· 462 360 return nodes; 463 361 } 464 362 363 + static int oprofile_spu_buff_create(void) 364 + { 365 + int spu; 366 + 367 + max_spu_buff = oprofile_get_cpu_buffer_size(); 368 + 369 + for (spu = 0; spu < num_spu_nodes; spu++) { 370 + /* create circular buffers to store the data in. 371 + * use locks to manage accessing the buffers 372 + */ 373 + spu_buff[spu].head = 0; 374 + spu_buff[spu].tail = 0; 375 + 376 + /* 377 + * Create a buffer for each SPU. Can't reliably 378 + * create a single buffer for all spus due to not 379 + * enough contiguous kernel memory. 380 + */ 381 + 382 + spu_buff[spu].buff = kzalloc((max_spu_buff 383 + * sizeof(unsigned long)), 384 + GFP_KERNEL); 385 + 386 + if (!spu_buff[spu].buff) { 387 + printk(KERN_ERR "SPU_PROF: " 388 + "%s, line %d: oprofile_spu_buff_create " 389 + "failed to allocate spu buffer %d.\n", 390 + __func__, __LINE__, spu); 391 + 392 + /* release the spu buffers that have been allocated */ 393 + while (spu >= 0) { 394 + kfree(spu_buff[spu].buff); 395 + spu_buff[spu].buff = 0; 396 + spu--; 397 + } 398 + return -ENOMEM; 399 + } 400 + } 401 + return 0; 402 + } 403 + 465 404 /* The main purpose of this function is to synchronize 466 405 * OProfile with SPUFS by registering to be notified of 467 406 * SPU task switches. ··· 515 372 */ 516 373 int spu_sync_start(void) 517 374 { 518 - int k; 375 + int spu; 519 376 int ret = SKIP_GENERIC_SYNC; 520 377 int register_ret; 521 378 unsigned long flags = 0; 522 379 523 380 spu_prof_num_nodes = number_of_online_nodes(); 524 381 num_spu_nodes = spu_prof_num_nodes * 8; 382 + INIT_DELAYED_WORK(&spu_work, wq_sync_spu_buff); 383 + 384 + /* create buffer for storing the SPU data to put in 385 + * the kernel buffer. 386 + */ 387 + ret = oprofile_spu_buff_create(); 388 + if (ret) 389 + goto out; 525 390 526 391 spin_lock_irqsave(&buffer_lock, flags); 527 - add_event_entry(ESCAPE_CODE); 528 - add_event_entry(SPU_PROFILING_CODE); 529 - add_event_entry(num_spu_nodes); 392 + for (spu = 0; spu < num_spu_nodes; spu++) { 393 + spu_buff_add(ESCAPE_CODE, spu); 394 + spu_buff_add(SPU_PROFILING_CODE, spu); 395 + spu_buff_add(num_spu_nodes, spu); 396 + } 530 397 spin_unlock_irqrestore(&buffer_lock, flags); 398 + 399 + for (spu = 0; spu < num_spu_nodes; spu++) { 400 + spu_buff[spu].ctx_sw_seen = 0; 401 + spu_buff[spu].last_guard_val = 0; 402 + } 531 403 532 404 /* Register for SPU events */ 533 405 register_ret = spu_switch_event_register(&spu_active); ··· 551 393 goto out; 552 394 } 553 395 554 - for (k = 0; k < (MAX_NUMNODES * 8); k++) 555 - last_guard_val[k] = 0; 556 396 pr_debug("spu_sync_start -- running.\n"); 557 397 out: 558 398 return ret; ··· 602 446 * use. We need to discard samples taken during the time 603 447 * period which an overlay occurs (i.e., guard value changes). 604 448 */ 605 - if (grd_val && grd_val != last_guard_val[spu_num]) { 606 - last_guard_val[spu_num] = grd_val; 449 + if (grd_val && grd_val != spu_buff[spu_num].last_guard_val) { 450 + spu_buff[spu_num].last_guard_val = grd_val; 607 451 /* Drop the rest of the samples. */ 608 452 break; 609 453 } 610 454 611 - add_event_entry(file_offset | spu_num_shifted); 455 + /* We must ensure that the SPU context switch has been written 456 + * out before samples for the SPU. Otherwise, the SPU context 457 + * information is not available and the postprocessing of the 458 + * SPU PC will fail with no available anonymous map information. 459 + */ 460 + if (spu_buff[spu_num].ctx_sw_seen) 461 + spu_buff_add((file_offset | spu_num_shifted), 462 + spu_num); 612 463 } 613 464 spin_unlock(&buffer_lock); 614 465 out: ··· 626 463 int spu_sync_stop(void) 627 464 { 628 465 unsigned long flags = 0; 629 - int ret = spu_switch_event_unregister(&spu_active); 630 - if (ret) { 466 + int ret; 467 + int k; 468 + 469 + ret = spu_switch_event_unregister(&spu_active); 470 + 471 + if (ret) 631 472 printk(KERN_ERR "SPU_PROF: " 632 - "%s, line %d: spu_switch_event_unregister returned %d\n", 633 - __func__, __LINE__, ret); 634 - goto out; 635 - } 473 + "%s, line %d: spu_switch_event_unregister " \ 474 + "returned %d\n", 475 + __func__, __LINE__, ret); 476 + 477 + /* flush any remaining data in the per SPU buffers */ 478 + sync_spu_buff(); 636 479 637 480 spin_lock_irqsave(&cache_lock, flags); 638 481 ret = release_cached_info(RELEASE_ALL); 639 482 spin_unlock_irqrestore(&cache_lock, flags); 640 - out: 483 + 484 + /* remove scheduled work queue item rather then waiting 485 + * for every queued entry to execute. Then flush pending 486 + * system wide buffer to event buffer. 487 + */ 488 + cancel_delayed_work(&spu_work); 489 + 490 + for (k = 0; k < num_spu_nodes; k++) { 491 + spu_buff[k].ctx_sw_seen = 0; 492 + 493 + /* 494 + * spu_sys_buff will be null if there was a problem 495 + * allocating the buffer. Only delete if it exists. 496 + */ 497 + kfree(spu_buff[k].buff); 498 + spu_buff[k].buff = 0; 499 + } 641 500 pr_debug("spu_sync_stop -- done.\n"); 642 501 return ret; 643 502 } 644 - 645 503

+24

drivers/oprofile/buffer_sync.c

··· 628 628 629 629 mutex_unlock(&buffer_mutex); 630 630 } 631 + 632 + /* The function can be used to add a buffer worth of data directly to 633 + * the kernel buffer. The buffer is assumed to be a circular buffer. 634 + * Take the entries from index start and end at index end, wrapping 635 + * at max_entries. 636 + */ 637 + void oprofile_put_buff(unsigned long *buf, unsigned int start, 638 + unsigned int stop, unsigned int max) 639 + { 640 + int i; 641 + 642 + i = start; 643 + 644 + mutex_lock(&buffer_mutex); 645 + while (i != stop) { 646 + add_event_entry(buf[i++]); 647 + 648 + if (i >= max) 649 + i = 0; 650 + } 651 + 652 + mutex_unlock(&buffer_mutex); 653 + } 654 +

+14 -1

drivers/oprofile/cpu_buffer.c

··· 38 38 void free_cpu_buffers(void) 39 39 { 40 40 int i; 41 - 41 + 42 42 for_each_online_cpu(i) { 43 43 vfree(per_cpu(cpu_buffer, i).buffer); 44 44 per_cpu(cpu_buffer, i).buffer = NULL; 45 45 } 46 + } 47 + 48 + unsigned long oprofile_get_cpu_buffer_size(void) 49 + { 50 + return fs_cpu_buffer_size; 51 + } 52 + 53 + void oprofile_cpu_buffer_inc_smpl_lost(void) 54 + { 55 + struct oprofile_cpu_buffer *cpu_buf 56 + = &__get_cpu_var(cpu_buffer); 57 + 58 + cpu_buf->sample_lost_overflow++; 46 59 } 47 60 48 61 int alloc_cpu_buffers(void)

drivers/oprofile/event_buffer.h

··· 17 17 18 18 void free_event_buffer(void); 19 19 20 + /** 21 + * Add data to the event buffer. 22 + * The data passed is free-form, but typically consists of 23 + * file offsets, dcookies, context information, and ESCAPE codes. 24 + */ 25 + void add_event_entry(unsigned long data); 26 + 20 27 /* wake up the process sleeping on the event file */ 21 28 void wake_up_buffer_waiter(void); 22 29

+9 -7

include/linux/oprofile.h

··· 86 86 void oprofile_arch_exit(void); 87 87 88 88 /** 89 - * Add data to the event buffer. 90 - * The data passed is free-form, but typically consists of 91 - * file offsets, dcookies, context information, and ESCAPE codes. 92 - */ 93 - void add_event_entry(unsigned long data); 94 - 95 - /** 96 89 * Add a sample. This may be called from any context. Pass 97 90 * smp_processor_id() as cpu. 98 91 */ ··· 155 162 156 163 /** lock for read/write safety */ 157 164 extern spinlock_t oprofilefs_lock; 165 + 166 + /** 167 + * Add the contents of a circular buffer to the event buffer. 168 + */ 169 + void oprofile_put_buff(unsigned long *buf, unsigned int start, 170 + unsigned int stop, unsigned int max); 171 + 172 + unsigned long oprofile_get_cpu_buffer_size(void); 173 + void oprofile_cpu_buffer_inc_smpl_lost(void); 158 174 159 175 #endif /* OPROFILE_H */