Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched

* git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched:
sched: tweak the sched_runtime_limit tunable
sched: skip updating rq's next_balance under null SD
sched: fix broken SMT/MC optimizations
sched: accounting regression since rc1
sched: fix sysctl directory permissions
sched: sched_clock_idle_[sleep|wakeup]_event()

+110 -43
-1
arch/i386/kernel/tsc.c
··· 292 292 293 293 void mark_tsc_unstable(char *reason) 294 294 { 295 - sched_clock_unstable_event(); 296 295 if (!tsc_unstable) { 297 296 tsc_unstable = 1; 298 297 tsc_enabled = 0;
+25 -7
drivers/acpi/processor_idle.c
··· 63 63 ACPI_MODULE_NAME("processor_idle"); 64 64 #define ACPI_PROCESSOR_FILE_POWER "power" 65 65 #define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000) 66 + #define PM_TIMER_TICK_NS (1000000000ULL/PM_TIMER_FREQUENCY) 66 67 #define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */ 67 68 #define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */ 68 69 static void (*pm_idle_save) (void) __read_mostly; ··· 463 462 * TBD: Can't get time duration while in C1, as resumes 464 463 * go to an ISR rather than here. Need to instrument 465 464 * base interrupt handler. 465 + * 466 + * Note: the TSC better not stop in C1, sched_clock() will 467 + * skew otherwise. 466 468 */ 467 469 sleep_ticks = 0xFFFFFFFF; 468 470 break; ··· 473 469 case ACPI_STATE_C2: 474 470 /* Get start time (ticks) */ 475 471 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); 472 + /* Tell the scheduler that we are going deep-idle: */ 473 + sched_clock_idle_sleep_event(); 476 474 /* Invoke C2 */ 477 475 acpi_state_timer_broadcast(pr, cx, 1); 478 476 acpi_cstate_enter(cx); ··· 485 479 /* TSC halts in C2, so notify users */ 486 480 mark_tsc_unstable("possible TSC halt in C2"); 487 481 #endif 482 + /* Compute time (ticks) that we were actually asleep */ 483 + sleep_ticks = ticks_elapsed(t1, t2); 484 + 485 + /* Tell the scheduler how much we idled: */ 486 + sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS); 487 + 488 488 /* Re-enable interrupts */ 489 489 local_irq_enable(); 490 + /* Do not account our idle-switching overhead: */ 491 + sleep_ticks -= cx->latency_ticks + C2_OVERHEAD; 492 + 490 493 current_thread_info()->status |= TS_POLLING; 491 - /* Compute time (ticks) that we were actually asleep */ 492 - sleep_ticks = 493 - ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD; 494 494 acpi_state_timer_broadcast(pr, cx, 0); 495 495 break; 496 496 497 497 case ACPI_STATE_C3: 498 - 499 498 /* 500 499 * disable bus master 501 500 * bm_check implies we need ARB_DIS ··· 529 518 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); 530 519 /* Invoke C3 */ 531 520 acpi_state_timer_broadcast(pr, cx, 1); 521 + /* Tell the scheduler that we are going deep-idle: */ 522 + sched_clock_idle_sleep_event(); 532 523 acpi_cstate_enter(cx); 533 524 /* Get end time (ticks) */ 534 525 t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); ··· 544 531 /* TSC halts in C3, so notify users */ 545 532 mark_tsc_unstable("TSC halts in C3"); 546 533 #endif 534 + /* Compute time (ticks) that we were actually asleep */ 535 + sleep_ticks = ticks_elapsed(t1, t2); 536 + /* Tell the scheduler how much we idled: */ 537 + sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS); 538 + 547 539 /* Re-enable interrupts */ 548 540 local_irq_enable(); 541 + /* Do not account our idle-switching overhead: */ 542 + sleep_ticks -= cx->latency_ticks + C3_OVERHEAD; 543 + 549 544 current_thread_info()->status |= TS_POLLING; 550 - /* Compute time (ticks) that we were actually asleep */ 551 - sleep_ticks = 552 - ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD; 553 545 acpi_state_timer_broadcast(pr, cx, 0); 554 546 break; 555 547
+29 -15
fs/proc/array.c
··· 320 320 return buffer - orig; 321 321 } 322 322 323 - static clock_t task_utime(struct task_struct *p) 323 + /* 324 + * Use precise platform statistics if available: 325 + */ 326 + #ifdef CONFIG_VIRT_CPU_ACCOUNTING 327 + static cputime_t task_utime(struct task_struct *p) 328 + { 329 + return p->utime; 330 + } 331 + 332 + static cputime_t task_stime(struct task_struct *p) 333 + { 334 + return p->stime; 335 + } 336 + #else 337 + static cputime_t task_utime(struct task_struct *p) 324 338 { 325 339 clock_t utime = cputime_to_clock_t(p->utime), 326 340 total = utime + cputime_to_clock_t(p->stime); ··· 351 337 } 352 338 utime = (clock_t)temp; 353 339 354 - return utime; 340 + return clock_t_to_cputime(utime); 355 341 } 356 342 357 - static clock_t task_stime(struct task_struct *p) 343 + static cputime_t task_stime(struct task_struct *p) 358 344 { 359 345 clock_t stime; 360 346 ··· 363 349 * the total, to make sure the total observed by userspace 364 350 * grows monotonically - apps rely on that): 365 351 */ 366 - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p); 352 + stime = nsec_to_clock_t(p->se.sum_exec_runtime) - 353 + cputime_to_clock_t(task_utime(p)); 367 354 368 - return stime; 355 + return clock_t_to_cputime(stime); 369 356 } 357 + #endif 370 358 371 359 static int do_task_stat(struct task_struct *task, char *buffer, int whole) 372 360 { ··· 384 368 unsigned long long start_time; 385 369 unsigned long cmin_flt = 0, cmaj_flt = 0; 386 370 unsigned long min_flt = 0, maj_flt = 0; 387 - cputime_t cutime, cstime; 388 - clock_t utime, stime; 371 + cputime_t cutime, cstime, utime, stime; 389 372 unsigned long rsslim = 0; 390 373 char tcomm[sizeof(task->comm)]; 391 374 unsigned long flags; ··· 402 387 403 388 sigemptyset(&sigign); 404 389 sigemptyset(&sigcatch); 405 - cutime = cstime = cputime_zero; 406 - utime = stime = 0; 390 + cutime = cstime = utime = stime = cputime_zero; 407 391 408 392 rcu_read_lock(); 409 393 if (lock_task_sighand(task, &flags)) { ··· 428 414 do { 429 415 min_flt += t->min_flt; 430 416 maj_flt += t->maj_flt; 431 - utime += task_utime(t); 432 - stime += task_stime(t); 417 + utime = cputime_add(utime, task_utime(t)); 418 + stime = cputime_add(stime, task_stime(t)); 433 419 t = next_thread(t); 434 420 } while (t != task); 435 421 436 422 min_flt += sig->min_flt; 437 423 maj_flt += sig->maj_flt; 438 - utime += cputime_to_clock_t(sig->utime); 439 - stime += cputime_to_clock_t(sig->stime); 424 + utime = cputime_add(utime, sig->utime); 425 + stime = cputime_add(stime, sig->stime); 440 426 } 441 427 442 428 sid = signal_session(sig); ··· 485 471 cmin_flt, 486 472 maj_flt, 487 473 cmaj_flt, 488 - utime, 489 - stime, 474 + cputime_to_clock_t(utime), 475 + cputime_to_clock_t(stime), 490 476 cputime_to_clock_t(cutime), 491 477 cputime_to_clock_t(cstime), 492 478 priority,
+3 -2
include/linux/sched.h
··· 681 681 #define SCHED_LOAD_SHIFT 10 682 682 #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) 683 683 684 - #define SCHED_LOAD_SCALE_FUZZ (SCHED_LOAD_SCALE >> 1) 684 + #define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE 685 685 686 686 #ifdef CONFIG_SMP 687 687 #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ ··· 1388 1388 #define sched_exec() {} 1389 1389 #endif 1390 1390 1391 - extern void sched_clock_unstable_event(void); 1391 + extern void sched_clock_idle_sleep_event(void); 1392 + extern void sched_clock_idle_wakeup_event(u64 delta_ns); 1392 1393 1393 1394 #ifdef CONFIG_HOTPLUG_CPU 1394 1395 extern void idle_task_exit(void);
+51 -17
kernel/sched.c
··· 262 262 s64 clock_max_delta; 263 263 264 264 unsigned int clock_warps, clock_overflows; 265 - unsigned int clock_unstable_events; 265 + u64 idle_clock; 266 + unsigned int clock_deep_idle_events; 266 267 u64 tick_timestamp; 267 268 268 269 atomic_t nr_iowait; ··· 557 556 } 558 557 559 558 /* 560 - * CPU frequency is/was unstable - start new by setting prev_clock_raw: 559 + * We are going deep-idle (irqs are disabled): 561 560 */ 562 - void sched_clock_unstable_event(void) 561 + void sched_clock_idle_sleep_event(void) 563 562 { 564 - unsigned long flags; 565 - struct rq *rq; 563 + struct rq *rq = cpu_rq(smp_processor_id()); 566 564 567 - rq = task_rq_lock(current, &flags); 568 - rq->prev_clock_raw = sched_clock(); 569 - rq->clock_unstable_events++; 570 - task_rq_unlock(rq, &flags); 565 + spin_lock(&rq->lock); 566 + __update_rq_clock(rq); 567 + spin_unlock(&rq->lock); 568 + rq->clock_deep_idle_events++; 571 569 } 570 + EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); 571 + 572 + /* 573 + * We just idled delta nanoseconds (called with irqs disabled): 574 + */ 575 + void sched_clock_idle_wakeup_event(u64 delta_ns) 576 + { 577 + struct rq *rq = cpu_rq(smp_processor_id()); 578 + u64 now = sched_clock(); 579 + 580 + rq->idle_clock += delta_ns; 581 + /* 582 + * Override the previous timestamp and ignore all 583 + * sched_clock() deltas that occured while we idled, 584 + * and use the PM-provided delta_ns to advance the 585 + * rq clock: 586 + */ 587 + spin_lock(&rq->lock); 588 + rq->prev_clock_raw = now; 589 + rq->clock += delta_ns; 590 + spin_unlock(&rq->lock); 591 + } 592 + EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 572 593 573 594 /* 574 595 * resched_task - mark a task 'to be rescheduled now'. ··· 2517 2494 * a think about bumping its value to force at least one task to be 2518 2495 * moved 2519 2496 */ 2520 - if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { 2497 + if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) { 2521 2498 unsigned long tmp, pwr_now, pwr_move; 2522 2499 unsigned int imbn; 2523 2500 ··· 3043 3020 struct sched_domain *sd; 3044 3021 /* Earliest time when we have to do rebalance again */ 3045 3022 unsigned long next_balance = jiffies + 60*HZ; 3023 + int update_next_balance = 0; 3046 3024 3047 3025 for_each_domain(cpu, sd) { 3048 3026 if (!(sd->flags & SD_LOAD_BALANCE)) ··· 3080 3056 if (sd->flags & SD_SERIALIZE) 3081 3057 spin_unlock(&balancing); 3082 3058 out: 3083 - if (time_after(next_balance, sd->last_balance + interval)) 3059 + if (time_after(next_balance, sd->last_balance + interval)) { 3084 3060 next_balance = sd->last_balance + interval; 3061 + update_next_balance = 1; 3062 + } 3085 3063 3086 3064 /* 3087 3065 * Stop the load balance at this level. There is another ··· 3093 3067 if (!balance) 3094 3068 break; 3095 3069 } 3096 - rq->next_balance = next_balance; 3070 + 3071 + /* 3072 + * next_balance will be updated only when there is a need. 3073 + * When the cpu is attached to null domain for ex, it will not be 3074 + * updated. 3075 + */ 3076 + if (likely(update_next_balance)) 3077 + rq->next_balance = next_balance; 3097 3078 } 3098 3079 3099 3080 /* ··· 4923 4890 if (sysctl_sched_granularity > gran_limit) 4924 4891 sysctl_sched_granularity = gran_limit; 4925 4892 4926 - sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; 4893 + sysctl_sched_runtime_limit = sysctl_sched_granularity * 8; 4927 4894 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; 4928 4895 } 4929 4896 ··· 5267 5234 static struct ctl_table sd_ctl_dir[] = { 5268 5235 { 5269 5236 .procname = "sched_domain", 5270 - .mode = 0755, 5237 + .mode = 0555, 5271 5238 }, 5272 5239 {0,}, 5273 5240 }; 5274 5241 5275 5242 static struct ctl_table sd_ctl_root[] = { 5276 5243 { 5244 + .ctl_name = CTL_KERN, 5277 5245 .procname = "kernel", 5278 - .mode = 0755, 5246 + .mode = 0555, 5279 5247 .child = sd_ctl_dir, 5280 5248 }, 5281 5249 {0,}, ··· 5352 5318 for_each_domain(cpu, sd) { 5353 5319 snprintf(buf, 32, "domain%d", i); 5354 5320 entry->procname = kstrdup(buf, GFP_KERNEL); 5355 - entry->mode = 0755; 5321 + entry->mode = 0555; 5356 5322 entry->child = sd_alloc_ctl_domain_table(sd); 5357 5323 entry++; 5358 5324 i++; ··· 5372 5338 for (i = 0; i < cpu_num; i++, entry++) { 5373 5339 snprintf(buf, 32, "cpu%d", i); 5374 5340 entry->procname = kstrdup(buf, GFP_KERNEL); 5375 - entry->mode = 0755; 5341 + entry->mode = 0555; 5376 5342 entry->child = sd_alloc_ctl_cpu_table(i); 5377 5343 } 5378 5344 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+2 -1
kernel/sched_debug.c
··· 154 154 P(next_balance); 155 155 P(curr->pid); 156 156 P(clock); 157 + P(idle_clock); 157 158 P(prev_clock_raw); 158 159 P(clock_warps); 159 160 P(clock_overflows); 160 - P(clock_unstable_events); 161 + P(clock_deep_idle_events); 161 162 P(clock_max_delta); 162 163 P(cpu_load[0]); 163 164 P(cpu_load[1]);