Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull scheduler fixes from Ingo Molnar:
"Thiscontains misc fixes: preempt_schedule_common() and io_schedule()
recursion fixes, sched/dl fixes, a completion_done() revert, two
sched/rt fixes and a comment update patch"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/rt: Avoid obvious configuration fail
sched/autogroup: Fix failure to set cpu.rt_runtime_us
sched/dl: Do update_rq_clock() in yield_task_dl()
sched: Prevent recursion in io_schedule()
sched/completion: Serialize completion_done() with complete()
sched: Fix preempt_schedule_common() triggering tracing recursion
sched/dl: Prevent enqueue of a sleeping task in dl_task_timer()
sched: Make dl_task_time() use task_rq_lock()
sched: Clarify ordering between task_rq_lock() and move_queued_task()

Linus Torvalds 11 years ago e2defd02 b5aeca54

+156 -103

6 changed files

expand all

include

linux

sched.h

kernel

sched

auto_group.c

completion.c

core.c

deadline.c

sched.h

+7 -3

include/linux/sched.h

··· 363 363 */ 364 364 extern void show_stack(struct task_struct *task, unsigned long *sp); 365 365 366 - void io_schedule(void); 367 - long io_schedule_timeout(long timeout); 368 - 369 366 extern void cpu_init (void); 370 367 extern void trap_init(void); 371 368 extern void update_process_times(int user); ··· 418 421 extern signed long schedule_timeout_uninterruptible(signed long timeout); 419 422 asmlinkage void schedule(void); 420 423 extern void schedule_preempt_disabled(void); 424 + 425 + extern long io_schedule_timeout(long timeout); 426 + 427 + static inline void io_schedule(void) 428 + { 429 + io_schedule_timeout(MAX_SCHEDULE_TIMEOUT); 430 + } 421 431 422 432 struct nsproxy; 423 433 struct user_namespace;

+1 -5

kernel/sched/auto_group.c

··· 87 87 * so we don't have to move tasks around upon policy change, 88 88 * or flail around trying to allocate bandwidth on the fly. 89 89 * A bandwidth exception in __sched_setscheduler() allows 90 - * the policy change to proceed. Thereafter, task_group() 91 - * returns &root_task_group, so zero bandwidth is required. 90 + * the policy change to proceed. 92 91 */ 93 92 free_rt_sched_group(tg); 94 93 tg->rt_se = root_task_group.rt_se; ··· 112 113 bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) 113 114 { 114 115 if (tg != &root_task_group) 115 - return false; 116 - 117 - if (p->sched_class != &fair_sched_class) 118 116 return false; 119 117 120 118 /*

+17 -2

kernel/sched/completion.c

··· 274 274 * first without taking the lock so we can 275 275 * return early in the blocking case. 276 276 */ 277 - if (!ACCESS_ONCE(x->done)) 277 + if (!READ_ONCE(x->done)) 278 278 return 0; 279 279 280 280 spin_lock_irqsave(&x->wait.lock, flags); ··· 297 297 */ 298 298 bool completion_done(struct completion *x) 299 299 { 300 - return !!ACCESS_ONCE(x->done); 300 + if (!READ_ONCE(x->done)) 301 + return false; 302 + 303 + /* 304 + * If ->done, we need to wait for complete() to release ->wait.lock 305 + * otherwise we can end up freeing the completion before complete() 306 + * is done referencing it. 307 + * 308 + * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders 309 + * the loads of ->done and ->wait.lock such that we cannot observe 310 + * the lock before complete() acquires it while observing the ->done 311 + * after it's acquired the lock. 312 + */ 313 + smp_rmb(); 314 + spin_unlock_wait(&x->wait.lock); 315 + return true; 301 316 } 302 317 EXPORT_SYMBOL(completion_done);

+31 -84

kernel/sched/core.c

··· 307 307 int sysctl_sched_rt_runtime = 950000; 308 308 309 309 /* 310 - * __task_rq_lock - lock the rq @p resides on. 311 - */ 312 - static inline struct rq *__task_rq_lock(struct task_struct *p) 313 - __acquires(rq->lock) 314 - { 315 - struct rq *rq; 316 - 317 - lockdep_assert_held(&p->pi_lock); 318 - 319 - for (;;) { 320 - rq = task_rq(p); 321 - raw_spin_lock(&rq->lock); 322 - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) 323 - return rq; 324 - raw_spin_unlock(&rq->lock); 325 - 326 - while (unlikely(task_on_rq_migrating(p))) 327 - cpu_relax(); 328 - } 329 - } 330 - 331 - /* 332 - * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 333 - */ 334 - static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 335 - __acquires(p->pi_lock) 336 - __acquires(rq->lock) 337 - { 338 - struct rq *rq; 339 - 340 - for (;;) { 341 - raw_spin_lock_irqsave(&p->pi_lock, *flags); 342 - rq = task_rq(p); 343 - raw_spin_lock(&rq->lock); 344 - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) 345 - return rq; 346 - raw_spin_unlock(&rq->lock); 347 - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 348 - 349 - while (unlikely(task_on_rq_migrating(p))) 350 - cpu_relax(); 351 - } 352 - } 353 - 354 - static void __task_rq_unlock(struct rq *rq) 355 - __releases(rq->lock) 356 - { 357 - raw_spin_unlock(&rq->lock); 358 - } 359 - 360 - static inline void 361 - task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) 362 - __releases(rq->lock) 363 - __releases(p->pi_lock) 364 - { 365 - raw_spin_unlock(&rq->lock); 366 - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 367 - } 368 - 369 - /* 370 310 * this_rq_lock - lock this runqueue and disable interrupts. 371 311 */ 372 312 static struct rq *this_rq_lock(void) ··· 2839 2899 preempt_disable(); 2840 2900 } 2841 2901 2842 - static void preempt_schedule_common(void) 2902 + static void __sched notrace preempt_schedule_common(void) 2843 2903 { 2844 2904 do { 2845 2905 __preempt_count_add(PREEMPT_ACTIVE); ··· 4358 4418 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4359 4419 * that process accounting knows that this is a task in IO wait state. 4360 4420 */ 4361 - void __sched io_schedule(void) 4362 - { 4363 - struct rq *rq = raw_rq(); 4364 - 4365 - delayacct_blkio_start(); 4366 - atomic_inc(&rq->nr_iowait); 4367 - blk_flush_plug(current); 4368 - current->in_iowait = 1; 4369 - schedule(); 4370 - current->in_iowait = 0; 4371 - atomic_dec(&rq->nr_iowait); 4372 - delayacct_blkio_end(); 4373 - } 4374 - EXPORT_SYMBOL(io_schedule); 4375 - 4376 4421 long __sched io_schedule_timeout(long timeout) 4377 4422 { 4378 - struct rq *rq = raw_rq(); 4423 + int old_iowait = current->in_iowait; 4424 + struct rq *rq; 4379 4425 long ret; 4380 4426 4381 - delayacct_blkio_start(); 4382 - atomic_inc(&rq->nr_iowait); 4383 - blk_flush_plug(current); 4384 4427 current->in_iowait = 1; 4428 + if (old_iowait) 4429 + blk_schedule_flush_plug(current); 4430 + else 4431 + blk_flush_plug(current); 4432 + 4433 + delayacct_blkio_start(); 4434 + rq = raw_rq(); 4435 + atomic_inc(&rq->nr_iowait); 4385 4436 ret = schedule_timeout(timeout); 4386 - current->in_iowait = 0; 4437 + current->in_iowait = old_iowait; 4387 4438 atomic_dec(&rq->nr_iowait); 4388 4439 delayacct_blkio_end(); 4440 + 4389 4441 return ret; 4390 4442 } 4443 + EXPORT_SYMBOL(io_schedule_timeout); 4391 4444 4392 4445 /** 4393 4446 * sys_sched_get_priority_max - return maximum RT priority. ··· 7575 7642 { 7576 7643 struct task_struct *g, *p; 7577 7644 7645 + /* 7646 + * Autogroups do not have RT tasks; see autogroup_create(). 7647 + */ 7648 + if (task_group_is_autogroup(tg)) 7649 + return 0; 7650 + 7578 7651 for_each_process_thread(g, p) { 7579 7652 if (rt_task(p) && task_group(p) == tg) 7580 7653 return 1; ··· 7673 7734 { 7674 7735 int i, err = 0; 7675 7736 7737 + /* 7738 + * Disallowing the root group RT runtime is BAD, it would disallow the 7739 + * kernel creating (and or operating) RT threads. 7740 + */ 7741 + if (tg == &root_task_group && rt_runtime == 0) 7742 + return -EINVAL; 7743 + 7744 + /* No period doesn't make any sense. */ 7745 + if (rt_period == 0) 7746 + return -EINVAL; 7747 + 7676 7748 mutex_lock(&rt_constraints_mutex); 7677 7749 read_lock(&tasklist_lock); 7678 7750 err = __rt_schedulable(tg, rt_period, rt_runtime); ··· 7739 7789 7740 7790 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 7741 7791 rt_runtime = tg->rt_bandwidth.rt_runtime; 7742 - 7743 - if (rt_period == 0) 7744 - return -EINVAL; 7745 7792 7746 7793 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7747 7794 }

+24 -9

kernel/sched/deadline.c

··· 511 511 struct sched_dl_entity, 512 512 dl_timer); 513 513 struct task_struct *p = dl_task_of(dl_se); 514 + unsigned long flags; 514 515 struct rq *rq; 515 - again: 516 - rq = task_rq(p); 517 - raw_spin_lock(&rq->lock); 518 516 519 - if (rq != task_rq(p)) { 520 - /* Task was moved, retrying. */ 521 - raw_spin_unlock(&rq->lock); 522 - goto again; 523 - } 517 + rq = task_rq_lock(current, &flags); 524 518 525 519 /* 526 520 * We need to take care of several possible races here: ··· 535 541 536 542 sched_clock_tick(); 537 543 update_rq_clock(rq); 544 + 545 + /* 546 + * If the throttle happened during sched-out; like: 547 + * 548 + * schedule() 549 + * deactivate_task() 550 + * dequeue_task_dl() 551 + * update_curr_dl() 552 + * start_dl_timer() 553 + * __dequeue_task_dl() 554 + * prev->on_rq = 0; 555 + * 556 + * We can be both throttled and !queued. Replenish the counter 557 + * but do not enqueue -- wait for our wakeup to do that. 558 + */ 559 + if (!task_on_rq_queued(p)) { 560 + replenish_dl_entity(dl_se, dl_se); 561 + goto unlock; 562 + } 563 + 538 564 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 539 565 if (dl_task(rq->curr)) 540 566 check_preempt_curr_dl(rq, p, 0); ··· 569 555 push_dl_task(rq); 570 556 #endif 571 557 unlock: 572 - raw_spin_unlock(&rq->lock); 558 + task_rq_unlock(rq, current, &flags); 573 559 574 560 return HRTIMER_NORESTART; 575 561 } ··· 912 898 rq->curr->dl.dl_yielded = 1; 913 899 p->dl.runtime = 0; 914 900 } 901 + update_rq_clock(rq); 915 902 update_curr_dl(rq); 916 903 } 917 904

+76

kernel/sched/sched.h

··· 1380 1380 1381 1381 extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); 1382 1382 1383 + /* 1384 + * __task_rq_lock - lock the rq @p resides on. 1385 + */ 1386 + static inline struct rq *__task_rq_lock(struct task_struct *p) 1387 + __acquires(rq->lock) 1388 + { 1389 + struct rq *rq; 1390 + 1391 + lockdep_assert_held(&p->pi_lock); 1392 + 1393 + for (;;) { 1394 + rq = task_rq(p); 1395 + raw_spin_lock(&rq->lock); 1396 + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) 1397 + return rq; 1398 + raw_spin_unlock(&rq->lock); 1399 + 1400 + while (unlikely(task_on_rq_migrating(p))) 1401 + cpu_relax(); 1402 + } 1403 + } 1404 + 1405 + /* 1406 + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 1407 + */ 1408 + static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 1409 + __acquires(p->pi_lock) 1410 + __acquires(rq->lock) 1411 + { 1412 + struct rq *rq; 1413 + 1414 + for (;;) { 1415 + raw_spin_lock_irqsave(&p->pi_lock, *flags); 1416 + rq = task_rq(p); 1417 + raw_spin_lock(&rq->lock); 1418 + /* 1419 + * move_queued_task() task_rq_lock() 1420 + * 1421 + * ACQUIRE (rq->lock) 1422 + * [S] ->on_rq = MIGRATING [L] rq = task_rq() 1423 + * WMB (__set_task_cpu()) ACQUIRE (rq->lock); 1424 + * [S] ->cpu = new_cpu [L] task_rq() 1425 + * [L] ->on_rq 1426 + * RELEASE (rq->lock) 1427 + * 1428 + * If we observe the old cpu in task_rq_lock, the acquire of 1429 + * the old rq->lock will fully serialize against the stores. 1430 + * 1431 + * If we observe the new cpu in task_rq_lock, the acquire will 1432 + * pair with the WMB to ensure we must then also see migrating. 1433 + */ 1434 + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) 1435 + return rq; 1436 + raw_spin_unlock(&rq->lock); 1437 + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 1438 + 1439 + while (unlikely(task_on_rq_migrating(p))) 1440 + cpu_relax(); 1441 + } 1442 + } 1443 + 1444 + static inline void __task_rq_unlock(struct rq *rq) 1445 + __releases(rq->lock) 1446 + { 1447 + raw_spin_unlock(&rq->lock); 1448 + } 1449 + 1450 + static inline void 1451 + task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) 1452 + __releases(rq->lock) 1453 + __releases(p->pi_lock) 1454 + { 1455 + raw_spin_unlock(&rq->lock); 1456 + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 1457 + } 1458 + 1383 1459 #ifdef CONFIG_SMP 1384 1460 #ifdef CONFIG_PREEMPT 1385 1461