Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull scheduler fixes from Ingo Molnar.

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched: Fix the relax_domain_level boot parameter
sched: Validate assumptions in sched_init_numa()
sched: Always initialize cpu-power
sched: Fix domain iteration
sched/rt: Fix lockdep annotation within find_lock_lowest_rq()
sched/numa: Load balance between remote nodes
sched/x86: Calculate booted cores after construction of sibling_mask

Linus Torvalds 14 years ago 72494504 cd96891d

+179 -39

6 changed files

expand all

arch

x86

kernel

smpboot.c

include

linux

sched.h

kernel

sched

core.c

fair.c

rt.c

sched.h

arch/x86/kernel/smpboot.c

··· 382 382 if ((i == cpu) || (has_mc && match_llc(c, o))) 383 383 link_mask(llc_shared, cpu, i); 384 384 385 + } 386 + 387 + /* 388 + * This needs a separate iteration over the cpus because we rely on all 389 + * cpu_sibling_mask links to be set-up. 390 + */ 391 + for_each_cpu(i, cpu_sibling_setup_mask) { 392 + o = &cpu_data(i); 393 + 385 394 if ((i == cpu) || (has_mc && match_mc(c, o))) { 386 395 link_mask(core, cpu, i); 387 396

+11

include/linux/sched.h

··· 877 877 * Number of busy cpus in this group. 878 878 */ 879 879 atomic_t nr_busy_cpus; 880 + 881 + unsigned long cpumask[0]; /* iteration mask */ 880 882 }; 881 883 882 884 struct sched_group { ··· 901 899 static inline struct cpumask *sched_group_cpus(struct sched_group *sg) 902 900 { 903 901 return to_cpumask(sg->cpumask); 902 + } 903 + 904 + /* 905 + * cpumask masking which cpus in the group are allowed to iterate up the domain 906 + * tree. 907 + */ 908 + static inline struct cpumask *sched_group_mask(struct sched_group *sg) 909 + { 910 + return to_cpumask(sg->sgp->cpumask); 904 911 } 905 912 906 913 /**

+152 -35

kernel/sched/core.c

··· 5556 5556 5557 5557 #ifdef CONFIG_SCHED_DEBUG 5558 5558 5559 - static __read_mostly int sched_domain_debug_enabled; 5559 + static __read_mostly int sched_debug_enabled; 5560 5560 5561 - static int __init sched_domain_debug_setup(char *str) 5561 + static int __init sched_debug_setup(char *str) 5562 5562 { 5563 - sched_domain_debug_enabled = 1; 5563 + sched_debug_enabled = 1; 5564 5564 5565 5565 return 0; 5566 5566 } 5567 - early_param("sched_debug", sched_domain_debug_setup); 5567 + early_param("sched_debug", sched_debug_setup); 5568 + 5569 + static inline bool sched_debug(void) 5570 + { 5571 + return sched_debug_enabled; 5572 + } 5568 5573 5569 5574 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5570 5575 struct cpumask *groupmask) ··· 5609 5604 break; 5610 5605 } 5611 5606 5612 - if (!group->sgp->power) { 5607 + /* 5608 + * Even though we initialize ->power to something semi-sane, 5609 + * we leave power_orig unset. This allows us to detect if 5610 + * domain iteration is still funny without causing /0 traps. 5611 + */ 5612 + if (!group->sgp->power_orig) { 5613 5613 printk(KERN_CONT "\n"); 5614 5614 printk(KERN_ERR "ERROR: domain->cpu_power not " 5615 5615 "set\n"); ··· 5662 5652 { 5663 5653 int level = 0; 5664 5654 5665 - if (!sched_domain_debug_enabled) 5655 + if (!sched_debug_enabled) 5666 5656 return; 5667 5657 5668 5658 if (!sd) { ··· 5683 5673 } 5684 5674 #else /* !CONFIG_SCHED_DEBUG */ 5685 5675 # define sched_domain_debug(sd, cpu) do { } while (0) 5676 + static inline bool sched_debug(void) 5677 + { 5678 + return false; 5679 + } 5686 5680 #endif /* CONFIG_SCHED_DEBUG */ 5687 5681 5688 5682 static int sd_degenerate(struct sched_domain *sd) ··· 6008 5994 struct sd_data data; 6009 5995 }; 6010 5996 5997 + /* 5998 + * Build an iteration mask that can exclude certain CPUs from the upwards 5999 + * domain traversal. 6000 + * 6001 + * Asymmetric node setups can result in situations where the domain tree is of 6002 + * unequal depth, make sure to skip domains that already cover the entire 6003 + * range. 6004 + * 6005 + * In that case build_sched_domains() will have terminated the iteration early 6006 + * and our sibling sd spans will be empty. Domains should always include the 6007 + * cpu they're built on, so check that. 6008 + * 6009 + */ 6010 + static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) 6011 + { 6012 + const struct cpumask *span = sched_domain_span(sd); 6013 + struct sd_data *sdd = sd->private; 6014 + struct sched_domain *sibling; 6015 + int i; 6016 + 6017 + for_each_cpu(i, span) { 6018 + sibling = *per_cpu_ptr(sdd->sd, i); 6019 + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 6020 + continue; 6021 + 6022 + cpumask_set_cpu(i, sched_group_mask(sg)); 6023 + } 6024 + } 6025 + 6026 + /* 6027 + * Return the canonical balance cpu for this group, this is the first cpu 6028 + * of this group that's also in the iteration mask. 6029 + */ 6030 + int group_balance_cpu(struct sched_group *sg) 6031 + { 6032 + return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); 6033 + } 6034 + 6011 6035 static int 6012 6036 build_overlap_sched_groups(struct sched_domain *sd, int cpu) 6013 6037 { ··· 6064 6012 if (cpumask_test_cpu(i, covered)) 6065 6013 continue; 6066 6014 6015 + child = *per_cpu_ptr(sdd->sd, i); 6016 + 6017 + /* See the comment near build_group_mask(). */ 6018 + if (!cpumask_test_cpu(i, sched_domain_span(child))) 6019 + continue; 6020 + 6067 6021 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6068 6022 GFP_KERNEL, cpu_to_node(cpu)); 6069 6023 ··· 6077 6019 goto fail; 6078 6020 6079 6021 sg_span = sched_group_cpus(sg); 6080 - 6081 - child = *per_cpu_ptr(sdd->sd, i); 6082 6022 if (child->child) { 6083 6023 child = child->child; 6084 6024 cpumask_copy(sg_span, sched_domain_span(child)); ··· 6086 6030 cpumask_or(covered, covered, sg_span); 6087 6031 6088 6032 sg->sgp = *per_cpu_ptr(sdd->sgp, i); 6089 - atomic_inc(&sg->sgp->ref); 6033 + if (atomic_inc_return(&sg->sgp->ref) == 1) 6034 + build_group_mask(sd, sg); 6090 6035 6036 + /* 6037 + * Initialize sgp->power such that even if we mess up the 6038 + * domains and no possible iteration will get us here, we won't 6039 + * die on a /0 trap. 6040 + */ 6041 + sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); 6042 + 6043 + /* 6044 + * Make sure the first group of this domain contains the 6045 + * canonical balance cpu. Otherwise the sched_domain iteration 6046 + * breaks. See update_sg_lb_stats(). 6047 + */ 6091 6048 if ((!groups && cpumask_test_cpu(cpu, sg_span)) || 6092 - cpumask_first(sg_span) == cpu) { 6093 - WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span)); 6049 + group_balance_cpu(sg) == cpu) 6094 6050 groups = sg; 6095 - } 6096 6051 6097 6052 if (!first) 6098 6053 first = sg; ··· 6176 6109 6177 6110 cpumask_clear(sched_group_cpus(sg)); 6178 6111 sg->sgp->power = 0; 6112 + cpumask_setall(sched_group_mask(sg)); 6179 6113 6180 6114 for_each_cpu(j, span) { 6181 6115 if (get_group(j, sdd, NULL) != group) ··· 6218 6150 sg = sg->next; 6219 6151 } while (sg != sd->groups); 6220 6152 6221 - if (cpu != group_first_cpu(sg)) 6153 + if (cpu != group_balance_cpu(sg)) 6222 6154 return; 6223 6155 6224 6156 update_group_power(sd, cpu); ··· 6268 6200 6269 6201 static int __init setup_relax_domain_level(char *str) 6270 6202 { 6271 - unsigned long val; 6272 - 6273 - val = simple_strtoul(str, NULL, 0); 6274 - if (val < sched_domain_level_max) 6275 - default_relax_domain_level = val; 6203 + if (kstrtoint(str, 0, &default_relax_domain_level)) 6204 + pr_warn("Unable to set relax_domain_level\n"); 6276 6205 6277 6206 return 1; 6278 6207 } ··· 6379 6314 #ifdef CONFIG_NUMA 6380 6315 6381 6316 static int sched_domains_numa_levels; 6382 - static int sched_domains_numa_scale; 6383 6317 static int *sched_domains_numa_distance; 6384 6318 static struct cpumask ***sched_domains_numa_masks; 6385 6319 static int sched_domains_curr_level; 6386 6320 6387 6321 static inline int sd_local_flags(int level) 6388 6322 { 6389 - if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) 6323 + if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) 6390 6324 return 0; 6391 6325 6392 6326 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; ··· 6443 6379 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6444 6380 } 6445 6381 6382 + static void sched_numa_warn(const char *str) 6383 + { 6384 + static int done = false; 6385 + int i,j; 6386 + 6387 + if (done) 6388 + return; 6389 + 6390 + done = true; 6391 + 6392 + printk(KERN_WARNING "ERROR: %s\n\n", str); 6393 + 6394 + for (i = 0; i < nr_node_ids; i++) { 6395 + printk(KERN_WARNING " "); 6396 + for (j = 0; j < nr_node_ids; j++) 6397 + printk(KERN_CONT "%02d ", node_distance(i,j)); 6398 + printk(KERN_CONT "\n"); 6399 + } 6400 + printk(KERN_WARNING "\n"); 6401 + } 6402 + 6403 + static bool find_numa_distance(int distance) 6404 + { 6405 + int i; 6406 + 6407 + if (distance == node_distance(0, 0)) 6408 + return true; 6409 + 6410 + for (i = 0; i < sched_domains_numa_levels; i++) { 6411 + if (sched_domains_numa_distance[i] == distance) 6412 + return true; 6413 + } 6414 + 6415 + return false; 6416 + } 6417 + 6446 6418 static void sched_init_numa(void) 6447 6419 { 6448 6420 int next_distance, curr_distance = node_distance(0, 0); ··· 6486 6386 int level = 0; 6487 6387 int i, j, k; 6488 6388 6489 - sched_domains_numa_scale = curr_distance; 6490 6389 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 6491 6390 if (!sched_domains_numa_distance) 6492 6391 return; ··· 6496 6397 * 6497 6398 * Assumes node_distance(0,j) includes all distances in 6498 6399 * node_distance(i,j) in order to avoid cubic time. 6499 - * 6500 - * XXX: could be optimized to O(n log n) by using sort() 6501 6400 */ 6502 6401 next_distance = curr_distance; 6503 6402 for (i = 0; i < nr_node_ids; i++) { 6504 6403 for (j = 0; j < nr_node_ids; j++) { 6505 - int distance = node_distance(0, j); 6506 - if (distance > curr_distance && 6507 - (distance < next_distance || 6508 - next_distance == curr_distance)) 6509 - next_distance = distance; 6404 + for (k = 0; k < nr_node_ids; k++) { 6405 + int distance = node_distance(i, k); 6406 + 6407 + if (distance > curr_distance && 6408 + (distance < next_distance || 6409 + next_distance == curr_distance)) 6410 + next_distance = distance; 6411 + 6412 + /* 6413 + * While not a strong assumption it would be nice to know 6414 + * about cases where if node A is connected to B, B is not 6415 + * equally connected to A. 6416 + */ 6417 + if (sched_debug() && node_distance(k, i) != distance) 6418 + sched_numa_warn("Node-distance not symmetric"); 6419 + 6420 + if (sched_debug() && i && !find_numa_distance(distance)) 6421 + sched_numa_warn("Node-0 not representative"); 6422 + } 6423 + if (next_distance != curr_distance) { 6424 + sched_domains_numa_distance[level++] = next_distance; 6425 + sched_domains_numa_levels = level; 6426 + curr_distance = next_distance; 6427 + } else break; 6510 6428 } 6511 - if (next_distance != curr_distance) { 6512 - sched_domains_numa_distance[level++] = next_distance; 6513 - sched_domains_numa_levels = level; 6514 - curr_distance = next_distance; 6515 - } else break; 6429 + 6430 + /* 6431 + * In case of sched_debug() we verify the above assumption. 6432 + */ 6433 + if (!sched_debug()) 6434 + break; 6516 6435 } 6517 6436 /* 6518 6437 * 'level' contains the number of unique distances, excluding the ··· 6642 6525 6643 6526 *per_cpu_ptr(sdd->sg, j) = sg; 6644 6527 6645 - sgp = kzalloc_node(sizeof(struct sched_group_power), 6528 + sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), 6646 6529 GFP_KERNEL, cpu_to_node(j)); 6647 6530 if (!sgp) 6648 6531 return -ENOMEM; ··· 6695 6578 if (!sd) 6696 6579 return child; 6697 6580 6698 - set_domain_attribute(sd, attr); 6699 6581 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6700 6582 if (child) { 6701 6583 sd->level = child->level + 1; ··· 6702 6586 child->parent = sd; 6703 6587 } 6704 6588 sd->child = child; 6589 + set_domain_attribute(sd, attr); 6705 6590 6706 6591 return sd; 6707 6592 }

+4 -3

kernel/sched/fair.c

··· 3602 3602 } while (group != child->groups); 3603 3603 } 3604 3604 3605 - sdg->sgp->power = power; 3605 + sdg->sgp->power_orig = sdg->sgp->power = power; 3606 3606 } 3607 3607 3608 3608 /* ··· 3652 3652 int i; 3653 3653 3654 3654 if (local_group) 3655 - balance_cpu = group_first_cpu(group); 3655 + balance_cpu = group_balance_cpu(group); 3656 3656 3657 3657 /* Tally up the load of all CPUs in the group */ 3658 3658 max_cpu_load = 0; ··· 3667 3667 3668 3668 /* Bias balancing toward cpus of our domain */ 3669 3669 if (local_group) { 3670 - if (idle_cpu(i) && !first_idle_cpu) { 3670 + if (idle_cpu(i) && !first_idle_cpu && 3671 + cpumask_test_cpu(i, sched_group_mask(group))) { 3671 3672 first_idle_cpu = 1; 3672 3673 balance_cpu = i; 3673 3674 }

+1 -1

kernel/sched/rt.c

··· 1562 1562 task_running(rq, task) || 1563 1563 !task->on_rq)) { 1564 1564 1565 - raw_spin_unlock(&lowest_rq->lock); 1565 + double_unlock_balance(rq, lowest_rq); 1566 1566 lowest_rq = NULL; 1567 1567 break; 1568 1568 }

kernel/sched/sched.h

··· 526 526 DECLARE_PER_CPU(struct sched_domain *, sd_llc); 527 527 DECLARE_PER_CPU(int, sd_llc_id); 528 528 529 + extern int group_balance_cpu(struct sched_group *sg); 530 + 529 531 #endif /* CONFIG_SMP */ 530 532 531 533 #include "stats.h"