Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

sched: Allow for overlapping sched_domain spans

Allow for sched_domain spans that overlap by giving such domains their
own sched_group list instead of sharing the sched_groups amongst
each-other.

This is needed for machines with more than 16 nodes, because
sched_domain_node_span() will generate a node mask from the
16 nearest nodes without regard if these masks have any overlap.

Currently sched_domains have a sched_group that maps to their child
sched_domain span, and since there is no overlap we share the
sched_group between the sched_domains of the various CPUs. If however
there is overlap, we would need to link the sched_group list in
different ways for each cpu, and hence sharing isn't possible.

In order to solve this, allocate private sched_groups for each CPU's
sched_domain but have the sched_groups share a sched_group_power
structure such that we can uniquely track the power.

Reported-and-tested-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-08bxqw9wis3qti9u5inifh3y@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by

Peter Zijlstra and committed by
Ingo Molnar
e3589f6c 9c3f75cb

+132 -29
+2
include/linux/sched.h
··· 844 844 #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ 845 845 #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ 846 846 #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ 847 + #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ 847 848 848 849 enum powersavings_balance_level { 849 850 POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ ··· 895 894 } 896 895 897 896 struct sched_group_power { 897 + atomic_t ref; 898 898 /* 899 899 * CPU power of this group, SCHED_LOAD_SCALE being max power for a 900 900 * single CPU.
+128 -29
kernel/sched.c
··· 6774 6774 return rd; 6775 6775 } 6776 6776 6777 + static void free_sched_groups(struct sched_group *sg, int free_sgp) 6778 + { 6779 + struct sched_group *tmp, *first; 6780 + 6781 + if (!sg) 6782 + return; 6783 + 6784 + first = sg; 6785 + do { 6786 + tmp = sg->next; 6787 + 6788 + if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) 6789 + kfree(sg->sgp); 6790 + 6791 + kfree(sg); 6792 + sg = tmp; 6793 + } while (sg != first); 6794 + } 6795 + 6777 6796 static void free_sched_domain(struct rcu_head *rcu) 6778 6797 { 6779 6798 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 6780 - if (atomic_dec_and_test(&sd->groups->ref)) { 6799 + 6800 + /* 6801 + * If its an overlapping domain it has private groups, iterate and 6802 + * nuke them all. 6803 + */ 6804 + if (sd->flags & SD_OVERLAP) { 6805 + free_sched_groups(sd->groups, 1); 6806 + } else if (atomic_dec_and_test(&sd->groups->ref)) { 6781 6807 kfree(sd->groups->sgp); 6782 6808 kfree(sd->groups); 6783 6809 } ··· 6993 6967 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); 6994 6968 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 6995 6969 6970 + #define SDTL_OVERLAP 0x01 6971 + 6996 6972 struct sched_domain_topology_level { 6997 6973 sched_domain_init_f init; 6998 6974 sched_domain_mask_f mask; 6975 + int flags; 6999 6976 struct sd_data data; 7000 6977 }; 7001 6978 7002 - /* 7003 - * Assumes the sched_domain tree is fully constructed 7004 - */ 6979 + static int 6980 + build_overlap_sched_groups(struct sched_domain *sd, int cpu) 6981 + { 6982 + struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; 6983 + const struct cpumask *span = sched_domain_span(sd); 6984 + struct cpumask *covered = sched_domains_tmpmask; 6985 + struct sd_data *sdd = sd->private; 6986 + struct sched_domain *child; 6987 + int i; 6988 + 6989 + cpumask_clear(covered); 6990 + 6991 + for_each_cpu(i, span) { 6992 + struct cpumask *sg_span; 6993 + 6994 + if (cpumask_test_cpu(i, covered)) 6995 + continue; 6996 + 6997 + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6998 + GFP_KERNEL, cpu_to_node(i)); 6999 + 7000 + if (!sg) 7001 + goto fail; 7002 + 7003 + sg_span = sched_group_cpus(sg); 7004 + 7005 + child = *per_cpu_ptr(sdd->sd, i); 7006 + if (child->child) { 7007 + child = child->child; 7008 + cpumask_copy(sg_span, sched_domain_span(child)); 7009 + } else 7010 + cpumask_set_cpu(i, sg_span); 7011 + 7012 + cpumask_or(covered, covered, sg_span); 7013 + 7014 + sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); 7015 + atomic_inc(&sg->sgp->ref); 7016 + 7017 + if (cpumask_test_cpu(cpu, sg_span)) 7018 + groups = sg; 7019 + 7020 + if (!first) 7021 + first = sg; 7022 + if (last) 7023 + last->next = sg; 7024 + last = sg; 7025 + last->next = first; 7026 + } 7027 + sd->groups = groups; 7028 + 7029 + return 0; 7030 + 7031 + fail: 7032 + free_sched_groups(first, 0); 7033 + 7034 + return -ENOMEM; 7035 + } 7036 + 7005 7037 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 7006 7038 { 7007 7039 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); ··· 7071 6987 if (sg) { 7072 6988 *sg = *per_cpu_ptr(sdd->sg, cpu); 7073 6989 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); 6990 + atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ 7074 6991 } 7075 6992 7076 6993 return cpu; 7077 6994 } 7078 6995 7079 6996 /* 7080 - * build_sched_groups takes the cpumask we wish to span, and a pointer 7081 - * to a function which identifies what group(along with sched group) a CPU 7082 - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids 7083 - * (due to the fact that we keep track of groups covered with a struct cpumask). 7084 - * 7085 6997 * build_sched_groups will build a circular linked list of the groups 7086 6998 * covered by the given span, and will set each group's ->cpumask correctly, 7087 6999 * and ->cpu_power to 0. 7000 + * 7001 + * Assumes the sched_domain tree is fully constructed 7088 7002 */ 7089 - static void 7090 - build_sched_groups(struct sched_domain *sd) 7003 + static int 7004 + build_sched_groups(struct sched_domain *sd, int cpu) 7091 7005 { 7092 7006 struct sched_group *first = NULL, *last = NULL; 7093 7007 struct sd_data *sdd = sd->private; 7094 7008 const struct cpumask *span = sched_domain_span(sd); 7095 7009 struct cpumask *covered; 7096 7010 int i; 7011 + 7012 + get_group(cpu, sdd, &sd->groups); 7013 + atomic_inc(&sd->groups->ref); 7014 + 7015 + if (cpu != cpumask_first(sched_domain_span(sd))) 7016 + return 0; 7097 7017 7098 7018 lockdep_assert_held(&sched_domains_mutex); 7099 7019 covered = sched_domains_tmpmask; ··· 7130 7042 last = sg; 7131 7043 } 7132 7044 last->next = first; 7045 + 7046 + return 0; 7133 7047 } 7134 7048 7135 7049 /* ··· 7146 7056 */ 7147 7057 static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7148 7058 { 7149 - WARN_ON(!sd || !sd->groups); 7059 + struct sched_group *sg = sd->groups; 7150 7060 7151 - if (cpu != group_first_cpu(sd->groups)) 7061 + WARN_ON(!sd || !sg); 7062 + 7063 + do { 7064 + sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 7065 + sg = sg->next; 7066 + } while (sg != sd->groups); 7067 + 7068 + if (cpu != group_first_cpu(sg)) 7152 7069 return; 7153 - 7154 - sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7155 7070 7156 7071 update_group_power(sd, cpu); 7157 7072 } ··· 7277 7182 static void claim_allocations(int cpu, struct sched_domain *sd) 7278 7183 { 7279 7184 struct sd_data *sdd = sd->private; 7280 - struct sched_group *sg = sd->groups; 7281 7185 7282 7186 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 7283 7187 *per_cpu_ptr(sdd->sd, cpu) = NULL; 7284 7188 7285 - if (cpu == cpumask_first(sched_group_cpus(sg))) { 7286 - WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); 7189 + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 7287 7190 *per_cpu_ptr(sdd->sg, cpu) = NULL; 7191 + 7192 + if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) 7288 7193 *per_cpu_ptr(sdd->sgp, cpu) = NULL; 7289 - } 7290 7194 } 7291 7195 7292 7196 #ifdef CONFIG_SCHED_SMT ··· 7310 7216 #endif 7311 7217 { sd_init_CPU, cpu_cpu_mask, }, 7312 7218 #ifdef CONFIG_NUMA 7313 - { sd_init_NODE, cpu_node_mask, }, 7219 + { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, 7314 7220 { sd_init_ALLNODES, cpu_allnodes_mask, }, 7315 7221 #endif 7316 7222 { NULL, }, ··· 7378 7284 struct sd_data *sdd = &tl->data; 7379 7285 7380 7286 for_each_cpu(j, cpu_map) { 7381 - kfree(*per_cpu_ptr(sdd->sd, j)); 7287 + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); 7288 + if (sd && (sd->flags & SD_OVERLAP)) 7289 + free_sched_groups(sd->groups, 0); 7382 7290 kfree(*per_cpu_ptr(sdd->sg, j)); 7383 7291 kfree(*per_cpu_ptr(sdd->sgp, j)); 7384 7292 } ··· 7432 7336 struct sched_domain_topology_level *tl; 7433 7337 7434 7338 sd = NULL; 7435 - for (tl = sched_domain_topology; tl->init; tl++) 7339 + for (tl = sched_domain_topology; tl->init; tl++) { 7436 7340 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); 7341 + if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 7342 + sd->flags |= SD_OVERLAP; 7343 + } 7437 7344 7438 7345 while (sd->child) 7439 7346 sd = sd->child; ··· 7448 7349 for_each_cpu(i, cpu_map) { 7449 7350 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 7450 7351 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 7451 - get_group(i, sd->private, &sd->groups); 7452 - atomic_inc(&sd->groups->ref); 7453 - 7454 - if (i != cpumask_first(sched_domain_span(sd))) 7455 - continue; 7456 - 7457 - build_sched_groups(sd); 7352 + if (sd->flags & SD_OVERLAP) { 7353 + if (build_overlap_sched_groups(sd, i)) 7354 + goto error; 7355 + } else { 7356 + if (build_sched_groups(sd, i)) 7357 + goto error; 7358 + } 7458 7359 } 7459 7360 } 7460 7361
+2
kernel/sched_features.h
··· 70 70 * using the scheduler IPI. Reduces rq->lock contention/bounces. 71 71 */ 72 72 SCHED_FEAT(TTWU_QUEUE, 1) 73 + 74 + SCHED_FEAT(FORCE_SD_OVERLAP, 0)