Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

genirq/affinity: Move group_cpus_evenly() into lib/

group_cpus_evenly() has become a generic function which can be used for
other subsystems than the interrupt subsystem, so move it into lib/.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20221227022905.352674-6-ming.lei@redhat.com

authored by

Ming Lei and committed by
Thomas Gleixner
f7b3ea8c 523f1ea7

+446 -397
+2
MAINTAINERS
··· 10935 10935 S: Maintained 10936 10936 T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core 10937 10937 F: kernel/irq/ 10938 + F: include/linux/group_cpus.h 10939 + F: lib/group_cpus.c 10938 10940 10939 10941 IRQCHIP DRIVERS 10940 10942 M: Thomas Gleixner <tglx@linutronix.de>
+14
include/linux/group_cpus.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (C) 2016 Thomas Gleixner. 4 + * Copyright (C) 2016-2017 Christoph Hellwig. 5 + */ 6 + 7 + #ifndef __LINUX_GROUP_CPUS_H 8 + #define __LINUX_GROUP_CPUS_H 9 + #include <linux/kernel.h> 10 + #include <linux/cpu.h> 11 + 12 + struct cpumask *group_cpus_evenly(unsigned int numgrps); 13 + 14 + #endif
+1 -397
kernel/irq/affinity.c
··· 7 7 #include <linux/kernel.h> 8 8 #include <linux/slab.h> 9 9 #include <linux/cpu.h> 10 - #include <linux/sort.h> 11 - 12 - static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, 13 - unsigned int cpus_per_grp) 14 - { 15 - const struct cpumask *siblmsk; 16 - int cpu, sibl; 17 - 18 - for ( ; cpus_per_grp > 0; ) { 19 - cpu = cpumask_first(nmsk); 20 - 21 - /* Should not happen, but I'm too lazy to think about it */ 22 - if (cpu >= nr_cpu_ids) 23 - return; 24 - 25 - cpumask_clear_cpu(cpu, nmsk); 26 - cpumask_set_cpu(cpu, irqmsk); 27 - cpus_per_grp--; 28 - 29 - /* If the cpu has siblings, use them first */ 30 - siblmsk = topology_sibling_cpumask(cpu); 31 - for (sibl = -1; cpus_per_grp > 0; ) { 32 - sibl = cpumask_next(sibl, siblmsk); 33 - if (sibl >= nr_cpu_ids) 34 - break; 35 - if (!cpumask_test_and_clear_cpu(sibl, nmsk)) 36 - continue; 37 - cpumask_set_cpu(sibl, irqmsk); 38 - cpus_per_grp--; 39 - } 40 - } 41 - } 42 - 43 - static cpumask_var_t *alloc_node_to_cpumask(void) 44 - { 45 - cpumask_var_t *masks; 46 - int node; 47 - 48 - masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); 49 - if (!masks) 50 - return NULL; 51 - 52 - for (node = 0; node < nr_node_ids; node++) { 53 - if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) 54 - goto out_unwind; 55 - } 56 - 57 - return masks; 58 - 59 - out_unwind: 60 - while (--node >= 0) 61 - free_cpumask_var(masks[node]); 62 - kfree(masks); 63 - return NULL; 64 - } 65 - 66 - static void free_node_to_cpumask(cpumask_var_t *masks) 67 - { 68 - int node; 69 - 70 - for (node = 0; node < nr_node_ids; node++) 71 - free_cpumask_var(masks[node]); 72 - kfree(masks); 73 - } 74 - 75 - static void build_node_to_cpumask(cpumask_var_t *masks) 76 - { 77 - int cpu; 78 - 79 - for_each_possible_cpu(cpu) 80 - cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); 81 - } 82 - 83 - static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, 84 - const struct cpumask *mask, nodemask_t *nodemsk) 85 - { 86 - int n, nodes = 0; 87 - 88 - /* Calculate the number of nodes in the supplied affinity mask */ 89 - for_each_node(n) { 90 - if (cpumask_intersects(mask, node_to_cpumask[n])) { 91 - node_set(n, *nodemsk); 92 - nodes++; 93 - } 94 - } 95 - return nodes; 96 - } 97 - 98 - struct node_groups { 99 - unsigned id; 100 - 101 - union { 102 - unsigned ngroups; 103 - unsigned ncpus; 104 - }; 105 - }; 106 - 107 - static int ncpus_cmp_func(const void *l, const void *r) 108 - { 109 - const struct node_groups *ln = l; 110 - const struct node_groups *rn = r; 111 - 112 - return ln->ncpus - rn->ncpus; 113 - } 114 - 115 - /* 116 - * Allocate group number for each node, so that for each node: 117 - * 118 - * 1) the allocated number is >= 1 119 - * 120 - * 2) the allocated number is <= active CPU number of this node 121 - * 122 - * The actual allocated total groups may be less than @numgrps when 123 - * active total CPU number is less than @numgrps. 124 - * 125 - * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' 126 - * for each node. 127 - */ 128 - static void alloc_nodes_groups(unsigned int numgrps, 129 - cpumask_var_t *node_to_cpumask, 130 - const struct cpumask *cpu_mask, 131 - const nodemask_t nodemsk, 132 - struct cpumask *nmsk, 133 - struct node_groups *node_groups) 134 - { 135 - unsigned n, remaining_ncpus = 0; 136 - 137 - for (n = 0; n < nr_node_ids; n++) { 138 - node_groups[n].id = n; 139 - node_groups[n].ncpus = UINT_MAX; 140 - } 141 - 142 - for_each_node_mask(n, nodemsk) { 143 - unsigned ncpus; 144 - 145 - cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); 146 - ncpus = cpumask_weight(nmsk); 147 - 148 - if (!ncpus) 149 - continue; 150 - remaining_ncpus += ncpus; 151 - node_groups[n].ncpus = ncpus; 152 - } 153 - 154 - numgrps = min_t(unsigned, remaining_ncpus, numgrps); 155 - 156 - sort(node_groups, nr_node_ids, sizeof(node_groups[0]), 157 - ncpus_cmp_func, NULL); 158 - 159 - /* 160 - * Allocate groups for each node according to the ratio of this 161 - * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is 162 - * bigger than number of active numa nodes. Always start the 163 - * allocation from the node with minimized nr_cpus. 164 - * 165 - * This way guarantees that each active node gets allocated at 166 - * least one group, and the theory is simple: over-allocation 167 - * is only done when this node is assigned by one group, so 168 - * other nodes will be allocated >= 1 groups, since 'numgrps' is 169 - * bigger than number of numa nodes. 170 - * 171 - * One perfect invariant is that number of allocated groups for 172 - * each node is <= CPU count of this node: 173 - * 174 - * 1) suppose there are two nodes: A and B 175 - * ncpu(X) is CPU count of node X 176 - * grps(X) is the group count allocated to node X via this 177 - * algorithm 178 - * 179 - * ncpu(A) <= ncpu(B) 180 - * ncpu(A) + ncpu(B) = N 181 - * grps(A) + grps(B) = G 182 - * 183 - * grps(A) = max(1, round_down(G * ncpu(A) / N)) 184 - * grps(B) = G - grps(A) 185 - * 186 - * both N and G are integer, and 2 <= G <= N, suppose 187 - * G = N - delta, and 0 <= delta <= N - 2 188 - * 189 - * 2) obviously grps(A) <= ncpu(A) because: 190 - * 191 - * if grps(A) is 1, then grps(A) <= ncpu(A) given 192 - * ncpu(A) >= 1 193 - * 194 - * otherwise, 195 - * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N 196 - * 197 - * 3) prove how grps(B) <= ncpu(B): 198 - * 199 - * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be 200 - * over-allocated, so grps(B) <= ncpu(B), 201 - * 202 - * otherwise: 203 - * 204 - * grps(A) = 205 - * round_down(G * ncpu(A) / N) = 206 - * round_down((N - delta) * ncpu(A) / N) = 207 - * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= 208 - * round_down((N * ncpu(A) - delta * N) / N) = 209 - * cpu(A) - delta 210 - * 211 - * then: 212 - * 213 - * grps(A) - G >= ncpu(A) - delta - G 214 - * => 215 - * G - grps(A) <= G + delta - ncpu(A) 216 - * => 217 - * grps(B) <= N - ncpu(A) 218 - * => 219 - * grps(B) <= cpu(B) 220 - * 221 - * For nodes >= 3, it can be thought as one node and another big 222 - * node given that is exactly what this algorithm is implemented, 223 - * and we always re-calculate 'remaining_ncpus' & 'numgrps', and 224 - * finally for each node X: grps(X) <= ncpu(X). 225 - * 226 - */ 227 - for (n = 0; n < nr_node_ids; n++) { 228 - unsigned ngroups, ncpus; 229 - 230 - if (node_groups[n].ncpus == UINT_MAX) 231 - continue; 232 - 233 - WARN_ON_ONCE(numgrps == 0); 234 - 235 - ncpus = node_groups[n].ncpus; 236 - ngroups = max_t(unsigned, 1, 237 - numgrps * ncpus / remaining_ncpus); 238 - WARN_ON_ONCE(ngroups > ncpus); 239 - 240 - node_groups[n].ngroups = ngroups; 241 - 242 - remaining_ncpus -= ncpus; 243 - numgrps -= ngroups; 244 - } 245 - } 246 - 247 - static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, 248 - cpumask_var_t *node_to_cpumask, 249 - const struct cpumask *cpu_mask, 250 - struct cpumask *nmsk, struct cpumask *masks) 251 - { 252 - unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; 253 - unsigned int last_grp = numgrps; 254 - unsigned int curgrp = startgrp; 255 - nodemask_t nodemsk = NODE_MASK_NONE; 256 - struct node_groups *node_groups; 257 - 258 - if (cpumask_empty(cpu_mask)) 259 - return 0; 260 - 261 - nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); 262 - 263 - /* 264 - * If the number of nodes in the mask is greater than or equal the 265 - * number of groups we just spread the groups across the nodes. 266 - */ 267 - if (numgrps <= nodes) { 268 - for_each_node_mask(n, nodemsk) { 269 - /* Ensure that only CPUs which are in both masks are set */ 270 - cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); 271 - cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); 272 - if (++curgrp == last_grp) 273 - curgrp = 0; 274 - } 275 - return numgrps; 276 - } 277 - 278 - node_groups = kcalloc(nr_node_ids, 279 - sizeof(struct node_groups), 280 - GFP_KERNEL); 281 - if (!node_groups) 282 - return -ENOMEM; 283 - 284 - /* allocate group number for each node */ 285 - alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, 286 - nodemsk, nmsk, node_groups); 287 - for (i = 0; i < nr_node_ids; i++) { 288 - unsigned int ncpus, v; 289 - struct node_groups *nv = &node_groups[i]; 290 - 291 - if (nv->ngroups == UINT_MAX) 292 - continue; 293 - 294 - /* Get the cpus on this node which are in the mask */ 295 - cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); 296 - ncpus = cpumask_weight(nmsk); 297 - if (!ncpus) 298 - continue; 299 - 300 - WARN_ON_ONCE(nv->ngroups > ncpus); 301 - 302 - /* Account for rounding errors */ 303 - extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); 304 - 305 - /* Spread allocated groups on CPUs of the current node */ 306 - for (v = 0; v < nv->ngroups; v++, curgrp++) { 307 - cpus_per_grp = ncpus / nv->ngroups; 308 - 309 - /* Account for extra groups to compensate rounding errors */ 310 - if (extra_grps) { 311 - cpus_per_grp++; 312 - --extra_grps; 313 - } 314 - 315 - /* 316 - * wrapping has to be considered given 'startgrp' 317 - * may start anywhere 318 - */ 319 - if (curgrp >= last_grp) 320 - curgrp = 0; 321 - grp_spread_init_one(&masks[curgrp], nmsk, 322 - cpus_per_grp); 323 - } 324 - done += nv->ngroups; 325 - } 326 - kfree(node_groups); 327 - return done; 328 - } 329 - 330 - /* 331 - * build affinity in two stages for each group, and try to put close CPUs 332 - * in viewpoint of CPU and NUMA locality into same group, and we run 333 - * two-stage grouping: 334 - * 335 - * 1) allocate present CPUs on these groups evenly first 336 - * 2) allocate other possible CPUs on these groups evenly 337 - */ 338 - static struct cpumask *group_cpus_evenly(unsigned int numgrps) 339 - { 340 - unsigned int curgrp = 0, nr_present = 0, nr_others = 0; 341 - cpumask_var_t *node_to_cpumask; 342 - cpumask_var_t nmsk, npresmsk; 343 - int ret = -ENOMEM; 344 - struct cpumask *masks = NULL; 345 - 346 - if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) 347 - return NULL; 348 - 349 - if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) 350 - goto fail_nmsk; 351 - 352 - node_to_cpumask = alloc_node_to_cpumask(); 353 - if (!node_to_cpumask) 354 - goto fail_npresmsk; 355 - 356 - masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); 357 - if (!masks) 358 - goto fail_node_to_cpumask; 359 - 360 - /* Stabilize the cpumasks */ 361 - cpus_read_lock(); 362 - build_node_to_cpumask(node_to_cpumask); 363 - 364 - /* grouping present CPUs first */ 365 - ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, 366 - cpu_present_mask, nmsk, masks); 367 - if (ret < 0) 368 - goto fail_build_affinity; 369 - nr_present = ret; 370 - 371 - /* 372 - * Allocate non present CPUs starting from the next group to be 373 - * handled. If the grouping of present CPUs already exhausted the 374 - * group space, assign the non present CPUs to the already 375 - * allocated out groups. 376 - */ 377 - if (nr_present >= numgrps) 378 - curgrp = 0; 379 - else 380 - curgrp = nr_present; 381 - cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); 382 - ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, 383 - npresmsk, nmsk, masks); 384 - if (ret >= 0) 385 - nr_others = ret; 386 - 387 - fail_build_affinity: 388 - cpus_read_unlock(); 389 - 390 - if (ret >= 0) 391 - WARN_ON(nr_present + nr_others < numgrps); 392 - 393 - fail_node_to_cpumask: 394 - free_node_to_cpumask(node_to_cpumask); 395 - 396 - fail_npresmsk: 397 - free_cpumask_var(npresmsk); 398 - 399 - fail_nmsk: 400 - free_cpumask_var(nmsk); 401 - if (ret < 0) { 402 - kfree(masks); 403 - return NULL; 404 - } 405 - return masks; 406 - } 10 + #include <linux/group_cpus.h> 407 11 408 12 static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) 409 13 {
+2
lib/Makefile
··· 353 353 354 354 obj-$(CONFIG_PARMAN) += parman.o 355 355 356 + obj-y += group_cpus.o 357 + 356 358 # GCC library routines 357 359 obj-$(CONFIG_GENERIC_LIB_ASHLDI3) += ashldi3.o 358 360 obj-$(CONFIG_GENERIC_LIB_ASHRDI3) += ashrdi3.o
+427
lib/group_cpus.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2016 Thomas Gleixner. 4 + * Copyright (C) 2016-2017 Christoph Hellwig. 5 + */ 6 + #include <linux/kernel.h> 7 + #include <linux/slab.h> 8 + #include <linux/cpu.h> 9 + #include <linux/sort.h> 10 + #include <linux/group_cpus.h> 11 + 12 + static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, 13 + unsigned int cpus_per_grp) 14 + { 15 + const struct cpumask *siblmsk; 16 + int cpu, sibl; 17 + 18 + for ( ; cpus_per_grp > 0; ) { 19 + cpu = cpumask_first(nmsk); 20 + 21 + /* Should not happen, but I'm too lazy to think about it */ 22 + if (cpu >= nr_cpu_ids) 23 + return; 24 + 25 + cpumask_clear_cpu(cpu, nmsk); 26 + cpumask_set_cpu(cpu, irqmsk); 27 + cpus_per_grp--; 28 + 29 + /* If the cpu has siblings, use them first */ 30 + siblmsk = topology_sibling_cpumask(cpu); 31 + for (sibl = -1; cpus_per_grp > 0; ) { 32 + sibl = cpumask_next(sibl, siblmsk); 33 + if (sibl >= nr_cpu_ids) 34 + break; 35 + if (!cpumask_test_and_clear_cpu(sibl, nmsk)) 36 + continue; 37 + cpumask_set_cpu(sibl, irqmsk); 38 + cpus_per_grp--; 39 + } 40 + } 41 + } 42 + 43 + static cpumask_var_t *alloc_node_to_cpumask(void) 44 + { 45 + cpumask_var_t *masks; 46 + int node; 47 + 48 + masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); 49 + if (!masks) 50 + return NULL; 51 + 52 + for (node = 0; node < nr_node_ids; node++) { 53 + if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) 54 + goto out_unwind; 55 + } 56 + 57 + return masks; 58 + 59 + out_unwind: 60 + while (--node >= 0) 61 + free_cpumask_var(masks[node]); 62 + kfree(masks); 63 + return NULL; 64 + } 65 + 66 + static void free_node_to_cpumask(cpumask_var_t *masks) 67 + { 68 + int node; 69 + 70 + for (node = 0; node < nr_node_ids; node++) 71 + free_cpumask_var(masks[node]); 72 + kfree(masks); 73 + } 74 + 75 + static void build_node_to_cpumask(cpumask_var_t *masks) 76 + { 77 + int cpu; 78 + 79 + for_each_possible_cpu(cpu) 80 + cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); 81 + } 82 + 83 + static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, 84 + const struct cpumask *mask, nodemask_t *nodemsk) 85 + { 86 + int n, nodes = 0; 87 + 88 + /* Calculate the number of nodes in the supplied affinity mask */ 89 + for_each_node(n) { 90 + if (cpumask_intersects(mask, node_to_cpumask[n])) { 91 + node_set(n, *nodemsk); 92 + nodes++; 93 + } 94 + } 95 + return nodes; 96 + } 97 + 98 + struct node_groups { 99 + unsigned id; 100 + 101 + union { 102 + unsigned ngroups; 103 + unsigned ncpus; 104 + }; 105 + }; 106 + 107 + static int ncpus_cmp_func(const void *l, const void *r) 108 + { 109 + const struct node_groups *ln = l; 110 + const struct node_groups *rn = r; 111 + 112 + return ln->ncpus - rn->ncpus; 113 + } 114 + 115 + /* 116 + * Allocate group number for each node, so that for each node: 117 + * 118 + * 1) the allocated number is >= 1 119 + * 120 + * 2) the allocated number is <= active CPU number of this node 121 + * 122 + * The actual allocated total groups may be less than @numgrps when 123 + * active total CPU number is less than @numgrps. 124 + * 125 + * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' 126 + * for each node. 127 + */ 128 + static void alloc_nodes_groups(unsigned int numgrps, 129 + cpumask_var_t *node_to_cpumask, 130 + const struct cpumask *cpu_mask, 131 + const nodemask_t nodemsk, 132 + struct cpumask *nmsk, 133 + struct node_groups *node_groups) 134 + { 135 + unsigned n, remaining_ncpus = 0; 136 + 137 + for (n = 0; n < nr_node_ids; n++) { 138 + node_groups[n].id = n; 139 + node_groups[n].ncpus = UINT_MAX; 140 + } 141 + 142 + for_each_node_mask(n, nodemsk) { 143 + unsigned ncpus; 144 + 145 + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); 146 + ncpus = cpumask_weight(nmsk); 147 + 148 + if (!ncpus) 149 + continue; 150 + remaining_ncpus += ncpus; 151 + node_groups[n].ncpus = ncpus; 152 + } 153 + 154 + numgrps = min_t(unsigned, remaining_ncpus, numgrps); 155 + 156 + sort(node_groups, nr_node_ids, sizeof(node_groups[0]), 157 + ncpus_cmp_func, NULL); 158 + 159 + /* 160 + * Allocate groups for each node according to the ratio of this 161 + * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is 162 + * bigger than number of active numa nodes. Always start the 163 + * allocation from the node with minimized nr_cpus. 164 + * 165 + * This way guarantees that each active node gets allocated at 166 + * least one group, and the theory is simple: over-allocation 167 + * is only done when this node is assigned by one group, so 168 + * other nodes will be allocated >= 1 groups, since 'numgrps' is 169 + * bigger than number of numa nodes. 170 + * 171 + * One perfect invariant is that number of allocated groups for 172 + * each node is <= CPU count of this node: 173 + * 174 + * 1) suppose there are two nodes: A and B 175 + * ncpu(X) is CPU count of node X 176 + * grps(X) is the group count allocated to node X via this 177 + * algorithm 178 + * 179 + * ncpu(A) <= ncpu(B) 180 + * ncpu(A) + ncpu(B) = N 181 + * grps(A) + grps(B) = G 182 + * 183 + * grps(A) = max(1, round_down(G * ncpu(A) / N)) 184 + * grps(B) = G - grps(A) 185 + * 186 + * both N and G are integer, and 2 <= G <= N, suppose 187 + * G = N - delta, and 0 <= delta <= N - 2 188 + * 189 + * 2) obviously grps(A) <= ncpu(A) because: 190 + * 191 + * if grps(A) is 1, then grps(A) <= ncpu(A) given 192 + * ncpu(A) >= 1 193 + * 194 + * otherwise, 195 + * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N 196 + * 197 + * 3) prove how grps(B) <= ncpu(B): 198 + * 199 + * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be 200 + * over-allocated, so grps(B) <= ncpu(B), 201 + * 202 + * otherwise: 203 + * 204 + * grps(A) = 205 + * round_down(G * ncpu(A) / N) = 206 + * round_down((N - delta) * ncpu(A) / N) = 207 + * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= 208 + * round_down((N * ncpu(A) - delta * N) / N) = 209 + * cpu(A) - delta 210 + * 211 + * then: 212 + * 213 + * grps(A) - G >= ncpu(A) - delta - G 214 + * => 215 + * G - grps(A) <= G + delta - ncpu(A) 216 + * => 217 + * grps(B) <= N - ncpu(A) 218 + * => 219 + * grps(B) <= cpu(B) 220 + * 221 + * For nodes >= 3, it can be thought as one node and another big 222 + * node given that is exactly what this algorithm is implemented, 223 + * and we always re-calculate 'remaining_ncpus' & 'numgrps', and 224 + * finally for each node X: grps(X) <= ncpu(X). 225 + * 226 + */ 227 + for (n = 0; n < nr_node_ids; n++) { 228 + unsigned ngroups, ncpus; 229 + 230 + if (node_groups[n].ncpus == UINT_MAX) 231 + continue; 232 + 233 + WARN_ON_ONCE(numgrps == 0); 234 + 235 + ncpus = node_groups[n].ncpus; 236 + ngroups = max_t(unsigned, 1, 237 + numgrps * ncpus / remaining_ncpus); 238 + WARN_ON_ONCE(ngroups > ncpus); 239 + 240 + node_groups[n].ngroups = ngroups; 241 + 242 + remaining_ncpus -= ncpus; 243 + numgrps -= ngroups; 244 + } 245 + } 246 + 247 + static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, 248 + cpumask_var_t *node_to_cpumask, 249 + const struct cpumask *cpu_mask, 250 + struct cpumask *nmsk, struct cpumask *masks) 251 + { 252 + unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; 253 + unsigned int last_grp = numgrps; 254 + unsigned int curgrp = startgrp; 255 + nodemask_t nodemsk = NODE_MASK_NONE; 256 + struct node_groups *node_groups; 257 + 258 + if (cpumask_empty(cpu_mask)) 259 + return 0; 260 + 261 + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); 262 + 263 + /* 264 + * If the number of nodes in the mask is greater than or equal the 265 + * number of groups we just spread the groups across the nodes. 266 + */ 267 + if (numgrps <= nodes) { 268 + for_each_node_mask(n, nodemsk) { 269 + /* Ensure that only CPUs which are in both masks are set */ 270 + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); 271 + cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); 272 + if (++curgrp == last_grp) 273 + curgrp = 0; 274 + } 275 + return numgrps; 276 + } 277 + 278 + node_groups = kcalloc(nr_node_ids, 279 + sizeof(struct node_groups), 280 + GFP_KERNEL); 281 + if (!node_groups) 282 + return -ENOMEM; 283 + 284 + /* allocate group number for each node */ 285 + alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, 286 + nodemsk, nmsk, node_groups); 287 + for (i = 0; i < nr_node_ids; i++) { 288 + unsigned int ncpus, v; 289 + struct node_groups *nv = &node_groups[i]; 290 + 291 + if (nv->ngroups == UINT_MAX) 292 + continue; 293 + 294 + /* Get the cpus on this node which are in the mask */ 295 + cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); 296 + ncpus = cpumask_weight(nmsk); 297 + if (!ncpus) 298 + continue; 299 + 300 + WARN_ON_ONCE(nv->ngroups > ncpus); 301 + 302 + /* Account for rounding errors */ 303 + extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); 304 + 305 + /* Spread allocated groups on CPUs of the current node */ 306 + for (v = 0; v < nv->ngroups; v++, curgrp++) { 307 + cpus_per_grp = ncpus / nv->ngroups; 308 + 309 + /* Account for extra groups to compensate rounding errors */ 310 + if (extra_grps) { 311 + cpus_per_grp++; 312 + --extra_grps; 313 + } 314 + 315 + /* 316 + * wrapping has to be considered given 'startgrp' 317 + * may start anywhere 318 + */ 319 + if (curgrp >= last_grp) 320 + curgrp = 0; 321 + grp_spread_init_one(&masks[curgrp], nmsk, 322 + cpus_per_grp); 323 + } 324 + done += nv->ngroups; 325 + } 326 + kfree(node_groups); 327 + return done; 328 + } 329 + 330 + #ifdef CONFIG_SMP 331 + /** 332 + * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality 333 + * @numgrps: number of groups 334 + * 335 + * Return: cpumask array if successful, NULL otherwise. And each element 336 + * includes CPUs assigned to this group 337 + * 338 + * Try to put close CPUs from viewpoint of CPU and NUMA locality into 339 + * same group, and run two-stage grouping: 340 + * 1) allocate present CPUs on these groups evenly first 341 + * 2) allocate other possible CPUs on these groups evenly 342 + * 343 + * We guarantee in the resulted grouping that all CPUs are covered, and 344 + * no same CPU is assigned to multiple groups 345 + */ 346 + struct cpumask *group_cpus_evenly(unsigned int numgrps) 347 + { 348 + unsigned int curgrp = 0, nr_present = 0, nr_others = 0; 349 + cpumask_var_t *node_to_cpumask; 350 + cpumask_var_t nmsk, npresmsk; 351 + int ret = -ENOMEM; 352 + struct cpumask *masks = NULL; 353 + 354 + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) 355 + return NULL; 356 + 357 + if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) 358 + goto fail_nmsk; 359 + 360 + node_to_cpumask = alloc_node_to_cpumask(); 361 + if (!node_to_cpumask) 362 + goto fail_npresmsk; 363 + 364 + masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); 365 + if (!masks) 366 + goto fail_node_to_cpumask; 367 + 368 + /* Stabilize the cpumasks */ 369 + cpus_read_lock(); 370 + build_node_to_cpumask(node_to_cpumask); 371 + 372 + /* grouping present CPUs first */ 373 + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, 374 + cpu_present_mask, nmsk, masks); 375 + if (ret < 0) 376 + goto fail_build_affinity; 377 + nr_present = ret; 378 + 379 + /* 380 + * Allocate non present CPUs starting from the next group to be 381 + * handled. If the grouping of present CPUs already exhausted the 382 + * group space, assign the non present CPUs to the already 383 + * allocated out groups. 384 + */ 385 + if (nr_present >= numgrps) 386 + curgrp = 0; 387 + else 388 + curgrp = nr_present; 389 + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); 390 + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, 391 + npresmsk, nmsk, masks); 392 + if (ret >= 0) 393 + nr_others = ret; 394 + 395 + fail_build_affinity: 396 + cpus_read_unlock(); 397 + 398 + if (ret >= 0) 399 + WARN_ON(nr_present + nr_others < numgrps); 400 + 401 + fail_node_to_cpumask: 402 + free_node_to_cpumask(node_to_cpumask); 403 + 404 + fail_npresmsk: 405 + free_cpumask_var(npresmsk); 406 + 407 + fail_nmsk: 408 + free_cpumask_var(nmsk); 409 + if (ret < 0) { 410 + kfree(masks); 411 + return NULL; 412 + } 413 + return masks; 414 + } 415 + #else 416 + struct cpumask *group_cpus_evenly(unsigned int numgrps) 417 + { 418 + struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); 419 + 420 + if (!masks) 421 + return NULL; 422 + 423 + /* assign all CPUs(cpu 0) to the 1st group only */ 424 + cpumask_copy(&masks[0], cpu_possible_mask); 425 + return masks; 426 + } 427 + #endif