···1414 * 2003-10-22 Updates by Stephen Hemminger.1515 * 2004 May-July Rework by Paul Jackson.1616 * 2006 Rework by Paul Menage to use generic cgroups1717+ * 2008 Rework of the scheduler domains and CPU hotplug handling1818+ * by Max Krasnyansky1719 *1820 * This file is subject to the terms and conditions of the GNU General Public1921 * License. See the file COPYING in the main directory of the Linux···238236239237static DEFINE_MUTEX(callback_mutex);240238241241-/* This is ugly, but preserves the userspace API for existing cpuset239239+/*240240+ * This is ugly, but preserves the userspace API for existing cpuset242241 * users. If someone tries to mount the "cpuset" filesystem, we243243- * silently switch it to mount "cgroup" instead */242242+ * silently switch it to mount "cgroup" instead243243+ */244244static int cpuset_get_sb(struct file_system_type *fs_type,245245 int flags, const char *unused_dev_name,246246 void *data, struct vfsmount *mnt)···477473}478474479475/*480480- * Helper routine for rebuild_sched_domains().476476+ * Helper routine for generate_sched_domains().481477 * Do cpusets a, b have overlapping cpus_allowed masks?482478 */483483-484479static int cpusets_overlap(struct cpuset *a, struct cpuset *b)485480{486481 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);···521518}522519523520/*524524- * rebuild_sched_domains()521521+ * generate_sched_domains()525522 *526526- * This routine will be called to rebuild the scheduler's dynamic527527- * sched domains:528528- * - if the flag 'sched_load_balance' of any cpuset with non-empty529529- * 'cpus' changes,530530- * - or if the 'cpus' allowed changes in any cpuset which has that531531- * flag enabled,532532- * - or if the 'sched_relax_domain_level' of any cpuset which has533533- * that flag enabled and with non-empty 'cpus' changes,534534- * - or if any cpuset with non-empty 'cpus' is removed,535535- * - or if a cpu gets offlined.536536- *537537- * This routine builds a partial partition of the systems CPUs538538- * (the set of non-overlappping cpumask_t's in the array 'part'539539- * below), and passes that partial partition to the kernel/sched.c540540- * partition_sched_domains() routine, which will rebuild the541541- * schedulers load balancing domains (sched domains) as specified542542- * by that partial partition. A 'partial partition' is a set of543543- * non-overlapping subsets whose union is a subset of that set.523523+ * This function builds a partial partition of the systems CPUs524524+ * A 'partial partition' is a set of non-overlapping subsets whose525525+ * union is a subset of that set.526526+ * The output of this function needs to be passed to kernel/sched.c527527+ * partition_sched_domains() routine, which will rebuild the scheduler's528528+ * load balancing domains (sched domains) as specified by that partial529529+ * partition.544530 *545531 * See "What is sched_load_balance" in Documentation/cpusets.txt546532 * for a background explanation of this.···539547 * domains when operating in the severe memory shortage situations540548 * that could cause allocation failures below.541549 *542542- * Call with cgroup_mutex held. May take callback_mutex during543543- * call due to the kfifo_alloc() and kmalloc() calls. May nest544544- * a call to the get_online_cpus()/put_online_cpus() pair.545545- * Must not be called holding callback_mutex, because we must not546546- * call get_online_cpus() while holding callback_mutex. Elsewhere547547- * the kernel nests callback_mutex inside get_online_cpus() calls.548548- * So the reverse nesting would risk an ABBA deadlock.550550+ * Must be called with cgroup_lock held.549551 *550552 * The three key local variables below are:551553 * q - a linked-list queue of cpuset pointers, used to implement a···574588 * element of the partition (one sched domain) to be passed to575589 * partition_sched_domains().576590 */577577-578578-void rebuild_sched_domains(void)591591+static int generate_sched_domains(cpumask_t **domains,592592+ struct sched_domain_attr **attributes)579593{580580- LIST_HEAD(q); /* queue of cpusets to be scanned*/594594+ LIST_HEAD(q); /* queue of cpusets to be scanned */581595 struct cpuset *cp; /* scans q */582596 struct cpuset **csa; /* array of all cpuset ptrs */583597 int csn; /* how many cpuset ptrs in csa so far */···587601 int ndoms; /* number of sched domains in result */588602 int nslot; /* next empty doms[] cpumask_t slot */589603590590- csa = NULL;604604+ ndoms = 0;591605 doms = NULL;592606 dattr = NULL;607607+ csa = NULL;593608594609 /* Special case for the 99% of systems with one, full, sched domain */595610 if (is_sched_load_balance(&top_cpuset)) {596596- ndoms = 1;597611 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);598612 if (!doms)599599- goto rebuild;613613+ goto done;614614+600615 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);601616 if (dattr) {602617 *dattr = SD_ATTR_INIT;603618 update_domain_attr_tree(dattr, &top_cpuset);604619 }605620 *doms = top_cpuset.cpus_allowed;606606- goto rebuild;621621+622622+ ndoms = 1;623623+ goto done;607624 }608625609626 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);···669680 }670681 }671682672672- /* Convert <csn, csa> to <ndoms, doms> */683683+ /*684684+ * Now we know how many domains to create.685685+ * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.686686+ */673687 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);674674- if (!doms)675675- goto rebuild;688688+ if (!doms) {689689+ ndoms = 0;690690+ goto done;691691+ }692692+693693+ /*694694+ * The rest of the code, including the scheduler, can deal with695695+ * dattr==NULL case. No need to abort if alloc fails.696696+ */676697 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);677698678699 for (nslot = 0, i = 0; i < csn; i++) {679700 struct cpuset *a = csa[i];701701+ cpumask_t *dp;680702 int apn = a->pn;681703682682- if (apn >= 0) {683683- cpumask_t *dp = doms + nslot;684684-685685- if (nslot == ndoms) {686686- static int warnings = 10;687687- if (warnings) {688688- printk(KERN_WARNING689689- "rebuild_sched_domains confused:"690690- " nslot %d, ndoms %d, csn %d, i %d,"691691- " apn %d\n",692692- nslot, ndoms, csn, i, apn);693693- warnings--;694694- }695695- continue;696696- }697697-698698- cpus_clear(*dp);699699- if (dattr)700700- *(dattr + nslot) = SD_ATTR_INIT;701701- for (j = i; j < csn; j++) {702702- struct cpuset *b = csa[j];703703-704704- if (apn == b->pn) {705705- cpus_or(*dp, *dp, b->cpus_allowed);706706- b->pn = -1;707707- if (dattr)708708- update_domain_attr_tree(dattr709709- + nslot, b);710710- }711711- }712712- nslot++;704704+ if (apn < 0) {705705+ /* Skip completed partitions */706706+ continue;713707 }708708+709709+ dp = doms + nslot;710710+711711+ if (nslot == ndoms) {712712+ static int warnings = 10;713713+ if (warnings) {714714+ printk(KERN_WARNING715715+ "rebuild_sched_domains confused:"716716+ " nslot %d, ndoms %d, csn %d, i %d,"717717+ " apn %d\n",718718+ nslot, ndoms, csn, i, apn);719719+ warnings--;720720+ }721721+ continue;722722+ }723723+724724+ cpus_clear(*dp);725725+ if (dattr)726726+ *(dattr + nslot) = SD_ATTR_INIT;727727+ for (j = i; j < csn; j++) {728728+ struct cpuset *b = csa[j];729729+730730+ if (apn == b->pn) {731731+ cpus_or(*dp, *dp, b->cpus_allowed);732732+ if (dattr)733733+ update_domain_attr_tree(dattr + nslot, b);734734+735735+ /* Done with this partition */736736+ b->pn = -1;737737+ }738738+ }739739+ nslot++;714740 }715741 BUG_ON(nslot != ndoms);716742717717-rebuild:718718- /* Have scheduler rebuild sched domains */719719- get_online_cpus();720720- partition_sched_domains(ndoms, doms, dattr);721721- put_online_cpus();722722-723743done:724744 kfree(csa);725725- /* Don't kfree(doms) -- partition_sched_domains() does that. */726726- /* Don't kfree(dattr) -- partition_sched_domains() does that. */745745+746746+ *domains = doms;747747+ *attributes = dattr;748748+ return ndoms;749749+}750750+751751+/*752752+ * Rebuild scheduler domains.753753+ *754754+ * Call with neither cgroup_mutex held nor within get_online_cpus().755755+ * Takes both cgroup_mutex and get_online_cpus().756756+ *757757+ * Cannot be directly called from cpuset code handling changes758758+ * to the cpuset pseudo-filesystem, because it cannot be called759759+ * from code that already holds cgroup_mutex.760760+ */761761+static void do_rebuild_sched_domains(struct work_struct *unused)762762+{763763+ struct sched_domain_attr *attr;764764+ cpumask_t *doms;765765+ int ndoms;766766+767767+ get_online_cpus();768768+769769+ /* Generate domain masks and attrs */770770+ cgroup_lock();771771+ ndoms = generate_sched_domains(&doms, &attr);772772+ cgroup_unlock();773773+774774+ /* Have scheduler rebuild the domains */775775+ partition_sched_domains(ndoms, doms, attr);776776+777777+ put_online_cpus();778778+}779779+780780+static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);781781+782782+/*783783+ * Rebuild scheduler domains, asynchronously via workqueue.784784+ *785785+ * If the flag 'sched_load_balance' of any cpuset with non-empty786786+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset787787+ * which has that flag enabled, or if any cpuset with a non-empty788788+ * 'cpus' is removed, then call this routine to rebuild the789789+ * scheduler's dynamic sched domains.790790+ *791791+ * The rebuild_sched_domains() and partition_sched_domains()792792+ * routines must nest cgroup_lock() inside get_online_cpus(),793793+ * but such cpuset changes as these must nest that locking the794794+ * other way, holding cgroup_lock() for much of the code.795795+ *796796+ * So in order to avoid an ABBA deadlock, the cpuset code handling797797+ * these user changes delegates the actual sched domain rebuilding798798+ * to a separate workqueue thread, which ends up processing the799799+ * above do_rebuild_sched_domains() function.800800+ */801801+static void async_rebuild_sched_domains(void)802802+{803803+ schedule_work(&rebuild_sched_domains_work);804804+}805805+806806+/*807807+ * Accomplishes the same scheduler domain rebuild as the above808808+ * async_rebuild_sched_domains(), however it directly calls the809809+ * rebuild routine synchronously rather than calling it via an810810+ * asynchronous work thread.811811+ *812812+ * This can only be called from code that is not holding813813+ * cgroup_mutex (not nested in a cgroup_lock() call.)814814+ */815815+void rebuild_sched_domains(void)816816+{817817+ do_rebuild_sched_domains(NULL);727818}728819729820/**···932863 return retval;933864934865 if (is_load_balanced)935935- rebuild_sched_domains();866866+ async_rebuild_sched_domains();936867 return 0;937868}938869···11591090 if (val != cs->relax_domain_level) {11601091 cs->relax_domain_level = val;11611092 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))11621162- rebuild_sched_domains();10931093+ async_rebuild_sched_domains();11631094 }1164109511651096 return 0;···12001131 mutex_unlock(&callback_mutex);1201113212021133 if (cpus_nonempty && balance_flag_changed)12031203- rebuild_sched_domains();11341134+ async_rebuild_sched_domains();1204113512051136 return 0;12061137}···15611492 default:15621493 BUG();15631494 }14951495+14961496+ /* Unreachable but makes gcc happy */14971497+ return 0;15641498}1565149915661500static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)···15761504 default:15771505 BUG();15781506 }15071507+15081508+ /* Unrechable but makes gcc happy */15091509+ return 0;15791510}1580151115811512···17671692}1768169317691694/*17701770- * Locking note on the strange update_flag() call below:17711771- *17721695 * If the cpuset being removed has its flag 'sched_load_balance'17731696 * enabled, then simulate turning sched_load_balance off, which17741774- * will call rebuild_sched_domains(). The get_online_cpus()17751775- * call in rebuild_sched_domains() must not be made while holding17761776- * callback_mutex. Elsewhere the kernel nests callback_mutex inside17771777- * get_online_cpus() calls. So the reverse nesting would risk an17781778- * ABBA deadlock.16971697+ * will call async_rebuild_sched_domains().17791698 */1780169917811700static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)···17881719struct cgroup_subsys cpuset_subsys = {17891720 .name = "cpuset",17901721 .create = cpuset_create,17911791- .destroy = cpuset_destroy,17221722+ .destroy = cpuset_destroy,17921723 .can_attach = cpuset_can_attach,17931724 .attach = cpuset_attach,17941725 .populate = cpuset_populate,···18801811}1881181218821813/*18831883- * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs18141814+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs18841815 * or memory nodes, we need to walk over the cpuset hierarchy,18851816 * removing that CPU or node from all cpusets. If this removes the18861817 * last CPU or node from a cpuset, then move the tasks in the empty···19721903}1973190419741905/*19751975- * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track19761976- * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to19771977- * track what's online after any CPU or memory node hotplug or unplug event.19781978- *19791979- * Since there are two callers of this routine, one for CPU hotplug19801980- * events and one for memory node hotplug events, we could have coded19811981- * two separate routines here. We code it as a single common routine19821982- * in order to minimize text size.19831983- */19841984-19851985-static void common_cpu_mem_hotplug_unplug(int rebuild_sd)19861986-{19871987- cgroup_lock();19881988-19891989- top_cpuset.cpus_allowed = cpu_online_map;19901990- top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];19911991- scan_for_empty_cpusets(&top_cpuset);19921992-19931993- /*19941994- * Scheduler destroys domains on hotplug events.19951995- * Rebuild them based on the current settings.19961996- */19971997- if (rebuild_sd)19981998- rebuild_sched_domains();19991999-20002000- cgroup_unlock();20012001-}20022002-20032003-/*20041906 * The top_cpuset tracks what CPUs and Memory Nodes are online,20051907 * period. This is necessary in order to make cpusets transparent20061908 * (of no affect) on systems that are actively using CPU hotplug···19791939 *19801940 * This routine ensures that top_cpuset.cpus_allowed tracks19811941 * cpu_online_map on each CPU hotplug (cpuhp) event.19421942+ *19431943+ * Called within get_online_cpus(). Needs to call cgroup_lock()19441944+ * before calling generate_sched_domains().19821945 */19831983-19841984-static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,19461946+static int cpuset_track_online_cpus(struct notifier_block *unused_nb,19851947 unsigned long phase, void *unused_cpu)19861948{19491949+ struct sched_domain_attr *attr;19501950+ cpumask_t *doms;19511951+ int ndoms;19521952+19871953 switch (phase) {19881988- case CPU_UP_CANCELED:19891989- case CPU_UP_CANCELED_FROZEN:19901990- case CPU_DOWN_FAILED:19911991- case CPU_DOWN_FAILED_FROZEN:19921954 case CPU_ONLINE:19931955 case CPU_ONLINE_FROZEN:19941956 case CPU_DEAD:19951957 case CPU_DEAD_FROZEN:19961996- common_cpu_mem_hotplug_unplug(1);19971958 break;19591959+19981960 default:19991961 return NOTIFY_DONE;20001962 }19631963+19641964+ cgroup_lock();19651965+ top_cpuset.cpus_allowed = cpu_online_map;19661966+ scan_for_empty_cpusets(&top_cpuset);19671967+ ndoms = generate_sched_domains(&doms, &attr);19681968+ cgroup_unlock();19691969+19701970+ /* Have scheduler rebuild the domains */19711971+ partition_sched_domains(ndoms, doms, attr);2001197220021973 return NOTIFY_OK;20031974}···20161965#ifdef CONFIG_MEMORY_HOTPLUG20171966/*20181967 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].20192019- * Call this routine anytime after you change20202020- * node_states[N_HIGH_MEMORY].20212021- * See also the previous routine cpuset_handle_cpuhp().19681968+ * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.19691969+ * See also the previous routine cpuset_track_online_cpus().20221970 */20232023-20241971void cpuset_track_online_nodes(void)20251972{20262026- common_cpu_mem_hotplug_unplug(0);19731973+ cgroup_lock();19741974+ top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];19751975+ scan_for_empty_cpusets(&top_cpuset);19761976+ cgroup_unlock();20271977}20281978#endif20291979···20391987 top_cpuset.cpus_allowed = cpu_online_map;20401988 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];2041198920422042- hotcpu_notifier(cpuset_handle_cpuhp, 0);19901990+ hotcpu_notifier(cpuset_track_online_cpus, 0);20431991}2044199220451993/**
+13-6
kernel/sched.c
···76967696 * and partition_sched_domains() will fallback to the single partition76977697 * 'fallback_doms', it also forces the domains to be rebuilt.76987698 *76997699+ * If doms_new==NULL it will be replaced with cpu_online_map.77007700+ * ndoms_new==0 is a special case for destroying existing domains.77017701+ * It will not create the default domain.77027702+ *76997703 * Call with hotplug lock held77007704 */77017705void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,77027706 struct sched_domain_attr *dattr_new)77037707{77047704- int i, j;77087708+ int i, j, n;7705770977067710 mutex_lock(&sched_domains_mutex);7707771177087712 /* always unregister in case we don't destroy any domains */77097713 unregister_sched_domain_sysctl();7710771477117711- if (doms_new == NULL)77127712- ndoms_new = 0;77157715+ n = doms_new ? ndoms_new : 0;7713771677147717 /* Destroy deleted domains */77157718 for (i = 0; i < ndoms_cur; i++) {77167716- for (j = 0; j < ndoms_new; j++) {77197719+ for (j = 0; j < n; j++) {77177720 if (cpus_equal(doms_cur[i], doms_new[j])77187721 && dattrs_equal(dattr_cur, i, dattr_new, j))77197722 goto match1;···7729772677307727 if (doms_new == NULL) {77317728 ndoms_cur = 0;77327732- ndoms_new = 1;77337729 doms_new = &fallback_doms;77347730 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);77357731 dattr_new = NULL;···77657763int arch_reinit_sched_domains(void)77667764{77677765 get_online_cpus();77667766+77677767+ /* Destroy domains first to force the rebuild */77687768+ partition_sched_domains(0, NULL, NULL);77697769+77687770 rebuild_sched_domains();77697771 put_online_cpus();77727772+77707773 return 0;77717774}77727775···78557848 case CPU_ONLINE_FROZEN:78567849 case CPU_DEAD:78577850 case CPU_DEAD_FROZEN:78587858- partition_sched_domains(0, NULL, NULL);78517851+ partition_sched_domains(1, NULL, NULL);78597852 return NOTIFY_OK;7860785378617854 default: