Fix cpusets update_cpumask · tjh.dev/kernel@8707d8b

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Fix cpusets update_cpumask

Cause writes to cpuset "cpus" file to update cpus_allowed for member tasks:

- collect batches of tasks under tasklist_lock and then call
set_cpus_allowed() on them outside the lock (since this can sleep).

- add a simple generic priority heap type to allow efficient collection
of batches of tasks to be processed without duplicating or missing any
tasks in subsequent batches.

- make "cpus" file update a no-op if the mask hasn't changed

- fix race between update_cpumask() and sched_setaffinity() by making
sched_setaffinity() post-check that it's not running on any cpus outside
cpuset_cpus_allowed().

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Paul Menage and committed by

Linus Torvalds 18 years ago 8707d8b8 020958b6

+243 -5

5 changed files

expand all

include

linux

prio_heap.h

kernel

cpuset.c

sched.c

lib

Makefile

prio_heap.c

+58

include/linux/prio_heap.h

··· 1 + #ifndef _LINUX_PRIO_HEAP_H 2 + #define _LINUX_PRIO_HEAP_H 3 + 4 + /* 5 + * Simple insertion-only static-sized priority heap containing 6 + * pointers, based on CLR, chapter 7 7 + */ 8 + 9 + #include <linux/gfp.h> 10 + 11 + /** 12 + * struct ptr_heap - simple static-sized priority heap 13 + * @ptrs - pointer to data area 14 + * @max - max number of elements that can be stored in @ptrs 15 + * @size - current number of valid elements in @ptrs (in the range 0..@size-1 16 + * @gt: comparison operator, which should implement "greater than" 17 + */ 18 + struct ptr_heap { 19 + void **ptrs; 20 + int max; 21 + int size; 22 + int (*gt)(void *, void *); 23 + }; 24 + 25 + /** 26 + * heap_init - initialize an empty heap with a given memory size 27 + * @heap: the heap structure to be initialized 28 + * @size: amount of memory to use in bytes 29 + * @gfp_mask: mask to pass to kmalloc() 30 + * @gt: comparison operator, which should implement "greater than" 31 + */ 32 + extern int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask, 33 + int (*gt)(void *, void *)); 34 + 35 + /** 36 + * heap_free - release a heap's storage 37 + * @heap: the heap structure whose data should be released 38 + */ 39 + void heap_free(struct ptr_heap *heap); 40 + 41 + /** 42 + * heap_insert - insert a value into the heap and return any overflowed value 43 + * @heap: the heap to be operated on 44 + * @p: the pointer to be inserted 45 + * 46 + * Attempts to insert the given value into the priority heap. If the 47 + * heap is full prior to the insertion, then the resulting heap will 48 + * consist of the smallest @max elements of the original heap and the 49 + * new element; the greatest element will be removed from the heap and 50 + * returned. Note that the returned element will be the new element 51 + * (i.e. no change to the heap) if the new element is greater than all 52 + * elements currently in the heap. 53 + */ 54 + extern void *heap_insert(struct ptr_heap *heap, void *p); 55 + 56 + 57 + 58 + #endif /* _LINUX_PRIO_HEAP_H */

+101 -4

kernel/cpuset.c

··· 38 38 #include <linux/mount.h> 39 39 #include <linux/namei.h> 40 40 #include <linux/pagemap.h> 41 + #include <linux/prio_heap.h> 41 42 #include <linux/proc_fs.h> 42 43 #include <linux/rcupdate.h> 43 44 #include <linux/sched.h> ··· 702 701 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 703 702 } 704 703 704 + static inline int started_after_time(struct task_struct *t1, 705 + struct timespec *time, 706 + struct task_struct *t2) 707 + { 708 + int start_diff = timespec_compare(&t1->start_time, time); 709 + if (start_diff > 0) { 710 + return 1; 711 + } else if (start_diff < 0) { 712 + return 0; 713 + } else { 714 + /* 715 + * Arbitrarily, if two processes started at the same 716 + * time, we'll say that the lower pointer value 717 + * started first. Note that t2 may have exited by now 718 + * so this may not be a valid pointer any longer, but 719 + * that's fine - it still serves to distinguish 720 + * between two tasks started (effectively) 721 + * simultaneously. 722 + */ 723 + return t1 > t2; 724 + } 725 + } 726 + 727 + static inline int started_after(void *p1, void *p2) 728 + { 729 + struct task_struct *t1 = p1; 730 + struct task_struct *t2 = p2; 731 + return started_after_time(t1, &t2->start_time, t2); 732 + } 733 + 705 734 /* 706 735 * Call with manage_mutex held. May take callback_mutex during call. 707 736 */ ··· 739 708 static int update_cpumask(struct cpuset *cs, char *buf) 740 709 { 741 710 struct cpuset trialcs; 742 - int retval; 743 - int cpus_changed, is_load_balanced; 711 + int retval, i; 712 + int is_load_balanced; 713 + struct cgroup_iter it; 714 + struct cgroup *cgrp = cs->css.cgroup; 715 + struct task_struct *p, *dropped; 716 + /* Never dereference latest_task, since it's not refcounted */ 717 + struct task_struct *latest_task = NULL; 718 + struct ptr_heap heap; 719 + struct timespec latest_time = { 0, 0 }; 744 720 745 721 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ 746 722 if (cs == &top_cpuset) ··· 774 736 if (retval < 0) 775 737 return retval; 776 738 777 - cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); 739 + /* Nothing to do if the cpus didn't change */ 740 + if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 741 + return 0; 742 + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); 743 + if (retval) 744 + return retval; 745 + 778 746 is_load_balanced = is_sched_load_balance(&trialcs); 779 747 780 748 mutex_lock(&callback_mutex); 781 749 cs->cpus_allowed = trialcs.cpus_allowed; 782 750 mutex_unlock(&callback_mutex); 783 751 784 - if (cpus_changed && is_load_balanced) 752 + again: 753 + /* 754 + * Scan tasks in the cpuset, and update the cpumasks of any 755 + * that need an update. Since we can't call set_cpus_allowed() 756 + * while holding tasklist_lock, gather tasks to be processed 757 + * in a heap structure. If the statically-sized heap fills up, 758 + * overflow tasks that started later, and in future iterations 759 + * only consider tasks that started after the latest task in 760 + * the previous pass. This guarantees forward progress and 761 + * that we don't miss any tasks 762 + */ 763 + heap.size = 0; 764 + cgroup_iter_start(cgrp, &it); 765 + while ((p = cgroup_iter_next(cgrp, &it))) { 766 + /* Only affect tasks that don't have the right cpus_allowed */ 767 + if (cpus_equal(p->cpus_allowed, cs->cpus_allowed)) 768 + continue; 769 + /* 770 + * Only process tasks that started after the last task 771 + * we processed 772 + */ 773 + if (!started_after_time(p, &latest_time, latest_task)) 774 + continue; 775 + dropped = heap_insert(&heap, p); 776 + if (dropped == NULL) { 777 + get_task_struct(p); 778 + } else if (dropped != p) { 779 + get_task_struct(p); 780 + put_task_struct(dropped); 781 + } 782 + } 783 + cgroup_iter_end(cgrp, &it); 784 + if (heap.size) { 785 + for (i = 0; i < heap.size; i++) { 786 + struct task_struct *p = heap.ptrs[i]; 787 + if (i == 0) { 788 + latest_time = p->start_time; 789 + latest_task = p; 790 + } 791 + set_cpus_allowed(p, cs->cpus_allowed); 792 + put_task_struct(p); 793 + } 794 + /* 795 + * If we had to process any tasks at all, scan again 796 + * in case some of them were in the middle of forking 797 + * children that didn't notice the new cpumask 798 + * restriction. Not the most efficient way to do it, 799 + * but it avoids having to take callback_mutex in the 800 + * fork path 801 + */ 802 + goto again; 803 + } 804 + heap_free(&heap); 805 + if (is_load_balanced) 785 806 rebuild_sched_domains(); 786 807 787 808 return 0;

+13

kernel/sched.c

··· 4471 4471 4472 4472 cpus_allowed = cpuset_cpus_allowed(p); 4473 4473 cpus_and(new_mask, new_mask, cpus_allowed); 4474 + again: 4474 4475 retval = set_cpus_allowed(p, new_mask); 4475 4476 4477 + if (!retval) { 4478 + cpus_allowed = cpuset_cpus_allowed(p); 4479 + if (!cpus_subset(new_mask, cpus_allowed)) { 4480 + /* 4481 + * We must have raced with a concurrent cpuset 4482 + * update. Just reset the cpus_allowed to the 4483 + * cpuset's cpus_allowed 4484 + */ 4485 + new_mask = cpus_allowed; 4486 + goto again; 4487 + } 4488 + } 4476 4489 out_unlock: 4477 4490 put_task_struct(p); 4478 4491 mutex_unlock(&sched_hotcpu_mutex);

+1 -1

lib/Makefile

··· 6 6 rbtree.o radix-tree.o dump_stack.o \ 7 7 idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ 8 8 sha1.o irq_regs.o reciprocal_div.o argv_split.o \ 9 - proportions.o 9 + proportions.o prio_heap.o 10 10 11 11 lib-$(CONFIG_MMU) += ioremap.o 12 12 lib-$(CONFIG_SMP) += cpumask.o

+70

lib/prio_heap.c

··· 1 + /* 2 + * Simple insertion-only static-sized priority heap containing 3 + * pointers, based on CLR, chapter 7 4 + */ 5 + 6 + #include <linux/slab.h> 7 + #include <linux/prio_heap.h> 8 + 9 + int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask, 10 + int (*gt)(void *, void *)) 11 + { 12 + heap->ptrs = kmalloc(size, gfp_mask); 13 + if (!heap->ptrs) 14 + return -ENOMEM; 15 + heap->size = 0; 16 + heap->max = size / sizeof(void *); 17 + heap->gt = gt; 18 + return 0; 19 + } 20 + 21 + void heap_free(struct ptr_heap *heap) 22 + { 23 + kfree(heap->ptrs); 24 + } 25 + 26 + void *heap_insert(struct ptr_heap *heap, void *p) 27 + { 28 + void *res; 29 + void **ptrs = heap->ptrs; 30 + int pos; 31 + 32 + if (heap->size < heap->max) { 33 + /* Heap insertion */ 34 + int pos = heap->size++; 35 + while (pos > 0 && heap->gt(p, ptrs[(pos-1)/2])) { 36 + ptrs[pos] = ptrs[(pos-1)/2]; 37 + pos = (pos-1)/2; 38 + } 39 + ptrs[pos] = p; 40 + return NULL; 41 + } 42 + 43 + /* The heap is full, so something will have to be dropped */ 44 + 45 + /* If the new pointer is greater than the current max, drop it */ 46 + if (heap->gt(p, ptrs[0])) 47 + return p; 48 + 49 + /* Replace the current max and heapify */ 50 + res = ptrs[0]; 51 + ptrs[0] = p; 52 + pos = 0; 53 + 54 + while (1) { 55 + int left = 2 * pos + 1; 56 + int right = 2 * pos + 2; 57 + int largest = pos; 58 + if (left < heap->size && heap->gt(ptrs[left], p)) 59 + largest = left; 60 + if (right < heap->size && heap->gt(ptrs[right], ptrs[largest])) 61 + largest = right; 62 + if (largest == pos) 63 + break; 64 + /* Push p down the heap one level and bump one up */ 65 + ptrs[pos] = ptrs[largest]; 66 + ptrs[largest] = p; 67 + pos = largest; 68 + } 69 + return res; 70 + }