Merge branch 'for-3.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu

+1 -1

arch/x86/kvm/mmu.c

··· 4549 4549 if (!mmu_page_header_cache) 4550 4550 goto nomem; 4551 4551 4552 - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) 4552 + if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) 4553 4553 goto nomem; 4554 4554 4555 4555 register_shrinker(&mmu_shrinker);

+6

block/blk-mq-sysfs.c

··· 402 402 } 403 403 } 404 404 405 + /* see blk_register_queue() */ 406 + void blk_mq_finish_init(struct request_queue *q) 407 + { 408 + percpu_ref_switch_to_percpu(&q->mq_usage_counter); 409 + } 410 + 405 411 int blk_mq_register_disk(struct gendisk *disk) 406 412 { 407 413 struct device *dev = disk_to_dev(disk);

+7 -11

block/blk-mq.c

··· 119 119 spin_unlock_irq(q->queue_lock); 120 120 121 121 if (freeze) { 122 - /* 123 - * XXX: Temporary kludge to work around SCSI blk-mq stall. 124 - * SCSI synchronously creates and destroys many queues 125 - * back-to-back during probe leading to lengthy stalls. 126 - * This will be fixed by keeping ->mq_usage_counter in 127 - * atomic mode until genhd registration, but, for now, 128 - * let's work around using expedited synchronization. 129 - */ 130 - __percpu_ref_kill_expedited(&q->mq_usage_counter); 131 - 122 + percpu_ref_kill(&q->mq_usage_counter); 132 123 blk_mq_run_queues(q, false); 133 124 } 134 125 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); ··· 1795 1804 if (!q) 1796 1805 goto err_hctxs; 1797 1806 1798 - if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release)) 1807 + /* 1808 + * Init percpu_ref in atomic mode so that it's faster to shutdown. 1809 + * See blk_register_queue() for details. 1810 + */ 1811 + if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, 1812 + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) 1799 1813 goto err_map; 1800 1814 1801 1815 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);

+9 -2

block/blk-sysfs.c

··· 551 551 return -ENXIO; 552 552 553 553 /* 554 - * Initialization must be complete by now. Finish the initial 555 - * bypass from queue allocation. 554 + * SCSI probing may synchronously create and destroy a lot of 555 + * request_queues for non-existent devices. Shutting down a fully 556 + * functional queue takes measureable wallclock time as RCU grace 557 + * periods are involved. To avoid excessive latency in these 558 + * cases, a request_queue starts out in a degraded mode which is 559 + * faster to shut down and is made fully functional here as 560 + * request_queues for non-existent devices never get registered. 556 561 */ 557 562 if (!blk_queue_init_done(q)) { 558 563 queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); 559 564 blk_queue_bypass_end(q); 565 + if (q->mq_ops) 566 + blk_mq_finish_init(q); 560 567 } 561 568 562 569 ret = blk_trace_init_sysfs(dev);

+2 -1

drivers/target/target_core_tpg.c

··· 819 819 { 820 820 int ret; 821 821 822 - ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release); 822 + ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, 0, 823 + GFP_KERNEL); 823 824 if (ret < 0) 824 825 return ret; 825 826

+2 -2

fs/aio.c

··· 661 661 662 662 INIT_LIST_HEAD(&ctx->active_reqs); 663 663 664 - if (percpu_ref_init(&ctx->users, free_ioctx_users)) 664 + if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) 665 665 goto err; 666 666 667 - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) 667 + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) 668 668 goto err; 669 669 670 670 ctx->cpu = alloc_percpu(struct kioctx_cpu);

+4 -4

fs/btrfs/disk-io.c

··· 1183 1183 if (!writers) 1184 1184 return ERR_PTR(-ENOMEM); 1185 1185 1186 - ret = percpu_counter_init(&writers->counter, 0); 1186 + ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL); 1187 1187 if (ret < 0) { 1188 1188 kfree(writers); 1189 1189 return ERR_PTR(ret); ··· 2188 2188 goto fail_srcu; 2189 2189 } 2190 2190 2191 - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); 2191 + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); 2192 2192 if (ret) { 2193 2193 err = ret; 2194 2194 goto fail_bdi; ··· 2196 2196 fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * 2197 2197 (1 + ilog2(nr_cpu_ids)); 2198 2198 2199 - ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); 2199 + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); 2200 2200 if (ret) { 2201 2201 err = ret; 2202 2202 goto fail_dirty_metadata_bytes; 2203 2203 } 2204 2204 2205 - ret = percpu_counter_init(&fs_info->bio_counter, 0); 2205 + ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL); 2206 2206 if (ret) { 2207 2207 err = ret; 2208 2208 goto fail_delalloc_bytes;

+1 -1

fs/btrfs/extent-tree.c

··· 3494 3494 if (!found) 3495 3495 return -ENOMEM; 3496 3496 3497 - ret = percpu_counter_init(&found->total_bytes_pinned, 0); 3497 + ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); 3498 3498 if (ret) { 3499 3499 kfree(found); 3500 3500 return ret;

+3 -3

fs/ext2/super.c

··· 1067 1067 ext2_rsv_window_add(sb, &sbi->s_rsv_window_head); 1068 1068 1069 1069 err = percpu_counter_init(&sbi->s_freeblocks_counter, 1070 - ext2_count_free_blocks(sb)); 1070 + ext2_count_free_blocks(sb), GFP_KERNEL); 1071 1071 if (!err) { 1072 1072 err = percpu_counter_init(&sbi->s_freeinodes_counter, 1073 - ext2_count_free_inodes(sb)); 1073 + ext2_count_free_inodes(sb), GFP_KERNEL); 1074 1074 } 1075 1075 if (!err) { 1076 1076 err = percpu_counter_init(&sbi->s_dirs_counter, 1077 - ext2_count_dirs(sb)); 1077 + ext2_count_dirs(sb), GFP_KERNEL); 1078 1078 } 1079 1079 if (err) { 1080 1080 ext2_msg(sb, KERN_ERR, "error: insufficient memory");

+3 -3

fs/ext3/super.c

··· 2039 2039 goto failed_mount2; 2040 2040 } 2041 2041 err = percpu_counter_init(&sbi->s_freeblocks_counter, 2042 - ext3_count_free_blocks(sb)); 2042 + ext3_count_free_blocks(sb), GFP_KERNEL); 2043 2043 if (!err) { 2044 2044 err = percpu_counter_init(&sbi->s_freeinodes_counter, 2045 - ext3_count_free_inodes(sb)); 2045 + ext3_count_free_inodes(sb), GFP_KERNEL); 2046 2046 } 2047 2047 if (!err) { 2048 2048 err = percpu_counter_init(&sbi->s_dirs_counter, 2049 - ext3_count_dirs(sb)); 2049 + ext3_count_dirs(sb), GFP_KERNEL); 2050 2050 } 2051 2051 if (err) { 2052 2052 ext3_msg(sb, KERN_ERR, "error: insufficient memory");

+9 -5

fs/ext4/super.c

··· 3892 3892 /* Register extent status tree shrinker */ 3893 3893 ext4_es_register_shrinker(sbi); 3894 3894 3895 - if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) { 3895 + err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL); 3896 + if (err) { 3896 3897 ext4_msg(sb, KERN_ERR, "insufficient memory"); 3897 3898 goto failed_mount3; 3898 3899 } ··· 4107 4106 block = ext4_count_free_clusters(sb); 4108 4107 ext4_free_blocks_count_set(sbi->s_es, 4109 4108 EXT4_C2B(sbi, block)); 4110 - err = percpu_counter_init(&sbi->s_freeclusters_counter, block); 4109 + err = percpu_counter_init(&sbi->s_freeclusters_counter, block, 4110 + GFP_KERNEL); 4111 4111 if (!err) { 4112 4112 unsigned long freei = ext4_count_free_inodes(sb); 4113 4113 sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); 4114 - err = percpu_counter_init(&sbi->s_freeinodes_counter, freei); 4114 + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, 4115 + GFP_KERNEL); 4115 4116 } 4116 4117 if (!err) 4117 4118 err = percpu_counter_init(&sbi->s_dirs_counter, 4118 - ext4_count_dirs(sb)); 4119 + ext4_count_dirs(sb), GFP_KERNEL); 4119 4120 if (!err) 4120 - err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); 4121 + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, 4122 + GFP_KERNEL); 4121 4123 if (err) { 4122 4124 ext4_msg(sb, KERN_ERR, "insufficient memory"); 4123 4125 goto failed_mount6;

+1 -1

fs/file_table.c

··· 331 331 332 332 n = (mempages * (PAGE_SIZE / 1024)) / 10; 333 333 files_stat.max_files = max_t(unsigned long, n, NR_FILE); 334 - percpu_counter_init(&nr_files, 0); 334 + percpu_counter_init(&nr_files, 0, GFP_KERNEL); 335 335 }

+1 -1

fs/quota/dquot.c

··· 2725 2725 panic("Cannot create dquot hash table"); 2726 2726 2727 2727 for (i = 0; i < _DQST_DQSTAT_LAST; i++) { 2728 - ret = percpu_counter_init(&dqstats.counter[i], 0); 2728 + ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL); 2729 2729 if (ret) 2730 2730 panic("Cannot create dquot stat counters"); 2731 2731 }

+2 -1

fs/super.c

··· 175 175 goto fail; 176 176 177 177 for (i = 0; i < SB_FREEZE_LEVELS; i++) { 178 - if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0) 178 + if (percpu_counter_init(&s->s_writers.counter[i], 0, 179 + GFP_KERNEL) < 0) 179 180 goto fail; 180 181 lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], 181 182 &type->s_writers_key[i], 0);

+1

include/linux/blk-mq.h

··· 140 140 }; 141 141 142 142 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); 143 + void blk_mq_finish_init(struct request_queue *q); 143 144 int blk_mq_register_disk(struct gendisk *); 144 145 void blk_mq_unregister_disk(struct gendisk *); 145 146

+3 -2

include/linux/flex_proportions.h

··· 10 10 #include <linux/percpu_counter.h> 11 11 #include <linux/spinlock.h> 12 12 #include <linux/seqlock.h> 13 + #include <linux/gfp.h> 13 14 14 15 /* 15 16 * When maximum proportion of some event type is specified, this is the ··· 33 32 seqcount_t sequence; 34 33 }; 35 34 36 - int fprop_global_init(struct fprop_global *p); 35 + int fprop_global_init(struct fprop_global *p, gfp_t gfp); 37 36 void fprop_global_destroy(struct fprop_global *p); 38 37 bool fprop_new_period(struct fprop_global *p, int periods); 39 38 ··· 80 79 raw_spinlock_t lock; /* Protect period and numerator */ 81 80 }; 82 81 83 - int fprop_local_init_percpu(struct fprop_local_percpu *pl); 82 + int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp); 84 83 void fprop_local_destroy_percpu(struct fprop_local_percpu *pl); 85 84 void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl); 86 85 void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl,

+80 -42

include/linux/percpu-refcount.h

··· 13 13 * 14 14 * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less 15 15 * than an atomic_t - this is because of the way shutdown works, see 16 - * percpu_ref_kill()/PCPU_COUNT_BIAS. 16 + * percpu_ref_kill()/PERCPU_COUNT_BIAS. 17 17 * 18 18 * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the 19 19 * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() ··· 49 49 #include <linux/kernel.h> 50 50 #include <linux/percpu.h> 51 51 #include <linux/rcupdate.h> 52 + #include <linux/gfp.h> 52 53 53 54 struct percpu_ref; 54 55 typedef void (percpu_ref_func_t)(struct percpu_ref *); 55 56 57 + /* flags set in the lower bits of percpu_ref->percpu_count_ptr */ 58 + enum { 59 + __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */ 60 + __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */ 61 + __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD, 62 + 63 + __PERCPU_REF_FLAG_BITS = 2, 64 + }; 65 + 66 + /* @flags for percpu_ref_init() */ 67 + enum { 68 + /* 69 + * Start w/ ref == 1 in atomic mode. Can be switched to percpu 70 + * operation using percpu_ref_switch_to_percpu(). If initialized 71 + * with this flag, the ref will stay in atomic mode until 72 + * percpu_ref_switch_to_percpu() is invoked on it. 73 + */ 74 + PERCPU_REF_INIT_ATOMIC = 1 << 0, 75 + 76 + /* 77 + * Start dead w/ ref == 0 in atomic mode. Must be revived with 78 + * percpu_ref_reinit() before used. Implies INIT_ATOMIC. 79 + */ 80 + PERCPU_REF_INIT_DEAD = 1 << 1, 81 + }; 82 + 56 83 struct percpu_ref { 57 - atomic_t count; 84 + atomic_long_t count; 58 85 /* 59 86 * The low bit of the pointer indicates whether the ref is in percpu 60 87 * mode; if set, then get/put will manipulate the atomic_t. 61 88 */ 62 - unsigned long pcpu_count_ptr; 89 + unsigned long percpu_count_ptr; 63 90 percpu_ref_func_t *release; 64 - percpu_ref_func_t *confirm_kill; 91 + percpu_ref_func_t *confirm_switch; 92 + bool force_atomic:1; 65 93 struct rcu_head rcu; 66 94 }; 67 95 68 96 int __must_check percpu_ref_init(struct percpu_ref *ref, 69 - percpu_ref_func_t *release); 70 - void percpu_ref_reinit(struct percpu_ref *ref); 97 + percpu_ref_func_t *release, unsigned int flags, 98 + gfp_t gfp); 71 99 void percpu_ref_exit(struct percpu_ref *ref); 100 + void percpu_ref_switch_to_atomic(struct percpu_ref *ref, 101 + percpu_ref_func_t *confirm_switch); 102 + void percpu_ref_switch_to_percpu(struct percpu_ref *ref); 72 103 void percpu_ref_kill_and_confirm(struct percpu_ref *ref, 73 104 percpu_ref_func_t *confirm_kill); 74 - void __percpu_ref_kill_expedited(struct percpu_ref *ref); 105 + void percpu_ref_reinit(struct percpu_ref *ref); 75 106 76 107 /** 77 108 * percpu_ref_kill - drop the initial ref ··· 119 88 return percpu_ref_kill_and_confirm(ref, NULL); 120 89 } 121 90 122 - #define PCPU_REF_DEAD 1 123 - 124 91 /* 125 92 * Internal helper. Don't use outside percpu-refcount proper. The 126 93 * function doesn't return the pointer and let the caller test it for NULL 127 94 * because doing so forces the compiler to generate two conditional 128 - * branches as it can't assume that @ref->pcpu_count is not NULL. 95 + * branches as it can't assume that @ref->percpu_count is not NULL. 129 96 */ 130 - static inline bool __pcpu_ref_alive(struct percpu_ref *ref, 131 - unsigned __percpu **pcpu_countp) 97 + static inline bool __ref_is_percpu(struct percpu_ref *ref, 98 + unsigned long __percpu **percpu_countp) 132 99 { 133 - unsigned long pcpu_ptr = ACCESS_ONCE(ref->pcpu_count_ptr); 100 + unsigned long percpu_ptr = ACCESS_ONCE(ref->percpu_count_ptr); 134 101 135 102 /* paired with smp_store_release() in percpu_ref_reinit() */ 136 103 smp_read_barrier_depends(); 137 104 138 - if (unlikely(pcpu_ptr & PCPU_REF_DEAD)) 105 + if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC)) 139 106 return false; 140 107 141 - *pcpu_countp = (unsigned __percpu *)pcpu_ptr; 108 + *percpu_countp = (unsigned long __percpu *)percpu_ptr; 142 109 return true; 143 110 } 144 111 ··· 144 115 * percpu_ref_get - increment a percpu refcount 145 116 * @ref: percpu_ref to get 146 117 * 147 - * Analagous to atomic_inc(). 148 - */ 118 + * Analagous to atomic_long_inc(). 119 + * 120 + * This function is safe to call as long as @ref is between init and exit. 121 + */ 149 122 static inline void percpu_ref_get(struct percpu_ref *ref) 150 123 { 151 - unsigned __percpu *pcpu_count; 124 + unsigned long __percpu *percpu_count; 152 125 153 126 rcu_read_lock_sched(); 154 127 155 - if (__pcpu_ref_alive(ref, &pcpu_count)) 156 - this_cpu_inc(*pcpu_count); 128 + if (__ref_is_percpu(ref, &percpu_count)) 129 + this_cpu_inc(*percpu_count); 157 130 else 158 - atomic_inc(&ref->count); 131 + atomic_long_inc(&ref->count); 159 132 160 133 rcu_read_unlock_sched(); 161 134 } ··· 169 138 * Increment a percpu refcount unless its count already reached zero. 170 139 * Returns %true on success; %false on failure. 171 140 * 172 - * The caller is responsible for ensuring that @ref stays accessible. 141 + * This function is safe to call as long as @ref is between init and exit. 173 142 */ 174 143 static inline bool percpu_ref_tryget(struct percpu_ref *ref) 175 144 { 176 - unsigned __percpu *pcpu_count; 177 - int ret = false; 145 + unsigned long __percpu *percpu_count; 146 + int ret; 178 147 179 148 rcu_read_lock_sched(); 180 149 181 - if (__pcpu_ref_alive(ref, &pcpu_count)) { 182 - this_cpu_inc(*pcpu_count); 150 + if (__ref_is_percpu(ref, &percpu_count)) { 151 + this_cpu_inc(*percpu_count); 183 152 ret = true; 184 153 } else { 185 - ret = atomic_inc_not_zero(&ref->count); 154 + ret = atomic_long_inc_not_zero(&ref->count); 186 155 } 187 156 188 157 rcu_read_unlock_sched(); ··· 197 166 * Increment a percpu refcount unless it has already been killed. Returns 198 167 * %true on success; %false on failure. 199 168 * 200 - * Completion of percpu_ref_kill() in itself doesn't guarantee that tryget 201 - * will fail. For such guarantee, percpu_ref_kill_and_confirm() should be 202 - * used. After the confirm_kill callback is invoked, it's guaranteed that 203 - * no new reference will be given out by percpu_ref_tryget(). 169 + * Completion of percpu_ref_kill() in itself doesn't guarantee that this 170 + * function will fail. For such guarantee, percpu_ref_kill_and_confirm() 171 + * should be used. After the confirm_kill callback is invoked, it's 172 + * guaranteed that no new reference will be given out by 173 + * percpu_ref_tryget_live(). 204 174 * 205 - * The caller is responsible for ensuring that @ref stays accessible. 175 + * This function is safe to call as long as @ref is between init and exit. 206 176 */ 207 177 static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) 208 178 { 209 - unsigned __percpu *pcpu_count; 179 + unsigned long __percpu *percpu_count; 210 180 int ret = false; 211 181 212 182 rcu_read_lock_sched(); 213 183 214 - if (__pcpu_ref_alive(ref, &pcpu_count)) { 215 - this_cpu_inc(*pcpu_count); 184 + if (__ref_is_percpu(ref, &percpu_count)) { 185 + this_cpu_inc(*percpu_count); 216 186 ret = true; 187 + } else if (!(ACCESS_ONCE(ref->percpu_count_ptr) & __PERCPU_REF_DEAD)) { 188 + ret = atomic_long_inc_not_zero(&ref->count); 217 189 } 218 190 219 191 rcu_read_unlock_sched(); ··· 230 196 * 231 197 * Decrement the refcount, and if 0, call the release function (which was passed 232 198 * to percpu_ref_init()) 199 + * 200 + * This function is safe to call as long as @ref is between init and exit. 233 201 */ 234 202 static inline void percpu_ref_put(struct percpu_ref *ref) 235 203 { 236 - unsigned __percpu *pcpu_count; 204 + unsigned long __percpu *percpu_count; 237 205 238 206 rcu_read_lock_sched(); 239 207 240 - if (__pcpu_ref_alive(ref, &pcpu_count)) 241 - this_cpu_dec(*pcpu_count); 242 - else if (unlikely(atomic_dec_and_test(&ref->count))) 208 + if (__ref_is_percpu(ref, &percpu_count)) 209 + this_cpu_dec(*percpu_count); 210 + else if (unlikely(atomic_long_dec_and_test(&ref->count))) 243 211 ref->release(ref); 244 212 245 213 rcu_read_unlock_sched(); ··· 252 216 * @ref: percpu_ref to test 253 217 * 254 218 * Returns %true if @ref reached zero. 219 + * 220 + * This function is safe to call as long as @ref is between init and exit. 255 221 */ 256 222 static inline bool percpu_ref_is_zero(struct percpu_ref *ref) 257 223 { 258 - unsigned __percpu *pcpu_count; 224 + unsigned long __percpu *percpu_count; 259 225 260 - if (__pcpu_ref_alive(ref, &pcpu_count)) 226 + if (__ref_is_percpu(ref, &percpu_count)) 261 227 return false; 262 - return !atomic_read(&ref->count); 228 + return !atomic_long_read(&ref->count); 263 229 } 264 230 265 231 #endif

+9 -4

include/linux/percpu.h

··· 48 48 * intelligent way to determine this would be nice. 49 49 */ 50 50 #if BITS_PER_LONG > 32 51 - #define PERCPU_DYNAMIC_RESERVE (20 << 10) 51 + #define PERCPU_DYNAMIC_RESERVE (28 << 10) 52 52 #else 53 - #define PERCPU_DYNAMIC_RESERVE (12 << 10) 53 + #define PERCPU_DYNAMIC_RESERVE (20 << 10) 54 54 #endif 55 55 56 56 extern void *pcpu_base_addr; ··· 122 122 #endif 123 123 extern void __init percpu_init_late(void); 124 124 125 + extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); 125 126 extern void __percpu *__alloc_percpu(size_t size, size_t align); 126 127 extern void free_percpu(void __percpu *__pdata); 127 128 extern phys_addr_t per_cpu_ptr_to_phys(void *addr); 128 129 129 - #define alloc_percpu(type) \ 130 - (typeof(type) __percpu *)__alloc_percpu(sizeof(type), __alignof__(type)) 130 + #define alloc_percpu_gfp(type, gfp) \ 131 + (typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type), \ 132 + __alignof__(type), gfp) 133 + #define alloc_percpu(type) \ 134 + (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ 135 + __alignof__(type)) 131 136 132 137 #endif /* __LINUX_PERCPU_H */

+6 -4

include/linux/percpu_counter.h

··· 12 12 #include <linux/threads.h> 13 13 #include <linux/percpu.h> 14 14 #include <linux/types.h> 15 + #include <linux/gfp.h> 15 16 16 17 #ifdef CONFIG_SMP 17 18 ··· 27 26 28 27 extern int percpu_counter_batch; 29 28 30 - int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, 29 + int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, 31 30 struct lock_class_key *key); 32 31 33 - #define percpu_counter_init(fbc, value) \ 32 + #define percpu_counter_init(fbc, value, gfp) \ 34 33 ({ \ 35 34 static struct lock_class_key __key; \ 36 35 \ 37 - __percpu_counter_init(fbc, value, &__key); \ 36 + __percpu_counter_init(fbc, value, gfp, &__key); \ 38 37 }) 39 38 40 39 void percpu_counter_destroy(struct percpu_counter *fbc); ··· 90 89 s64 count; 91 90 }; 92 91 93 - static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount) 92 + static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount, 93 + gfp_t gfp) 94 94 { 95 95 fbc->count = amount; 96 96 return 0;

+3 -2

include/linux/proportions.h

··· 12 12 #include <linux/percpu_counter.h> 13 13 #include <linux/spinlock.h> 14 14 #include <linux/mutex.h> 15 + #include <linux/gfp.h> 15 16 16 17 struct prop_global { 17 18 /* ··· 41 40 struct mutex mutex; /* serialize the prop_global switch */ 42 41 }; 43 42 44 - int prop_descriptor_init(struct prop_descriptor *pd, int shift); 43 + int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp); 45 44 void prop_change_shift(struct prop_descriptor *pd, int new_shift); 46 45 47 46 /* ··· 62 61 raw_spinlock_t lock; /* protect the snapshot state */ 63 62 }; 64 63 65 - int prop_local_init_percpu(struct prop_local_percpu *pl); 64 + int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp); 66 65 void prop_local_destroy_percpu(struct prop_local_percpu *pl); 67 66 void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl); 68 67 void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl,

+1 -1

include/net/dst_ops.h

··· 63 63 64 64 static inline int dst_entries_init(struct dst_ops *dst) 65 65 { 66 - return percpu_counter_init(&dst->pcpuc_entries, 0); 66 + return percpu_counter_init(&dst->pcpuc_entries, 0, GFP_KERNEL); 67 67 } 68 68 69 69 static inline void dst_entries_destroy(struct dst_ops *dst)

+1 -1

include/net/inet_frag.h

··· 151 151 152 152 static inline void init_frag_mem_limit(struct netns_frags *nf) 153 153 { 154 - percpu_counter_init(&nf->mem, 0); 154 + percpu_counter_init(&nf->mem, 0, GFP_KERNEL); 155 155 } 156 156 157 157 static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)

+4 -3

kernel/cgroup.c

··· 1607 1607 goto out; 1608 1608 root_cgrp->id = ret; 1609 1609 1610 - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); 1610 + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, 1611 + GFP_KERNEL); 1611 1612 if (ret) 1612 1613 goto out; 1613 1614 ··· 4483 4482 4484 4483 init_and_link_css(css, ss, cgrp); 4485 4484 4486 - err = percpu_ref_init(&css->refcnt, css_release); 4485 + err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); 4487 4486 if (err) 4488 4487 goto err_free_css; 4489 4488 ··· 4556 4555 goto out_unlock; 4557 4556 } 4558 4557 4559 - ret = percpu_ref_init(&cgrp->self.refcnt, css_release); 4558 + ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); 4560 4559 if (ret) 4561 4560 goto out_free_cgrp; 4562 4561

+4 -4

lib/flex_proportions.c

··· 34 34 */ 35 35 #include <linux/flex_proportions.h> 36 36 37 - int fprop_global_init(struct fprop_global *p) 37 + int fprop_global_init(struct fprop_global *p, gfp_t gfp) 38 38 { 39 39 int err; 40 40 41 41 p->period = 0; 42 42 /* Use 1 to avoid dealing with periods with 0 events... */ 43 - err = percpu_counter_init(&p->events, 1); 43 + err = percpu_counter_init(&p->events, 1, gfp); 44 44 if (err) 45 45 return err; 46 46 seqcount_init(&p->sequence); ··· 168 168 */ 169 169 #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) 170 170 171 - int fprop_local_init_percpu(struct fprop_local_percpu *pl) 171 + int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp) 172 172 { 173 173 int err; 174 174 175 - err = percpu_counter_init(&pl->events, 0); 175 + err = percpu_counter_init(&pl->events, 0, gfp); 176 176 if (err) 177 177 return err; 178 178 pl->period = 0;

+218 -89

lib/percpu-refcount.c

··· 1 1 #define pr_fmt(fmt) "%s: " fmt "\n", __func__ 2 2 3 3 #include <linux/kernel.h> 4 + #include <linux/sched.h> 5 + #include <linux/wait.h> 4 6 #include <linux/percpu-refcount.h> 5 7 6 8 /* ··· 13 11 * percpu counters will all sum to the correct value 14 12 * 15 13 * (More precisely: because moduler arithmatic is commutative the sum of all the 16 - * pcpu_count vars will be equal to what it would have been if all the gets and 17 - * puts were done to a single integer, even if some of the percpu integers 14 + * percpu_count vars will be equal to what it would have been if all the gets 15 + * and puts were done to a single integer, even if some of the percpu integers 18 16 * overflow or underflow). 19 17 * 20 18 * The real trick to implementing percpu refcounts is shutdown. We can't detect ··· 27 25 * works. 28 26 * 29 27 * Converting to non percpu mode is done with some RCUish stuff in 30 - * percpu_ref_kill. Additionally, we need a bias value so that the atomic_t 31 - * can't hit 0 before we've added up all the percpu refs. 28 + * percpu_ref_kill. Additionally, we need a bias value so that the 29 + * atomic_long_t can't hit 0 before we've added up all the percpu refs. 32 30 */ 33 31 34 - #define PCPU_COUNT_BIAS (1U << 31) 32 + #define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) 35 33 36 - static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) 34 + static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq); 35 + 36 + static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) 37 37 { 38 - return (unsigned __percpu *)(ref->pcpu_count_ptr & ~PCPU_REF_DEAD); 38 + return (unsigned long __percpu *) 39 + (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); 39 40 } 40 41 41 42 /** 42 43 * percpu_ref_init - initialize a percpu refcount 43 44 * @ref: percpu_ref to initialize 44 45 * @release: function which will be called when refcount hits 0 46 + * @flags: PERCPU_REF_INIT_* flags 47 + * @gfp: allocation mask to use 45 48 * 46 - * Initializes the refcount in single atomic counter mode with a refcount of 1; 47 - * analagous to atomic_set(ref, 1). 49 + * Initializes @ref. If @flags is zero, @ref starts in percpu mode with a 50 + * refcount of 1; analagous to atomic_long_set(ref, 1). See the 51 + * definitions of PERCPU_REF_INIT_* flags for flag behaviors. 48 52 * 49 53 * Note that @release must not sleep - it may potentially be called from RCU 50 54 * callback context by percpu_ref_kill(). 51 55 */ 52 - int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) 56 + int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, 57 + unsigned int flags, gfp_t gfp) 53 58 { 54 - atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); 59 + size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS, 60 + __alignof__(unsigned long)); 61 + unsigned long start_count = 0; 55 62 56 - ref->pcpu_count_ptr = (unsigned long)alloc_percpu(unsigned); 57 - if (!ref->pcpu_count_ptr) 63 + ref->percpu_count_ptr = (unsigned long) 64 + __alloc_percpu_gfp(sizeof(unsigned long), align, gfp); 65 + if (!ref->percpu_count_ptr) 58 66 return -ENOMEM; 67 + 68 + ref->force_atomic = flags & PERCPU_REF_INIT_ATOMIC; 69 + 70 + if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) 71 + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; 72 + else 73 + start_count += PERCPU_COUNT_BIAS; 74 + 75 + if (flags & PERCPU_REF_INIT_DEAD) 76 + ref->percpu_count_ptr |= __PERCPU_REF_DEAD; 77 + else 78 + start_count++; 79 + 80 + atomic_long_set(&ref->count, start_count); 59 81 60 82 ref->release = release; 61 83 return 0; 62 84 } 63 85 EXPORT_SYMBOL_GPL(percpu_ref_init); 64 - 65 - /** 66 - * percpu_ref_reinit - re-initialize a percpu refcount 67 - * @ref: perpcu_ref to re-initialize 68 - * 69 - * Re-initialize @ref so that it's in the same state as when it finished 70 - * percpu_ref_init(). @ref must have been initialized successfully, killed 71 - * and reached 0 but not exited. 72 - * 73 - * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while 74 - * this function is in progress. 75 - */ 76 - void percpu_ref_reinit(struct percpu_ref *ref) 77 - { 78 - unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); 79 - int cpu; 80 - 81 - BUG_ON(!pcpu_count); 82 - WARN_ON(!percpu_ref_is_zero(ref)); 83 - 84 - atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); 85 - 86 - /* 87 - * Restore per-cpu operation. smp_store_release() is paired with 88 - * smp_read_barrier_depends() in __pcpu_ref_alive() and guarantees 89 - * that the zeroing is visible to all percpu accesses which can see 90 - * the following PCPU_REF_DEAD clearing. 91 - */ 92 - for_each_possible_cpu(cpu) 93 - *per_cpu_ptr(pcpu_count, cpu) = 0; 94 - 95 - smp_store_release(&ref->pcpu_count_ptr, 96 - ref->pcpu_count_ptr & ~PCPU_REF_DEAD); 97 - } 98 - EXPORT_SYMBOL_GPL(percpu_ref_reinit); 99 86 100 87 /** 101 88 * percpu_ref_exit - undo percpu_ref_init() ··· 98 107 */ 99 108 void percpu_ref_exit(struct percpu_ref *ref) 100 109 { 101 - unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); 110 + unsigned long __percpu *percpu_count = percpu_count_ptr(ref); 102 111 103 - if (pcpu_count) { 104 - free_percpu(pcpu_count); 105 - ref->pcpu_count_ptr = PCPU_REF_DEAD; 112 + if (percpu_count) { 113 + free_percpu(percpu_count); 114 + ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD; 106 115 } 107 116 } 108 117 EXPORT_SYMBOL_GPL(percpu_ref_exit); 109 118 110 - static void percpu_ref_kill_rcu(struct rcu_head *rcu) 119 + static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu) 111 120 { 112 121 struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); 113 - unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); 114 - unsigned count = 0; 122 + 123 + ref->confirm_switch(ref); 124 + ref->confirm_switch = NULL; 125 + wake_up_all(&percpu_ref_switch_waitq); 126 + 127 + /* drop ref from percpu_ref_switch_to_atomic() */ 128 + percpu_ref_put(ref); 129 + } 130 + 131 + static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu) 132 + { 133 + struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); 134 + unsigned long __percpu *percpu_count = percpu_count_ptr(ref); 135 + unsigned long count = 0; 115 136 int cpu; 116 137 117 138 for_each_possible_cpu(cpu) 118 - count += *per_cpu_ptr(pcpu_count, cpu); 139 + count += *per_cpu_ptr(percpu_count, cpu); 119 140 120 - pr_debug("global %i pcpu %i", atomic_read(&ref->count), (int) count); 141 + pr_debug("global %ld percpu %ld", 142 + atomic_long_read(&ref->count), (long)count); 121 143 122 144 /* 123 145 * It's crucial that we sum the percpu counters _before_ adding the sum ··· 144 140 * reaching 0 before we add the percpu counts. But doing it at the same 145 141 * time is equivalent and saves us atomic operations: 146 142 */ 143 + atomic_long_add((long)count - PERCPU_COUNT_BIAS, &ref->count); 147 144 148 - atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); 145 + WARN_ONCE(atomic_long_read(&ref->count) <= 0, 146 + "percpu ref (%pf) <= 0 (%ld) after switching to atomic", 147 + ref->release, atomic_long_read(&ref->count)); 149 148 150 - WARN_ONCE(atomic_read(&ref->count) <= 0, "percpu ref <= 0 (%i)", 151 - atomic_read(&ref->count)); 149 + /* @ref is viewed as dead on all CPUs, send out switch confirmation */ 150 + percpu_ref_call_confirm_rcu(rcu); 151 + } 152 152 153 - /* @ref is viewed as dead on all CPUs, send out kill confirmation */ 154 - if (ref->confirm_kill) 155 - ref->confirm_kill(ref); 153 + static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref) 154 + { 155 + } 156 + 157 + static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, 158 + percpu_ref_func_t *confirm_switch) 159 + { 160 + if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) { 161 + /* switching from percpu to atomic */ 162 + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; 163 + 164 + /* 165 + * Non-NULL ->confirm_switch is used to indicate that 166 + * switching is in progress. Use noop one if unspecified. 167 + */ 168 + WARN_ON_ONCE(ref->confirm_switch); 169 + ref->confirm_switch = 170 + confirm_switch ?: percpu_ref_noop_confirm_switch; 171 + 172 + percpu_ref_get(ref); /* put after confirmation */ 173 + call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu); 174 + } else if (confirm_switch) { 175 + /* 176 + * Somebody already set ATOMIC. Switching may still be in 177 + * progress. @confirm_switch must be invoked after the 178 + * switching is complete and a full sched RCU grace period 179 + * has passed. Wait synchronously for the previous 180 + * switching and schedule @confirm_switch invocation. 181 + */ 182 + wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); 183 + ref->confirm_switch = confirm_switch; 184 + 185 + percpu_ref_get(ref); /* put after confirmation */ 186 + call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu); 187 + } 188 + } 189 + 190 + /** 191 + * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode 192 + * @ref: percpu_ref to switch to atomic mode 193 + * @confirm_switch: optional confirmation callback 194 + * 195 + * There's no reason to use this function for the usual reference counting. 196 + * Use percpu_ref_kill[_and_confirm](). 197 + * 198 + * Schedule switching of @ref to atomic mode. All its percpu counts will 199 + * be collected to the main atomic counter. On completion, when all CPUs 200 + * are guaraneed to be in atomic mode, @confirm_switch, which may not 201 + * block, is invoked. This function may be invoked concurrently with all 202 + * the get/put operations and can safely be mixed with kill and reinit 203 + * operations. Note that @ref will stay in atomic mode across kill/reinit 204 + * cycles until percpu_ref_switch_to_percpu() is called. 205 + * 206 + * This function normally doesn't block and can be called from any context 207 + * but it may block if @confirm_kill is specified and @ref is already in 208 + * the process of switching to atomic mode. In such cases, @confirm_switch 209 + * will be invoked after the switching is complete. 210 + * 211 + * Due to the way percpu_ref is implemented, @confirm_switch will be called 212 + * after at least one full sched RCU grace period has passed but this is an 213 + * implementation detail and must not be depended upon. 214 + */ 215 + void percpu_ref_switch_to_atomic(struct percpu_ref *ref, 216 + percpu_ref_func_t *confirm_switch) 217 + { 218 + ref->force_atomic = true; 219 + __percpu_ref_switch_to_atomic(ref, confirm_switch); 220 + } 221 + 222 + static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) 223 + { 224 + unsigned long __percpu *percpu_count = percpu_count_ptr(ref); 225 + int cpu; 226 + 227 + BUG_ON(!percpu_count); 228 + 229 + if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) 230 + return; 231 + 232 + wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); 233 + 234 + atomic_long_add(PERCPU_COUNT_BIAS, &ref->count); 156 235 157 236 /* 158 - * Now we're in single atomic_t mode with a consistent refcount, so it's 159 - * safe to drop our initial ref: 237 + * Restore per-cpu operation. smp_store_release() is paired with 238 + * smp_read_barrier_depends() in __ref_is_percpu() and guarantees 239 + * that the zeroing is visible to all percpu accesses which can see 240 + * the following __PERCPU_REF_ATOMIC clearing. 160 241 */ 161 - percpu_ref_put(ref); 242 + for_each_possible_cpu(cpu) 243 + *per_cpu_ptr(percpu_count, cpu) = 0; 244 + 245 + smp_store_release(&ref->percpu_count_ptr, 246 + ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); 247 + } 248 + 249 + /** 250 + * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode 251 + * @ref: percpu_ref to switch to percpu mode 252 + * 253 + * There's no reason to use this function for the usual reference counting. 254 + * To re-use an expired ref, use percpu_ref_reinit(). 255 + * 256 + * Switch @ref to percpu mode. This function may be invoked concurrently 257 + * with all the get/put operations and can safely be mixed with kill and 258 + * reinit operations. This function reverses the sticky atomic state set 259 + * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic(). If @ref is 260 + * dying or dead, the actual switching takes place on the following 261 + * percpu_ref_reinit(). 262 + * 263 + * This function normally doesn't block and can be called from any context 264 + * but it may block if @ref is in the process of switching to atomic mode 265 + * by percpu_ref_switch_atomic(). 266 + */ 267 + void percpu_ref_switch_to_percpu(struct percpu_ref *ref) 268 + { 269 + ref->force_atomic = false; 270 + 271 + /* a dying or dead ref can't be switched to percpu mode w/o reinit */ 272 + if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) 273 + __percpu_ref_switch_to_percpu(ref); 162 274 } 163 275 164 276 /** ··· 284 164 * 285 165 * Equivalent to percpu_ref_kill() but also schedules kill confirmation if 286 166 * @confirm_kill is not NULL. @confirm_kill, which may not block, will be 287 - * called after @ref is seen as dead from all CPUs - all further 288 - * invocations of percpu_ref_tryget() will fail. See percpu_ref_tryget() 289 - * for more details. 167 + * called after @ref is seen as dead from all CPUs at which point all 168 + * further invocations of percpu_ref_tryget_live() will fail. See 169 + * percpu_ref_tryget_live() for details. 290 170 * 291 - * Due to the way percpu_ref is implemented, @confirm_kill will be called 292 - * after at least one full RCU grace period has passed but this is an 293 - * implementation detail and callers must not depend on it. 171 + * This function normally doesn't block and can be called from any context 172 + * but it may block if @confirm_kill is specified and @ref is in the 173 + * process of switching to atomic mode by percpu_ref_switch_atomic(). 174 + * 175 + * Due to the way percpu_ref is implemented, @confirm_switch will be called 176 + * after at least one full sched RCU grace period has passed but this is an 177 + * implementation detail and must not be depended upon. 294 178 */ 295 179 void percpu_ref_kill_and_confirm(struct percpu_ref *ref, 296 180 percpu_ref_func_t *confirm_kill) 297 181 { 298 - WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, 299 - "percpu_ref_kill() called more than once!\n"); 182 + WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD, 183 + "%s called more than once on %pf!", __func__, ref->release); 300 184 301 - ref->pcpu_count_ptr |= PCPU_REF_DEAD; 302 - ref->confirm_kill = confirm_kill; 303 - 304 - call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); 185 + ref->percpu_count_ptr |= __PERCPU_REF_DEAD; 186 + __percpu_ref_switch_to_atomic(ref, confirm_kill); 187 + percpu_ref_put(ref); 305 188 } 306 189 EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); 307 190 308 - /* 309 - * XXX: Temporary kludge to work around SCSI blk-mq stall. Used only by 310 - * block/blk-mq.c::blk_mq_freeze_queue(). Will be removed during v3.18 311 - * devel cycle. Do not use anywhere else. 191 + /** 192 + * percpu_ref_reinit - re-initialize a percpu refcount 193 + * @ref: perpcu_ref to re-initialize 194 + * 195 + * Re-initialize @ref so that it's in the same state as when it finished 196 + * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD. @ref must have been 197 + * initialized successfully and reached 0 but not exited. 198 + * 199 + * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while 200 + * this function is in progress. 312 201 */ 313 - void __percpu_ref_kill_expedited(struct percpu_ref *ref) 202 + void percpu_ref_reinit(struct percpu_ref *ref) 314 203 { 315 - WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, 316 - "percpu_ref_kill() called more than once on %pf!", 317 - ref->release); 204 + WARN_ON_ONCE(!percpu_ref_is_zero(ref)); 318 205 319 - ref->pcpu_count_ptr |= PCPU_REF_DEAD; 320 - synchronize_sched_expedited(); 321 - percpu_ref_kill_rcu(&ref->rcu); 206 + ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; 207 + percpu_ref_get(ref); 208 + if (!ref->force_atomic) 209 + __percpu_ref_switch_to_percpu(ref); 322 210 } 211 + EXPORT_SYMBOL_GPL(percpu_ref_reinit);

+12 -8

lib/percpu_counter.c

··· 112 112 } 113 113 EXPORT_SYMBOL(__percpu_counter_sum); 114 114 115 - int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, 115 + int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, 116 116 struct lock_class_key *key) 117 117 { 118 + unsigned long flags __maybe_unused; 119 + 118 120 raw_spin_lock_init(&fbc->lock); 119 121 lockdep_set_class(&fbc->lock, key); 120 122 fbc->count = amount; 121 - fbc->counters = alloc_percpu(s32); 123 + fbc->counters = alloc_percpu_gfp(s32, gfp); 122 124 if (!fbc->counters) 123 125 return -ENOMEM; 124 126 ··· 128 126 129 127 #ifdef CONFIG_HOTPLUG_CPU 130 128 INIT_LIST_HEAD(&fbc->list); 131 - spin_lock(&percpu_counters_lock); 129 + spin_lock_irqsave(&percpu_counters_lock, flags); 132 130 list_add(&fbc->list, &percpu_counters); 133 - spin_unlock(&percpu_counters_lock); 131 + spin_unlock_irqrestore(&percpu_counters_lock, flags); 134 132 #endif 135 133 return 0; 136 134 } ··· 138 136 139 137 void percpu_counter_destroy(struct percpu_counter *fbc) 140 138 { 139 + unsigned long flags __maybe_unused; 140 + 141 141 if (!fbc->counters) 142 142 return; 143 143 144 144 debug_percpu_counter_deactivate(fbc); 145 145 146 146 #ifdef CONFIG_HOTPLUG_CPU 147 - spin_lock(&percpu_counters_lock); 147 + spin_lock_irqsave(&percpu_counters_lock, flags); 148 148 list_del(&fbc->list); 149 - spin_unlock(&percpu_counters_lock); 149 + spin_unlock_irqrestore(&percpu_counters_lock, flags); 150 150 #endif 151 151 free_percpu(fbc->counters); 152 152 fbc->counters = NULL; ··· 177 173 return NOTIFY_OK; 178 174 179 175 cpu = (unsigned long)hcpu; 180 - spin_lock(&percpu_counters_lock); 176 + spin_lock_irq(&percpu_counters_lock); 181 177 list_for_each_entry(fbc, &percpu_counters, list) { 182 178 s32 *pcount; 183 179 unsigned long flags; ··· 188 184 *pcount = 0; 189 185 raw_spin_unlock_irqrestore(&fbc->lock, flags); 190 186 } 191 - spin_unlock(&percpu_counters_lock); 187 + spin_unlock_irq(&percpu_counters_lock); 192 188 #endif 193 189 return NOTIFY_OK; 194 190 }

+5 -5

lib/proportions.c

··· 73 73 #include <linux/proportions.h> 74 74 #include <linux/rcupdate.h> 75 75 76 - int prop_descriptor_init(struct prop_descriptor *pd, int shift) 76 + int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp) 77 77 { 78 78 int err; 79 79 ··· 83 83 pd->index = 0; 84 84 pd->pg[0].shift = shift; 85 85 mutex_init(&pd->mutex); 86 - err = percpu_counter_init(&pd->pg[0].events, 0); 86 + err = percpu_counter_init(&pd->pg[0].events, 0, gfp); 87 87 if (err) 88 88 goto out; 89 89 90 - err = percpu_counter_init(&pd->pg[1].events, 0); 90 + err = percpu_counter_init(&pd->pg[1].events, 0, gfp); 91 91 if (err) 92 92 percpu_counter_destroy(&pd->pg[0].events); 93 93 ··· 188 188 189 189 #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) 190 190 191 - int prop_local_init_percpu(struct prop_local_percpu *pl) 191 + int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp) 192 192 { 193 193 raw_spin_lock_init(&pl->lock); 194 194 pl->shift = 0; 195 195 pl->period = 0; 196 - return percpu_counter_init(&pl->events, 0); 196 + return percpu_counter_init(&pl->events, 0, gfp); 197 197 } 198 198 199 199 void prop_local_destroy_percpu(struct prop_local_percpu *pl)

+2 -2

mm/backing-dev.c

··· 455 455 bdi_wb_init(&bdi->wb, bdi); 456 456 457 457 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 458 - err = percpu_counter_init(&bdi->bdi_stat[i], 0); 458 + err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); 459 459 if (err) 460 460 goto err; 461 461 } ··· 470 470 bdi->write_bandwidth = INIT_BW; 471 471 bdi->avg_write_bandwidth = INIT_BW; 472 472 473 - err = fprop_local_init_percpu(&bdi->completions); 473 + err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); 474 474 475 475 if (err) { 476 476 err:

+1 -1

mm/mmap.c

··· 3202 3202 { 3203 3203 int ret; 3204 3204 3205 - ret = percpu_counter_init(&vm_committed_as, 0); 3205 + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); 3206 3206 VM_BUG_ON(ret); 3207 3207 } 3208 3208

+1 -1

mm/nommu.c

··· 539 539 { 540 540 int ret; 541 541 542 - ret = percpu_counter_init(&vm_committed_as, 0); 542 + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); 543 543 VM_BUG_ON(ret); 544 544 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); 545 545 }

+1 -1

mm/page-writeback.c

··· 1777 1777 writeback_set_ratelimit(); 1778 1778 register_cpu_notifier(&ratelimit_nb); 1779 1779 1780 - fprop_global_init(&writeout_completions); 1780 + fprop_global_init(&writeout_completions, GFP_KERNEL); 1781 1781 } 1782 1782 1783 1783 /**

+9 -7

mm/percpu-km.c

··· 33 33 34 34 #include <linux/log2.h> 35 35 36 - static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 36 + static int pcpu_populate_chunk(struct pcpu_chunk *chunk, 37 + int page_start, int page_end) 37 38 { 38 - unsigned int cpu; 39 - 40 - for_each_possible_cpu(cpu) 41 - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); 42 - 43 39 return 0; 44 40 } 45 41 46 - static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) 42 + static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, 43 + int page_start, int page_end) 47 44 { 48 45 /* nada */ 49 46 } ··· 67 70 68 71 chunk->data = pages; 69 72 chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; 73 + 74 + spin_lock_irq(&pcpu_lock); 75 + pcpu_chunk_populated(chunk, 0, nr_pages); 76 + spin_unlock_irq(&pcpu_lock); 77 + 70 78 return chunk; 71 79 } 72 80

+35 -127

mm/percpu-vm.c

··· 20 20 } 21 21 22 22 /** 23 - * pcpu_get_pages_and_bitmap - get temp pages array and bitmap 23 + * pcpu_get_pages - get temp pages array 24 24 * @chunk: chunk of interest 25 - * @bitmapp: output parameter for bitmap 26 - * @may_alloc: may allocate the array 27 25 * 28 - * Returns pointer to array of pointers to struct page and bitmap, 29 - * both of which can be indexed with pcpu_page_idx(). The returned 30 - * array is cleared to zero and *@bitmapp is copied from 31 - * @chunk->populated. Note that there is only one array and bitmap 32 - * and access exclusion is the caller's responsibility. 33 - * 34 - * CONTEXT: 35 - * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. 36 - * Otherwise, don't care. 26 + * Returns pointer to array of pointers to struct page which can be indexed 27 + * with pcpu_page_idx(). Note that there is only one array and accesses 28 + * should be serialized by pcpu_alloc_mutex. 37 29 * 38 30 * RETURNS: 39 - * Pointer to temp pages array on success, NULL on failure. 31 + * Pointer to temp pages array on success. 40 32 */ 41 - static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, 42 - unsigned long **bitmapp, 43 - bool may_alloc) 33 + static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc) 44 34 { 45 35 static struct page **pages; 46 - static unsigned long *bitmap; 47 36 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); 48 - size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * 49 - sizeof(unsigned long); 50 37 51 - if (!pages || !bitmap) { 52 - if (may_alloc && !pages) 53 - pages = pcpu_mem_zalloc(pages_size); 54 - if (may_alloc && !bitmap) 55 - bitmap = pcpu_mem_zalloc(bitmap_size); 56 - if (!pages || !bitmap) 57 - return NULL; 58 - } 38 + lockdep_assert_held(&pcpu_alloc_mutex); 59 39 60 - bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); 61 - 62 - *bitmapp = bitmap; 40 + if (!pages) 41 + pages = pcpu_mem_zalloc(pages_size); 63 42 return pages; 64 43 } 65 44 ··· 46 67 * pcpu_free_pages - free pages which were allocated for @chunk 47 68 * @chunk: chunk pages were allocated for 48 69 * @pages: array of pages to be freed, indexed by pcpu_page_idx() 49 - * @populated: populated bitmap 50 70 * @page_start: page index of the first page to be freed 51 71 * @page_end: page index of the last page to be freed + 1 52 72 * ··· 53 75 * The pages were allocated for @chunk. 54 76 */ 55 77 static void pcpu_free_pages(struct pcpu_chunk *chunk, 56 - struct page **pages, unsigned long *populated, 57 - int page_start, int page_end) 78 + struct page **pages, int page_start, int page_end) 58 79 { 59 80 unsigned int cpu; 60 81 int i; ··· 72 95 * pcpu_alloc_pages - allocates pages for @chunk 73 96 * @chunk: target chunk 74 97 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() 75 - * @populated: populated bitmap 76 98 * @page_start: page index of the first page to be allocated 77 99 * @page_end: page index of the last page to be allocated + 1 78 100 * ··· 80 104 * content of @pages and will pass it verbatim to pcpu_map_pages(). 81 105 */ 82 106 static int pcpu_alloc_pages(struct pcpu_chunk *chunk, 83 - struct page **pages, unsigned long *populated, 84 - int page_start, int page_end) 107 + struct page **pages, int page_start, int page_end) 85 108 { 86 109 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; 87 110 unsigned int cpu, tcpu; ··· 139 164 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk 140 165 * @chunk: chunk of interest 141 166 * @pages: pages array which can be used to pass information to free 142 - * @populated: populated bitmap 143 167 * @page_start: page index of the first page to unmap 144 168 * @page_end: page index of the last page to unmap + 1 145 169 * ··· 149 175 * proper pre/post flush functions. 150 176 */ 151 177 static void pcpu_unmap_pages(struct pcpu_chunk *chunk, 152 - struct page **pages, unsigned long *populated, 153 - int page_start, int page_end) 178 + struct page **pages, int page_start, int page_end) 154 179 { 155 180 unsigned int cpu; 156 181 int i; ··· 165 192 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), 166 193 page_end - page_start); 167 194 } 168 - 169 - bitmap_clear(populated, page_start, page_end - page_start); 170 195 } 171 196 172 197 /** ··· 199 228 * pcpu_map_pages - map pages into a pcpu_chunk 200 229 * @chunk: chunk of interest 201 230 * @pages: pages array containing pages to be mapped 202 - * @populated: populated bitmap 203 231 * @page_start: page index of the first page to map 204 232 * @page_end: page index of the last page to map + 1 205 233 * ··· 206 236 * caller is responsible for calling pcpu_post_map_flush() after all 207 237 * mappings are complete. 208 238 * 209 - * This function is responsible for setting corresponding bits in 210 - * @chunk->populated bitmap and whatever is necessary for reverse 211 - * lookup (addr -> chunk). 239 + * This function is responsible for setting up whatever is necessary for 240 + * reverse lookup (addr -> chunk). 212 241 */ 213 242 static int pcpu_map_pages(struct pcpu_chunk *chunk, 214 - struct page **pages, unsigned long *populated, 215 - int page_start, int page_end) 243 + struct page **pages, int page_start, int page_end) 216 244 { 217 245 unsigned int cpu, tcpu; 218 246 int i, err; ··· 221 253 page_end - page_start); 222 254 if (err < 0) 223 255 goto err; 224 - } 225 256 226 - /* mapping successful, link chunk and mark populated */ 227 - for (i = page_start; i < page_end; i++) { 228 - for_each_possible_cpu(cpu) 257 + for (i = page_start; i < page_end; i++) 229 258 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], 230 259 chunk); 231 - __set_bit(i, populated); 232 260 } 233 - 234 261 return 0; 235 - 236 262 err: 237 263 for_each_possible_cpu(tcpu) { 238 264 if (tcpu == cpu) ··· 261 299 /** 262 300 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk 263 301 * @chunk: chunk of interest 264 - * @off: offset to the area to populate 265 - * @size: size of the area to populate in bytes 302 + * @page_start: the start page 303 + * @page_end: the end page 266 304 * 267 305 * For each cpu, populate and map pages [@page_start,@page_end) into 268 - * @chunk. The area is cleared on return. 306 + * @chunk. 269 307 * 270 308 * CONTEXT: 271 309 * pcpu_alloc_mutex, does GFP_KERNEL allocation. 272 310 */ 273 - static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 311 + static int pcpu_populate_chunk(struct pcpu_chunk *chunk, 312 + int page_start, int page_end) 274 313 { 275 - int page_start = PFN_DOWN(off); 276 - int page_end = PFN_UP(off + size); 277 - int free_end = page_start, unmap_end = page_start; 278 314 struct page **pages; 279 - unsigned long *populated; 280 - unsigned int cpu; 281 - int rs, re, rc; 282 315 283 - /* quick path, check whether all pages are already there */ 284 - rs = page_start; 285 - pcpu_next_pop(chunk, &rs, &re, page_end); 286 - if (rs == page_start && re == page_end) 287 - goto clear; 288 - 289 - /* need to allocate and map pages, this chunk can't be immutable */ 290 - WARN_ON(chunk->immutable); 291 - 292 - pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); 316 + pages = pcpu_get_pages(chunk); 293 317 if (!pages) 294 318 return -ENOMEM; 295 319 296 - /* alloc and map */ 297 - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 298 - rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); 299 - if (rc) 300 - goto err_free; 301 - free_end = re; 302 - } 320 + if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) 321 + return -ENOMEM; 303 322 304 - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 305 - rc = pcpu_map_pages(chunk, pages, populated, rs, re); 306 - if (rc) 307 - goto err_unmap; 308 - unmap_end = re; 323 + if (pcpu_map_pages(chunk, pages, page_start, page_end)) { 324 + pcpu_free_pages(chunk, pages, page_start, page_end); 325 + return -ENOMEM; 309 326 } 310 327 pcpu_post_map_flush(chunk, page_start, page_end); 311 328 312 - /* commit new bitmap */ 313 - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); 314 - clear: 315 - for_each_possible_cpu(cpu) 316 - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); 317 329 return 0; 318 - 319 - err_unmap: 320 - pcpu_pre_unmap_flush(chunk, page_start, unmap_end); 321 - pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) 322 - pcpu_unmap_pages(chunk, pages, populated, rs, re); 323 - pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); 324 - err_free: 325 - pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) 326 - pcpu_free_pages(chunk, pages, populated, rs, re); 327 - return rc; 328 330 } 329 331 330 332 /** 331 333 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk 332 334 * @chunk: chunk to depopulate 333 - * @off: offset to the area to depopulate 334 - * @size: size of the area to depopulate in bytes 335 + * @page_start: the start page 336 + * @page_end: the end page 335 337 * 336 338 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 337 - * from @chunk. If @flush is true, vcache is flushed before unmapping 338 - * and tlb after. 339 + * from @chunk. 339 340 * 340 341 * CONTEXT: 341 342 * pcpu_alloc_mutex. 342 343 */ 343 - static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) 344 + static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, 345 + int page_start, int page_end) 344 346 { 345 - int page_start = PFN_DOWN(off); 346 - int page_end = PFN_UP(off + size); 347 347 struct page **pages; 348 - unsigned long *populated; 349 - int rs, re; 350 - 351 - /* quick path, check whether it's empty already */ 352 - rs = page_start; 353 - pcpu_next_unpop(chunk, &rs, &re, page_end); 354 - if (rs == page_start && re == page_end) 355 - return; 356 - 357 - /* immutable chunks can't be depopulated */ 358 - WARN_ON(chunk->immutable); 359 348 360 349 /* 361 350 * If control reaches here, there must have been at least one 362 351 * successful population attempt so the temp pages array must 363 352 * be available now. 364 353 */ 365 - pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); 354 + pages = pcpu_get_pages(chunk); 366 355 BUG_ON(!pages); 367 356 368 357 /* unmap and free */ 369 358 pcpu_pre_unmap_flush(chunk, page_start, page_end); 370 359 371 - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) 372 - pcpu_unmap_pages(chunk, pages, populated, rs, re); 360 + pcpu_unmap_pages(chunk, pages, page_start, page_end); 373 361 374 362 /* no need to flush tlb, vmalloc will handle it lazily */ 375 363 376 - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) 377 - pcpu_free_pages(chunk, pages, populated, rs, re); 378 - 379 - /* commit new bitmap */ 380 - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); 364 + pcpu_free_pages(chunk, pages, page_start, page_end); 381 365 } 382 366 383 367 static struct pcpu_chunk *pcpu_create_chunk(void)

+430 -102

mm/percpu.c

··· 76 76 77 77 #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 78 78 #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 79 + #define PCPU_ATOMIC_MAP_MARGIN_LOW 32 80 + #define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 81 + #define PCPU_EMPTY_POP_PAGES_LOW 2 82 + #define PCPU_EMPTY_POP_PAGES_HIGH 4 79 83 80 84 #ifdef CONFIG_SMP 81 85 /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ ··· 106 102 int free_size; /* free bytes in the chunk */ 107 103 int contig_hint; /* max contiguous size hint */ 108 104 void *base_addr; /* base address of this chunk */ 105 + 109 106 int map_used; /* # of map entries used before the sentry */ 110 107 int map_alloc; /* # of map entries allocated */ 111 108 int *map; /* allocation map */ 109 + struct work_struct map_extend_work;/* async ->map[] extension */ 110 + 112 111 void *data; /* chunk data */ 113 112 int first_free; /* no free below this */ 114 113 bool immutable; /* no [de]population allowed */ 114 + int nr_populated; /* # of populated pages */ 115 115 unsigned long populated[]; /* populated bitmap */ 116 116 }; 117 117 ··· 159 151 static struct pcpu_chunk *pcpu_reserved_chunk; 160 152 static int pcpu_reserved_chunk_limit; 161 153 162 - /* 163 - * Synchronization rules. 164 - * 165 - * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 166 - * protects allocation/reclaim paths, chunks, populated bitmap and 167 - * vmalloc mapping. The latter is a spinlock and protects the index 168 - * data structures - chunk slots, chunks and area maps in chunks. 169 - * 170 - * During allocation, pcpu_alloc_mutex is kept locked all the time and 171 - * pcpu_lock is grabbed and released as necessary. All actual memory 172 - * allocations are done using GFP_KERNEL with pcpu_lock released. In 173 - * general, percpu memory can't be allocated with irq off but 174 - * irqsave/restore are still used in alloc path so that it can be used 175 - * from early init path - sched_init() specifically. 176 - * 177 - * Free path accesses and alters only the index data structures, so it 178 - * can be safely called from atomic context. When memory needs to be 179 - * returned to the system, free path schedules reclaim_work which 180 - * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be 181 - * reclaimed, release both locks and frees the chunks. Note that it's 182 - * necessary to grab both locks to remove a chunk from circulation as 183 - * allocation path might be referencing the chunk with only 184 - * pcpu_alloc_mutex locked. 185 - */ 186 - static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ 187 - static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ 154 + static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ 155 + static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ 188 156 189 157 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ 190 158 191 - /* reclaim work to release fully free chunks, scheduled from free path */ 192 - static void pcpu_reclaim(struct work_struct *work); 193 - static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); 159 + /* 160 + * The number of empty populated pages, protected by pcpu_lock. The 161 + * reserved chunk doesn't contribute to the count. 162 + */ 163 + static int pcpu_nr_empty_pop_pages; 164 + 165 + /* 166 + * Balance work is used to populate or destroy chunks asynchronously. We 167 + * try to keep the number of populated free pages between 168 + * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one 169 + * empty chunk. 170 + */ 171 + static void pcpu_balance_workfn(struct work_struct *work); 172 + static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); 173 + static bool pcpu_async_enabled __read_mostly; 174 + static bool pcpu_atomic_alloc_failed; 175 + 176 + static void pcpu_schedule_balance_work(void) 177 + { 178 + if (pcpu_async_enabled) 179 + schedule_work(&pcpu_balance_work); 180 + } 194 181 195 182 static bool pcpu_addr_in_first_chunk(void *addr) 196 183 { ··· 318 315 } 319 316 320 317 /** 318 + * pcpu_count_occupied_pages - count the number of pages an area occupies 319 + * @chunk: chunk of interest 320 + * @i: index of the area in question 321 + * 322 + * Count the number of pages chunk's @i'th area occupies. When the area's 323 + * start and/or end address isn't aligned to page boundary, the straddled 324 + * page is included in the count iff the rest of the page is free. 325 + */ 326 + static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) 327 + { 328 + int off = chunk->map[i] & ~1; 329 + int end = chunk->map[i + 1] & ~1; 330 + 331 + if (!PAGE_ALIGNED(off) && i > 0) { 332 + int prev = chunk->map[i - 1]; 333 + 334 + if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) 335 + off = round_down(off, PAGE_SIZE); 336 + } 337 + 338 + if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { 339 + int next = chunk->map[i + 1]; 340 + int nend = chunk->map[i + 2] & ~1; 341 + 342 + if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) 343 + end = round_up(end, PAGE_SIZE); 344 + } 345 + 346 + return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); 347 + } 348 + 349 + /** 321 350 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot 322 351 * @chunk: chunk of interest 323 352 * @oslot: the previous slot it was on ··· 377 342 /** 378 343 * pcpu_need_to_extend - determine whether chunk area map needs to be extended 379 344 * @chunk: chunk of interest 345 + * @is_atomic: the allocation context 380 346 * 381 - * Determine whether area map of @chunk needs to be extended to 382 - * accommodate a new allocation. 347 + * Determine whether area map of @chunk needs to be extended. If 348 + * @is_atomic, only the amount necessary for a new allocation is 349 + * considered; however, async extension is scheduled if the left amount is 350 + * low. If !@is_atomic, it aims for more empty space. Combined, this 351 + * ensures that the map is likely to have enough available space to 352 + * accomodate atomic allocations which can't extend maps directly. 383 353 * 384 354 * CONTEXT: 385 355 * pcpu_lock. ··· 393 353 * New target map allocation length if extension is necessary, 0 394 354 * otherwise. 395 355 */ 396 - static int pcpu_need_to_extend(struct pcpu_chunk *chunk) 356 + static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) 397 357 { 398 - int new_alloc; 358 + int margin, new_alloc; 399 359 400 - if (chunk->map_alloc >= chunk->map_used + 3) 360 + if (is_atomic) { 361 + margin = 3; 362 + 363 + if (chunk->map_alloc < 364 + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && 365 + pcpu_async_enabled) 366 + schedule_work(&chunk->map_extend_work); 367 + } else { 368 + margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; 369 + } 370 + 371 + if (chunk->map_alloc >= chunk->map_used + margin) 401 372 return 0; 402 373 403 374 new_alloc = PCPU_DFL_MAP_ALLOC; 404 - while (new_alloc < chunk->map_used + 3) 375 + while (new_alloc < chunk->map_used + margin) 405 376 new_alloc *= 2; 406 377 407 378 return new_alloc; ··· 469 418 return 0; 470 419 } 471 420 421 + static void pcpu_map_extend_workfn(struct work_struct *work) 422 + { 423 + struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, 424 + map_extend_work); 425 + int new_alloc; 426 + 427 + spin_lock_irq(&pcpu_lock); 428 + new_alloc = pcpu_need_to_extend(chunk, false); 429 + spin_unlock_irq(&pcpu_lock); 430 + 431 + if (new_alloc) 432 + pcpu_extend_area_map(chunk, new_alloc); 433 + } 434 + 435 + /** 436 + * pcpu_fit_in_area - try to fit the requested allocation in a candidate area 437 + * @chunk: chunk the candidate area belongs to 438 + * @off: the offset to the start of the candidate area 439 + * @this_size: the size of the candidate area 440 + * @size: the size of the target allocation 441 + * @align: the alignment of the target allocation 442 + * @pop_only: only allocate from already populated region 443 + * 444 + * We're trying to allocate @size bytes aligned at @align. @chunk's area 445 + * at @off sized @this_size is a candidate. This function determines 446 + * whether the target allocation fits in the candidate area and returns the 447 + * number of bytes to pad after @off. If the target area doesn't fit, -1 448 + * is returned. 449 + * 450 + * If @pop_only is %true, this function only considers the already 451 + * populated part of the candidate area. 452 + */ 453 + static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, 454 + int size, int align, bool pop_only) 455 + { 456 + int cand_off = off; 457 + 458 + while (true) { 459 + int head = ALIGN(cand_off, align) - off; 460 + int page_start, page_end, rs, re; 461 + 462 + if (this_size < head + size) 463 + return -1; 464 + 465 + if (!pop_only) 466 + return head; 467 + 468 + /* 469 + * If the first unpopulated page is beyond the end of the 470 + * allocation, the whole allocation is populated; 471 + * otherwise, retry from the end of the unpopulated area. 472 + */ 473 + page_start = PFN_DOWN(head + off); 474 + page_end = PFN_UP(head + off + size); 475 + 476 + rs = page_start; 477 + pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); 478 + if (rs >= page_end) 479 + return head; 480 + cand_off = re * PAGE_SIZE; 481 + } 482 + } 483 + 472 484 /** 473 485 * pcpu_alloc_area - allocate area from a pcpu_chunk 474 486 * @chunk: chunk of interest 475 487 * @size: wanted size in bytes 476 488 * @align: wanted align 489 + * @pop_only: allocate only from the populated area 490 + * @occ_pages_p: out param for the number of pages the area occupies 477 491 * 478 492 * Try to allocate @size bytes area aligned at @align from @chunk. 479 493 * Note that this function only allocates the offset. It doesn't ··· 553 437 * Allocated offset in @chunk on success, -1 if no matching area is 554 438 * found. 555 439 */ 556 - static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) 440 + static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, 441 + bool pop_only, int *occ_pages_p) 557 442 { 558 443 int oslot = pcpu_chunk_slot(chunk); 559 444 int max_contig = 0; ··· 570 453 if (off & 1) 571 454 continue; 572 455 573 - /* extra for alignment requirement */ 574 - head = ALIGN(off, align) - off; 575 - 576 456 this_size = (p[1] & ~1) - off; 577 - if (this_size < head + size) { 457 + 458 + head = pcpu_fit_in_area(chunk, off, this_size, size, align, 459 + pop_only); 460 + if (head < 0) { 578 461 if (!seen_free) { 579 462 chunk->first_free = i; 580 463 seen_free = true; ··· 643 526 chunk->free_size -= size; 644 527 *p |= 1; 645 528 529 + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); 646 530 pcpu_chunk_relocate(chunk, oslot); 647 531 return off; 648 532 } ··· 659 541 * pcpu_free_area - free area to a pcpu_chunk 660 542 * @chunk: chunk of interest 661 543 * @freeme: offset of area to free 544 + * @occ_pages_p: out param for the number of pages the area occupies 662 545 * 663 546 * Free area starting from @freeme to @chunk. Note that this function 664 547 * only modifies the allocation map. It doesn't depopulate or unmap ··· 668 549 * CONTEXT: 669 550 * pcpu_lock. 670 551 */ 671 - static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) 552 + static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, 553 + int *occ_pages_p) 672 554 { 673 555 int oslot = pcpu_chunk_slot(chunk); 674 556 int off = 0; ··· 699 579 p = chunk->map + i; 700 580 *p = off &= ~1; 701 581 chunk->free_size += (p[1] & ~1) - off; 582 + 583 + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); 702 584 703 585 /* merge with next? */ 704 586 if (!(p[1] & 1)) ··· 742 620 chunk->map_used = 1; 743 621 744 622 INIT_LIST_HEAD(&chunk->list); 623 + INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); 745 624 chunk->free_size = pcpu_unit_size; 746 625 chunk->contig_hint = pcpu_unit_size; 747 626 ··· 755 632 return; 756 633 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); 757 634 pcpu_mem_free(chunk, pcpu_chunk_struct_size); 635 + } 636 + 637 + /** 638 + * pcpu_chunk_populated - post-population bookkeeping 639 + * @chunk: pcpu_chunk which got populated 640 + * @page_start: the start page 641 + * @page_end: the end page 642 + * 643 + * Pages in [@page_start,@page_end) have been populated to @chunk. Update 644 + * the bookkeeping information accordingly. Must be called after each 645 + * successful population. 646 + */ 647 + static void pcpu_chunk_populated(struct pcpu_chunk *chunk, 648 + int page_start, int page_end) 649 + { 650 + int nr = page_end - page_start; 651 + 652 + lockdep_assert_held(&pcpu_lock); 653 + 654 + bitmap_set(chunk->populated, page_start, nr); 655 + chunk->nr_populated += nr; 656 + pcpu_nr_empty_pop_pages += nr; 657 + } 658 + 659 + /** 660 + * pcpu_chunk_depopulated - post-depopulation bookkeeping 661 + * @chunk: pcpu_chunk which got depopulated 662 + * @page_start: the start page 663 + * @page_end: the end page 664 + * 665 + * Pages in [@page_start,@page_end) have been depopulated from @chunk. 666 + * Update the bookkeeping information accordingly. Must be called after 667 + * each successful depopulation. 668 + */ 669 + static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, 670 + int page_start, int page_end) 671 + { 672 + int nr = page_end - page_start; 673 + 674 + lockdep_assert_held(&pcpu_lock); 675 + 676 + bitmap_clear(chunk->populated, page_start, nr); 677 + chunk->nr_populated -= nr; 678 + pcpu_nr_empty_pop_pages -= nr; 758 679 } 759 680 760 681 /* ··· 862 695 * @size: size of area to allocate in bytes 863 696 * @align: alignment of area (max PAGE_SIZE) 864 697 * @reserved: allocate from the reserved chunk if available 698 + * @gfp: allocation flags 865 699 * 866 - * Allocate percpu area of @size bytes aligned at @align. 867 - * 868 - * CONTEXT: 869 - * Does GFP_KERNEL allocation. 700 + * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't 701 + * contain %GFP_KERNEL, the allocation is atomic. 870 702 * 871 703 * RETURNS: 872 704 * Percpu pointer to the allocated area on success, NULL on failure. 873 705 */ 874 - static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) 706 + static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, 707 + gfp_t gfp) 875 708 { 876 709 static int warn_limit = 10; 877 710 struct pcpu_chunk *chunk; 878 711 const char *err; 879 - int slot, off, new_alloc; 712 + bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; 713 + int occ_pages = 0; 714 + int slot, off, new_alloc, cpu, ret; 880 715 unsigned long flags; 881 716 void __percpu *ptr; 882 717 ··· 897 728 return NULL; 898 729 } 899 730 900 - mutex_lock(&pcpu_alloc_mutex); 901 731 spin_lock_irqsave(&pcpu_lock, flags); 902 732 903 733 /* serve reserved allocations from the reserved chunk if available */ ··· 908 740 goto fail_unlock; 909 741 } 910 742 911 - while ((new_alloc = pcpu_need_to_extend(chunk))) { 743 + while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { 912 744 spin_unlock_irqrestore(&pcpu_lock, flags); 913 - if (pcpu_extend_area_map(chunk, new_alloc) < 0) { 745 + if (is_atomic || 746 + pcpu_extend_area_map(chunk, new_alloc) < 0) { 914 747 err = "failed to extend area map of reserved chunk"; 915 - goto fail_unlock_mutex; 748 + goto fail; 916 749 } 917 750 spin_lock_irqsave(&pcpu_lock, flags); 918 751 } 919 752 920 - off = pcpu_alloc_area(chunk, size, align); 753 + off = pcpu_alloc_area(chunk, size, align, is_atomic, 754 + &occ_pages); 921 755 if (off >= 0) 922 756 goto area_found; 923 757 ··· 934 764 if (size > chunk->contig_hint) 935 765 continue; 936 766 937 - new_alloc = pcpu_need_to_extend(chunk); 767 + new_alloc = pcpu_need_to_extend(chunk, is_atomic); 938 768 if (new_alloc) { 769 + if (is_atomic) 770 + continue; 939 771 spin_unlock_irqrestore(&pcpu_lock, flags); 940 772 if (pcpu_extend_area_map(chunk, 941 773 new_alloc) < 0) { 942 774 err = "failed to extend area map"; 943 - goto fail_unlock_mutex; 775 + goto fail; 944 776 } 945 777 spin_lock_irqsave(&pcpu_lock, flags); 946 778 /* ··· 952 780 goto restart; 953 781 } 954 782 955 - off = pcpu_alloc_area(chunk, size, align); 783 + off = pcpu_alloc_area(chunk, size, align, is_atomic, 784 + &occ_pages); 956 785 if (off >= 0) 957 786 goto area_found; 958 787 } 959 788 } 960 789 961 - /* hmmm... no space left, create a new chunk */ 962 790 spin_unlock_irqrestore(&pcpu_lock, flags); 963 791 964 - chunk = pcpu_create_chunk(); 965 - if (!chunk) { 966 - err = "failed to allocate new chunk"; 967 - goto fail_unlock_mutex; 792 + /* 793 + * No space left. Create a new chunk. We don't want multiple 794 + * tasks to create chunks simultaneously. Serialize and create iff 795 + * there's still no empty chunk after grabbing the mutex. 796 + */ 797 + if (is_atomic) 798 + goto fail; 799 + 800 + mutex_lock(&pcpu_alloc_mutex); 801 + 802 + if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { 803 + chunk = pcpu_create_chunk(); 804 + if (!chunk) { 805 + mutex_unlock(&pcpu_alloc_mutex); 806 + err = "failed to allocate new chunk"; 807 + goto fail; 808 + } 809 + 810 + spin_lock_irqsave(&pcpu_lock, flags); 811 + pcpu_chunk_relocate(chunk, -1); 812 + } else { 813 + spin_lock_irqsave(&pcpu_lock, flags); 968 814 } 969 815 970 - spin_lock_irqsave(&pcpu_lock, flags); 971 - pcpu_chunk_relocate(chunk, -1); 816 + mutex_unlock(&pcpu_alloc_mutex); 972 817 goto restart; 973 818 974 819 area_found: 975 820 spin_unlock_irqrestore(&pcpu_lock, flags); 976 821 977 - /* populate, map and clear the area */ 978 - if (pcpu_populate_chunk(chunk, off, size)) { 979 - spin_lock_irqsave(&pcpu_lock, flags); 980 - pcpu_free_area(chunk, off); 981 - err = "failed to populate"; 982 - goto fail_unlock; 822 + /* populate if not all pages are already there */ 823 + if (!is_atomic) { 824 + int page_start, page_end, rs, re; 825 + 826 + mutex_lock(&pcpu_alloc_mutex); 827 + 828 + page_start = PFN_DOWN(off); 829 + page_end = PFN_UP(off + size); 830 + 831 + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 832 + WARN_ON(chunk->immutable); 833 + 834 + ret = pcpu_populate_chunk(chunk, rs, re); 835 + 836 + spin_lock_irqsave(&pcpu_lock, flags); 837 + if (ret) { 838 + mutex_unlock(&pcpu_alloc_mutex); 839 + pcpu_free_area(chunk, off, &occ_pages); 840 + err = "failed to populate"; 841 + goto fail_unlock; 842 + } 843 + pcpu_chunk_populated(chunk, rs, re); 844 + spin_unlock_irqrestore(&pcpu_lock, flags); 845 + } 846 + 847 + mutex_unlock(&pcpu_alloc_mutex); 983 848 } 984 849 985 - mutex_unlock(&pcpu_alloc_mutex); 850 + if (chunk != pcpu_reserved_chunk) 851 + pcpu_nr_empty_pop_pages -= occ_pages; 986 852 987 - /* return address relative to base address */ 853 + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) 854 + pcpu_schedule_balance_work(); 855 + 856 + /* clear the areas and return address relative to base address */ 857 + for_each_possible_cpu(cpu) 858 + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); 859 + 988 860 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); 989 861 kmemleak_alloc_percpu(ptr, size); 990 862 return ptr; 991 863 992 864 fail_unlock: 993 865 spin_unlock_irqrestore(&pcpu_lock, flags); 994 - fail_unlock_mutex: 995 - mutex_unlock(&pcpu_alloc_mutex); 996 - if (warn_limit) { 997 - pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " 998 - "%s\n", size, align, err); 866 + fail: 867 + if (!is_atomic && warn_limit) { 868 + pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", 869 + size, align, is_atomic, err); 999 870 dump_stack(); 1000 871 if (!--warn_limit) 1001 872 pr_info("PERCPU: limit reached, disable warning\n"); 1002 873 } 874 + if (is_atomic) { 875 + /* see the flag handling in pcpu_blance_workfn() */ 876 + pcpu_atomic_alloc_failed = true; 877 + pcpu_schedule_balance_work(); 878 + } 1003 879 return NULL; 1004 880 } 881 + 882 + /** 883 + * __alloc_percpu_gfp - allocate dynamic percpu area 884 + * @size: size of area to allocate in bytes 885 + * @align: alignment of area (max PAGE_SIZE) 886 + * @gfp: allocation flags 887 + * 888 + * Allocate zero-filled percpu area of @size bytes aligned at @align. If 889 + * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can 890 + * be called from any context but is a lot more likely to fail. 891 + * 892 + * RETURNS: 893 + * Percpu pointer to the allocated area on success, NULL on failure. 894 + */ 895 + void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) 896 + { 897 + return pcpu_alloc(size, align, false, gfp); 898 + } 899 + EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); 1005 900 1006 901 /** 1007 902 * __alloc_percpu - allocate dynamic percpu area 1008 903 * @size: size of area to allocate in bytes 1009 904 * @align: alignment of area (max PAGE_SIZE) 1010 905 * 1011 - * Allocate zero-filled percpu area of @size bytes aligned at @align. 1012 - * Might sleep. Might trigger writeouts. 1013 - * 1014 - * CONTEXT: 1015 - * Does GFP_KERNEL allocation. 1016 - * 1017 - * RETURNS: 1018 - * Percpu pointer to the allocated area on success, NULL on failure. 906 + * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). 1019 907 */ 1020 908 void __percpu *__alloc_percpu(size_t size, size_t align) 1021 909 { 1022 - return pcpu_alloc(size, align, false); 910 + return pcpu_alloc(size, align, false, GFP_KERNEL); 1023 911 } 1024 912 EXPORT_SYMBOL_GPL(__alloc_percpu); 1025 913 ··· 1101 869 */ 1102 870 void __percpu *__alloc_reserved_percpu(size_t size, size_t align) 1103 871 { 1104 - return pcpu_alloc(size, align, true); 872 + return pcpu_alloc(size, align, true, GFP_KERNEL); 1105 873 } 1106 874 1107 875 /** 1108 - * pcpu_reclaim - reclaim fully free chunks, workqueue function 876 + * pcpu_balance_workfn - manage the amount of free chunks and populated pages 1109 877 * @work: unused 1110 878 * 1111 879 * Reclaim all fully free chunks except for the first one. 1112 - * 1113 - * CONTEXT: 1114 - * workqueue context. 1115 880 */ 1116 - static void pcpu_reclaim(struct work_struct *work) 881 + static void pcpu_balance_workfn(struct work_struct *work) 1117 882 { 1118 - LIST_HEAD(todo); 1119 - struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; 883 + LIST_HEAD(to_free); 884 + struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; 1120 885 struct pcpu_chunk *chunk, *next; 886 + int slot, nr_to_pop, ret; 1121 887 888 + /* 889 + * There's no reason to keep around multiple unused chunks and VM 890 + * areas can be scarce. Destroy all free chunks except for one. 891 + */ 1122 892 mutex_lock(&pcpu_alloc_mutex); 1123 893 spin_lock_irq(&pcpu_lock); 1124 894 1125 - list_for_each_entry_safe(chunk, next, head, list) { 895 + list_for_each_entry_safe(chunk, next, free_head, list) { 1126 896 WARN_ON(chunk->immutable); 1127 897 1128 898 /* spare the first one */ 1129 - if (chunk == list_first_entry(head, struct pcpu_chunk, list)) 899 + if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) 1130 900 continue; 1131 901 1132 - list_move(&chunk->list, &todo); 902 + list_move(&chunk->list, &to_free); 1133 903 } 1134 904 1135 905 spin_unlock_irq(&pcpu_lock); 1136 906 1137 - list_for_each_entry_safe(chunk, next, &todo, list) { 1138 - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); 907 + list_for_each_entry_safe(chunk, next, &to_free, list) { 908 + int rs, re; 909 + 910 + pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { 911 + pcpu_depopulate_chunk(chunk, rs, re); 912 + spin_lock_irq(&pcpu_lock); 913 + pcpu_chunk_depopulated(chunk, rs, re); 914 + spin_unlock_irq(&pcpu_lock); 915 + } 1139 916 pcpu_destroy_chunk(chunk); 917 + } 918 + 919 + /* 920 + * Ensure there are certain number of free populated pages for 921 + * atomic allocs. Fill up from the most packed so that atomic 922 + * allocs don't increase fragmentation. If atomic allocation 923 + * failed previously, always populate the maximum amount. This 924 + * should prevent atomic allocs larger than PAGE_SIZE from keeping 925 + * failing indefinitely; however, large atomic allocs are not 926 + * something we support properly and can be highly unreliable and 927 + * inefficient. 928 + */ 929 + retry_pop: 930 + if (pcpu_atomic_alloc_failed) { 931 + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; 932 + /* best effort anyway, don't worry about synchronization */ 933 + pcpu_atomic_alloc_failed = false; 934 + } else { 935 + nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - 936 + pcpu_nr_empty_pop_pages, 937 + 0, PCPU_EMPTY_POP_PAGES_HIGH); 938 + } 939 + 940 + for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { 941 + int nr_unpop = 0, rs, re; 942 + 943 + if (!nr_to_pop) 944 + break; 945 + 946 + spin_lock_irq(&pcpu_lock); 947 + list_for_each_entry(chunk, &pcpu_slot[slot], list) { 948 + nr_unpop = pcpu_unit_pages - chunk->nr_populated; 949 + if (nr_unpop) 950 + break; 951 + } 952 + spin_unlock_irq(&pcpu_lock); 953 + 954 + if (!nr_unpop) 955 + continue; 956 + 957 + /* @chunk can't go away while pcpu_alloc_mutex is held */ 958 + pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { 959 + int nr = min(re - rs, nr_to_pop); 960 + 961 + ret = pcpu_populate_chunk(chunk, rs, rs + nr); 962 + if (!ret) { 963 + nr_to_pop -= nr; 964 + spin_lock_irq(&pcpu_lock); 965 + pcpu_chunk_populated(chunk, rs, rs + nr); 966 + spin_unlock_irq(&pcpu_lock); 967 + } else { 968 + nr_to_pop = 0; 969 + } 970 + 971 + if (!nr_to_pop) 972 + break; 973 + } 974 + } 975 + 976 + if (nr_to_pop) { 977 + /* ran out of chunks to populate, create a new one and retry */ 978 + chunk = pcpu_create_chunk(); 979 + if (chunk) { 980 + spin_lock_irq(&pcpu_lock); 981 + pcpu_chunk_relocate(chunk, -1); 982 + spin_unlock_irq(&pcpu_lock); 983 + goto retry_pop; 984 + } 1140 985 } 1141 986 1142 987 mutex_unlock(&pcpu_alloc_mutex); ··· 1233 924 void *addr; 1234 925 struct pcpu_chunk *chunk; 1235 926 unsigned long flags; 1236 - int off; 927 + int off, occ_pages; 1237 928 1238 929 if (!ptr) 1239 930 return; ··· 1247 938 chunk = pcpu_chunk_addr_search(addr); 1248 939 off = addr - chunk->base_addr; 1249 940 1250 - pcpu_free_area(chunk, off); 941 + pcpu_free_area(chunk, off, &occ_pages); 942 + 943 + if (chunk != pcpu_reserved_chunk) 944 + pcpu_nr_empty_pop_pages += occ_pages; 1251 945 1252 946 /* if there are more than one fully free chunks, wake up grim reaper */ 1253 947 if (chunk->free_size == pcpu_unit_size) { ··· 1258 946 1259 947 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) 1260 948 if (pos != chunk) { 1261 - schedule_work(&pcpu_reclaim_work); 949 + pcpu_schedule_balance_work(); 1262 950 break; 1263 951 } 1264 952 } ··· 1648 1336 */ 1649 1337 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1650 1338 INIT_LIST_HEAD(&schunk->list); 1339 + INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); 1651 1340 schunk->base_addr = base_addr; 1652 1341 schunk->map = smap; 1653 1342 schunk->map_alloc = ARRAY_SIZE(smap); 1654 1343 schunk->immutable = true; 1655 1344 bitmap_fill(schunk->populated, pcpu_unit_pages); 1345 + schunk->nr_populated = pcpu_unit_pages; 1656 1346 1657 1347 if (ai->reserved_size) { 1658 1348 schunk->free_size = ai->reserved_size; ··· 1678 1364 if (dyn_size) { 1679 1365 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1680 1366 INIT_LIST_HEAD(&dchunk->list); 1367 + INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); 1681 1368 dchunk->base_addr = base_addr; 1682 1369 dchunk->map = dmap; 1683 1370 dchunk->map_alloc = ARRAY_SIZE(dmap); 1684 1371 dchunk->immutable = true; 1685 1372 bitmap_fill(dchunk->populated, pcpu_unit_pages); 1373 + dchunk->nr_populated = pcpu_unit_pages; 1686 1374 1687 1375 dchunk->contig_hint = dchunk->free_size = dyn_size; 1688 1376 dchunk->map[0] = 1; ··· 1695 1379 1696 1380 /* link the first chunk in */ 1697 1381 pcpu_first_chunk = dchunk ?: schunk; 1382 + pcpu_nr_empty_pop_pages += 1383 + pcpu_count_occupied_pages(pcpu_first_chunk, 1); 1698 1384 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1699 1385 1700 1386 /* we're done */ ··· 2250 1932 2251 1933 if (pcpu_setup_first_chunk(ai, fc) < 0) 2252 1934 panic("Failed to initialize percpu areas."); 2253 - 2254 - pcpu_free_alloc_info(ai); 2255 1935 } 2256 1936 2257 1937 #endif /* CONFIG_SMP */ ··· 2283 1967 spin_unlock_irqrestore(&pcpu_lock, flags); 2284 1968 } 2285 1969 } 1970 + 1971 + /* 1972 + * Percpu allocator is initialized early during boot when neither slab or 1973 + * workqueue is available. Plug async management until everything is up 1974 + * and running. 1975 + */ 1976 + static int __init percpu_enable_async(void) 1977 + { 1978 + pcpu_async_enabled = true; 1979 + return 0; 1980 + } 1981 + subsys_initcall(percpu_enable_async);

+1 -1

mm/shmem.c

··· 2995 2995 #endif 2996 2996 2997 2997 spin_lock_init(&sbinfo->stat_lock); 2998 - if (percpu_counter_init(&sbinfo->used_blocks, 0)) 2998 + if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) 2999 2999 goto failed; 3000 3000 sbinfo->free_inodes = sbinfo->max_inodes; 3001 3001

+1 -1

net/dccp/proto.c

··· 1115 1115 1116 1116 BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > 1117 1117 FIELD_SIZEOF(struct sk_buff, cb)); 1118 - rc = percpu_counter_init(&dccp_orphan_count, 0); 1118 + rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL); 1119 1119 if (rc) 1120 1120 goto out_fail; 1121 1121 rc = -ENOBUFS;

+2 -2

net/ipv4/tcp.c

··· 3071 3071 3072 3072 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3073 3073 3074 - percpu_counter_init(&tcp_sockets_allocated, 0); 3075 - percpu_counter_init(&tcp_orphan_count, 0); 3074 + percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); 3075 + percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); 3076 3076 tcp_hashinfo.bind_bucket_cachep = 3077 3077 kmem_cache_create("tcp_bind_bucket", 3078 3078 sizeof(struct inet_bind_bucket), 0,

+1 -1

net/ipv4/tcp_memcontrol.c

··· 32 32 res_parent = &parent_cg->memory_allocated; 33 33 34 34 res_counter_init(&cg_proto->memory_allocated, res_parent); 35 - percpu_counter_init(&cg_proto->sockets_allocated, 0); 35 + percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); 36 36 37 37 return 0; 38 38 }

+1 -1

net/sctp/protocol.c

··· 1341 1341 if (!sctp_chunk_cachep) 1342 1342 goto err_chunk_cachep; 1343 1343 1344 - status = percpu_counter_init(&sctp_sockets_allocated, 0); 1344 + status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL); 1345 1345 if (status) 1346 1346 goto err_percpu_counter_init; 1347 1347