Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-3.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu

Pull percpu updates from Tejun Heo:
"A lot of activities on percpu front. Notable changes are...

- percpu allocator now can take @gfp. If @gfp doesn't contain
GFP_KERNEL, it tries to allocate from what's already available to
the allocator and a work item tries to keep the reserve around
certain level so that these atomic allocations usually succeed.

This will replace the ad-hoc percpu memory pool used by
blk-throttle and also be used by the planned blkcg support for
writeback IOs.

Please note that I noticed a bug in how @gfp is interpreted while
preparing this pull request and applied the fix 6ae833c7fe0c
("percpu: fix how @gfp is interpreted by the percpu allocator")
just now.

- percpu_ref now uses longs for percpu and global counters instead of
ints. It leads to more sparse packing of the percpu counters on
64bit machines but the overhead should be negligible and this
allows using percpu_ref for refcnting pages and in-memory objects
directly.

- The switching between percpu and single counter modes of a
percpu_ref is made independent of putting the base ref and a
percpu_ref can now optionally be initialized in single or killed
mode. This allows avoiding percpu shutdown latency for cases where
the refcounted objects may be synchronously created and destroyed
in rapid succession with only a fraction of them reaching fully
operational status (SCSI probing does this when combined with
blk-mq support). It's also planned to be used to implement forced
single mode to detect underflow more timely for debugging.

There's a separate branch percpu/for-3.18-consistent-ops which cleans
up the duplicate percpu accessors. That branch causes a number of
conflicts with s390 and other trees. I'll send a separate pull
request w/ resolutions once other branches are merged"

* 'for-3.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu: (33 commits)
percpu: fix how @gfp is interpreted by the percpu allocator
blk-mq, percpu_ref: start q->mq_usage_counter in atomic mode
percpu_ref: make INIT_ATOMIC and switch_to_atomic() sticky
percpu_ref: add PERCPU_REF_INIT_* flags
percpu_ref: decouple switching to percpu mode and reinit
percpu_ref: decouple switching to atomic mode and killing
percpu_ref: add PCPU_REF_DEAD
percpu_ref: rename things to prepare for decoupling percpu/atomic mode switch
percpu_ref: replace pcpu_ prefix with percpu_
percpu_ref: minor code and comment updates
percpu_ref: relocate percpu_ref_reinit()
Revert "blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe"
Revert "percpu: free percpu allocation info for uniprocessor system"
percpu-refcount: make percpu_ref based on longs instead of ints
percpu-refcount: improve WARN messages
percpu: fix locking regression in the failure path of pcpu_alloc()
percpu-refcount: add @gfp to percpu_ref_init()
proportions: add @gfp to init functions
percpu_counter: add @gfp to percpu_counter_init()
percpu_counter: make percpu_counters_lock irq-safe
...

+883 -448
+1 -1
arch/x86/kvm/mmu.c
··· 4549 4549 if (!mmu_page_header_cache) 4550 4550 goto nomem; 4551 4551 4552 - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) 4552 + if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) 4553 4553 goto nomem; 4554 4554 4555 4555 register_shrinker(&mmu_shrinker);
+6
block/blk-mq-sysfs.c
··· 402 402 } 403 403 } 404 404 405 + /* see blk_register_queue() */ 406 + void blk_mq_finish_init(struct request_queue *q) 407 + { 408 + percpu_ref_switch_to_percpu(&q->mq_usage_counter); 409 + } 410 + 405 411 int blk_mq_register_disk(struct gendisk *disk) 406 412 { 407 413 struct device *dev = disk_to_dev(disk);
+7 -11
block/blk-mq.c
··· 119 119 spin_unlock_irq(q->queue_lock); 120 120 121 121 if (freeze) { 122 - /* 123 - * XXX: Temporary kludge to work around SCSI blk-mq stall. 124 - * SCSI synchronously creates and destroys many queues 125 - * back-to-back during probe leading to lengthy stalls. 126 - * This will be fixed by keeping ->mq_usage_counter in 127 - * atomic mode until genhd registration, but, for now, 128 - * let's work around using expedited synchronization. 129 - */ 130 - __percpu_ref_kill_expedited(&q->mq_usage_counter); 131 - 122 + percpu_ref_kill(&q->mq_usage_counter); 132 123 blk_mq_run_queues(q, false); 133 124 } 134 125 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); ··· 1795 1804 if (!q) 1796 1805 goto err_hctxs; 1797 1806 1798 - if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release)) 1807 + /* 1808 + * Init percpu_ref in atomic mode so that it's faster to shutdown. 1809 + * See blk_register_queue() for details. 1810 + */ 1811 + if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, 1812 + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) 1799 1813 goto err_map; 1800 1814 1801 1815 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
+9 -2
block/blk-sysfs.c
··· 551 551 return -ENXIO; 552 552 553 553 /* 554 - * Initialization must be complete by now. Finish the initial 555 - * bypass from queue allocation. 554 + * SCSI probing may synchronously create and destroy a lot of 555 + * request_queues for non-existent devices. Shutting down a fully 556 + * functional queue takes measureable wallclock time as RCU grace 557 + * periods are involved. To avoid excessive latency in these 558 + * cases, a request_queue starts out in a degraded mode which is 559 + * faster to shut down and is made fully functional here as 560 + * request_queues for non-existent devices never get registered. 556 561 */ 557 562 if (!blk_queue_init_done(q)) { 558 563 queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); 559 564 blk_queue_bypass_end(q); 565 + if (q->mq_ops) 566 + blk_mq_finish_init(q); 560 567 } 561 568 562 569 ret = blk_trace_init_sysfs(dev);
+2 -1
drivers/target/target_core_tpg.c
··· 819 819 { 820 820 int ret; 821 821 822 - ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release); 822 + ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, 0, 823 + GFP_KERNEL); 823 824 if (ret < 0) 824 825 return ret; 825 826
+2 -2
fs/aio.c
··· 661 661 662 662 INIT_LIST_HEAD(&ctx->active_reqs); 663 663 664 - if (percpu_ref_init(&ctx->users, free_ioctx_users)) 664 + if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) 665 665 goto err; 666 666 667 - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) 667 + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) 668 668 goto err; 669 669 670 670 ctx->cpu = alloc_percpu(struct kioctx_cpu);
+4 -4
fs/btrfs/disk-io.c
··· 1183 1183 if (!writers) 1184 1184 return ERR_PTR(-ENOMEM); 1185 1185 1186 - ret = percpu_counter_init(&writers->counter, 0); 1186 + ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL); 1187 1187 if (ret < 0) { 1188 1188 kfree(writers); 1189 1189 return ERR_PTR(ret); ··· 2188 2188 goto fail_srcu; 2189 2189 } 2190 2190 2191 - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); 2191 + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); 2192 2192 if (ret) { 2193 2193 err = ret; 2194 2194 goto fail_bdi; ··· 2196 2196 fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * 2197 2197 (1 + ilog2(nr_cpu_ids)); 2198 2198 2199 - ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); 2199 + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); 2200 2200 if (ret) { 2201 2201 err = ret; 2202 2202 goto fail_dirty_metadata_bytes; 2203 2203 } 2204 2204 2205 - ret = percpu_counter_init(&fs_info->bio_counter, 0); 2205 + ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL); 2206 2206 if (ret) { 2207 2207 err = ret; 2208 2208 goto fail_delalloc_bytes;
+1 -1
fs/btrfs/extent-tree.c
··· 3494 3494 if (!found) 3495 3495 return -ENOMEM; 3496 3496 3497 - ret = percpu_counter_init(&found->total_bytes_pinned, 0); 3497 + ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); 3498 3498 if (ret) { 3499 3499 kfree(found); 3500 3500 return ret;
+3 -3
fs/ext2/super.c
··· 1067 1067 ext2_rsv_window_add(sb, &sbi->s_rsv_window_head); 1068 1068 1069 1069 err = percpu_counter_init(&sbi->s_freeblocks_counter, 1070 - ext2_count_free_blocks(sb)); 1070 + ext2_count_free_blocks(sb), GFP_KERNEL); 1071 1071 if (!err) { 1072 1072 err = percpu_counter_init(&sbi->s_freeinodes_counter, 1073 - ext2_count_free_inodes(sb)); 1073 + ext2_count_free_inodes(sb), GFP_KERNEL); 1074 1074 } 1075 1075 if (!err) { 1076 1076 err = percpu_counter_init(&sbi->s_dirs_counter, 1077 - ext2_count_dirs(sb)); 1077 + ext2_count_dirs(sb), GFP_KERNEL); 1078 1078 } 1079 1079 if (err) { 1080 1080 ext2_msg(sb, KERN_ERR, "error: insufficient memory");
+3 -3
fs/ext3/super.c
··· 2039 2039 goto failed_mount2; 2040 2040 } 2041 2041 err = percpu_counter_init(&sbi->s_freeblocks_counter, 2042 - ext3_count_free_blocks(sb)); 2042 + ext3_count_free_blocks(sb), GFP_KERNEL); 2043 2043 if (!err) { 2044 2044 err = percpu_counter_init(&sbi->s_freeinodes_counter, 2045 - ext3_count_free_inodes(sb)); 2045 + ext3_count_free_inodes(sb), GFP_KERNEL); 2046 2046 } 2047 2047 if (!err) { 2048 2048 err = percpu_counter_init(&sbi->s_dirs_counter, 2049 - ext3_count_dirs(sb)); 2049 + ext3_count_dirs(sb), GFP_KERNEL); 2050 2050 } 2051 2051 if (err) { 2052 2052 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
+9 -5
fs/ext4/super.c
··· 3892 3892 /* Register extent status tree shrinker */ 3893 3893 ext4_es_register_shrinker(sbi); 3894 3894 3895 - if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) { 3895 + err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL); 3896 + if (err) { 3896 3897 ext4_msg(sb, KERN_ERR, "insufficient memory"); 3897 3898 goto failed_mount3; 3898 3899 } ··· 4107 4106 block = ext4_count_free_clusters(sb); 4108 4107 ext4_free_blocks_count_set(sbi->s_es, 4109 4108 EXT4_C2B(sbi, block)); 4110 - err = percpu_counter_init(&sbi->s_freeclusters_counter, block); 4109 + err = percpu_counter_init(&sbi->s_freeclusters_counter, block, 4110 + GFP_KERNEL); 4111 4111 if (!err) { 4112 4112 unsigned long freei = ext4_count_free_inodes(sb); 4113 4113 sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); 4114 - err = percpu_counter_init(&sbi->s_freeinodes_counter, freei); 4114 + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, 4115 + GFP_KERNEL); 4115 4116 } 4116 4117 if (!err) 4117 4118 err = percpu_counter_init(&sbi->s_dirs_counter, 4118 - ext4_count_dirs(sb)); 4119 + ext4_count_dirs(sb), GFP_KERNEL); 4119 4120 if (!err) 4120 - err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); 4121 + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, 4122 + GFP_KERNEL); 4121 4123 if (err) { 4122 4124 ext4_msg(sb, KERN_ERR, "insufficient memory"); 4123 4125 goto failed_mount6;
+1 -1
fs/file_table.c
··· 331 331 332 332 n = (mempages * (PAGE_SIZE / 1024)) / 10; 333 333 files_stat.max_files = max_t(unsigned long, n, NR_FILE); 334 - percpu_counter_init(&nr_files, 0); 334 + percpu_counter_init(&nr_files, 0, GFP_KERNEL); 335 335 }
+1 -1
fs/quota/dquot.c
··· 2725 2725 panic("Cannot create dquot hash table"); 2726 2726 2727 2727 for (i = 0; i < _DQST_DQSTAT_LAST; i++) { 2728 - ret = percpu_counter_init(&dqstats.counter[i], 0); 2728 + ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL); 2729 2729 if (ret) 2730 2730 panic("Cannot create dquot stat counters"); 2731 2731 }
+2 -1
fs/super.c
··· 175 175 goto fail; 176 176 177 177 for (i = 0; i < SB_FREEZE_LEVELS; i++) { 178 - if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0) 178 + if (percpu_counter_init(&s->s_writers.counter[i], 0, 179 + GFP_KERNEL) < 0) 179 180 goto fail; 180 181 lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], 181 182 &type->s_writers_key[i], 0);
+1
include/linux/blk-mq.h
··· 140 140 }; 141 141 142 142 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); 143 + void blk_mq_finish_init(struct request_queue *q); 143 144 int blk_mq_register_disk(struct gendisk *); 144 145 void blk_mq_unregister_disk(struct gendisk *); 145 146
+3 -2
include/linux/flex_proportions.h
··· 10 10 #include <linux/percpu_counter.h> 11 11 #include <linux/spinlock.h> 12 12 #include <linux/seqlock.h> 13 + #include <linux/gfp.h> 13 14 14 15 /* 15 16 * When maximum proportion of some event type is specified, this is the ··· 33 32 seqcount_t sequence; 34 33 }; 35 34 36 - int fprop_global_init(struct fprop_global *p); 35 + int fprop_global_init(struct fprop_global *p, gfp_t gfp); 37 36 void fprop_global_destroy(struct fprop_global *p); 38 37 bool fprop_new_period(struct fprop_global *p, int periods); 39 38 ··· 80 79 raw_spinlock_t lock; /* Protect period and numerator */ 81 80 }; 82 81 83 - int fprop_local_init_percpu(struct fprop_local_percpu *pl); 82 + int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp); 84 83 void fprop_local_destroy_percpu(struct fprop_local_percpu *pl); 85 84 void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl); 86 85 void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl,
+80 -42
include/linux/percpu-refcount.h
··· 13 13 * 14 14 * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less 15 15 * than an atomic_t - this is because of the way shutdown works, see 16 - * percpu_ref_kill()/PCPU_COUNT_BIAS. 16 + * percpu_ref_kill()/PERCPU_COUNT_BIAS. 17 17 * 18 18 * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the 19 19 * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() ··· 49 49 #include <linux/kernel.h> 50 50 #include <linux/percpu.h> 51 51 #include <linux/rcupdate.h> 52 + #include <linux/gfp.h> 52 53 53 54 struct percpu_ref; 54 55 typedef void (percpu_ref_func_t)(struct percpu_ref *); 55 56 57 + /* flags set in the lower bits of percpu_ref->percpu_count_ptr */ 58 + enum { 59 + __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */ 60 + __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */ 61 + __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD, 62 + 63 + __PERCPU_REF_FLAG_BITS = 2, 64 + }; 65 + 66 + /* @flags for percpu_ref_init() */ 67 + enum { 68 + /* 69 + * Start w/ ref == 1 in atomic mode. Can be switched to percpu 70 + * operation using percpu_ref_switch_to_percpu(). If initialized 71 + * with this flag, the ref will stay in atomic mode until 72 + * percpu_ref_switch_to_percpu() is invoked on it. 73 + */ 74 + PERCPU_REF_INIT_ATOMIC = 1 << 0, 75 + 76 + /* 77 + * Start dead w/ ref == 0 in atomic mode. Must be revived with 78 + * percpu_ref_reinit() before used. Implies INIT_ATOMIC. 79 + */ 80 + PERCPU_REF_INIT_DEAD = 1 << 1, 81 + }; 82 + 56 83 struct percpu_ref { 57 - atomic_t count; 84 + atomic_long_t count; 58 85 /* 59 86 * The low bit of the pointer indicates whether the ref is in percpu 60 87 * mode; if set, then get/put will manipulate the atomic_t. 61 88 */ 62 - unsigned long pcpu_count_ptr; 89 + unsigned long percpu_count_ptr; 63 90 percpu_ref_func_t *release; 64 - percpu_ref_func_t *confirm_kill; 91 + percpu_ref_func_t *confirm_switch; 92 + bool force_atomic:1; 65 93 struct rcu_head rcu; 66 94 }; 67 95 68 96 int __must_check percpu_ref_init(struct percpu_ref *ref, 69 - percpu_ref_func_t *release); 70 - void percpu_ref_reinit(struct percpu_ref *ref); 97 + percpu_ref_func_t *release, unsigned int flags, 98 + gfp_t gfp); 71 99 void percpu_ref_exit(struct percpu_ref *ref); 100 + void percpu_ref_switch_to_atomic(struct percpu_ref *ref, 101 + percpu_ref_func_t *confirm_switch); 102 + void percpu_ref_switch_to_percpu(struct percpu_ref *ref); 72 103 void percpu_ref_kill_and_confirm(struct percpu_ref *ref, 73 104 percpu_ref_func_t *confirm_kill); 74 - void __percpu_ref_kill_expedited(struct percpu_ref *ref); 105 + void percpu_ref_reinit(struct percpu_ref *ref); 75 106 76 107 /** 77 108 * percpu_ref_kill - drop the initial ref ··· 119 88 return percpu_ref_kill_and_confirm(ref, NULL); 120 89 } 121 90 122 - #define PCPU_REF_DEAD 1 123 - 124 91 /* 125 92 * Internal helper. Don't use outside percpu-refcount proper. The 126 93 * function doesn't return the pointer and let the caller test it for NULL 127 94 * because doing so forces the compiler to generate two conditional 128 - * branches as it can't assume that @ref->pcpu_count is not NULL. 95 + * branches as it can't assume that @ref->percpu_count is not NULL. 129 96 */ 130 - static inline bool __pcpu_ref_alive(struct percpu_ref *ref, 131 - unsigned __percpu **pcpu_countp) 97 + static inline bool __ref_is_percpu(struct percpu_ref *ref, 98 + unsigned long __percpu **percpu_countp) 132 99 { 133 - unsigned long pcpu_ptr = ACCESS_ONCE(ref->pcpu_count_ptr); 100 + unsigned long percpu_ptr = ACCESS_ONCE(ref->percpu_count_ptr); 134 101 135 102 /* paired with smp_store_release() in percpu_ref_reinit() */ 136 103 smp_read_barrier_depends(); 137 104 138 - if (unlikely(pcpu_ptr & PCPU_REF_DEAD)) 105 + if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC)) 139 106 return false; 140 107 141 - *pcpu_countp = (unsigned __percpu *)pcpu_ptr; 108 + *percpu_countp = (unsigned long __percpu *)percpu_ptr; 142 109 return true; 143 110 } 144 111 ··· 144 115 * percpu_ref_get - increment a percpu refcount 145 116 * @ref: percpu_ref to get 146 117 * 147 - * Analagous to atomic_inc(). 148 - */ 118 + * Analagous to atomic_long_inc(). 119 + * 120 + * This function is safe to call as long as @ref is between init and exit. 121 + */ 149 122 static inline void percpu_ref_get(struct percpu_ref *ref) 150 123 { 151 - unsigned __percpu *pcpu_count; 124 + unsigned long __percpu *percpu_count; 152 125 153 126 rcu_read_lock_sched(); 154 127 155 - if (__pcpu_ref_alive(ref, &pcpu_count)) 156 - this_cpu_inc(*pcpu_count); 128 + if (__ref_is_percpu(ref, &percpu_count)) 129 + this_cpu_inc(*percpu_count); 157 130 else 158 - atomic_inc(&ref->count); 131 + atomic_long_inc(&ref->count); 159 132 160 133 rcu_read_unlock_sched(); 161 134 } ··· 169 138 * Increment a percpu refcount unless its count already reached zero. 170 139 * Returns %true on success; %false on failure. 171 140 * 172 - * The caller is responsible for ensuring that @ref stays accessible. 141 + * This function is safe to call as long as @ref is between init and exit. 173 142 */ 174 143 static inline bool percpu_ref_tryget(struct percpu_ref *ref) 175 144 { 176 - unsigned __percpu *pcpu_count; 177 - int ret = false; 145 + unsigned long __percpu *percpu_count; 146 + int ret; 178 147 179 148 rcu_read_lock_sched(); 180 149 181 - if (__pcpu_ref_alive(ref, &pcpu_count)) { 182 - this_cpu_inc(*pcpu_count); 150 + if (__ref_is_percpu(ref, &percpu_count)) { 151 + this_cpu_inc(*percpu_count); 183 152 ret = true; 184 153 } else { 185 - ret = atomic_inc_not_zero(&ref->count); 154 + ret = atomic_long_inc_not_zero(&ref->count); 186 155 } 187 156 188 157 rcu_read_unlock_sched(); ··· 197 166 * Increment a percpu refcount unless it has already been killed. Returns 198 167 * %true on success; %false on failure. 199 168 * 200 - * Completion of percpu_ref_kill() in itself doesn't guarantee that tryget 201 - * will fail. For such guarantee, percpu_ref_kill_and_confirm() should be 202 - * used. After the confirm_kill callback is invoked, it's guaranteed that 203 - * no new reference will be given out by percpu_ref_tryget(). 169 + * Completion of percpu_ref_kill() in itself doesn't guarantee that this 170 + * function will fail. For such guarantee, percpu_ref_kill_and_confirm() 171 + * should be used. After the confirm_kill callback is invoked, it's 172 + * guaranteed that no new reference will be given out by 173 + * percpu_ref_tryget_live(). 204 174 * 205 - * The caller is responsible for ensuring that @ref stays accessible. 175 + * This function is safe to call as long as @ref is between init and exit. 206 176 */ 207 177 static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) 208 178 { 209 - unsigned __percpu *pcpu_count; 179 + unsigned long __percpu *percpu_count; 210 180 int ret = false; 211 181 212 182 rcu_read_lock_sched(); 213 183 214 - if (__pcpu_ref_alive(ref, &pcpu_count)) { 215 - this_cpu_inc(*pcpu_count); 184 + if (__ref_is_percpu(ref, &percpu_count)) { 185 + this_cpu_inc(*percpu_count); 216 186 ret = true; 187 + } else if (!(ACCESS_ONCE(ref->percpu_count_ptr) & __PERCPU_REF_DEAD)) { 188 + ret = atomic_long_inc_not_zero(&ref->count); 217 189 } 218 190 219 191 rcu_read_unlock_sched(); ··· 230 196 * 231 197 * Decrement the refcount, and if 0, call the release function (which was passed 232 198 * to percpu_ref_init()) 199 + * 200 + * This function is safe to call as long as @ref is between init and exit. 233 201 */ 234 202 static inline void percpu_ref_put(struct percpu_ref *ref) 235 203 { 236 - unsigned __percpu *pcpu_count; 204 + unsigned long __percpu *percpu_count; 237 205 238 206 rcu_read_lock_sched(); 239 207 240 - if (__pcpu_ref_alive(ref, &pcpu_count)) 241 - this_cpu_dec(*pcpu_count); 242 - else if (unlikely(atomic_dec_and_test(&ref->count))) 208 + if (__ref_is_percpu(ref, &percpu_count)) 209 + this_cpu_dec(*percpu_count); 210 + else if (unlikely(atomic_long_dec_and_test(&ref->count))) 243 211 ref->release(ref); 244 212 245 213 rcu_read_unlock_sched(); ··· 252 216 * @ref: percpu_ref to test 253 217 * 254 218 * Returns %true if @ref reached zero. 219 + * 220 + * This function is safe to call as long as @ref is between init and exit. 255 221 */ 256 222 static inline bool percpu_ref_is_zero(struct percpu_ref *ref) 257 223 { 258 - unsigned __percpu *pcpu_count; 224 + unsigned long __percpu *percpu_count; 259 225 260 - if (__pcpu_ref_alive(ref, &pcpu_count)) 226 + if (__ref_is_percpu(ref, &percpu_count)) 261 227 return false; 262 - return !atomic_read(&ref->count); 228 + return !atomic_long_read(&ref->count); 263 229 } 264 230 265 231 #endif
+9 -4
include/linux/percpu.h
··· 48 48 * intelligent way to determine this would be nice. 49 49 */ 50 50 #if BITS_PER_LONG > 32 51 - #define PERCPU_DYNAMIC_RESERVE (20 << 10) 51 + #define PERCPU_DYNAMIC_RESERVE (28 << 10) 52 52 #else 53 - #define PERCPU_DYNAMIC_RESERVE (12 << 10) 53 + #define PERCPU_DYNAMIC_RESERVE (20 << 10) 54 54 #endif 55 55 56 56 extern void *pcpu_base_addr; ··· 122 122 #endif 123 123 extern void __init percpu_init_late(void); 124 124 125 + extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); 125 126 extern void __percpu *__alloc_percpu(size_t size, size_t align); 126 127 extern void free_percpu(void __percpu *__pdata); 127 128 extern phys_addr_t per_cpu_ptr_to_phys(void *addr); 128 129 129 - #define alloc_percpu(type) \ 130 - (typeof(type) __percpu *)__alloc_percpu(sizeof(type), __alignof__(type)) 130 + #define alloc_percpu_gfp(type, gfp) \ 131 + (typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type), \ 132 + __alignof__(type), gfp) 133 + #define alloc_percpu(type) \ 134 + (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ 135 + __alignof__(type)) 131 136 132 137 #endif /* __LINUX_PERCPU_H */
+6 -4
include/linux/percpu_counter.h
··· 12 12 #include <linux/threads.h> 13 13 #include <linux/percpu.h> 14 14 #include <linux/types.h> 15 + #include <linux/gfp.h> 15 16 16 17 #ifdef CONFIG_SMP 17 18 ··· 27 26 28 27 extern int percpu_counter_batch; 29 28 30 - int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, 29 + int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, 31 30 struct lock_class_key *key); 32 31 33 - #define percpu_counter_init(fbc, value) \ 32 + #define percpu_counter_init(fbc, value, gfp) \ 34 33 ({ \ 35 34 static struct lock_class_key __key; \ 36 35 \ 37 - __percpu_counter_init(fbc, value, &__key); \ 36 + __percpu_counter_init(fbc, value, gfp, &__key); \ 38 37 }) 39 38 40 39 void percpu_counter_destroy(struct percpu_counter *fbc); ··· 90 89 s64 count; 91 90 }; 92 91 93 - static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount) 92 + static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount, 93 + gfp_t gfp) 94 94 { 95 95 fbc->count = amount; 96 96 return 0;
+3 -2
include/linux/proportions.h
··· 12 12 #include <linux/percpu_counter.h> 13 13 #include <linux/spinlock.h> 14 14 #include <linux/mutex.h> 15 + #include <linux/gfp.h> 15 16 16 17 struct prop_global { 17 18 /* ··· 41 40 struct mutex mutex; /* serialize the prop_global switch */ 42 41 }; 43 42 44 - int prop_descriptor_init(struct prop_descriptor *pd, int shift); 43 + int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp); 45 44 void prop_change_shift(struct prop_descriptor *pd, int new_shift); 46 45 47 46 /* ··· 62 61 raw_spinlock_t lock; /* protect the snapshot state */ 63 62 }; 64 63 65 - int prop_local_init_percpu(struct prop_local_percpu *pl); 64 + int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp); 66 65 void prop_local_destroy_percpu(struct prop_local_percpu *pl); 67 66 void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl); 68 67 void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl,
+1 -1
include/net/dst_ops.h
··· 63 63 64 64 static inline int dst_entries_init(struct dst_ops *dst) 65 65 { 66 - return percpu_counter_init(&dst->pcpuc_entries, 0); 66 + return percpu_counter_init(&dst->pcpuc_entries, 0, GFP_KERNEL); 67 67 } 68 68 69 69 static inline void dst_entries_destroy(struct dst_ops *dst)
+1 -1
include/net/inet_frag.h
··· 151 151 152 152 static inline void init_frag_mem_limit(struct netns_frags *nf) 153 153 { 154 - percpu_counter_init(&nf->mem, 0); 154 + percpu_counter_init(&nf->mem, 0, GFP_KERNEL); 155 155 } 156 156 157 157 static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
+4 -3
kernel/cgroup.c
··· 1607 1607 goto out; 1608 1608 root_cgrp->id = ret; 1609 1609 1610 - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); 1610 + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, 1611 + GFP_KERNEL); 1611 1612 if (ret) 1612 1613 goto out; 1613 1614 ··· 4483 4482 4484 4483 init_and_link_css(css, ss, cgrp); 4485 4484 4486 - err = percpu_ref_init(&css->refcnt, css_release); 4485 + err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); 4487 4486 if (err) 4488 4487 goto err_free_css; 4489 4488 ··· 4556 4555 goto out_unlock; 4557 4556 } 4558 4557 4559 - ret = percpu_ref_init(&cgrp->self.refcnt, css_release); 4558 + ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); 4560 4559 if (ret) 4561 4560 goto out_free_cgrp; 4562 4561
+4 -4
lib/flex_proportions.c
··· 34 34 */ 35 35 #include <linux/flex_proportions.h> 36 36 37 - int fprop_global_init(struct fprop_global *p) 37 + int fprop_global_init(struct fprop_global *p, gfp_t gfp) 38 38 { 39 39 int err; 40 40 41 41 p->period = 0; 42 42 /* Use 1 to avoid dealing with periods with 0 events... */ 43 - err = percpu_counter_init(&p->events, 1); 43 + err = percpu_counter_init(&p->events, 1, gfp); 44 44 if (err) 45 45 return err; 46 46 seqcount_init(&p->sequence); ··· 168 168 */ 169 169 #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) 170 170 171 - int fprop_local_init_percpu(struct fprop_local_percpu *pl) 171 + int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp) 172 172 { 173 173 int err; 174 174 175 - err = percpu_counter_init(&pl->events, 0); 175 + err = percpu_counter_init(&pl->events, 0, gfp); 176 176 if (err) 177 177 return err; 178 178 pl->period = 0;
+218 -89
lib/percpu-refcount.c
··· 1 1 #define pr_fmt(fmt) "%s: " fmt "\n", __func__ 2 2 3 3 #include <linux/kernel.h> 4 + #include <linux/sched.h> 5 + #include <linux/wait.h> 4 6 #include <linux/percpu-refcount.h> 5 7 6 8 /* ··· 13 11 * percpu counters will all sum to the correct value 14 12 * 15 13 * (More precisely: because moduler arithmatic is commutative the sum of all the 16 - * pcpu_count vars will be equal to what it would have been if all the gets and 17 - * puts were done to a single integer, even if some of the percpu integers 14 + * percpu_count vars will be equal to what it would have been if all the gets 15 + * and puts were done to a single integer, even if some of the percpu integers 18 16 * overflow or underflow). 19 17 * 20 18 * The real trick to implementing percpu refcounts is shutdown. We can't detect ··· 27 25 * works. 28 26 * 29 27 * Converting to non percpu mode is done with some RCUish stuff in 30 - * percpu_ref_kill. Additionally, we need a bias value so that the atomic_t 31 - * can't hit 0 before we've added up all the percpu refs. 28 + * percpu_ref_kill. Additionally, we need a bias value so that the 29 + * atomic_long_t can't hit 0 before we've added up all the percpu refs. 32 30 */ 33 31 34 - #define PCPU_COUNT_BIAS (1U << 31) 32 + #define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) 35 33 36 - static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) 34 + static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq); 35 + 36 + static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) 37 37 { 38 - return (unsigned __percpu *)(ref->pcpu_count_ptr & ~PCPU_REF_DEAD); 38 + return (unsigned long __percpu *) 39 + (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); 39 40 } 40 41 41 42 /** 42 43 * percpu_ref_init - initialize a percpu refcount 43 44 * @ref: percpu_ref to initialize 44 45 * @release: function which will be called when refcount hits 0 46 + * @flags: PERCPU_REF_INIT_* flags 47 + * @gfp: allocation mask to use 45 48 * 46 - * Initializes the refcount in single atomic counter mode with a refcount of 1; 47 - * analagous to atomic_set(ref, 1). 49 + * Initializes @ref. If @flags is zero, @ref starts in percpu mode with a 50 + * refcount of 1; analagous to atomic_long_set(ref, 1). See the 51 + * definitions of PERCPU_REF_INIT_* flags for flag behaviors. 48 52 * 49 53 * Note that @release must not sleep - it may potentially be called from RCU 50 54 * callback context by percpu_ref_kill(). 51 55 */ 52 - int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) 56 + int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, 57 + unsigned int flags, gfp_t gfp) 53 58 { 54 - atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); 59 + size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS, 60 + __alignof__(unsigned long)); 61 + unsigned long start_count = 0; 55 62 56 - ref->pcpu_count_ptr = (unsigned long)alloc_percpu(unsigned); 57 - if (!ref->pcpu_count_ptr) 63 + ref->percpu_count_ptr = (unsigned long) 64 + __alloc_percpu_gfp(sizeof(unsigned long), align, gfp); 65 + if (!ref->percpu_count_ptr) 58 66 return -ENOMEM; 67 + 68 + ref->force_atomic = flags & PERCPU_REF_INIT_ATOMIC; 69 + 70 + if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) 71 + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; 72 + else 73 + start_count += PERCPU_COUNT_BIAS; 74 + 75 + if (flags & PERCPU_REF_INIT_DEAD) 76 + ref->percpu_count_ptr |= __PERCPU_REF_DEAD; 77 + else 78 + start_count++; 79 + 80 + atomic_long_set(&ref->count, start_count); 59 81 60 82 ref->release = release; 61 83 return 0; 62 84 } 63 85 EXPORT_SYMBOL_GPL(percpu_ref_init); 64 - 65 - /** 66 - * percpu_ref_reinit - re-initialize a percpu refcount 67 - * @ref: perpcu_ref to re-initialize 68 - * 69 - * Re-initialize @ref so that it's in the same state as when it finished 70 - * percpu_ref_init(). @ref must have been initialized successfully, killed 71 - * and reached 0 but not exited. 72 - * 73 - * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while 74 - * this function is in progress. 75 - */ 76 - void percpu_ref_reinit(struct percpu_ref *ref) 77 - { 78 - unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); 79 - int cpu; 80 - 81 - BUG_ON(!pcpu_count); 82 - WARN_ON(!percpu_ref_is_zero(ref)); 83 - 84 - atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); 85 - 86 - /* 87 - * Restore per-cpu operation. smp_store_release() is paired with 88 - * smp_read_barrier_depends() in __pcpu_ref_alive() and guarantees 89 - * that the zeroing is visible to all percpu accesses which can see 90 - * the following PCPU_REF_DEAD clearing. 91 - */ 92 - for_each_possible_cpu(cpu) 93 - *per_cpu_ptr(pcpu_count, cpu) = 0; 94 - 95 - smp_store_release(&ref->pcpu_count_ptr, 96 - ref->pcpu_count_ptr & ~PCPU_REF_DEAD); 97 - } 98 - EXPORT_SYMBOL_GPL(percpu_ref_reinit); 99 86 100 87 /** 101 88 * percpu_ref_exit - undo percpu_ref_init() ··· 98 107 */ 99 108 void percpu_ref_exit(struct percpu_ref *ref) 100 109 { 101 - unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); 110 + unsigned long __percpu *percpu_count = percpu_count_ptr(ref); 102 111 103 - if (pcpu_count) { 104 - free_percpu(pcpu_count); 105 - ref->pcpu_count_ptr = PCPU_REF_DEAD; 112 + if (percpu_count) { 113 + free_percpu(percpu_count); 114 + ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD; 106 115 } 107 116 } 108 117 EXPORT_SYMBOL_GPL(percpu_ref_exit); 109 118 110 - static void percpu_ref_kill_rcu(struct rcu_head *rcu) 119 + static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu) 111 120 { 112 121 struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); 113 - unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); 114 - unsigned count = 0; 122 + 123 + ref->confirm_switch(ref); 124 + ref->confirm_switch = NULL; 125 + wake_up_all(&percpu_ref_switch_waitq); 126 + 127 + /* drop ref from percpu_ref_switch_to_atomic() */ 128 + percpu_ref_put(ref); 129 + } 130 + 131 + static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu) 132 + { 133 + struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); 134 + unsigned long __percpu *percpu_count = percpu_count_ptr(ref); 135 + unsigned long count = 0; 115 136 int cpu; 116 137 117 138 for_each_possible_cpu(cpu) 118 - count += *per_cpu_ptr(pcpu_count, cpu); 139 + count += *per_cpu_ptr(percpu_count, cpu); 119 140 120 - pr_debug("global %i pcpu %i", atomic_read(&ref->count), (int) count); 141 + pr_debug("global %ld percpu %ld", 142 + atomic_long_read(&ref->count), (long)count); 121 143 122 144 /* 123 145 * It's crucial that we sum the percpu counters _before_ adding the sum ··· 144 140 * reaching 0 before we add the percpu counts. But doing it at the same 145 141 * time is equivalent and saves us atomic operations: 146 142 */ 143 + atomic_long_add((long)count - PERCPU_COUNT_BIAS, &ref->count); 147 144 148 - atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); 145 + WARN_ONCE(atomic_long_read(&ref->count) <= 0, 146 + "percpu ref (%pf) <= 0 (%ld) after switching to atomic", 147 + ref->release, atomic_long_read(&ref->count)); 149 148 150 - WARN_ONCE(atomic_read(&ref->count) <= 0, "percpu ref <= 0 (%i)", 151 - atomic_read(&ref->count)); 149 + /* @ref is viewed as dead on all CPUs, send out switch confirmation */ 150 + percpu_ref_call_confirm_rcu(rcu); 151 + } 152 152 153 - /* @ref is viewed as dead on all CPUs, send out kill confirmation */ 154 - if (ref->confirm_kill) 155 - ref->confirm_kill(ref); 153 + static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref) 154 + { 155 + } 156 + 157 + static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, 158 + percpu_ref_func_t *confirm_switch) 159 + { 160 + if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) { 161 + /* switching from percpu to atomic */ 162 + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; 163 + 164 + /* 165 + * Non-NULL ->confirm_switch is used to indicate that 166 + * switching is in progress. Use noop one if unspecified. 167 + */ 168 + WARN_ON_ONCE(ref->confirm_switch); 169 + ref->confirm_switch = 170 + confirm_switch ?: percpu_ref_noop_confirm_switch; 171 + 172 + percpu_ref_get(ref); /* put after confirmation */ 173 + call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu); 174 + } else if (confirm_switch) { 175 + /* 176 + * Somebody already set ATOMIC. Switching may still be in 177 + * progress. @confirm_switch must be invoked after the 178 + * switching is complete and a full sched RCU grace period 179 + * has passed. Wait synchronously for the previous 180 + * switching and schedule @confirm_switch invocation. 181 + */ 182 + wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); 183 + ref->confirm_switch = confirm_switch; 184 + 185 + percpu_ref_get(ref); /* put after confirmation */ 186 + call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu); 187 + } 188 + } 189 + 190 + /** 191 + * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode 192 + * @ref: percpu_ref to switch to atomic mode 193 + * @confirm_switch: optional confirmation callback 194 + * 195 + * There's no reason to use this function for the usual reference counting. 196 + * Use percpu_ref_kill[_and_confirm](). 197 + * 198 + * Schedule switching of @ref to atomic mode. All its percpu counts will 199 + * be collected to the main atomic counter. On completion, when all CPUs 200 + * are guaraneed to be in atomic mode, @confirm_switch, which may not 201 + * block, is invoked. This function may be invoked concurrently with all 202 + * the get/put operations and can safely be mixed with kill and reinit 203 + * operations. Note that @ref will stay in atomic mode across kill/reinit 204 + * cycles until percpu_ref_switch_to_percpu() is called. 205 + * 206 + * This function normally doesn't block and can be called from any context 207 + * but it may block if @confirm_kill is specified and @ref is already in 208 + * the process of switching to atomic mode. In such cases, @confirm_switch 209 + * will be invoked after the switching is complete. 210 + * 211 + * Due to the way percpu_ref is implemented, @confirm_switch will be called 212 + * after at least one full sched RCU grace period has passed but this is an 213 + * implementation detail and must not be depended upon. 214 + */ 215 + void percpu_ref_switch_to_atomic(struct percpu_ref *ref, 216 + percpu_ref_func_t *confirm_switch) 217 + { 218 + ref->force_atomic = true; 219 + __percpu_ref_switch_to_atomic(ref, confirm_switch); 220 + } 221 + 222 + static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) 223 + { 224 + unsigned long __percpu *percpu_count = percpu_count_ptr(ref); 225 + int cpu; 226 + 227 + BUG_ON(!percpu_count); 228 + 229 + if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) 230 + return; 231 + 232 + wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); 233 + 234 + atomic_long_add(PERCPU_COUNT_BIAS, &ref->count); 156 235 157 236 /* 158 - * Now we're in single atomic_t mode with a consistent refcount, so it's 159 - * safe to drop our initial ref: 237 + * Restore per-cpu operation. smp_store_release() is paired with 238 + * smp_read_barrier_depends() in __ref_is_percpu() and guarantees 239 + * that the zeroing is visible to all percpu accesses which can see 240 + * the following __PERCPU_REF_ATOMIC clearing. 160 241 */ 161 - percpu_ref_put(ref); 242 + for_each_possible_cpu(cpu) 243 + *per_cpu_ptr(percpu_count, cpu) = 0; 244 + 245 + smp_store_release(&ref->percpu_count_ptr, 246 + ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); 247 + } 248 + 249 + /** 250 + * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode 251 + * @ref: percpu_ref to switch to percpu mode 252 + * 253 + * There's no reason to use this function for the usual reference counting. 254 + * To re-use an expired ref, use percpu_ref_reinit(). 255 + * 256 + * Switch @ref to percpu mode. This function may be invoked concurrently 257 + * with all the get/put operations and can safely be mixed with kill and 258 + * reinit operations. This function reverses the sticky atomic state set 259 + * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic(). If @ref is 260 + * dying or dead, the actual switching takes place on the following 261 + * percpu_ref_reinit(). 262 + * 263 + * This function normally doesn't block and can be called from any context 264 + * but it may block if @ref is in the process of switching to atomic mode 265 + * by percpu_ref_switch_atomic(). 266 + */ 267 + void percpu_ref_switch_to_percpu(struct percpu_ref *ref) 268 + { 269 + ref->force_atomic = false; 270 + 271 + /* a dying or dead ref can't be switched to percpu mode w/o reinit */ 272 + if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) 273 + __percpu_ref_switch_to_percpu(ref); 162 274 } 163 275 164 276 /** ··· 284 164 * 285 165 * Equivalent to percpu_ref_kill() but also schedules kill confirmation if 286 166 * @confirm_kill is not NULL. @confirm_kill, which may not block, will be 287 - * called after @ref is seen as dead from all CPUs - all further 288 - * invocations of percpu_ref_tryget() will fail. See percpu_ref_tryget() 289 - * for more details. 167 + * called after @ref is seen as dead from all CPUs at which point all 168 + * further invocations of percpu_ref_tryget_live() will fail. See 169 + * percpu_ref_tryget_live() for details. 290 170 * 291 - * Due to the way percpu_ref is implemented, @confirm_kill will be called 292 - * after at least one full RCU grace period has passed but this is an 293 - * implementation detail and callers must not depend on it. 171 + * This function normally doesn't block and can be called from any context 172 + * but it may block if @confirm_kill is specified and @ref is in the 173 + * process of switching to atomic mode by percpu_ref_switch_atomic(). 174 + * 175 + * Due to the way percpu_ref is implemented, @confirm_switch will be called 176 + * after at least one full sched RCU grace period has passed but this is an 177 + * implementation detail and must not be depended upon. 294 178 */ 295 179 void percpu_ref_kill_and_confirm(struct percpu_ref *ref, 296 180 percpu_ref_func_t *confirm_kill) 297 181 { 298 - WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, 299 - "percpu_ref_kill() called more than once!\n"); 182 + WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD, 183 + "%s called more than once on %pf!", __func__, ref->release); 300 184 301 - ref->pcpu_count_ptr |= PCPU_REF_DEAD; 302 - ref->confirm_kill = confirm_kill; 303 - 304 - call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); 185 + ref->percpu_count_ptr |= __PERCPU_REF_DEAD; 186 + __percpu_ref_switch_to_atomic(ref, confirm_kill); 187 + percpu_ref_put(ref); 305 188 } 306 189 EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); 307 190 308 - /* 309 - * XXX: Temporary kludge to work around SCSI blk-mq stall. Used only by 310 - * block/blk-mq.c::blk_mq_freeze_queue(). Will be removed during v3.18 311 - * devel cycle. Do not use anywhere else. 191 + /** 192 + * percpu_ref_reinit - re-initialize a percpu refcount 193 + * @ref: perpcu_ref to re-initialize 194 + * 195 + * Re-initialize @ref so that it's in the same state as when it finished 196 + * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD. @ref must have been 197 + * initialized successfully and reached 0 but not exited. 198 + * 199 + * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while 200 + * this function is in progress. 312 201 */ 313 - void __percpu_ref_kill_expedited(struct percpu_ref *ref) 202 + void percpu_ref_reinit(struct percpu_ref *ref) 314 203 { 315 - WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, 316 - "percpu_ref_kill() called more than once on %pf!", 317 - ref->release); 204 + WARN_ON_ONCE(!percpu_ref_is_zero(ref)); 318 205 319 - ref->pcpu_count_ptr |= PCPU_REF_DEAD; 320 - synchronize_sched_expedited(); 321 - percpu_ref_kill_rcu(&ref->rcu); 206 + ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; 207 + percpu_ref_get(ref); 208 + if (!ref->force_atomic) 209 + __percpu_ref_switch_to_percpu(ref); 322 210 } 211 + EXPORT_SYMBOL_GPL(percpu_ref_reinit);
+12 -8
lib/percpu_counter.c
··· 112 112 } 113 113 EXPORT_SYMBOL(__percpu_counter_sum); 114 114 115 - int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, 115 + int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, 116 116 struct lock_class_key *key) 117 117 { 118 + unsigned long flags __maybe_unused; 119 + 118 120 raw_spin_lock_init(&fbc->lock); 119 121 lockdep_set_class(&fbc->lock, key); 120 122 fbc->count = amount; 121 - fbc->counters = alloc_percpu(s32); 123 + fbc->counters = alloc_percpu_gfp(s32, gfp); 122 124 if (!fbc->counters) 123 125 return -ENOMEM; 124 126 ··· 128 126 129 127 #ifdef CONFIG_HOTPLUG_CPU 130 128 INIT_LIST_HEAD(&fbc->list); 131 - spin_lock(&percpu_counters_lock); 129 + spin_lock_irqsave(&percpu_counters_lock, flags); 132 130 list_add(&fbc->list, &percpu_counters); 133 - spin_unlock(&percpu_counters_lock); 131 + spin_unlock_irqrestore(&percpu_counters_lock, flags); 134 132 #endif 135 133 return 0; 136 134 } ··· 138 136 139 137 void percpu_counter_destroy(struct percpu_counter *fbc) 140 138 { 139 + unsigned long flags __maybe_unused; 140 + 141 141 if (!fbc->counters) 142 142 return; 143 143 144 144 debug_percpu_counter_deactivate(fbc); 145 145 146 146 #ifdef CONFIG_HOTPLUG_CPU 147 - spin_lock(&percpu_counters_lock); 147 + spin_lock_irqsave(&percpu_counters_lock, flags); 148 148 list_del(&fbc->list); 149 - spin_unlock(&percpu_counters_lock); 149 + spin_unlock_irqrestore(&percpu_counters_lock, flags); 150 150 #endif 151 151 free_percpu(fbc->counters); 152 152 fbc->counters = NULL; ··· 177 173 return NOTIFY_OK; 178 174 179 175 cpu = (unsigned long)hcpu; 180 - spin_lock(&percpu_counters_lock); 176 + spin_lock_irq(&percpu_counters_lock); 181 177 list_for_each_entry(fbc, &percpu_counters, list) { 182 178 s32 *pcount; 183 179 unsigned long flags; ··· 188 184 *pcount = 0; 189 185 raw_spin_unlock_irqrestore(&fbc->lock, flags); 190 186 } 191 - spin_unlock(&percpu_counters_lock); 187 + spin_unlock_irq(&percpu_counters_lock); 192 188 #endif 193 189 return NOTIFY_OK; 194 190 }
+5 -5
lib/proportions.c
··· 73 73 #include <linux/proportions.h> 74 74 #include <linux/rcupdate.h> 75 75 76 - int prop_descriptor_init(struct prop_descriptor *pd, int shift) 76 + int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp) 77 77 { 78 78 int err; 79 79 ··· 83 83 pd->index = 0; 84 84 pd->pg[0].shift = shift; 85 85 mutex_init(&pd->mutex); 86 - err = percpu_counter_init(&pd->pg[0].events, 0); 86 + err = percpu_counter_init(&pd->pg[0].events, 0, gfp); 87 87 if (err) 88 88 goto out; 89 89 90 - err = percpu_counter_init(&pd->pg[1].events, 0); 90 + err = percpu_counter_init(&pd->pg[1].events, 0, gfp); 91 91 if (err) 92 92 percpu_counter_destroy(&pd->pg[0].events); 93 93 ··· 188 188 189 189 #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) 190 190 191 - int prop_local_init_percpu(struct prop_local_percpu *pl) 191 + int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp) 192 192 { 193 193 raw_spin_lock_init(&pl->lock); 194 194 pl->shift = 0; 195 195 pl->period = 0; 196 - return percpu_counter_init(&pl->events, 0); 196 + return percpu_counter_init(&pl->events, 0, gfp); 197 197 } 198 198 199 199 void prop_local_destroy_percpu(struct prop_local_percpu *pl)
+2 -2
mm/backing-dev.c
··· 455 455 bdi_wb_init(&bdi->wb, bdi); 456 456 457 457 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 458 - err = percpu_counter_init(&bdi->bdi_stat[i], 0); 458 + err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); 459 459 if (err) 460 460 goto err; 461 461 } ··· 470 470 bdi->write_bandwidth = INIT_BW; 471 471 bdi->avg_write_bandwidth = INIT_BW; 472 472 473 - err = fprop_local_init_percpu(&bdi->completions); 473 + err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); 474 474 475 475 if (err) { 476 476 err:
+1 -1
mm/mmap.c
··· 3202 3202 { 3203 3203 int ret; 3204 3204 3205 - ret = percpu_counter_init(&vm_committed_as, 0); 3205 + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); 3206 3206 VM_BUG_ON(ret); 3207 3207 } 3208 3208
+1 -1
mm/nommu.c
··· 539 539 { 540 540 int ret; 541 541 542 - ret = percpu_counter_init(&vm_committed_as, 0); 542 + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); 543 543 VM_BUG_ON(ret); 544 544 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); 545 545 }
+1 -1
mm/page-writeback.c
··· 1777 1777 writeback_set_ratelimit(); 1778 1778 register_cpu_notifier(&ratelimit_nb); 1779 1779 1780 - fprop_global_init(&writeout_completions); 1780 + fprop_global_init(&writeout_completions, GFP_KERNEL); 1781 1781 } 1782 1782 1783 1783 /**
+9 -7
mm/percpu-km.c
··· 33 33 34 34 #include <linux/log2.h> 35 35 36 - static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 36 + static int pcpu_populate_chunk(struct pcpu_chunk *chunk, 37 + int page_start, int page_end) 37 38 { 38 - unsigned int cpu; 39 - 40 - for_each_possible_cpu(cpu) 41 - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); 42 - 43 39 return 0; 44 40 } 45 41 46 - static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) 42 + static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, 43 + int page_start, int page_end) 47 44 { 48 45 /* nada */ 49 46 } ··· 67 70 68 71 chunk->data = pages; 69 72 chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; 73 + 74 + spin_lock_irq(&pcpu_lock); 75 + pcpu_chunk_populated(chunk, 0, nr_pages); 76 + spin_unlock_irq(&pcpu_lock); 77 + 70 78 return chunk; 71 79 } 72 80
+35 -127
mm/percpu-vm.c
··· 20 20 } 21 21 22 22 /** 23 - * pcpu_get_pages_and_bitmap - get temp pages array and bitmap 23 + * pcpu_get_pages - get temp pages array 24 24 * @chunk: chunk of interest 25 - * @bitmapp: output parameter for bitmap 26 - * @may_alloc: may allocate the array 27 25 * 28 - * Returns pointer to array of pointers to struct page and bitmap, 29 - * both of which can be indexed with pcpu_page_idx(). The returned 30 - * array is cleared to zero and *@bitmapp is copied from 31 - * @chunk->populated. Note that there is only one array and bitmap 32 - * and access exclusion is the caller's responsibility. 33 - * 34 - * CONTEXT: 35 - * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. 36 - * Otherwise, don't care. 26 + * Returns pointer to array of pointers to struct page which can be indexed 27 + * with pcpu_page_idx(). Note that there is only one array and accesses 28 + * should be serialized by pcpu_alloc_mutex. 37 29 * 38 30 * RETURNS: 39 - * Pointer to temp pages array on success, NULL on failure. 31 + * Pointer to temp pages array on success. 40 32 */ 41 - static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, 42 - unsigned long **bitmapp, 43 - bool may_alloc) 33 + static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc) 44 34 { 45 35 static struct page **pages; 46 - static unsigned long *bitmap; 47 36 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); 48 - size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * 49 - sizeof(unsigned long); 50 37 51 - if (!pages || !bitmap) { 52 - if (may_alloc && !pages) 53 - pages = pcpu_mem_zalloc(pages_size); 54 - if (may_alloc && !bitmap) 55 - bitmap = pcpu_mem_zalloc(bitmap_size); 56 - if (!pages || !bitmap) 57 - return NULL; 58 - } 38 + lockdep_assert_held(&pcpu_alloc_mutex); 59 39 60 - bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); 61 - 62 - *bitmapp = bitmap; 40 + if (!pages) 41 + pages = pcpu_mem_zalloc(pages_size); 63 42 return pages; 64 43 } 65 44 ··· 46 67 * pcpu_free_pages - free pages which were allocated for @chunk 47 68 * @chunk: chunk pages were allocated for 48 69 * @pages: array of pages to be freed, indexed by pcpu_page_idx() 49 - * @populated: populated bitmap 50 70 * @page_start: page index of the first page to be freed 51 71 * @page_end: page index of the last page to be freed + 1 52 72 * ··· 53 75 * The pages were allocated for @chunk. 54 76 */ 55 77 static void pcpu_free_pages(struct pcpu_chunk *chunk, 56 - struct page **pages, unsigned long *populated, 57 - int page_start, int page_end) 78 + struct page **pages, int page_start, int page_end) 58 79 { 59 80 unsigned int cpu; 60 81 int i; ··· 72 95 * pcpu_alloc_pages - allocates pages for @chunk 73 96 * @chunk: target chunk 74 97 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() 75 - * @populated: populated bitmap 76 98 * @page_start: page index of the first page to be allocated 77 99 * @page_end: page index of the last page to be allocated + 1 78 100 * ··· 80 104 * content of @pages and will pass it verbatim to pcpu_map_pages(). 81 105 */ 82 106 static int pcpu_alloc_pages(struct pcpu_chunk *chunk, 83 - struct page **pages, unsigned long *populated, 84 - int page_start, int page_end) 107 + struct page **pages, int page_start, int page_end) 85 108 { 86 109 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; 87 110 unsigned int cpu, tcpu; ··· 139 164 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk 140 165 * @chunk: chunk of interest 141 166 * @pages: pages array which can be used to pass information to free 142 - * @populated: populated bitmap 143 167 * @page_start: page index of the first page to unmap 144 168 * @page_end: page index of the last page to unmap + 1 145 169 * ··· 149 175 * proper pre/post flush functions. 150 176 */ 151 177 static void pcpu_unmap_pages(struct pcpu_chunk *chunk, 152 - struct page **pages, unsigned long *populated, 153 - int page_start, int page_end) 178 + struct page **pages, int page_start, int page_end) 154 179 { 155 180 unsigned int cpu; 156 181 int i; ··· 165 192 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), 166 193 page_end - page_start); 167 194 } 168 - 169 - bitmap_clear(populated, page_start, page_end - page_start); 170 195 } 171 196 172 197 /** ··· 199 228 * pcpu_map_pages - map pages into a pcpu_chunk 200 229 * @chunk: chunk of interest 201 230 * @pages: pages array containing pages to be mapped 202 - * @populated: populated bitmap 203 231 * @page_start: page index of the first page to map 204 232 * @page_end: page index of the last page to map + 1 205 233 * ··· 206 236 * caller is responsible for calling pcpu_post_map_flush() after all 207 237 * mappings are complete. 208 238 * 209 - * This function is responsible for setting corresponding bits in 210 - * @chunk->populated bitmap and whatever is necessary for reverse 211 - * lookup (addr -> chunk). 239 + * This function is responsible for setting up whatever is necessary for 240 + * reverse lookup (addr -> chunk). 212 241 */ 213 242 static int pcpu_map_pages(struct pcpu_chunk *chunk, 214 - struct page **pages, unsigned long *populated, 215 - int page_start, int page_end) 243 + struct page **pages, int page_start, int page_end) 216 244 { 217 245 unsigned int cpu, tcpu; 218 246 int i, err; ··· 221 253 page_end - page_start); 222 254 if (err < 0) 223 255 goto err; 224 - } 225 256 226 - /* mapping successful, link chunk and mark populated */ 227 - for (i = page_start; i < page_end; i++) { 228 - for_each_possible_cpu(cpu) 257 + for (i = page_start; i < page_end; i++) 229 258 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], 230 259 chunk); 231 - __set_bit(i, populated); 232 260 } 233 - 234 261 return 0; 235 - 236 262 err: 237 263 for_each_possible_cpu(tcpu) { 238 264 if (tcpu == cpu) ··· 261 299 /** 262 300 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk 263 301 * @chunk: chunk of interest 264 - * @off: offset to the area to populate 265 - * @size: size of the area to populate in bytes 302 + * @page_start: the start page 303 + * @page_end: the end page 266 304 * 267 305 * For each cpu, populate and map pages [@page_start,@page_end) into 268 - * @chunk. The area is cleared on return. 306 + * @chunk. 269 307 * 270 308 * CONTEXT: 271 309 * pcpu_alloc_mutex, does GFP_KERNEL allocation. 272 310 */ 273 - static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 311 + static int pcpu_populate_chunk(struct pcpu_chunk *chunk, 312 + int page_start, int page_end) 274 313 { 275 - int page_start = PFN_DOWN(off); 276 - int page_end = PFN_UP(off + size); 277 - int free_end = page_start, unmap_end = page_start; 278 314 struct page **pages; 279 - unsigned long *populated; 280 - unsigned int cpu; 281 - int rs, re, rc; 282 315 283 - /* quick path, check whether all pages are already there */ 284 - rs = page_start; 285 - pcpu_next_pop(chunk, &rs, &re, page_end); 286 - if (rs == page_start && re == page_end) 287 - goto clear; 288 - 289 - /* need to allocate and map pages, this chunk can't be immutable */ 290 - WARN_ON(chunk->immutable); 291 - 292 - pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); 316 + pages = pcpu_get_pages(chunk); 293 317 if (!pages) 294 318 return -ENOMEM; 295 319 296 - /* alloc and map */ 297 - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 298 - rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); 299 - if (rc) 300 - goto err_free; 301 - free_end = re; 302 - } 320 + if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) 321 + return -ENOMEM; 303 322 304 - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 305 - rc = pcpu_map_pages(chunk, pages, populated, rs, re); 306 - if (rc) 307 - goto err_unmap; 308 - unmap_end = re; 323 + if (pcpu_map_pages(chunk, pages, page_start, page_end)) { 324 + pcpu_free_pages(chunk, pages, page_start, page_end); 325 + return -ENOMEM; 309 326 } 310 327 pcpu_post_map_flush(chunk, page_start, page_end); 311 328 312 - /* commit new bitmap */ 313 - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); 314 - clear: 315 - for_each_possible_cpu(cpu) 316 - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); 317 329 return 0; 318 - 319 - err_unmap: 320 - pcpu_pre_unmap_flush(chunk, page_start, unmap_end); 321 - pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) 322 - pcpu_unmap_pages(chunk, pages, populated, rs, re); 323 - pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); 324 - err_free: 325 - pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) 326 - pcpu_free_pages(chunk, pages, populated, rs, re); 327 - return rc; 328 330 } 329 331 330 332 /** 331 333 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk 332 334 * @chunk: chunk to depopulate 333 - * @off: offset to the area to depopulate 334 - * @size: size of the area to depopulate in bytes 335 + * @page_start: the start page 336 + * @page_end: the end page 335 337 * 336 338 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 337 - * from @chunk. If @flush is true, vcache is flushed before unmapping 338 - * and tlb after. 339 + * from @chunk. 339 340 * 340 341 * CONTEXT: 341 342 * pcpu_alloc_mutex. 342 343 */ 343 - static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) 344 + static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, 345 + int page_start, int page_end) 344 346 { 345 - int page_start = PFN_DOWN(off); 346 - int page_end = PFN_UP(off + size); 347 347 struct page **pages; 348 - unsigned long *populated; 349 - int rs, re; 350 - 351 - /* quick path, check whether it's empty already */ 352 - rs = page_start; 353 - pcpu_next_unpop(chunk, &rs, &re, page_end); 354 - if (rs == page_start && re == page_end) 355 - return; 356 - 357 - /* immutable chunks can't be depopulated */ 358 - WARN_ON(chunk->immutable); 359 348 360 349 /* 361 350 * If control reaches here, there must have been at least one 362 351 * successful population attempt so the temp pages array must 363 352 * be available now. 364 353 */ 365 - pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); 354 + pages = pcpu_get_pages(chunk); 366 355 BUG_ON(!pages); 367 356 368 357 /* unmap and free */ 369 358 pcpu_pre_unmap_flush(chunk, page_start, page_end); 370 359 371 - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) 372 - pcpu_unmap_pages(chunk, pages, populated, rs, re); 360 + pcpu_unmap_pages(chunk, pages, page_start, page_end); 373 361 374 362 /* no need to flush tlb, vmalloc will handle it lazily */ 375 363 376 - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) 377 - pcpu_free_pages(chunk, pages, populated, rs, re); 378 - 379 - /* commit new bitmap */ 380 - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); 364 + pcpu_free_pages(chunk, pages, page_start, page_end); 381 365 } 382 366 383 367 static struct pcpu_chunk *pcpu_create_chunk(void)
+430 -102
mm/percpu.c
··· 76 76 77 77 #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 78 78 #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 79 + #define PCPU_ATOMIC_MAP_MARGIN_LOW 32 80 + #define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 81 + #define PCPU_EMPTY_POP_PAGES_LOW 2 82 + #define PCPU_EMPTY_POP_PAGES_HIGH 4 79 83 80 84 #ifdef CONFIG_SMP 81 85 /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ ··· 106 102 int free_size; /* free bytes in the chunk */ 107 103 int contig_hint; /* max contiguous size hint */ 108 104 void *base_addr; /* base address of this chunk */ 105 + 109 106 int map_used; /* # of map entries used before the sentry */ 110 107 int map_alloc; /* # of map entries allocated */ 111 108 int *map; /* allocation map */ 109 + struct work_struct map_extend_work;/* async ->map[] extension */ 110 + 112 111 void *data; /* chunk data */ 113 112 int first_free; /* no free below this */ 114 113 bool immutable; /* no [de]population allowed */ 114 + int nr_populated; /* # of populated pages */ 115 115 unsigned long populated[]; /* populated bitmap */ 116 116 }; 117 117 ··· 159 151 static struct pcpu_chunk *pcpu_reserved_chunk; 160 152 static int pcpu_reserved_chunk_limit; 161 153 162 - /* 163 - * Synchronization rules. 164 - * 165 - * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 166 - * protects allocation/reclaim paths, chunks, populated bitmap and 167 - * vmalloc mapping. The latter is a spinlock and protects the index 168 - * data structures - chunk slots, chunks and area maps in chunks. 169 - * 170 - * During allocation, pcpu_alloc_mutex is kept locked all the time and 171 - * pcpu_lock is grabbed and released as necessary. All actual memory 172 - * allocations are done using GFP_KERNEL with pcpu_lock released. In 173 - * general, percpu memory can't be allocated with irq off but 174 - * irqsave/restore are still used in alloc path so that it can be used 175 - * from early init path - sched_init() specifically. 176 - * 177 - * Free path accesses and alters only the index data structures, so it 178 - * can be safely called from atomic context. When memory needs to be 179 - * returned to the system, free path schedules reclaim_work which 180 - * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be 181 - * reclaimed, release both locks and frees the chunks. Note that it's 182 - * necessary to grab both locks to remove a chunk from circulation as 183 - * allocation path might be referencing the chunk with only 184 - * pcpu_alloc_mutex locked. 185 - */ 186 - static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ 187 - static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ 154 + static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ 155 + static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ 188 156 189 157 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ 190 158 191 - /* reclaim work to release fully free chunks, scheduled from free path */ 192 - static void pcpu_reclaim(struct work_struct *work); 193 - static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); 159 + /* 160 + * The number of empty populated pages, protected by pcpu_lock. The 161 + * reserved chunk doesn't contribute to the count. 162 + */ 163 + static int pcpu_nr_empty_pop_pages; 164 + 165 + /* 166 + * Balance work is used to populate or destroy chunks asynchronously. We 167 + * try to keep the number of populated free pages between 168 + * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one 169 + * empty chunk. 170 + */ 171 + static void pcpu_balance_workfn(struct work_struct *work); 172 + static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); 173 + static bool pcpu_async_enabled __read_mostly; 174 + static bool pcpu_atomic_alloc_failed; 175 + 176 + static void pcpu_schedule_balance_work(void) 177 + { 178 + if (pcpu_async_enabled) 179 + schedule_work(&pcpu_balance_work); 180 + } 194 181 195 182 static bool pcpu_addr_in_first_chunk(void *addr) 196 183 { ··· 318 315 } 319 316 320 317 /** 318 + * pcpu_count_occupied_pages - count the number of pages an area occupies 319 + * @chunk: chunk of interest 320 + * @i: index of the area in question 321 + * 322 + * Count the number of pages chunk's @i'th area occupies. When the area's 323 + * start and/or end address isn't aligned to page boundary, the straddled 324 + * page is included in the count iff the rest of the page is free. 325 + */ 326 + static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) 327 + { 328 + int off = chunk->map[i] & ~1; 329 + int end = chunk->map[i + 1] & ~1; 330 + 331 + if (!PAGE_ALIGNED(off) && i > 0) { 332 + int prev = chunk->map[i - 1]; 333 + 334 + if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) 335 + off = round_down(off, PAGE_SIZE); 336 + } 337 + 338 + if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { 339 + int next = chunk->map[i + 1]; 340 + int nend = chunk->map[i + 2] & ~1; 341 + 342 + if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) 343 + end = round_up(end, PAGE_SIZE); 344 + } 345 + 346 + return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); 347 + } 348 + 349 + /** 321 350 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot 322 351 * @chunk: chunk of interest 323 352 * @oslot: the previous slot it was on ··· 377 342 /** 378 343 * pcpu_need_to_extend - determine whether chunk area map needs to be extended 379 344 * @chunk: chunk of interest 345 + * @is_atomic: the allocation context 380 346 * 381 - * Determine whether area map of @chunk needs to be extended to 382 - * accommodate a new allocation. 347 + * Determine whether area map of @chunk needs to be extended. If 348 + * @is_atomic, only the amount necessary for a new allocation is 349 + * considered; however, async extension is scheduled if the left amount is 350 + * low. If !@is_atomic, it aims for more empty space. Combined, this 351 + * ensures that the map is likely to have enough available space to 352 + * accomodate atomic allocations which can't extend maps directly. 383 353 * 384 354 * CONTEXT: 385 355 * pcpu_lock. ··· 393 353 * New target map allocation length if extension is necessary, 0 394 354 * otherwise. 395 355 */ 396 - static int pcpu_need_to_extend(struct pcpu_chunk *chunk) 356 + static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) 397 357 { 398 - int new_alloc; 358 + int margin, new_alloc; 399 359 400 - if (chunk->map_alloc >= chunk->map_used + 3) 360 + if (is_atomic) { 361 + margin = 3; 362 + 363 + if (chunk->map_alloc < 364 + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && 365 + pcpu_async_enabled) 366 + schedule_work(&chunk->map_extend_work); 367 + } else { 368 + margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; 369 + } 370 + 371 + if (chunk->map_alloc >= chunk->map_used + margin) 401 372 return 0; 402 373 403 374 new_alloc = PCPU_DFL_MAP_ALLOC; 404 - while (new_alloc < chunk->map_used + 3) 375 + while (new_alloc < chunk->map_used + margin) 405 376 new_alloc *= 2; 406 377 407 378 return new_alloc; ··· 469 418 return 0; 470 419 } 471 420 421 + static void pcpu_map_extend_workfn(struct work_struct *work) 422 + { 423 + struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, 424 + map_extend_work); 425 + int new_alloc; 426 + 427 + spin_lock_irq(&pcpu_lock); 428 + new_alloc = pcpu_need_to_extend(chunk, false); 429 + spin_unlock_irq(&pcpu_lock); 430 + 431 + if (new_alloc) 432 + pcpu_extend_area_map(chunk, new_alloc); 433 + } 434 + 435 + /** 436 + * pcpu_fit_in_area - try to fit the requested allocation in a candidate area 437 + * @chunk: chunk the candidate area belongs to 438 + * @off: the offset to the start of the candidate area 439 + * @this_size: the size of the candidate area 440 + * @size: the size of the target allocation 441 + * @align: the alignment of the target allocation 442 + * @pop_only: only allocate from already populated region 443 + * 444 + * We're trying to allocate @size bytes aligned at @align. @chunk's area 445 + * at @off sized @this_size is a candidate. This function determines 446 + * whether the target allocation fits in the candidate area and returns the 447 + * number of bytes to pad after @off. If the target area doesn't fit, -1 448 + * is returned. 449 + * 450 + * If @pop_only is %true, this function only considers the already 451 + * populated part of the candidate area. 452 + */ 453 + static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, 454 + int size, int align, bool pop_only) 455 + { 456 + int cand_off = off; 457 + 458 + while (true) { 459 + int head = ALIGN(cand_off, align) - off; 460 + int page_start, page_end, rs, re; 461 + 462 + if (this_size < head + size) 463 + return -1; 464 + 465 + if (!pop_only) 466 + return head; 467 + 468 + /* 469 + * If the first unpopulated page is beyond the end of the 470 + * allocation, the whole allocation is populated; 471 + * otherwise, retry from the end of the unpopulated area. 472 + */ 473 + page_start = PFN_DOWN(head + off); 474 + page_end = PFN_UP(head + off + size); 475 + 476 + rs = page_start; 477 + pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); 478 + if (rs >= page_end) 479 + return head; 480 + cand_off = re * PAGE_SIZE; 481 + } 482 + } 483 + 472 484 /** 473 485 * pcpu_alloc_area - allocate area from a pcpu_chunk 474 486 * @chunk: chunk of interest 475 487 * @size: wanted size in bytes 476 488 * @align: wanted align 489 + * @pop_only: allocate only from the populated area 490 + * @occ_pages_p: out param for the number of pages the area occupies 477 491 * 478 492 * Try to allocate @size bytes area aligned at @align from @chunk. 479 493 * Note that this function only allocates the offset. It doesn't ··· 553 437 * Allocated offset in @chunk on success, -1 if no matching area is 554 438 * found. 555 439 */ 556 - static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) 440 + static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, 441 + bool pop_only, int *occ_pages_p) 557 442 { 558 443 int oslot = pcpu_chunk_slot(chunk); 559 444 int max_contig = 0; ··· 570 453 if (off & 1) 571 454 continue; 572 455 573 - /* extra for alignment requirement */ 574 - head = ALIGN(off, align) - off; 575 - 576 456 this_size = (p[1] & ~1) - off; 577 - if (this_size < head + size) { 457 + 458 + head = pcpu_fit_in_area(chunk, off, this_size, size, align, 459 + pop_only); 460 + if (head < 0) { 578 461 if (!seen_free) { 579 462 chunk->first_free = i; 580 463 seen_free = true; ··· 643 526 chunk->free_size -= size; 644 527 *p |= 1; 645 528 529 + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); 646 530 pcpu_chunk_relocate(chunk, oslot); 647 531 return off; 648 532 } ··· 659 541 * pcpu_free_area - free area to a pcpu_chunk 660 542 * @chunk: chunk of interest 661 543 * @freeme: offset of area to free 544 + * @occ_pages_p: out param for the number of pages the area occupies 662 545 * 663 546 * Free area starting from @freeme to @chunk. Note that this function 664 547 * only modifies the allocation map. It doesn't depopulate or unmap ··· 668 549 * CONTEXT: 669 550 * pcpu_lock. 670 551 */ 671 - static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) 552 + static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, 553 + int *occ_pages_p) 672 554 { 673 555 int oslot = pcpu_chunk_slot(chunk); 674 556 int off = 0; ··· 699 579 p = chunk->map + i; 700 580 *p = off &= ~1; 701 581 chunk->free_size += (p[1] & ~1) - off; 582 + 583 + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); 702 584 703 585 /* merge with next? */ 704 586 if (!(p[1] & 1)) ··· 742 620 chunk->map_used = 1; 743 621 744 622 INIT_LIST_HEAD(&chunk->list); 623 + INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); 745 624 chunk->free_size = pcpu_unit_size; 746 625 chunk->contig_hint = pcpu_unit_size; 747 626 ··· 755 632 return; 756 633 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); 757 634 pcpu_mem_free(chunk, pcpu_chunk_struct_size); 635 + } 636 + 637 + /** 638 + * pcpu_chunk_populated - post-population bookkeeping 639 + * @chunk: pcpu_chunk which got populated 640 + * @page_start: the start page 641 + * @page_end: the end page 642 + * 643 + * Pages in [@page_start,@page_end) have been populated to @chunk. Update 644 + * the bookkeeping information accordingly. Must be called after each 645 + * successful population. 646 + */ 647 + static void pcpu_chunk_populated(struct pcpu_chunk *chunk, 648 + int page_start, int page_end) 649 + { 650 + int nr = page_end - page_start; 651 + 652 + lockdep_assert_held(&pcpu_lock); 653 + 654 + bitmap_set(chunk->populated, page_start, nr); 655 + chunk->nr_populated += nr; 656 + pcpu_nr_empty_pop_pages += nr; 657 + } 658 + 659 + /** 660 + * pcpu_chunk_depopulated - post-depopulation bookkeeping 661 + * @chunk: pcpu_chunk which got depopulated 662 + * @page_start: the start page 663 + * @page_end: the end page 664 + * 665 + * Pages in [@page_start,@page_end) have been depopulated from @chunk. 666 + * Update the bookkeeping information accordingly. Must be called after 667 + * each successful depopulation. 668 + */ 669 + static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, 670 + int page_start, int page_end) 671 + { 672 + int nr = page_end - page_start; 673 + 674 + lockdep_assert_held(&pcpu_lock); 675 + 676 + bitmap_clear(chunk->populated, page_start, nr); 677 + chunk->nr_populated -= nr; 678 + pcpu_nr_empty_pop_pages -= nr; 758 679 } 759 680 760 681 /* ··· 862 695 * @size: size of area to allocate in bytes 863 696 * @align: alignment of area (max PAGE_SIZE) 864 697 * @reserved: allocate from the reserved chunk if available 698 + * @gfp: allocation flags 865 699 * 866 - * Allocate percpu area of @size bytes aligned at @align. 867 - * 868 - * CONTEXT: 869 - * Does GFP_KERNEL allocation. 700 + * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't 701 + * contain %GFP_KERNEL, the allocation is atomic. 870 702 * 871 703 * RETURNS: 872 704 * Percpu pointer to the allocated area on success, NULL on failure. 873 705 */ 874 - static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) 706 + static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, 707 + gfp_t gfp) 875 708 { 876 709 static int warn_limit = 10; 877 710 struct pcpu_chunk *chunk; 878 711 const char *err; 879 - int slot, off, new_alloc; 712 + bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; 713 + int occ_pages = 0; 714 + int slot, off, new_alloc, cpu, ret; 880 715 unsigned long flags; 881 716 void __percpu *ptr; 882 717 ··· 897 728 return NULL; 898 729 } 899 730 900 - mutex_lock(&pcpu_alloc_mutex); 901 731 spin_lock_irqsave(&pcpu_lock, flags); 902 732 903 733 /* serve reserved allocations from the reserved chunk if available */ ··· 908 740 goto fail_unlock; 909 741 } 910 742 911 - while ((new_alloc = pcpu_need_to_extend(chunk))) { 743 + while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { 912 744 spin_unlock_irqrestore(&pcpu_lock, flags); 913 - if (pcpu_extend_area_map(chunk, new_alloc) < 0) { 745 + if (is_atomic || 746 + pcpu_extend_area_map(chunk, new_alloc) < 0) { 914 747 err = "failed to extend area map of reserved chunk"; 915 - goto fail_unlock_mutex; 748 + goto fail; 916 749 } 917 750 spin_lock_irqsave(&pcpu_lock, flags); 918 751 } 919 752 920 - off = pcpu_alloc_area(chunk, size, align); 753 + off = pcpu_alloc_area(chunk, size, align, is_atomic, 754 + &occ_pages); 921 755 if (off >= 0) 922 756 goto area_found; 923 757 ··· 934 764 if (size > chunk->contig_hint) 935 765 continue; 936 766 937 - new_alloc = pcpu_need_to_extend(chunk); 767 + new_alloc = pcpu_need_to_extend(chunk, is_atomic); 938 768 if (new_alloc) { 769 + if (is_atomic) 770 + continue; 939 771 spin_unlock_irqrestore(&pcpu_lock, flags); 940 772 if (pcpu_extend_area_map(chunk, 941 773 new_alloc) < 0) { 942 774 err = "failed to extend area map"; 943 - goto fail_unlock_mutex; 775 + goto fail; 944 776 } 945 777 spin_lock_irqsave(&pcpu_lock, flags); 946 778 /* ··· 952 780 goto restart; 953 781 } 954 782 955 - off = pcpu_alloc_area(chunk, size, align); 783 + off = pcpu_alloc_area(chunk, size, align, is_atomic, 784 + &occ_pages); 956 785 if (off >= 0) 957 786 goto area_found; 958 787 } 959 788 } 960 789 961 - /* hmmm... no space left, create a new chunk */ 962 790 spin_unlock_irqrestore(&pcpu_lock, flags); 963 791 964 - chunk = pcpu_create_chunk(); 965 - if (!chunk) { 966 - err = "failed to allocate new chunk"; 967 - goto fail_unlock_mutex; 792 + /* 793 + * No space left. Create a new chunk. We don't want multiple 794 + * tasks to create chunks simultaneously. Serialize and create iff 795 + * there's still no empty chunk after grabbing the mutex. 796 + */ 797 + if (is_atomic) 798 + goto fail; 799 + 800 + mutex_lock(&pcpu_alloc_mutex); 801 + 802 + if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { 803 + chunk = pcpu_create_chunk(); 804 + if (!chunk) { 805 + mutex_unlock(&pcpu_alloc_mutex); 806 + err = "failed to allocate new chunk"; 807 + goto fail; 808 + } 809 + 810 + spin_lock_irqsave(&pcpu_lock, flags); 811 + pcpu_chunk_relocate(chunk, -1); 812 + } else { 813 + spin_lock_irqsave(&pcpu_lock, flags); 968 814 } 969 815 970 - spin_lock_irqsave(&pcpu_lock, flags); 971 - pcpu_chunk_relocate(chunk, -1); 816 + mutex_unlock(&pcpu_alloc_mutex); 972 817 goto restart; 973 818 974 819 area_found: 975 820 spin_unlock_irqrestore(&pcpu_lock, flags); 976 821 977 - /* populate, map and clear the area */ 978 - if (pcpu_populate_chunk(chunk, off, size)) { 979 - spin_lock_irqsave(&pcpu_lock, flags); 980 - pcpu_free_area(chunk, off); 981 - err = "failed to populate"; 982 - goto fail_unlock; 822 + /* populate if not all pages are already there */ 823 + if (!is_atomic) { 824 + int page_start, page_end, rs, re; 825 + 826 + mutex_lock(&pcpu_alloc_mutex); 827 + 828 + page_start = PFN_DOWN(off); 829 + page_end = PFN_UP(off + size); 830 + 831 + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 832 + WARN_ON(chunk->immutable); 833 + 834 + ret = pcpu_populate_chunk(chunk, rs, re); 835 + 836 + spin_lock_irqsave(&pcpu_lock, flags); 837 + if (ret) { 838 + mutex_unlock(&pcpu_alloc_mutex); 839 + pcpu_free_area(chunk, off, &occ_pages); 840 + err = "failed to populate"; 841 + goto fail_unlock; 842 + } 843 + pcpu_chunk_populated(chunk, rs, re); 844 + spin_unlock_irqrestore(&pcpu_lock, flags); 845 + } 846 + 847 + mutex_unlock(&pcpu_alloc_mutex); 983 848 } 984 849 985 - mutex_unlock(&pcpu_alloc_mutex); 850 + if (chunk != pcpu_reserved_chunk) 851 + pcpu_nr_empty_pop_pages -= occ_pages; 986 852 987 - /* return address relative to base address */ 853 + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) 854 + pcpu_schedule_balance_work(); 855 + 856 + /* clear the areas and return address relative to base address */ 857 + for_each_possible_cpu(cpu) 858 + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); 859 + 988 860 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); 989 861 kmemleak_alloc_percpu(ptr, size); 990 862 return ptr; 991 863 992 864 fail_unlock: 993 865 spin_unlock_irqrestore(&pcpu_lock, flags); 994 - fail_unlock_mutex: 995 - mutex_unlock(&pcpu_alloc_mutex); 996 - if (warn_limit) { 997 - pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " 998 - "%s\n", size, align, err); 866 + fail: 867 + if (!is_atomic && warn_limit) { 868 + pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", 869 + size, align, is_atomic, err); 999 870 dump_stack(); 1000 871 if (!--warn_limit) 1001 872 pr_info("PERCPU: limit reached, disable warning\n"); 1002 873 } 874 + if (is_atomic) { 875 + /* see the flag handling in pcpu_blance_workfn() */ 876 + pcpu_atomic_alloc_failed = true; 877 + pcpu_schedule_balance_work(); 878 + } 1003 879 return NULL; 1004 880 } 881 + 882 + /** 883 + * __alloc_percpu_gfp - allocate dynamic percpu area 884 + * @size: size of area to allocate in bytes 885 + * @align: alignment of area (max PAGE_SIZE) 886 + * @gfp: allocation flags 887 + * 888 + * Allocate zero-filled percpu area of @size bytes aligned at @align. If 889 + * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can 890 + * be called from any context but is a lot more likely to fail. 891 + * 892 + * RETURNS: 893 + * Percpu pointer to the allocated area on success, NULL on failure. 894 + */ 895 + void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) 896 + { 897 + return pcpu_alloc(size, align, false, gfp); 898 + } 899 + EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); 1005 900 1006 901 /** 1007 902 * __alloc_percpu - allocate dynamic percpu area 1008 903 * @size: size of area to allocate in bytes 1009 904 * @align: alignment of area (max PAGE_SIZE) 1010 905 * 1011 - * Allocate zero-filled percpu area of @size bytes aligned at @align. 1012 - * Might sleep. Might trigger writeouts. 1013 - * 1014 - * CONTEXT: 1015 - * Does GFP_KERNEL allocation. 1016 - * 1017 - * RETURNS: 1018 - * Percpu pointer to the allocated area on success, NULL on failure. 906 + * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). 1019 907 */ 1020 908 void __percpu *__alloc_percpu(size_t size, size_t align) 1021 909 { 1022 - return pcpu_alloc(size, align, false); 910 + return pcpu_alloc(size, align, false, GFP_KERNEL); 1023 911 } 1024 912 EXPORT_SYMBOL_GPL(__alloc_percpu); 1025 913 ··· 1101 869 */ 1102 870 void __percpu *__alloc_reserved_percpu(size_t size, size_t align) 1103 871 { 1104 - return pcpu_alloc(size, align, true); 872 + return pcpu_alloc(size, align, true, GFP_KERNEL); 1105 873 } 1106 874 1107 875 /** 1108 - * pcpu_reclaim - reclaim fully free chunks, workqueue function 876 + * pcpu_balance_workfn - manage the amount of free chunks and populated pages 1109 877 * @work: unused 1110 878 * 1111 879 * Reclaim all fully free chunks except for the first one. 1112 - * 1113 - * CONTEXT: 1114 - * workqueue context. 1115 880 */ 1116 - static void pcpu_reclaim(struct work_struct *work) 881 + static void pcpu_balance_workfn(struct work_struct *work) 1117 882 { 1118 - LIST_HEAD(todo); 1119 - struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; 883 + LIST_HEAD(to_free); 884 + struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; 1120 885 struct pcpu_chunk *chunk, *next; 886 + int slot, nr_to_pop, ret; 1121 887 888 + /* 889 + * There's no reason to keep around multiple unused chunks and VM 890 + * areas can be scarce. Destroy all free chunks except for one. 891 + */ 1122 892 mutex_lock(&pcpu_alloc_mutex); 1123 893 spin_lock_irq(&pcpu_lock); 1124 894 1125 - list_for_each_entry_safe(chunk, next, head, list) { 895 + list_for_each_entry_safe(chunk, next, free_head, list) { 1126 896 WARN_ON(chunk->immutable); 1127 897 1128 898 /* spare the first one */ 1129 - if (chunk == list_first_entry(head, struct pcpu_chunk, list)) 899 + if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) 1130 900 continue; 1131 901 1132 - list_move(&chunk->list, &todo); 902 + list_move(&chunk->list, &to_free); 1133 903 } 1134 904 1135 905 spin_unlock_irq(&pcpu_lock); 1136 906 1137 - list_for_each_entry_safe(chunk, next, &todo, list) { 1138 - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); 907 + list_for_each_entry_safe(chunk, next, &to_free, list) { 908 + int rs, re; 909 + 910 + pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { 911 + pcpu_depopulate_chunk(chunk, rs, re); 912 + spin_lock_irq(&pcpu_lock); 913 + pcpu_chunk_depopulated(chunk, rs, re); 914 + spin_unlock_irq(&pcpu_lock); 915 + } 1139 916 pcpu_destroy_chunk(chunk); 917 + } 918 + 919 + /* 920 + * Ensure there are certain number of free populated pages for 921 + * atomic allocs. Fill up from the most packed so that atomic 922 + * allocs don't increase fragmentation. If atomic allocation 923 + * failed previously, always populate the maximum amount. This 924 + * should prevent atomic allocs larger than PAGE_SIZE from keeping 925 + * failing indefinitely; however, large atomic allocs are not 926 + * something we support properly and can be highly unreliable and 927 + * inefficient. 928 + */ 929 + retry_pop: 930 + if (pcpu_atomic_alloc_failed) { 931 + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; 932 + /* best effort anyway, don't worry about synchronization */ 933 + pcpu_atomic_alloc_failed = false; 934 + } else { 935 + nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - 936 + pcpu_nr_empty_pop_pages, 937 + 0, PCPU_EMPTY_POP_PAGES_HIGH); 938 + } 939 + 940 + for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { 941 + int nr_unpop = 0, rs, re; 942 + 943 + if (!nr_to_pop) 944 + break; 945 + 946 + spin_lock_irq(&pcpu_lock); 947 + list_for_each_entry(chunk, &pcpu_slot[slot], list) { 948 + nr_unpop = pcpu_unit_pages - chunk->nr_populated; 949 + if (nr_unpop) 950 + break; 951 + } 952 + spin_unlock_irq(&pcpu_lock); 953 + 954 + if (!nr_unpop) 955 + continue; 956 + 957 + /* @chunk can't go away while pcpu_alloc_mutex is held */ 958 + pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { 959 + int nr = min(re - rs, nr_to_pop); 960 + 961 + ret = pcpu_populate_chunk(chunk, rs, rs + nr); 962 + if (!ret) { 963 + nr_to_pop -= nr; 964 + spin_lock_irq(&pcpu_lock); 965 + pcpu_chunk_populated(chunk, rs, rs + nr); 966 + spin_unlock_irq(&pcpu_lock); 967 + } else { 968 + nr_to_pop = 0; 969 + } 970 + 971 + if (!nr_to_pop) 972 + break; 973 + } 974 + } 975 + 976 + if (nr_to_pop) { 977 + /* ran out of chunks to populate, create a new one and retry */ 978 + chunk = pcpu_create_chunk(); 979 + if (chunk) { 980 + spin_lock_irq(&pcpu_lock); 981 + pcpu_chunk_relocate(chunk, -1); 982 + spin_unlock_irq(&pcpu_lock); 983 + goto retry_pop; 984 + } 1140 985 } 1141 986 1142 987 mutex_unlock(&pcpu_alloc_mutex); ··· 1233 924 void *addr; 1234 925 struct pcpu_chunk *chunk; 1235 926 unsigned long flags; 1236 - int off; 927 + int off, occ_pages; 1237 928 1238 929 if (!ptr) 1239 930 return; ··· 1247 938 chunk = pcpu_chunk_addr_search(addr); 1248 939 off = addr - chunk->base_addr; 1249 940 1250 - pcpu_free_area(chunk, off); 941 + pcpu_free_area(chunk, off, &occ_pages); 942 + 943 + if (chunk != pcpu_reserved_chunk) 944 + pcpu_nr_empty_pop_pages += occ_pages; 1251 945 1252 946 /* if there are more than one fully free chunks, wake up grim reaper */ 1253 947 if (chunk->free_size == pcpu_unit_size) { ··· 1258 946 1259 947 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) 1260 948 if (pos != chunk) { 1261 - schedule_work(&pcpu_reclaim_work); 949 + pcpu_schedule_balance_work(); 1262 950 break; 1263 951 } 1264 952 } ··· 1648 1336 */ 1649 1337 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1650 1338 INIT_LIST_HEAD(&schunk->list); 1339 + INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); 1651 1340 schunk->base_addr = base_addr; 1652 1341 schunk->map = smap; 1653 1342 schunk->map_alloc = ARRAY_SIZE(smap); 1654 1343 schunk->immutable = true; 1655 1344 bitmap_fill(schunk->populated, pcpu_unit_pages); 1345 + schunk->nr_populated = pcpu_unit_pages; 1656 1346 1657 1347 if (ai->reserved_size) { 1658 1348 schunk->free_size = ai->reserved_size; ··· 1678 1364 if (dyn_size) { 1679 1365 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1680 1366 INIT_LIST_HEAD(&dchunk->list); 1367 + INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); 1681 1368 dchunk->base_addr = base_addr; 1682 1369 dchunk->map = dmap; 1683 1370 dchunk->map_alloc = ARRAY_SIZE(dmap); 1684 1371 dchunk->immutable = true; 1685 1372 bitmap_fill(dchunk->populated, pcpu_unit_pages); 1373 + dchunk->nr_populated = pcpu_unit_pages; 1686 1374 1687 1375 dchunk->contig_hint = dchunk->free_size = dyn_size; 1688 1376 dchunk->map[0] = 1; ··· 1695 1379 1696 1380 /* link the first chunk in */ 1697 1381 pcpu_first_chunk = dchunk ?: schunk; 1382 + pcpu_nr_empty_pop_pages += 1383 + pcpu_count_occupied_pages(pcpu_first_chunk, 1); 1698 1384 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1699 1385 1700 1386 /* we're done */ ··· 2250 1932 2251 1933 if (pcpu_setup_first_chunk(ai, fc) < 0) 2252 1934 panic("Failed to initialize percpu areas."); 2253 - 2254 - pcpu_free_alloc_info(ai); 2255 1935 } 2256 1936 2257 1937 #endif /* CONFIG_SMP */ ··· 2283 1967 spin_unlock_irqrestore(&pcpu_lock, flags); 2284 1968 } 2285 1969 } 1970 + 1971 + /* 1972 + * Percpu allocator is initialized early during boot when neither slab or 1973 + * workqueue is available. Plug async management until everything is up 1974 + * and running. 1975 + */ 1976 + static int __init percpu_enable_async(void) 1977 + { 1978 + pcpu_async_enabled = true; 1979 + return 0; 1980 + } 1981 + subsys_initcall(percpu_enable_async);
+1 -1
mm/shmem.c
··· 2995 2995 #endif 2996 2996 2997 2997 spin_lock_init(&sbinfo->stat_lock); 2998 - if (percpu_counter_init(&sbinfo->used_blocks, 0)) 2998 + if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) 2999 2999 goto failed; 3000 3000 sbinfo->free_inodes = sbinfo->max_inodes; 3001 3001
+1 -1
net/dccp/proto.c
··· 1115 1115 1116 1116 BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > 1117 1117 FIELD_SIZEOF(struct sk_buff, cb)); 1118 - rc = percpu_counter_init(&dccp_orphan_count, 0); 1118 + rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL); 1119 1119 if (rc) 1120 1120 goto out_fail; 1121 1121 rc = -ENOBUFS;
+2 -2
net/ipv4/tcp.c
··· 3071 3071 3072 3072 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3073 3073 3074 - percpu_counter_init(&tcp_sockets_allocated, 0); 3075 - percpu_counter_init(&tcp_orphan_count, 0); 3074 + percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); 3075 + percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); 3076 3076 tcp_hashinfo.bind_bucket_cachep = 3077 3077 kmem_cache_create("tcp_bind_bucket", 3078 3078 sizeof(struct inet_bind_bucket), 0,
+1 -1
net/ipv4/tcp_memcontrol.c
··· 32 32 res_parent = &parent_cg->memory_allocated; 33 33 34 34 res_counter_init(&cg_proto->memory_allocated, res_parent); 35 - percpu_counter_init(&cg_proto->sockets_allocated, 0); 35 + percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); 36 36 37 37 return 0; 38 38 }
+1 -1
net/sctp/protocol.c
··· 1341 1341 if (!sctp_chunk_cachep) 1342 1342 goto err_chunk_cachep; 1343 1343 1344 - status = percpu_counter_init(&sctp_sockets_allocated, 0); 1344 + status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL); 1345 1345 if (status) 1346 1346 goto err_percpu_counter_init; 1347 1347