Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

block: strict rq_affinity

Some systems benefit from completions always being steered to the strict
requester cpu rather than the looser "per-socket" steering that
blk_cpu_to_group() attempts by default. This is because the first
CPU in the group mask ends up being completely overloaded with work,
while the others (including the original submitter) has power left
to spare.

Allow the strict mode to be set by writing '2' to the sysfs control
file. This is identical to the scheme used for the nomerges file,
where '2' is a more aggressive setting than just being turned on.

echo 2 > /sys/block/<bdev>/queue/rq_affinity

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Roland Dreier <roland@purestorage.com>
Tested-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>

authored by

Dan Williams and committed by
Jens Axboe
5757a6d7 ef323088

+27 -16
+7 -3
Documentation/block/queue-sysfs.txt
··· 45 45 46 46 rq_affinity (RW) 47 47 ---------------- 48 - If this option is enabled, the block layer will migrate request completions 49 - to the CPU that originally submitted the request. For some workloads 50 - this provides a significant reduction in CPU cycles due to caching effects. 48 + If this option is '1', the block layer will migrate request completions to the 49 + cpu "group" that originally submitted the request. For some workloads this 50 + provides a significant reduction in CPU cycles due to caching effects. 51 + 52 + For storage configurations that need to maximize distribution of completion 53 + processing setting this option to '2' forces the completion to run on the 54 + requesting cpu (bypassing the "group" aggregation logic). 51 55 52 56 scheduler (RW) 53 57 --------------
+2 -4
block/blk-core.c
··· 1279 1279 init_request_from_bio(req, bio); 1280 1280 1281 1281 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || 1282 - bio_flagged(bio, BIO_CPU_AFFINE)) { 1283 - req->cpu = blk_cpu_to_group(get_cpu()); 1284 - put_cpu(); 1285 - } 1282 + bio_flagged(bio, BIO_CPU_AFFINE)) 1283 + req->cpu = smp_processor_id(); 1286 1284 1287 1285 plug = current->plug; 1288 1286 if (plug) {
+7 -4
block/blk-softirq.c
··· 103 103 104 104 void __blk_complete_request(struct request *req) 105 105 { 106 + int ccpu, cpu, group_cpu = NR_CPUS; 106 107 struct request_queue *q = req->q; 107 108 unsigned long flags; 108 - int ccpu, cpu, group_cpu; 109 109 110 110 BUG_ON(!q->softirq_done_fn); 111 111 112 112 local_irq_save(flags); 113 113 cpu = smp_processor_id(); 114 - group_cpu = blk_cpu_to_group(cpu); 115 114 116 115 /* 117 116 * Select completion CPU 118 117 */ 119 - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) 118 + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) { 120 119 ccpu = req->cpu; 121 - else 120 + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) { 121 + ccpu = blk_cpu_to_group(ccpu); 122 + group_cpu = blk_cpu_to_group(cpu); 123 + } 124 + } else 122 125 ccpu = cpu; 123 126 124 127 if (ccpu == cpu || ccpu == group_cpu) {
+9 -4
block/blk-sysfs.c
··· 244 244 static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) 245 245 { 246 246 bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); 247 + bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags); 247 248 248 - return queue_var_show(set, page); 249 + return queue_var_show(set << force, page); 249 250 } 250 251 251 252 static ssize_t ··· 258 257 259 258 ret = queue_var_store(&val, page, count); 260 259 spin_lock_irq(q->queue_lock); 261 - if (val) 260 + if (val) { 262 261 queue_flag_set(QUEUE_FLAG_SAME_COMP, q); 263 - else 264 - queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); 262 + if (val == 2) 263 + queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); 264 + } else { 265 + queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); 266 + queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); 267 + } 265 268 spin_unlock_irq(q->queue_lock); 266 269 #endif 267 270 return ret;
+2 -1
include/linux/blkdev.h
··· 392 392 #define QUEUE_FLAG_ELVSWITCH 6 /* don't use elevator, just do FIFO */ 393 393 #define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */ 394 394 #define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */ 395 - #define QUEUE_FLAG_SAME_COMP 9 /* force complete on same CPU */ 395 + #define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */ 396 396 #define QUEUE_FLAG_FAIL_IO 10 /* fake timeout */ 397 397 #define QUEUE_FLAG_STACKABLE 11 /* supports request stacking */ 398 398 #define QUEUE_FLAG_NONROT 12 /* non-rotational device (SSD) */ ··· 402 402 #define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */ 403 403 #define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */ 404 404 #define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ 405 + #define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ 405 406 406 407 #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ 407 408 (1 << QUEUE_FLAG_STACKABLE) | \