Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-3.17/core' of git://git.kernel.dk/linux-block

Pull block core bits from Jens Axboe:
"Small round this time, after the massive blk-mq dump for 3.16. This
pull request contains:

- Fixes for max_sectors overflow in ioctls from Akinoby Mita.

- Partition off-by-one bug fix in aix partitions from Dan Carpenter.

- Various small partition cleanups from Fabian Frederick.

- Fix for the block integrity code sometimes returning the wrong
vector count from Gu Zheng.

- Cleanup an re-org of the blk-mq queue enter/exit percpu counters
from Tejun. Dependent on the percpu pull for 3.17 (which was in
the block tree too), that you have already pulled in.

- A blkcg oops fix, also from Tejun"

* 'for-3.17/core' of git://git.kernel.dk/linux-block:
partitions: aix.c: off by one bug
blkcg: don't call into policy draining if root_blkg is already gone
Revert "bio: modify __bio_add_page() to accept pages that don't start a new segment"
bio: modify __bio_add_page() to accept pages that don't start a new segment
block: fix SG_[GS]ET_RESERVED_SIZE ioctl when max_sectors is huge
block: fix BLKSECTGET ioctl when max_sectors is greater than USHRT_MAX
block/partitions/efi.c: kerneldoc fixing
block/partitions/msdos.c: code clean-up
block/partitions/amiga.c: replace nolevel printk by pr_err
block/partitions/aix.c: replace count*size kzalloc by kcalloc
bio-integrity: add "bip_max_vcnt" into struct bio_integrity_payload
blk-mq: use percpu_ref for mq usage count
blk-mq: collapse __blk_mq_drain_queue() into blk_mq_freeze_queue()
blk-mq: decouble blk-mq freezing from generic bypassing
block, blk-mq: draining can't be skipped even if bypass_depth was non-zero
blk-mq: fix a memory ordering bug in blk_mq_queue_enter()

+106 -110
+3 -9
block/bio-integrity.c
··· 70 70 bs->bvec_integrity_pool); 71 71 if (!bip->bip_vec) 72 72 goto err; 73 + bip->bip_max_vcnt = bvec_nr_vecs(idx); 73 74 } else { 74 75 bip->bip_vec = bip->bip_inline_vecs; 76 + bip->bip_max_vcnt = inline_vecs; 75 77 } 76 78 77 79 bip->bip_slab = idx; ··· 116 114 } 117 115 EXPORT_SYMBOL(bio_integrity_free); 118 116 119 - static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip) 120 - { 121 - if (bip->bip_slab == BIO_POOL_NONE) 122 - return BIP_INLINE_VECS; 123 - 124 - return bvec_nr_vecs(bip->bip_slab); 125 - } 126 - 127 117 /** 128 118 * bio_integrity_add_page - Attach integrity metadata 129 119 * @bio: bio to update ··· 131 137 struct bio_integrity_payload *bip = bio->bi_integrity; 132 138 struct bio_vec *iv; 133 139 134 - if (bip->bip_vcnt >= bip_integrity_vecs(bip)) { 140 + if (bip->bip_vcnt >= bip->bip_max_vcnt) { 135 141 printk(KERN_ERR "%s: bip_vec full\n", __func__); 136 142 return 0; 137 143 }
+8 -5
block/blk-core.c
··· 438 438 */ 439 439 void blk_queue_bypass_start(struct request_queue *q) 440 440 { 441 - bool drain; 442 - 443 441 spin_lock_irq(q->queue_lock); 444 - drain = !q->bypass_depth++; 442 + q->bypass_depth++; 445 443 queue_flag_set(QUEUE_FLAG_BYPASS, q); 446 444 spin_unlock_irq(q->queue_lock); 447 445 448 - if (drain) { 446 + /* 447 + * Queues start drained. Skip actual draining till init is 448 + * complete. This avoids lenghty delays during queue init which 449 + * can happen many times during boot. 450 + */ 451 + if (blk_queue_init_done(q)) { 449 452 spin_lock_irq(q->queue_lock); 450 453 __blk_drain_queue(q, false); 451 454 spin_unlock_irq(q->queue_lock); ··· 514 511 * prevent that q->request_fn() gets invoked after draining finished. 515 512 */ 516 513 if (q->mq_ops) { 517 - blk_mq_drain_queue(q); 514 + blk_mq_freeze_queue(q); 518 515 spin_lock_irq(lock); 519 516 } else { 520 517 spin_lock_irq(lock);
+29 -52
block/blk-mq.c
··· 78 78 79 79 static int blk_mq_queue_enter(struct request_queue *q) 80 80 { 81 - int ret; 81 + while (true) { 82 + int ret; 82 83 83 - __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 84 - smp_wmb(); 84 + if (percpu_ref_tryget_live(&q->mq_usage_counter)) 85 + return 0; 85 86 86 - /* we have problems freezing the queue if it's initializing */ 87 - if (!blk_queue_dying(q) && 88 - (!blk_queue_bypass(q) || !blk_queue_init_done(q))) 89 - return 0; 90 - 91 - __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 92 - 93 - spin_lock_irq(q->queue_lock); 94 - ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, 95 - !blk_queue_bypass(q) || blk_queue_dying(q), 96 - *q->queue_lock); 97 - /* inc usage with lock hold to avoid freeze_queue runs here */ 98 - if (!ret && !blk_queue_dying(q)) 99 - __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 100 - else if (blk_queue_dying(q)) 101 - ret = -ENODEV; 102 - spin_unlock_irq(q->queue_lock); 103 - 104 - return ret; 87 + ret = wait_event_interruptible(q->mq_freeze_wq, 88 + !q->mq_freeze_depth || blk_queue_dying(q)); 89 + if (blk_queue_dying(q)) 90 + return -ENODEV; 91 + if (ret) 92 + return ret; 93 + } 105 94 } 106 95 107 96 static void blk_mq_queue_exit(struct request_queue *q) 108 97 { 109 - __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 98 + percpu_ref_put(&q->mq_usage_counter); 110 99 } 111 100 112 - void blk_mq_drain_queue(struct request_queue *q) 101 + static void blk_mq_usage_counter_release(struct percpu_ref *ref) 113 102 { 114 - while (true) { 115 - s64 count; 103 + struct request_queue *q = 104 + container_of(ref, struct request_queue, mq_usage_counter); 116 105 117 - spin_lock_irq(q->queue_lock); 118 - count = percpu_counter_sum(&q->mq_usage_counter); 119 - spin_unlock_irq(q->queue_lock); 120 - 121 - if (count == 0) 122 - break; 123 - blk_mq_start_hw_queues(q); 124 - msleep(10); 125 - } 106 + wake_up_all(&q->mq_freeze_wq); 126 107 } 127 108 128 109 /* 129 110 * Guarantee no request is in use, so we can change any data structure of 130 111 * the queue afterward. 131 112 */ 132 - static void blk_mq_freeze_queue(struct request_queue *q) 113 + void blk_mq_freeze_queue(struct request_queue *q) 133 114 { 134 - bool drain; 135 - 136 115 spin_lock_irq(q->queue_lock); 137 - drain = !q->bypass_depth++; 138 - queue_flag_set(QUEUE_FLAG_BYPASS, q); 116 + q->mq_freeze_depth++; 139 117 spin_unlock_irq(q->queue_lock); 140 118 141 - if (drain) 142 - blk_mq_drain_queue(q); 119 + percpu_ref_kill(&q->mq_usage_counter); 120 + blk_mq_run_queues(q, false); 121 + wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); 143 122 } 144 123 145 124 static void blk_mq_unfreeze_queue(struct request_queue *q) ··· 126 147 bool wake = false; 127 148 128 149 spin_lock_irq(q->queue_lock); 129 - if (!--q->bypass_depth) { 130 - queue_flag_clear(QUEUE_FLAG_BYPASS, q); 131 - wake = true; 132 - } 133 - WARN_ON_ONCE(q->bypass_depth < 0); 150 + wake = !--q->mq_freeze_depth; 151 + WARN_ON_ONCE(q->mq_freeze_depth < 0); 134 152 spin_unlock_irq(q->queue_lock); 135 - if (wake) 153 + if (wake) { 154 + percpu_ref_reinit(&q->mq_usage_counter); 136 155 wake_up_all(&q->mq_freeze_wq); 156 + } 137 157 } 138 158 139 159 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) ··· 1776 1798 if (!q) 1777 1799 goto err_hctxs; 1778 1800 1779 - if (percpu_counter_init(&q->mq_usage_counter, 0)) 1801 + if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release)) 1780 1802 goto err_map; 1781 1803 1782 1804 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); ··· 1869 1891 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 1870 1892 blk_mq_free_hw_queues(q, set); 1871 1893 1872 - percpu_counter_destroy(&q->mq_usage_counter); 1894 + percpu_ref_exit(&q->mq_usage_counter); 1873 1895 1874 1896 free_percpu(q->queue_ctx); 1875 1897 kfree(q->queue_hw_ctx); ··· 2028 2050 { 2029 2051 blk_mq_cpu_init(); 2030 2052 2031 - /* Must be called after percpu_counter_hotcpu_callback() */ 2032 - hotcpu_notifier(blk_mq_queue_reinit_notify, -10); 2053 + hotcpu_notifier(blk_mq_queue_reinit_notify, 0); 2033 2054 2034 2055 return 0; 2035 2056 }
+1 -1
block/blk-mq.h
··· 28 28 void __blk_mq_complete_request(struct request *rq); 29 29 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 30 30 void blk_mq_init_flush(struct request_queue *q); 31 - void blk_mq_drain_queue(struct request_queue *q); 31 + void blk_mq_freeze_queue(struct request_queue *q); 32 32 void blk_mq_free_queue(struct request_queue *q); 33 33 void blk_mq_clone_flush_request(struct request *flush_rq, 34 34 struct request *orig_rq);
+1 -1
block/blk-sysfs.c
··· 554 554 * Initialization must be complete by now. Finish the initial 555 555 * bypass from queue allocation. 556 556 */ 557 - blk_queue_bypass_end(q); 558 557 queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); 558 + blk_queue_bypass_end(q); 559 559 560 560 ret = blk_trace_init_sysfs(dev); 561 561 if (ret)
+4 -2
block/compat_ioctl.c
··· 663 663 fmode_t mode = file->f_mode; 664 664 struct backing_dev_info *bdi; 665 665 loff_t size; 666 + unsigned int max_sectors; 666 667 667 668 /* 668 669 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have ··· 720 719 case BLKSSZGET: /* get block device hardware sector size */ 721 720 return compat_put_int(arg, bdev_logical_block_size(bdev)); 722 721 case BLKSECTGET: 723 - return compat_put_ushort(arg, 724 - queue_max_sectors(bdev_get_queue(bdev))); 722 + max_sectors = min_t(unsigned int, USHRT_MAX, 723 + queue_max_sectors(bdev_get_queue(bdev))); 724 + return compat_put_ushort(arg, max_sectors); 725 725 case BLKROTATIONAL: 726 726 return compat_put_ushort(arg, 727 727 !blk_queue_nonrot(bdev_get_queue(bdev)));
+4 -1
block/ioctl.c
··· 278 278 struct backing_dev_info *bdi; 279 279 loff_t size; 280 280 int ret, n; 281 + unsigned int max_sectors; 281 282 282 283 switch(cmd) { 283 284 case BLKFLSBUF: ··· 376 375 case BLKDISCARDZEROES: 377 376 return put_uint(arg, bdev_discard_zeroes_data(bdev)); 378 377 case BLKSECTGET: 379 - return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); 378 + max_sectors = min_t(unsigned int, USHRT_MAX, 379 + queue_max_sectors(bdev_get_queue(bdev))); 380 + return put_ushort(arg, max_sectors); 380 381 case BLKROTATIONAL: 381 382 return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev))); 382 383 case BLKRASET:
+2 -2
block/partitions/aix.c
··· 215 215 numlvs = be16_to_cpu(p->numlvs); 216 216 put_dev_sector(sect); 217 217 } 218 - lvip = kzalloc(sizeof(struct lv_info) * state->limit, GFP_KERNEL); 218 + lvip = kcalloc(state->limit, sizeof(struct lv_info), GFP_KERNEL); 219 219 if (!lvip) 220 220 return 0; 221 221 if (numlvs && (d = read_part_sector(state, vgda_sector + 1, &sect))) { ··· 253 253 continue; 254 254 } 255 255 lv_ix = be16_to_cpu(p->lv_ix) - 1; 256 - if (lv_ix > state->limit) { 256 + if (lv_ix >= state->limit) { 257 257 cur_lv_ix = -1; 258 258 continue; 259 259 }
+7 -5
block/partitions/amiga.c
··· 7 7 * Re-organised Feb 1998 Russell King 8 8 */ 9 9 10 + #define pr_fmt(fmt) fmt 11 + 10 12 #include <linux/types.h> 11 13 #include <linux/affs_hardblocks.h> 12 14 ··· 42 40 data = read_part_sector(state, blk, &sect); 43 41 if (!data) { 44 42 if (warn_no_part) 45 - printk("Dev %s: unable to read RDB block %d\n", 43 + pr_err("Dev %s: unable to read RDB block %d\n", 46 44 bdevname(state->bdev, b), blk); 47 45 res = -1; 48 46 goto rdb_done; ··· 59 57 *(__be32 *)(data+0xdc) = 0; 60 58 if (checksum_block((__be32 *)data, 61 59 be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) { 62 - printk("Warning: Trashed word at 0xd0 in block %d " 63 - "ignored in checksum calculation\n",blk); 60 + pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n", 61 + blk); 64 62 break; 65 63 } 66 64 67 - printk("Dev %s: RDB in block %d has bad checksum\n", 65 + pr_err("Dev %s: RDB in block %d has bad checksum\n", 68 66 bdevname(state->bdev, b), blk); 69 67 } 70 68 ··· 85 83 data = read_part_sector(state, blk, &sect); 86 84 if (!data) { 87 85 if (warn_no_part) 88 - printk("Dev %s: unable to read partition block %d\n", 86 + pr_err("Dev %s: unable to read partition block %d\n", 89 87 bdevname(state->bdev, b), blk); 90 88 res = -1; 91 89 goto rdb_done;
+24 -22
block/partitions/efi.c
··· 121 121 /** 122 122 * efi_crc32() - EFI version of crc32 function 123 123 * @buf: buffer to calculate crc32 of 124 - * @len - length of buf 124 + * @len: length of buf 125 125 * 126 126 * Description: Returns EFI-style CRC32 value for @buf 127 127 * ··· 240 240 241 241 /** 242 242 * read_lba(): Read bytes from disk, starting at given LBA 243 - * @state 244 - * @lba 245 - * @buffer 246 - * @size_t 243 + * @state: disk parsed partitions 244 + * @lba: the Logical Block Address of the partition table 245 + * @buffer: destination buffer 246 + * @count: bytes to read 247 247 * 248 248 * Description: Reads @count bytes from @state->bdev into @buffer. 249 249 * Returns number of bytes read on success, 0 on error. ··· 277 277 278 278 /** 279 279 * alloc_read_gpt_entries(): reads partition entries from disk 280 - * @state 281 - * @gpt - GPT header 280 + * @state: disk parsed partitions 281 + * @gpt: GPT header 282 282 * 283 283 * Description: Returns ptes on success, NULL on error. 284 284 * Allocates space for PTEs based on information found in @gpt. ··· 312 312 313 313 /** 314 314 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk 315 - * @state 316 - * @lba is the Logical Block Address of the partition table 315 + * @state: disk parsed partitions 316 + * @lba: the Logical Block Address of the partition table 317 317 * 318 318 * Description: returns GPT header on success, NULL on error. Allocates 319 319 * and fills a GPT header starting at @ from @state->bdev. ··· 340 340 341 341 /** 342 342 * is_gpt_valid() - tests one GPT header and PTEs for validity 343 - * @state 344 - * @lba is the logical block address of the GPT header to test 345 - * @gpt is a GPT header ptr, filled on return. 346 - * @ptes is a PTEs ptr, filled on return. 343 + * @state: disk parsed partitions 344 + * @lba: logical block address of the GPT header to test 345 + * @gpt: GPT header ptr, filled on return. 346 + * @ptes: PTEs ptr, filled on return. 347 347 * 348 348 * Description: returns 1 if valid, 0 on error. 349 349 * If valid, returns pointers to newly allocated GPT header and PTEs. ··· 461 461 462 462 /** 463 463 * is_pte_valid() - tests one PTE for validity 464 - * @pte is the pte to check 465 - * @lastlba is last lba of the disk 464 + * @pte:pte to check 465 + * @lastlba: last lba of the disk 466 466 * 467 467 * Description: returns 1 if valid, 0 on error. 468 468 */ ··· 478 478 479 479 /** 480 480 * compare_gpts() - Search disk for valid GPT headers and PTEs 481 - * @pgpt is the primary GPT header 482 - * @agpt is the alternate GPT header 483 - * @lastlba is the last LBA number 481 + * @pgpt: primary GPT header 482 + * @agpt: alternate GPT header 483 + * @lastlba: last LBA number 484 + * 484 485 * Description: Returns nothing. Sanity checks pgpt and agpt fields 485 486 * and prints warnings on discrepancies. 486 487 * ··· 573 572 574 573 /** 575 574 * find_valid_gpt() - Search disk for valid GPT headers and PTEs 576 - * @state 577 - * @gpt is a GPT header ptr, filled on return. 578 - * @ptes is a PTEs ptr, filled on return. 575 + * @state: disk parsed partitions 576 + * @gpt: GPT header ptr, filled on return. 577 + * @ptes: PTEs ptr, filled on return. 578 + * 579 579 * Description: Returns 1 if valid, 0 on error. 580 580 * If valid, returns pointers to newly allocated GPT header and PTEs. 581 581 * Validity depends on PMBR being valid (or being overridden by the ··· 665 663 666 664 /** 667 665 * efi_partition(struct parsed_partitions *state) 668 - * @state 666 + * @state: disk parsed partitions 669 667 * 670 668 * Description: called from check.c, if the disk contains GPT 671 669 * partitions, sets up partition entries in the kernel.
+8 -5
block/partitions/msdos.c
··· 159 159 /* 160 160 * First process the data partition(s) 161 161 */ 162 - for (i=0; i<4; i++, p++) { 162 + for (i = 0; i < 4; i++, p++) { 163 163 sector_t offs, size, next; 164 + 164 165 if (!nr_sects(p) || is_extended_partition(p)) 165 166 continue; 166 167 ··· 195 194 * It should be a link to the next logical partition. 196 195 */ 197 196 p -= 4; 198 - for (i=0; i<4; i++, p++) 197 + for (i = 0; i < 4; i++, p++) 199 198 if (nr_sects(p) && is_extended_partition(p)) 200 199 break; 201 200 if (i == 4) ··· 244 243 return; 245 244 } 246 245 /* Ensure we can handle previous case of VTOC with 8 entries gracefully */ 247 - max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; 248 - for (i=0; i<max_nparts && state->next<state->limit; i++) { 246 + max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; 247 + for (i = 0; i < max_nparts && state->next < state->limit; i++) { 249 248 struct solaris_x86_slice *s = &v->v_slice[i]; 250 249 char tmp[3 + 10 + 1 + 1]; 251 250 ··· 410 409 /* The first sector of a Minix partition can have either 411 410 * a secondary MBR describing its subpartitions, or 412 411 * the normal boot sector. */ 413 - if (msdos_magic_present (data + 510) && 412 + if (msdos_magic_present(data + 510) && 414 413 SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */ 415 414 char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1]; 416 415 ··· 528 527 for (slot = 1 ; slot <= 4 ; slot++, p++) { 529 528 sector_t start = start_sect(p)*sector_size; 530 529 sector_t size = nr_sects(p)*sector_size; 530 + 531 531 if (!size) 532 532 continue; 533 533 if (is_extended_partition(p)) { ··· 539 537 * sector, although it may not be enough/proper. 540 538 */ 541 539 sector_t n = 2; 540 + 542 541 n = min(size, max(sector_size, n)); 543 542 put_partition(state, slot, start, n); 544 543
+11 -4
block/scsi_ioctl.c
··· 82 82 return err; 83 83 } 84 84 85 + static int max_sectors_bytes(struct request_queue *q) 86 + { 87 + unsigned int max_sectors = queue_max_sectors(q); 88 + 89 + max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9); 90 + 91 + return max_sectors << 9; 92 + } 93 + 85 94 static int sg_get_reserved_size(struct request_queue *q, int __user *p) 86 95 { 87 - unsigned val = min(q->sg_reserved_size, queue_max_sectors(q) << 9); 96 + int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q)); 88 97 89 98 return put_user(val, p); 90 99 } ··· 107 98 108 99 if (size < 0) 109 100 return -EINVAL; 110 - if (size > (queue_max_sectors(q) << 9)) 111 - size = queue_max_sectors(q) << 9; 112 101 113 - q->sg_reserved_size = size; 102 + q->sg_reserved_size = min(size, max_sectors_bytes(q)); 114 103 return 0; 115 104 } 116 105
+1
include/linux/bio.h
··· 308 308 309 309 unsigned short bip_slab; /* slab the bip came from */ 310 310 unsigned short bip_vcnt; /* # of integrity bio_vecs */ 311 + unsigned short bip_max_vcnt; /* integrity bio_vec slots */ 311 312 unsigned bip_owns_buf:1; /* should free bip_buf */ 312 313 313 314 struct work_struct bip_work; /* I/O completion */
+3 -1
include/linux/blkdev.h
··· 21 21 #include <linux/bsg.h> 22 22 #include <linux/smp.h> 23 23 #include <linux/rcupdate.h> 24 + #include <linux/percpu-refcount.h> 24 25 25 26 #include <asm/scatterlist.h> 26 27 ··· 471 470 struct mutex sysfs_lock; 472 471 473 472 int bypass_depth; 473 + int mq_freeze_depth; 474 474 475 475 #if defined(CONFIG_BLK_DEV_BSG) 476 476 bsg_job_fn *bsg_job_fn; ··· 485 483 #endif 486 484 struct rcu_head rcu_head; 487 485 wait_queue_head_t mq_freeze_wq; 488 - struct percpu_counter mq_usage_counter; 486 + struct percpu_ref mq_usage_counter; 489 487 struct list_head all_q_node; 490 488 491 489 struct blk_mq_tag_set *tag_set;