Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

nfs/blocklayout: Fix premature PR key unregistration

During generic/069 runs with pNFS SCSI layouts, the NFS client emits
the following in the system journal:

kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
kernel: sd 6:0:0:1: reservation conflict
kernel: sd 6:0:0:1: [sdb] tag#16 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
kernel: sd 6:0:0:1: [sdb] tag#16 CDB: Write(10) 2a 00 00 00 00 50 00 00 08 00
kernel: reservation conflict error, dev sdb, sector 80 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 2
kernel: sd 6:0:0:1: reservation conflict
kernel: sd 6:0:0:1: reservation conflict
kernel: sd 6:0:0:1: [sdb] tag#18 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
kernel: sd 6:0:0:1: [sdb] tag#17 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
kernel: sd 6:0:0:1: [sdb] tag#18 CDB: Write(10) 2a 00 00 00 00 60 00 00 08 00
kernel: sd 6:0:0:1: [sdb] tag#17 CDB: Write(10) 2a 00 00 00 00 58 00 00 08 00
kernel: reservation conflict error, dev sdb, sector 96 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
kernel: reservation conflict error, dev sdb, sector 88 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
systemd[1]: fstests-generic-069.scope: Deactivated successfully.
systemd[1]: fstests-generic-069.scope: Consumed 5.092s CPU time.
systemd[1]: media-test.mount: Deactivated successfully.
systemd[1]: media-scratch.mount: Deactivated successfully.
kernel: sd 6:0:0:1: reservation conflict
kernel: failed to unregister PR key.

This appears to be due to a race. bl_alloc_lseg() calls this:

561 static struct nfs4_deviceid_node *
562 bl_find_get_deviceid(struct nfs_server *server,
563 const struct nfs4_deviceid *id, const struct cred *cred,
564 gfp_t gfp_mask)
565 {
566 struct nfs4_deviceid_node *node;
567 unsigned long start, end;
568
569 retry:
570 node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
571 if (!node)
572 return ERR_PTR(-ENODEV);

nfs4_find_get_deviceid() does a lookup without the spin lock first.
If it can't find a matching deviceid, it creates a new device_info
(which calls bl_alloc_deviceid_node, and that registers the device's
PR key).

Then it takes the nfs4_deviceid_lock and looks up the deviceid again.
If it finds it this time, bl_find_get_deviceid() frees the spare
(new) device_info, which unregisters the PR key for the same device.

Any subsequent I/O from this client on that device gets EBADE.

The umount later unregisters the device's PR key again.

To prevent this problem, register the PR key after the deviceid_node
lookup.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>

authored by

Chuck Lever and committed by
Anna Schumaker
d869da91 5468fc82

+94 -31
+16 -9
fs/nfs/blocklayout/blocklayout.c
··· 564 564 gfp_t gfp_mask) 565 565 { 566 566 struct nfs4_deviceid_node *node; 567 - unsigned long start, end; 567 + int err = -ENODEV; 568 568 569 569 retry: 570 570 node = nfs4_find_get_deviceid(server, id, cred, gfp_mask); 571 571 if (!node) 572 572 return ERR_PTR(-ENODEV); 573 573 574 - if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) 575 - return node; 574 + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) { 575 + unsigned long end = jiffies; 576 + unsigned long start = end - PNFS_DEVICE_RETRY_TIMEOUT; 576 577 577 - end = jiffies; 578 - start = end - PNFS_DEVICE_RETRY_TIMEOUT; 579 - if (!time_in_range(node->timestamp_unavailable, start, end)) { 580 - nfs4_delete_deviceid(node->ld, node->nfs_client, id); 581 - goto retry; 578 + if (!time_in_range(node->timestamp_unavailable, start, end)) { 579 + nfs4_delete_deviceid(node->ld, node->nfs_client, id); 580 + goto retry; 581 + } 582 + goto out_put; 582 583 } 583 584 585 + if (!bl_register_dev(container_of(node, struct pnfs_block_dev, node))) 586 + goto out_put; 587 + 588 + return node; 589 + 590 + out_put: 584 591 nfs4_put_deviceid_node(node); 585 - return ERR_PTR(-ENODEV); 592 + return ERR_PTR(err); 586 593 } 587 594 588 595 static int
+8 -1
fs/nfs/blocklayout/blocklayout.h
··· 104 104 u64 start; 105 105 u64 len; 106 106 107 + enum pnfs_block_volume_type type; 107 108 u32 nr_children; 108 109 struct pnfs_block_dev *children; 109 110 u64 chunk_size; 110 111 111 112 struct file *bdev_file; 112 113 u64 disk_offset; 114 + unsigned long flags; 113 115 114 116 u64 pr_key; 115 - bool pr_registered; 116 117 117 118 bool (*map)(struct pnfs_block_dev *dev, u64 offset, 118 119 struct pnfs_block_dev_map *map); 120 + }; 121 + 122 + /* pnfs_block_dev flag bits */ 123 + enum { 124 + PNFS_BDEV_REGISTERED = 0, 119 125 }; 120 126 121 127 /* sector_t fields are all in 512-byte sectors */ ··· 178 172 #define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ 179 173 180 174 /* dev.c */ 175 + bool bl_register_dev(struct pnfs_block_dev *d); 181 176 struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server, 182 177 struct pnfs_device *pdev, gfp_t gfp_mask); 183 178 void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
+70 -21
fs/nfs/blocklayout/dev.c
··· 13 13 14 14 #define NFSDBG_FACILITY NFSDBG_PNFS_LD 15 15 16 + static void bl_unregister_scsi(struct pnfs_block_dev *dev) 17 + { 18 + struct block_device *bdev = file_bdev(dev->bdev_file); 19 + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; 20 + 21 + if (!test_and_clear_bit(PNFS_BDEV_REGISTERED, &dev->flags)) 22 + return; 23 + 24 + if (ops->pr_register(bdev, dev->pr_key, 0, false)) 25 + pr_err("failed to unregister PR key.\n"); 26 + } 27 + 28 + static bool bl_register_scsi(struct pnfs_block_dev *dev) 29 + { 30 + struct block_device *bdev = file_bdev(dev->bdev_file); 31 + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; 32 + int status; 33 + 34 + if (test_and_set_bit(PNFS_BDEV_REGISTERED, &dev->flags)) 35 + return true; 36 + 37 + status = ops->pr_register(bdev, 0, dev->pr_key, true); 38 + if (status) { 39 + pr_err("pNFS: failed to register key for block device %s.", 40 + bdev->bd_disk->disk_name); 41 + return false; 42 + } 43 + return true; 44 + } 45 + 46 + static void bl_unregister_dev(struct pnfs_block_dev *dev) 47 + { 48 + u32 i; 49 + 50 + if (dev->nr_children) { 51 + for (i = 0; i < dev->nr_children; i++) 52 + bl_unregister_dev(&dev->children[i]); 53 + return; 54 + } 55 + 56 + if (dev->type == PNFS_BLOCK_VOLUME_SCSI) 57 + bl_unregister_scsi(dev); 58 + } 59 + 60 + bool bl_register_dev(struct pnfs_block_dev *dev) 61 + { 62 + u32 i; 63 + 64 + if (dev->nr_children) { 65 + for (i = 0; i < dev->nr_children; i++) { 66 + if (!bl_register_dev(&dev->children[i])) { 67 + while (i > 0) 68 + bl_unregister_dev(&dev->children[--i]); 69 + return false; 70 + } 71 + } 72 + return true; 73 + } 74 + 75 + if (dev->type == PNFS_BLOCK_VOLUME_SCSI) 76 + return bl_register_scsi(dev); 77 + return true; 78 + } 79 + 16 80 static void 17 81 bl_free_device(struct pnfs_block_dev *dev) 18 82 { 83 + bl_unregister_dev(dev); 84 + 19 85 if (dev->nr_children) { 20 86 int i; 21 87 ··· 89 23 bl_free_device(&dev->children[i]); 90 24 kfree(dev->children); 91 25 } else { 92 - if (dev->pr_registered) { 93 - const struct pr_ops *ops = 94 - file_bdev(dev->bdev_file)->bd_disk->fops->pr_ops; 95 - int error; 96 - 97 - error = ops->pr_register(file_bdev(dev->bdev_file), 98 - dev->pr_key, 0, false); 99 - if (error) 100 - pr_err("failed to unregister PR key.\n"); 101 - } 102 - 103 26 if (dev->bdev_file) 104 27 fput(dev->bdev_file); 105 28 } ··· 420 365 goto out_blkdev_put; 421 366 } 422 367 423 - error = ops->pr_register(file_bdev(d->bdev_file), 0, d->pr_key, true); 424 - if (error) { 425 - pr_err("pNFS: failed to register key for block device %s.", 426 - file_bdev(d->bdev_file)->bd_disk->disk_name); 427 - goto out_blkdev_put; 428 - } 429 - 430 - d->pr_registered = true; 431 368 return 0; 432 369 433 370 out_blkdev_put: ··· 505 458 bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, 506 459 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) 507 460 { 508 - switch (volumes[idx].type) { 461 + d->type = volumes[idx].type; 462 + 463 + switch (d->type) { 509 464 case PNFS_BLOCK_VOLUME_SIMPLE: 510 465 return bl_parse_simple(server, d, volumes, idx, gfp_mask); 511 466 case PNFS_BLOCK_VOLUME_SLICE: ··· 519 470 case PNFS_BLOCK_VOLUME_SCSI: 520 471 return bl_parse_scsi(server, d, volumes, idx, gfp_mask); 521 472 default: 522 - dprintk("unsupported volume type: %d\n", volumes[idx].type); 473 + dprintk("unsupported volume type: %d\n", d->type); 523 474 return -EIO; 524 475 } 525 476 }