Merge branch 'for-3.2/drivers' of git://git.kernel.dk/linux-block

+7

Documentation/ABI/testing/sysfs-bus-pci-devices-cciss

··· 71 71 a dump device, as kdump requires resetting the device in order 72 72 to work reliably. 73 73 74 + Where: /sys/bus/pci/devices/<dev>/ccissX/transport_mode 75 + Date: July 2011 76 + Kernel Version: 3.0 77 + Contact: iss_storagedev@hp.com 78 + Description: Value of "simple" indicates that the controller has been placed 79 + in "simple mode". Value of "performant" indicates that the 80 + controller has been placed in "performant mode".

+10

Documentation/blockdev/cciss.txt

··· 78 78 /dev/cciss/c1d1p2 Controller 1, disk 1, partition 2 79 79 /dev/cciss/c1d1p3 Controller 1, disk 1, partition 3 80 80 81 + CCISS simple mode support 82 + ------------------------- 83 + 84 + The "cciss_simple_mode=1" boot parameter may be used to prevent the driver 85 + from putting the controller into "performant" mode. The difference is that 86 + with simple mode, each command completion requires an interrupt, while with 87 + "performant mode" (the default, and ordinarily better performing) it is 88 + possible to have multiple command completions indicated by a single 89 + interrupt. 90 + 81 91 SCSI tape drive and medium changer support 82 92 ------------------------------------------ 83 93

+2 -2

block/genhd.c

··· 537 537 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); 538 538 539 539 /* No minors to use for partitions */ 540 - if (!disk_partitionable(disk)) 540 + if (!disk_part_scan_enabled(disk)) 541 541 goto exit; 542 542 543 543 /* No such device (e.g., media were just removed) */ ··· 848 848 char buf[BDEVNAME_SIZE]; 849 849 850 850 /* Don't show non-partitionable removeable devices or empty devices */ 851 - if (!get_capacity(sgp) || (!disk_partitionable(sgp) && 851 + if (!get_capacity(sgp) || (!disk_max_parts(sgp) && 852 852 (sgp->flags & GENHD_FL_REMOVABLE))) 853 853 return 0; 854 854 if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)

+1 -1

block/ioctl.c

··· 101 101 struct gendisk *disk = bdev->bd_disk; 102 102 int res; 103 103 104 - if (!disk_partitionable(disk) || bdev != bdev->bd_contains) 104 + if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains) 105 105 return -EINVAL; 106 106 if (!capable(CAP_SYS_ADMIN)) 107 107 return -EACCES;

+65 -11

drivers/block/cciss.c

··· 68 68 module_param(cciss_tape_cmds, int, 0644); 69 69 MODULE_PARM_DESC(cciss_tape_cmds, 70 70 "number of commands to allocate for tape devices (default: 6)"); 71 + static int cciss_simple_mode; 72 + module_param(cciss_simple_mode, int, S_IRUGO|S_IWUSR); 73 + MODULE_PARM_DESC(cciss_simple_mode, 74 + "Use 'simple mode' rather than 'performant mode'"); 71 75 72 76 static DEFINE_MUTEX(cciss_mutex); 73 77 static struct proc_dir_entry *proc_cciss; ··· 180 176 unsigned int block_size, InquiryData_struct *inq_buff, 181 177 drive_info_struct *drv); 182 178 static void __devinit cciss_interrupt_mode(ctlr_info_t *); 179 + static int __devinit cciss_enter_simple_mode(struct ctlr_info *h); 183 180 static void start_io(ctlr_info_t *h); 184 181 static int sendcmd_withirq(ctlr_info_t *h, __u8 cmd, void *buff, size_t size, 185 182 __u8 page_code, unsigned char scsi3addr[], ··· 393 388 h->product_name, 394 389 (unsigned long)h->board_id, 395 390 h->firm_ver[0], h->firm_ver[1], h->firm_ver[2], 396 - h->firm_ver[3], (unsigned int)h->intr[PERF_MODE_INT], 391 + h->firm_ver[3], (unsigned int)h->intr[h->intr_mode], 397 392 h->num_luns, 398 393 h->Qdepth, h->commands_outstanding, 399 394 h->maxQsinceinit, h->max_outstanding, h->maxSG); ··· 641 636 } 642 637 static DEVICE_ATTR(rescan, S_IWUSR, NULL, host_store_rescan); 643 638 639 + static ssize_t host_show_transport_mode(struct device *dev, 640 + struct device_attribute *attr, 641 + char *buf) 642 + { 643 + struct ctlr_info *h = to_hba(dev); 644 + 645 + return snprintf(buf, 20, "%s\n", 646 + h->transMethod & CFGTBL_Trans_Performant ? 647 + "performant" : "simple"); 648 + } 649 + static DEVICE_ATTR(transport_mode, S_IRUGO, host_show_transport_mode, NULL); 650 + 644 651 static ssize_t dev_show_unique_id(struct device *dev, 645 652 struct device_attribute *attr, 646 653 char *buf) ··· 825 808 static struct attribute *cciss_host_attrs[] = { 826 809 &dev_attr_rescan.attr, 827 810 &dev_attr_resettable.attr, 811 + &dev_attr_transport_mode.attr, 828 812 NULL 829 813 }; 830 814 ··· 4002 3984 { 4003 3985 __u32 trans_support; 4004 3986 3987 + if (cciss_simple_mode) 3988 + return; 3989 + 4005 3990 dev_dbg(&h->pdev->dev, "Trying to put board into Performant mode\n"); 4006 3991 /* Attempt to put controller into performant mode if supported */ 4007 3992 /* Does board support performant mode? */ ··· 4102 4081 default_int_mode: 4103 4082 #endif /* CONFIG_PCI_MSI */ 4104 4083 /* if we get here we're going to use the default interrupt mode */ 4105 - h->intr[PERF_MODE_INT] = h->pdev->irq; 4084 + h->intr[h->intr_mode] = h->pdev->irq; 4106 4085 return; 4107 4086 } 4108 4087 ··· 4362 4341 } 4363 4342 cciss_enable_scsi_prefetch(h); 4364 4343 cciss_p600_dma_prefetch_quirk(h); 4344 + err = cciss_enter_simple_mode(h); 4345 + if (err) 4346 + goto err_out_free_res; 4365 4347 cciss_put_controller_into_performant_mode(h); 4366 4348 return 0; 4367 4349 ··· 4557 4533 pmcsr &= ~PCI_PM_CTRL_STATE_MASK; 4558 4534 pmcsr |= PCI_D0; 4559 4535 pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr); 4536 + 4537 + /* 4538 + * The P600 requires a small delay when changing states. 4539 + * Otherwise we may think the board did not reset and we bail. 4540 + * This for kdump only and is particular to the P600. 4541 + */ 4542 + msleep(500); 4560 4543 } 4561 4544 return 0; 4562 4545 } ··· 4874 4843 irqreturn_t (*intxhandler)(int, void *)) 4875 4844 { 4876 4845 if (h->msix_vector || h->msi_vector) { 4877 - if (!request_irq(h->intr[PERF_MODE_INT], msixhandler, 4846 + if (!request_irq(h->intr[h->intr_mode], msixhandler, 4878 4847 IRQF_DISABLED, h->devname, h)) 4879 4848 return 0; 4880 4849 dev_err(&h->pdev->dev, "Unable to get msi irq %d" 4881 - " for %s\n", h->intr[PERF_MODE_INT], 4850 + " for %s\n", h->intr[h->intr_mode], 4882 4851 h->devname); 4883 4852 return -1; 4884 4853 } 4885 4854 4886 - if (!request_irq(h->intr[PERF_MODE_INT], intxhandler, 4855 + if (!request_irq(h->intr[h->intr_mode], intxhandler, 4887 4856 IRQF_DISABLED, h->devname, h)) 4888 4857 return 0; 4889 4858 dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n", 4890 - h->intr[PERF_MODE_INT], h->devname); 4859 + h->intr[h->intr_mode], h->devname); 4891 4860 return -1; 4892 4861 } 4893 4862 ··· 4918 4887 { 4919 4888 int ctlr = h->ctlr; 4920 4889 4921 - free_irq(h->intr[PERF_MODE_INT], h); 4890 + free_irq(h->intr[h->intr_mode], h); 4922 4891 #ifdef CONFIG_PCI_MSI 4923 4892 if (h->msix_vector) 4924 4893 pci_disable_msix(h->pdev); ··· 4984 4953 h = hba[i]; 4985 4954 h->pdev = pdev; 4986 4955 h->busy_initializing = 1; 4956 + h->intr_mode = cciss_simple_mode ? SIMPLE_MODE_INT : PERF_MODE_INT; 4987 4957 INIT_LIST_HEAD(&h->cmpQ); 4988 4958 INIT_LIST_HEAD(&h->reqQ); 4989 4959 mutex_init(&h->busy_shutting_down); ··· 5041 5009 5042 5010 dev_info(&h->pdev->dev, "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n", 5043 5011 h->devname, pdev->device, pci_name(pdev), 5044 - h->intr[PERF_MODE_INT], dac ? "" : " not"); 5012 + h->intr[h->intr_mode], dac ? "" : " not"); 5045 5013 5046 5014 if (cciss_allocate_cmd_pool(h)) 5047 5015 goto clean4; ··· 5088 5056 spin_lock_irqsave(&h->lock, flags); 5089 5057 h->access.set_intr_mask(h, CCISS_INTR_OFF); 5090 5058 spin_unlock_irqrestore(&h->lock, flags); 5091 - free_irq(h->intr[PERF_MODE_INT], h); 5059 + free_irq(h->intr[h->intr_mode], h); 5092 5060 rc = cciss_request_irq(h, cciss_msix_discard_completions, 5093 5061 cciss_intx_discard_completions); 5094 5062 if (rc) { ··· 5165 5133 cciss_free_cmd_pool(h); 5166 5134 cciss_free_scatterlists(h); 5167 5135 cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds); 5168 - free_irq(h->intr[PERF_MODE_INT], h); 5136 + free_irq(h->intr[h->intr_mode], h); 5169 5137 clean2: 5170 5138 unregister_blkdev(h->major, h->devname); 5171 5139 clean1: ··· 5204 5172 if (return_code != IO_OK) 5205 5173 dev_warn(&h->pdev->dev, "Error flushing cache\n"); 5206 5174 h->access.set_intr_mask(h, CCISS_INTR_OFF); 5207 - free_irq(h->intr[PERF_MODE_INT], h); 5175 + free_irq(h->intr[h->intr_mode], h); 5208 5176 } 5177 + 5178 + static int __devinit cciss_enter_simple_mode(struct ctlr_info *h) 5179 + { 5180 + u32 trans_support; 5181 + 5182 + trans_support = readl(&(h->cfgtable->TransportSupport)); 5183 + if (!(trans_support & SIMPLE_MODE)) 5184 + return -ENOTSUPP; 5185 + 5186 + h->max_commands = readl(&(h->cfgtable->CmdsOutMax)); 5187 + writel(CFGTBL_Trans_Simple, &(h->cfgtable->HostWrite.TransportRequest)); 5188 + writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); 5189 + cciss_wait_for_mode_change_ack(h); 5190 + print_cfg_table(h); 5191 + if (!(readl(&(h->cfgtable->TransportActive)) & CFGTBL_Trans_Simple)) { 5192 + dev_warn(&h->pdev->dev, "unable to get board into simple mode\n"); 5193 + return -ENODEV; 5194 + } 5195 + h->transMethod = CFGTBL_Trans_Simple; 5196 + return 0; 5197 + } 5198 + 5209 5199 5210 5200 static void __devexit cciss_remove_one(struct pci_dev *pdev) 5211 5201 {

+1

drivers/block/cciss.h

··· 92 92 unsigned int intr[4]; 93 93 unsigned int msix_vector; 94 94 unsigned int msi_vector; 95 + int intr_mode; 95 96 int cciss_max_sectors; 96 97 BYTE cciss_read; 97 98 BYTE cciss_write;

+1 -1

drivers/block/cpqarray.c

··· 620 620 } 621 621 vendor_id = pdev->vendor; 622 622 device_id = pdev->device; 623 + revision = pdev->revision; 623 624 irq = pdev->irq; 624 625 625 626 for(i=0; i<6; i++) ··· 633 632 } 634 633 635 634 pci_read_config_word(pdev, PCI_COMMAND, &command); 636 - pci_read_config_byte(pdev, PCI_CLASS_REVISION, &revision); 637 635 pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &cache_line_size); 638 636 pci_read_config_byte(pdev, PCI_LATENCY_TIMER, &latency_timer); 639 637

+104 -7

drivers/block/loop.c

··· 76 76 #include <linux/splice.h> 77 77 #include <linux/sysfs.h> 78 78 #include <linux/miscdevice.h> 79 + #include <linux/falloc.h> 80 + 79 81 #include <asm/uaccess.h> 80 82 81 83 static DEFINE_IDR(loop_index_idr); ··· 409 407 } 410 408 } 411 409 410 + /* 411 + * We use punch hole to reclaim the free space used by the 412 + * image a.k.a. discard. However we do support discard if 413 + * encryption is enabled, because it may give an attacker 414 + * useful information. 415 + */ 416 + if (bio->bi_rw & REQ_DISCARD) { 417 + struct file *file = lo->lo_backing_file; 418 + int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; 419 + 420 + if ((!file->f_op->fallocate) || 421 + lo->lo_encrypt_key_size) { 422 + ret = -EOPNOTSUPP; 423 + goto out; 424 + } 425 + ret = file->f_op->fallocate(file, mode, pos, 426 + bio->bi_size); 427 + if (unlikely(ret && ret != -EINVAL && 428 + ret != -EOPNOTSUPP)) 429 + ret = -EIO; 430 + goto out; 431 + } 432 + 412 433 ret = lo_send(lo, bio, pos); 413 434 414 435 if ((bio->bi_rw & REQ_FUA) && !ret) { ··· 647 622 goto out_putf; 648 623 649 624 fput(old_file); 650 - if (max_part > 0) 625 + if (lo->lo_flags & LO_FLAGS_PARTSCAN) 651 626 ioctl_by_bdev(bdev, BLKRRPART, 0); 652 627 return 0; 653 628 ··· 724 699 return sprintf(buf, "%s\n", autoclear ? "1" : "0"); 725 700 } 726 701 702 + static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) 703 + { 704 + int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN); 705 + 706 + return sprintf(buf, "%s\n", partscan ? "1" : "0"); 707 + } 708 + 727 709 LOOP_ATTR_RO(backing_file); 728 710 LOOP_ATTR_RO(offset); 729 711 LOOP_ATTR_RO(sizelimit); 730 712 LOOP_ATTR_RO(autoclear); 713 + LOOP_ATTR_RO(partscan); 731 714 732 715 static struct attribute *loop_attrs[] = { 733 716 &loop_attr_backing_file.attr, 734 717 &loop_attr_offset.attr, 735 718 &loop_attr_sizelimit.attr, 736 719 &loop_attr_autoclear.attr, 720 + &loop_attr_partscan.attr, 737 721 NULL, 738 722 }; 739 723 ··· 761 727 { 762 728 sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj, 763 729 &loop_attribute_group); 730 + } 731 + 732 + static void loop_config_discard(struct loop_device *lo) 733 + { 734 + struct file *file = lo->lo_backing_file; 735 + struct inode *inode = file->f_mapping->host; 736 + struct request_queue *q = lo->lo_queue; 737 + 738 + /* 739 + * We use punch hole to reclaim the free space used by the 740 + * image a.k.a. discard. However we do support discard if 741 + * encryption is enabled, because it may give an attacker 742 + * useful information. 743 + */ 744 + if ((!file->f_op->fallocate) || 745 + lo->lo_encrypt_key_size) { 746 + q->limits.discard_granularity = 0; 747 + q->limits.discard_alignment = 0; 748 + q->limits.max_discard_sectors = 0; 749 + q->limits.discard_zeroes_data = 0; 750 + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); 751 + return; 752 + } 753 + 754 + q->limits.discard_granularity = inode->i_sb->s_blocksize; 755 + q->limits.discard_alignment = inode->i_sb->s_blocksize; 756 + q->limits.max_discard_sectors = UINT_MAX >> 9; 757 + q->limits.discard_zeroes_data = 1; 758 + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 764 759 } 765 760 766 761 static int loop_set_fd(struct loop_device *lo, fmode_t mode, ··· 892 829 } 893 830 lo->lo_state = Lo_bound; 894 831 wake_up_process(lo->lo_thread); 895 - if (max_part > 0) 832 + if (part_shift) 833 + lo->lo_flags |= LO_FLAGS_PARTSCAN; 834 + if (lo->lo_flags & LO_FLAGS_PARTSCAN) 896 835 ioctl_by_bdev(bdev, BLKRRPART, 0); 897 836 return 0; 898 837 ··· 955 890 return err; 956 891 } 957 892 958 - static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) 893 + static int loop_clr_fd(struct loop_device *lo) 959 894 { 960 895 struct file *filp = lo->lo_backing_file; 961 896 gfp_t gfp = lo->old_gfp_mask; 897 + struct block_device *bdev = lo->lo_device; 962 898 963 899 if (lo->lo_state != Lo_bound) 964 900 return -ENXIO; ··· 988 922 lo->lo_offset = 0; 989 923 lo->lo_sizelimit = 0; 990 924 lo->lo_encrypt_key_size = 0; 991 - lo->lo_flags = 0; 992 925 lo->lo_thread = NULL; 993 926 memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); 994 927 memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); ··· 1005 940 lo->lo_state = Lo_unbound; 1006 941 /* This is safe: open() is still holding a reference. */ 1007 942 module_put(THIS_MODULE); 1008 - if (max_part > 0 && bdev) 943 + if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) 1009 944 ioctl_by_bdev(bdev, BLKRRPART, 0); 945 + lo->lo_flags = 0; 946 + if (!part_shift) 947 + lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; 1010 948 mutex_unlock(&lo->lo_ctl_mutex); 1011 949 /* 1012 950 * Need not hold lo_ctl_mutex to fput backing file. ··· 1063 995 if (figure_loop_size(lo)) 1064 996 return -EFBIG; 1065 997 } 998 + loop_config_discard(lo); 1066 999 1067 1000 memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); 1068 1001 memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE); ··· 1078 1009 if ((lo->lo_flags & LO_FLAGS_AUTOCLEAR) != 1079 1010 (info->lo_flags & LO_FLAGS_AUTOCLEAR)) 1080 1011 lo->lo_flags ^= LO_FLAGS_AUTOCLEAR; 1012 + 1013 + if ((info->lo_flags & LO_FLAGS_PARTSCAN) && 1014 + !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { 1015 + lo->lo_flags |= LO_FLAGS_PARTSCAN; 1016 + lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; 1017 + ioctl_by_bdev(lo->lo_device, BLKRRPART, 0); 1018 + } 1081 1019 1082 1020 lo->lo_encrypt_key_size = info->lo_encrypt_key_size; 1083 1021 lo->lo_init[0] = info->lo_init[0]; ··· 1279 1203 break; 1280 1204 case LOOP_CLR_FD: 1281 1205 /* loop_clr_fd would have unlocked lo_ctl_mutex on success */ 1282 - err = loop_clr_fd(lo, bdev); 1206 + err = loop_clr_fd(lo); 1283 1207 if (!err) 1284 1208 goto out_unlocked; 1285 1209 break; ··· 1499 1423 * In autoclear mode, stop the loop thread 1500 1424 * and remove configuration after last close. 1501 1425 */ 1502 - err = loop_clr_fd(lo, NULL); 1426 + err = loop_clr_fd(lo); 1503 1427 if (!err) 1504 1428 goto out_unlocked; 1505 1429 } else { ··· 1621 1545 if (!disk) 1622 1546 goto out_free_queue; 1623 1547 1548 + /* 1549 + * Disable partition scanning by default. The in-kernel partition 1550 + * scanning can be requested individually per-device during its 1551 + * setup. Userspace can always add and remove partitions from all 1552 + * devices. The needed partition minors are allocated from the 1553 + * extended minor space, the main loop device numbers will continue 1554 + * to match the loop minors, regardless of the number of partitions 1555 + * used. 1556 + * 1557 + * If max_part is given, partition scanning is globally enabled for 1558 + * all loop devices. The minors for the main loop devices will be 1559 + * multiples of max_part. 1560 + * 1561 + * Note: Global-for-all-devices, set-only-at-init, read-only module 1562 + * parameteters like 'max_loop' and 'max_part' make things needlessly 1563 + * complicated, are too static, inflexible and may surprise 1564 + * userspace tools. Parameters like this in general should be avoided. 1565 + */ 1566 + if (!part_shift) 1567 + disk->flags |= GENHD_FL_NO_PART_SCAN; 1568 + disk->flags |= GENHD_FL_EXT_DEVT; 1624 1569 mutex_init(&lo->lo_ctl_mutex); 1625 1570 lo->lo_number = i; 1626 1571 lo->lo_thread = NULL;

+34 -35

drivers/block/nbd.c

··· 127 127 if (lock) 128 128 mutex_lock(&lo->tx_lock); 129 129 if (lo->sock) { 130 - printk(KERN_WARNING "%s: shutting down socket\n", 131 - lo->disk->disk_name); 130 + dev_warn(disk_to_dev(lo->disk), "shutting down socket\n"); 132 131 kernel_sock_shutdown(lo->sock, SHUT_RDWR); 133 132 lo->sock = NULL; 134 133 } ··· 157 158 sigset_t blocked, oldset; 158 159 159 160 if (unlikely(!sock)) { 160 - printk(KERN_ERR "%s: Attempted %s on closed socket in sock_xmit\n", 161 - lo->disk->disk_name, (send ? "send" : "recv")); 161 + dev_err(disk_to_dev(lo->disk), 162 + "Attempted %s on closed socket in sock_xmit\n", 163 + (send ? "send" : "recv")); 162 164 return -EINVAL; 163 165 } 164 166 ··· 250 250 result = sock_xmit(lo, 1, &request, sizeof(request), 251 251 (nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0); 252 252 if (result <= 0) { 253 - printk(KERN_ERR "%s: Send control failed (result %d)\n", 254 - lo->disk->disk_name, result); 253 + dev_err(disk_to_dev(lo->disk), 254 + "Send control failed (result %d)\n", result); 255 255 goto error_out; 256 256 } 257 257 ··· 270 270 lo->disk->disk_name, req, bvec->bv_len); 271 271 result = sock_send_bvec(lo, bvec, flags); 272 272 if (result <= 0) { 273 - printk(KERN_ERR "%s: Send data failed (result %d)\n", 274 - lo->disk->disk_name, result); 273 + dev_err(disk_to_dev(lo->disk), 274 + "Send data failed (result %d)\n", 275 + result); 275 276 goto error_out; 276 277 } 277 278 } ··· 329 328 reply.magic = 0; 330 329 result = sock_xmit(lo, 0, &reply, sizeof(reply), MSG_WAITALL); 331 330 if (result <= 0) { 332 - printk(KERN_ERR "%s: Receive control failed (result %d)\n", 333 - lo->disk->disk_name, result); 331 + dev_err(disk_to_dev(lo->disk), 332 + "Receive control failed (result %d)\n", result); 334 333 goto harderror; 335 334 } 336 335 337 336 if (ntohl(reply.magic) != NBD_REPLY_MAGIC) { 338 - printk(KERN_ERR "%s: Wrong magic (0x%lx)\n", 339 - lo->disk->disk_name, 337 + dev_err(disk_to_dev(lo->disk), "Wrong magic (0x%lx)\n", 340 338 (unsigned long)ntohl(reply.magic)); 341 339 result = -EPROTO; 342 340 goto harderror; ··· 347 347 if (result != -ENOENT) 348 348 goto harderror; 349 349 350 - printk(KERN_ERR "%s: Unexpected reply (%p)\n", 351 - lo->disk->disk_name, reply.handle); 350 + dev_err(disk_to_dev(lo->disk), "Unexpected reply (%p)\n", 351 + reply.handle); 352 352 result = -EBADR; 353 353 goto harderror; 354 354 } 355 355 356 356 if (ntohl(reply.error)) { 357 - printk(KERN_ERR "%s: Other side returned error (%d)\n", 358 - lo->disk->disk_name, ntohl(reply.error)); 357 + dev_err(disk_to_dev(lo->disk), "Other side returned error (%d)\n", 358 + ntohl(reply.error)); 359 359 req->errors++; 360 360 return req; 361 361 } ··· 369 369 rq_for_each_segment(bvec, req, iter) { 370 370 result = sock_recv_bvec(lo, bvec); 371 371 if (result <= 0) { 372 - printk(KERN_ERR "%s: Receive data failed (result %d)\n", 373 - lo->disk->disk_name, result); 372 + dev_err(disk_to_dev(lo->disk), "Receive data failed (result %d)\n", 373 + result); 374 374 req->errors++; 375 375 return req; 376 376 } ··· 405 405 406 406 BUG_ON(lo->magic != LO_MAGIC); 407 407 408 - lo->pid = current->pid; 409 - ret = sysfs_create_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr); 408 + lo->pid = task_pid_nr(current); 409 + ret = device_create_file(disk_to_dev(lo->disk), &pid_attr); 410 410 if (ret) { 411 - printk(KERN_ERR "nbd: sysfs_create_file failed!"); 411 + dev_err(disk_to_dev(lo->disk), "device_create_file failed!\n"); 412 412 lo->pid = 0; 413 413 return ret; 414 414 } ··· 416 416 while ((req = nbd_read_stat(lo)) != NULL) 417 417 nbd_end_request(req); 418 418 419 - sysfs_remove_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr); 419 + device_remove_file(disk_to_dev(lo->disk), &pid_attr); 420 420 lo->pid = 0; 421 421 return 0; 422 422 } ··· 457 457 if (rq_data_dir(req) == WRITE) { 458 458 nbd_cmd(req) = NBD_CMD_WRITE; 459 459 if (lo->flags & NBD_READ_ONLY) { 460 - printk(KERN_ERR "%s: Write on read-only\n", 461 - lo->disk->disk_name); 460 + dev_err(disk_to_dev(lo->disk), 461 + "Write on read-only\n"); 462 462 goto error_out; 463 463 } 464 464 } ··· 468 468 mutex_lock(&lo->tx_lock); 469 469 if (unlikely(!lo->sock)) { 470 470 mutex_unlock(&lo->tx_lock); 471 - printk(KERN_ERR "%s: Attempted send on closed socket\n", 472 - lo->disk->disk_name); 471 + dev_err(disk_to_dev(lo->disk), 472 + "Attempted send on closed socket\n"); 473 473 goto error_out; 474 474 } 475 475 476 476 lo->active_req = req; 477 477 478 478 if (nbd_send_req(lo, req) != 0) { 479 - printk(KERN_ERR "%s: Request send failed\n", 480 - lo->disk->disk_name); 479 + dev_err(disk_to_dev(lo->disk), "Request send failed\n"); 481 480 req->errors++; 482 481 nbd_end_request(req); 483 482 } else { ··· 548 549 BUG_ON(lo->magic != LO_MAGIC); 549 550 550 551 if (unlikely(!lo->sock)) { 551 - printk(KERN_ERR "%s: Attempted send on closed socket\n", 552 - lo->disk->disk_name); 552 + dev_err(disk_to_dev(lo->disk), 553 + "Attempted send on closed socket\n"); 553 554 req->errors++; 554 555 nbd_end_request(req); 555 556 spin_lock_irq(q->queue_lock); ··· 575 576 case NBD_DISCONNECT: { 576 577 struct request sreq; 577 578 578 - printk(KERN_INFO "%s: NBD_DISCONNECT\n", lo->disk->disk_name); 579 + dev_info(disk_to_dev(lo->disk), "NBD_DISCONNECT\n"); 579 580 580 581 blk_rq_init(NULL, &sreq); 581 582 sreq.cmd_type = REQ_TYPE_SPECIAL; ··· 673 674 file = lo->file; 674 675 lo->file = NULL; 675 676 nbd_clear_que(lo); 676 - printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name); 677 + dev_warn(disk_to_dev(lo->disk), "queue cleared\n"); 677 678 if (file) 678 679 fput(file); 679 680 lo->bytesize = 0; ··· 693 694 return 0; 694 695 695 696 case NBD_PRINT_DEBUG: 696 - printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n", 697 - bdev->bd_disk->disk_name, 697 + dev_info(disk_to_dev(lo->disk), 698 + "next = %p, prev = %p, head = %p\n", 698 699 lo->queue_head.next, lo->queue_head.prev, 699 700 &lo->queue_head); 700 701 return 0; ··· 744 745 BUILD_BUG_ON(sizeof(struct nbd_request) != 28); 745 746 746 747 if (max_part < 0) { 747 - printk(KERN_CRIT "nbd: max_part must be >= 0\n"); 748 + printk(KERN_ERR "nbd: max_part must be >= 0\n"); 748 749 return -EINVAL; 749 750 } 750 751

+109 -21

drivers/block/xen-blkback/blkback.c

··· 39 39 #include <linux/list.h> 40 40 #include <linux/delay.h> 41 41 #include <linux/freezer.h> 42 + #include <linux/loop.h> 43 + #include <linux/falloc.h> 44 + #include <linux/fs.h> 42 45 43 46 #include <xen/events.h> 44 47 #include <xen/page.h> ··· 261 258 262 259 static void print_stats(struct xen_blkif *blkif) 263 260 { 264 - pr_info("xen-blkback (%s): oo %3d | rd %4d | wr %4d | f %4d\n", 261 + pr_info("xen-blkback (%s): oo %3d | rd %4d | wr %4d | f %4d" 262 + " | ds %4d\n", 265 263 current->comm, blkif->st_oo_req, 266 - blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req); 264 + blkif->st_rd_req, blkif->st_wr_req, 265 + blkif->st_f_req, blkif->st_ds_req); 267 266 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); 268 267 blkif->st_rd_req = 0; 269 268 blkif->st_wr_req = 0; 270 269 blkif->st_oo_req = 0; 270 + blkif->st_ds_req = 0; 271 271 } 272 272 273 273 int xen_blkif_schedule(void *arg) ··· 416 410 return ret; 417 411 } 418 412 413 + static void xen_blk_discard(struct xen_blkif *blkif, struct blkif_request *req) 414 + { 415 + int err = 0; 416 + int status = BLKIF_RSP_OKAY; 417 + struct block_device *bdev = blkif->vbd.bdev; 418 + 419 + if (blkif->blk_backend_type == BLKIF_BACKEND_PHY) 420 + /* just forward the discard request */ 421 + err = blkdev_issue_discard(bdev, 422 + req->u.discard.sector_number, 423 + req->u.discard.nr_sectors, 424 + GFP_KERNEL, 0); 425 + else if (blkif->blk_backend_type == BLKIF_BACKEND_FILE) { 426 + /* punch a hole in the backing file */ 427 + struct loop_device *lo = bdev->bd_disk->private_data; 428 + struct file *file = lo->lo_backing_file; 429 + 430 + if (file->f_op->fallocate) 431 + err = file->f_op->fallocate(file, 432 + FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 433 + req->u.discard.sector_number << 9, 434 + req->u.discard.nr_sectors << 9); 435 + else 436 + err = -EOPNOTSUPP; 437 + } else 438 + err = -EOPNOTSUPP; 439 + 440 + if (err == -EOPNOTSUPP) { 441 + pr_debug(DRV_PFX "discard op failed, not supported\n"); 442 + status = BLKIF_RSP_EOPNOTSUPP; 443 + } else if (err) 444 + status = BLKIF_RSP_ERROR; 445 + 446 + make_response(blkif, req->id, req->operation, status); 447 + } 448 + 449 + static void xen_blk_drain_io(struct xen_blkif *blkif) 450 + { 451 + atomic_set(&blkif->drain, 1); 452 + do { 453 + /* The initial value is one, and one refcnt taken at the 454 + * start of the xen_blkif_schedule thread. */ 455 + if (atomic_read(&blkif->refcnt) <= 2) 456 + break; 457 + wait_for_completion_interruptible_timeout( 458 + &blkif->drain_complete, HZ); 459 + 460 + if (!atomic_read(&blkif->drain)) 461 + break; 462 + } while (!kthread_should_stop()); 463 + atomic_set(&blkif->drain, 0); 464 + } 465 + 419 466 /* 420 467 * Completion callback on the bio's. Called as bh->b_end_io() 421 468 */ ··· 480 421 (error == -EOPNOTSUPP)) { 481 422 pr_debug(DRV_PFX "flush diskcache op failed, not supported\n"); 482 423 xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); 424 + pending_req->status = BLKIF_RSP_EOPNOTSUPP; 425 + } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && 426 + (error == -EOPNOTSUPP)) { 427 + pr_debug(DRV_PFX "write barrier op failed, not supported\n"); 428 + xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0); 483 429 pending_req->status = BLKIF_RSP_EOPNOTSUPP; 484 430 } else if (error) { 485 431 pr_debug(DRV_PFX "Buffer not up-to-date at end of operation," ··· 502 438 make_response(pending_req->blkif, pending_req->id, 503 439 pending_req->operation, pending_req->status); 504 440 xen_blkif_put(pending_req->blkif); 441 + if (atomic_read(&pending_req->blkif->refcnt) <= 2) { 442 + if (atomic_read(&pending_req->blkif->drain)) 443 + complete(&pending_req->blkif->drain_complete); 444 + } 505 445 free_req(pending_req); 506 446 } 507 447 } ··· 600 532 601 533 return more_to_do; 602 534 } 603 - 604 535 /* 605 536 * Transmutation of the 'struct blkif_request' to a proper 'struct bio' 606 537 * and call the 'submit_bio' to pass it to the underlying storage. ··· 616 549 int i, nbio = 0; 617 550 int operation; 618 551 struct blk_plug plug; 552 + bool drain = false; 619 553 620 554 switch (req->operation) { 621 555 case BLKIF_OP_READ: ··· 627 559 blkif->st_wr_req++; 628 560 operation = WRITE_ODIRECT; 629 561 break; 562 + case BLKIF_OP_WRITE_BARRIER: 563 + drain = true; 630 564 case BLKIF_OP_FLUSH_DISKCACHE: 631 565 blkif->st_f_req++; 632 566 operation = WRITE_FLUSH; 633 567 break; 634 - case BLKIF_OP_WRITE_BARRIER: 568 + case BLKIF_OP_DISCARD: 569 + blkif->st_ds_req++; 570 + operation = REQ_DISCARD; 571 + break; 635 572 default: 636 573 operation = 0; /* make gcc happy */ 637 574 goto fail_response; ··· 645 572 646 573 /* Check that the number of segments is sane. */ 647 574 nseg = req->nr_segments; 648 - if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || 575 + if (unlikely(nseg == 0 && operation != WRITE_FLUSH && 576 + operation != REQ_DISCARD) || 649 577 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { 650 578 pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", 651 579 nseg); ··· 695 621 } 696 622 } 697 623 624 + /* Wait on all outstanding I/O's and once that has been completed 625 + * issue the WRITE_FLUSH. 626 + */ 627 + if (drain) 628 + xen_blk_drain_io(pending_req->blkif); 629 + 698 630 /* 699 631 * If we have failed at this point, we need to undo the M2P override, 700 632 * set gnttab_set_unmap_op on all of the grant references and perform 701 633 * the hypercall to unmap the grants - that is all done in 702 634 * xen_blkbk_unmap. 703 635 */ 704 - if (xen_blkbk_map(req, pending_req, seg)) 636 + if (operation != REQ_DISCARD && xen_blkbk_map(req, pending_req, seg)) 705 637 goto fail_flush; 706 638 707 - /* This corresponding xen_blkif_put is done in __end_block_io_op */ 639 + /* 640 + * This corresponding xen_blkif_put is done in __end_block_io_op, or 641 + * below (in "!bio") if we are handling a BLKIF_OP_DISCARD. 642 + */ 708 643 xen_blkif_get(blkif); 709 644 710 645 for (i = 0; i < nseg; i++) { ··· 737 654 preq.sector_number += seg[i].nsec; 738 655 } 739 656 740 - /* This will be hit if the operation was a flush. */ 657 + /* This will be hit if the operation was a flush or discard. */ 741 658 if (!bio) { 742 - BUG_ON(operation != WRITE_FLUSH); 659 + BUG_ON(operation != WRITE_FLUSH && operation != REQ_DISCARD); 743 660 744 - bio = bio_alloc(GFP_KERNEL, 0); 745 - if (unlikely(bio == NULL)) 746 - goto fail_put_bio; 661 + if (operation == WRITE_FLUSH) { 662 + bio = bio_alloc(GFP_KERNEL, 0); 663 + if (unlikely(bio == NULL)) 664 + goto fail_put_bio; 747 665 748 - biolist[nbio++] = bio; 749 - bio->bi_bdev = preq.bdev; 750 - bio->bi_private = pending_req; 751 - bio->bi_end_io = end_block_io_op; 666 + biolist[nbio++] = bio; 667 + bio->bi_bdev = preq.bdev; 668 + bio->bi_private = pending_req; 669 + bio->bi_end_io = end_block_io_op; 670 + } else if (operation == REQ_DISCARD) { 671 + xen_blk_discard(blkif, req); 672 + xen_blkif_put(blkif); 673 + free_req(pending_req); 674 + return 0; 675 + } 752 676 } 753 677 754 678 /* ··· 775 685 776 686 if (operation == READ) 777 687 blkif->st_rd_sect += preq.nr_sects; 778 - else if (operation == WRITE || operation == WRITE_FLUSH) 688 + else if (operation & WRITE) 779 689 blkif->st_wr_sect += preq.nr_sects; 780 690 781 691 return 0; ··· 855 765 856 766 mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; 857 767 858 - blkbk->pending_reqs = kmalloc(sizeof(blkbk->pending_reqs[0]) * 768 + blkbk->pending_reqs = kzalloc(sizeof(blkbk->pending_reqs[0]) * 859 769 xen_blkif_reqs, GFP_KERNEL); 860 - blkbk->pending_grant_handles = kzalloc(sizeof(blkbk->pending_grant_handles[0]) * 770 + blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) * 861 771 mmap_pages, GFP_KERNEL); 862 772 blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) * 863 773 mmap_pages, GFP_KERNEL); ··· 879 789 rc = xen_blkif_interface_init(); 880 790 if (rc) 881 791 goto failed_init; 882 - 883 - memset(blkbk->pending_reqs, 0, sizeof(blkbk->pending_reqs)); 884 792 885 793 INIT_LIST_HEAD(&blkbk->pending_free); 886 794 spin_lock_init(&blkbk->pending_free_lock);

+81 -17

drivers/block/xen-blkback/common.h

··· 62 62 63 63 /* i386 protocol version */ 64 64 #pragma pack(push, 4) 65 + 66 + struct blkif_x86_32_request_rw { 67 + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ 68 + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 69 + }; 70 + 71 + struct blkif_x86_32_request_discard { 72 + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ 73 + uint64_t nr_sectors; 74 + }; 75 + 65 76 struct blkif_x86_32_request { 66 77 uint8_t operation; /* BLKIF_OP_??? */ 67 78 uint8_t nr_segments; /* number of segments */ 68 79 blkif_vdev_t handle; /* only for read/write requests */ 69 80 uint64_t id; /* private guest value, echoed in resp */ 70 - blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ 71 - struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 81 + union { 82 + struct blkif_x86_32_request_rw rw; 83 + struct blkif_x86_32_request_discard discard; 84 + } u; 72 85 }; 73 86 struct blkif_x86_32_response { 74 87 uint64_t id; /* copied from request */ ··· 91 78 #pragma pack(pop) 92 79 93 80 /* x86_64 protocol version */ 81 + 82 + struct blkif_x86_64_request_rw { 83 + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ 84 + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 85 + }; 86 + 87 + struct blkif_x86_64_request_discard { 88 + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ 89 + uint64_t nr_sectors; 90 + }; 91 + 94 92 struct blkif_x86_64_request { 95 93 uint8_t operation; /* BLKIF_OP_??? */ 96 94 uint8_t nr_segments; /* number of segments */ 97 95 blkif_vdev_t handle; /* only for read/write requests */ 98 96 uint64_t __attribute__((__aligned__(8))) id; 99 - blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ 100 - struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 97 + union { 98 + struct blkif_x86_64_request_rw rw; 99 + struct blkif_x86_64_request_discard discard; 100 + } u; 101 101 }; 102 102 struct blkif_x86_64_response { 103 103 uint64_t __attribute__((__aligned__(8))) id; ··· 138 112 BLKIF_PROTOCOL_X86_64 = 3, 139 113 }; 140 114 115 + enum blkif_backend_type { 116 + BLKIF_BACKEND_PHY = 1, 117 + BLKIF_BACKEND_FILE = 2, 118 + }; 119 + 141 120 struct xen_vbd { 142 121 /* What the domain refers to this vbd as. */ 143 122 blkif_vdev_t handle; ··· 168 137 unsigned int irq; 169 138 /* Comms information. */ 170 139 enum blkif_protocol blk_protocol; 140 + enum blkif_backend_type blk_backend_type; 171 141 union blkif_back_rings blk_rings; 172 142 struct vm_struct *blk_ring_area; 173 143 /* The VBD attached to this interface. */ ··· 180 148 atomic_t refcnt; 181 149 182 150 wait_queue_head_t wq; 151 + /* for barrier (drain) requests */ 152 + struct completion drain_complete; 153 + atomic_t drain; 183 154 /* One thread per one blkif. */ 184 155 struct task_struct *xenblkd; 185 156 unsigned int waiting_reqs; ··· 193 158 int st_wr_req; 194 159 int st_oo_req; 195 160 int st_f_req; 161 + int st_ds_req; 196 162 int st_rd_sect; 197 163 int st_wr_sect; 198 164 ··· 217 181 218 182 struct phys_req { 219 183 unsigned short dev; 220 - unsigned short nr_sects; 184 + blkif_sector_t nr_sects; 221 185 struct block_device *bdev; 222 186 blkif_sector_t sector_number; 223 187 }; ··· 231 195 int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, 232 196 struct backend_info *be, int state); 233 197 198 + int xen_blkbk_barrier(struct xenbus_transaction xbt, 199 + struct backend_info *be, int state); 234 200 struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be); 235 201 236 202 static inline void blkif_get_x86_32_req(struct blkif_request *dst, ··· 243 205 dst->nr_segments = src->nr_segments; 244 206 dst->handle = src->handle; 245 207 dst->id = src->id; 246 - dst->u.rw.sector_number = src->sector_number; 247 - barrier(); 248 - if (n > dst->nr_segments) 249 - n = dst->nr_segments; 250 - for (i = 0; i < n; i++) 251 - dst->u.rw.seg[i] = src->seg[i]; 208 + switch (src->operation) { 209 + case BLKIF_OP_READ: 210 + case BLKIF_OP_WRITE: 211 + case BLKIF_OP_WRITE_BARRIER: 212 + case BLKIF_OP_FLUSH_DISKCACHE: 213 + dst->u.rw.sector_number = src->u.rw.sector_number; 214 + barrier(); 215 + if (n > dst->nr_segments) 216 + n = dst->nr_segments; 217 + for (i = 0; i < n; i++) 218 + dst->u.rw.seg[i] = src->u.rw.seg[i]; 219 + break; 220 + case BLKIF_OP_DISCARD: 221 + dst->u.discard.sector_number = src->u.discard.sector_number; 222 + dst->u.discard.nr_sectors = src->u.discard.nr_sectors; 223 + break; 224 + default: 225 + break; 226 + } 252 227 } 253 228 254 229 static inline void blkif_get_x86_64_req(struct blkif_request *dst, ··· 272 221 dst->nr_segments = src->nr_segments; 273 222 dst->handle = src->handle; 274 223 dst->id = src->id; 275 - dst->u.rw.sector_number = src->sector_number; 276 - barrier(); 277 - if (n > dst->nr_segments) 278 - n = dst->nr_segments; 279 - for (i = 0; i < n; i++) 280 - dst->u.rw.seg[i] = src->seg[i]; 224 + switch (src->operation) { 225 + case BLKIF_OP_READ: 226 + case BLKIF_OP_WRITE: 227 + case BLKIF_OP_WRITE_BARRIER: 228 + case BLKIF_OP_FLUSH_DISKCACHE: 229 + dst->u.rw.sector_number = src->u.rw.sector_number; 230 + barrier(); 231 + if (n > dst->nr_segments) 232 + n = dst->nr_segments; 233 + for (i = 0; i < n; i++) 234 + dst->u.rw.seg[i] = src->u.rw.seg[i]; 235 + break; 236 + case BLKIF_OP_DISCARD: 237 + dst->u.discard.sector_number = src->u.discard.sector_number; 238 + dst->u.discard.nr_sectors = src->u.discard.nr_sectors; 239 + break; 240 + default: 241 + break; 242 + } 281 243 } 282 244 283 245 #endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */

+76

drivers/block/xen-blkback/xenbus.c

··· 114 114 spin_lock_init(&blkif->blk_ring_lock); 115 115 atomic_set(&blkif->refcnt, 1); 116 116 init_waitqueue_head(&blkif->wq); 117 + init_completion(&blkif->drain_complete); 118 + atomic_set(&blkif->drain, 0); 117 119 blkif->st_print = jiffies; 118 120 init_waitqueue_head(&blkif->waiting_to_free); 119 121 ··· 274 272 VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req); 275 273 VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req); 276 274 VBD_SHOW(f_req, "%d\n", be->blkif->st_f_req); 275 + VBD_SHOW(ds_req, "%d\n", be->blkif->st_ds_req); 277 276 VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect); 278 277 VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect); 279 278 ··· 283 280 &dev_attr_rd_req.attr, 284 281 &dev_attr_wr_req.attr, 285 282 &dev_attr_f_req.attr, 283 + &dev_attr_ds_req.attr, 286 284 &dev_attr_rd_sect.attr, 287 285 &dev_attr_wr_sect.attr, 288 286 NULL ··· 419 415 "%d", state); 420 416 if (err) 421 417 xenbus_dev_fatal(dev, err, "writing feature-flush-cache"); 418 + 419 + return err; 420 + } 421 + 422 + int xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be) 423 + { 424 + struct xenbus_device *dev = be->dev; 425 + struct xen_blkif *blkif = be->blkif; 426 + char *type; 427 + int err; 428 + int state = 0; 429 + 430 + type = xenbus_read(XBT_NIL, dev->nodename, "type", NULL); 431 + if (!IS_ERR(type)) { 432 + if (strncmp(type, "file", 4) == 0) { 433 + state = 1; 434 + blkif->blk_backend_type = BLKIF_BACKEND_FILE; 435 + } 436 + if (strncmp(type, "phy", 3) == 0) { 437 + struct block_device *bdev = be->blkif->vbd.bdev; 438 + struct request_queue *q = bdev_get_queue(bdev); 439 + if (blk_queue_discard(q)) { 440 + err = xenbus_printf(xbt, dev->nodename, 441 + "discard-granularity", "%u", 442 + q->limits.discard_granularity); 443 + if (err) { 444 + xenbus_dev_fatal(dev, err, 445 + "writing discard-granularity"); 446 + goto kfree; 447 + } 448 + err = xenbus_printf(xbt, dev->nodename, 449 + "discard-alignment", "%u", 450 + q->limits.discard_alignment); 451 + if (err) { 452 + xenbus_dev_fatal(dev, err, 453 + "writing discard-alignment"); 454 + goto kfree; 455 + } 456 + state = 1; 457 + blkif->blk_backend_type = BLKIF_BACKEND_PHY; 458 + } 459 + } 460 + } else { 461 + err = PTR_ERR(type); 462 + xenbus_dev_fatal(dev, err, "reading type"); 463 + goto out; 464 + } 465 + 466 + err = xenbus_printf(xbt, dev->nodename, "feature-discard", 467 + "%d", state); 468 + if (err) 469 + xenbus_dev_fatal(dev, err, "writing feature-discard"); 470 + kfree: 471 + kfree(type); 472 + out: 473 + return err; 474 + } 475 + int xen_blkbk_barrier(struct xenbus_transaction xbt, 476 + struct backend_info *be, int state) 477 + { 478 + struct xenbus_device *dev = be->dev; 479 + int err; 480 + 481 + err = xenbus_printf(xbt, dev->nodename, "feature-barrier", 482 + "%d", state); 483 + if (err) 484 + xenbus_dev_fatal(dev, err, "writing feature-barrier"); 422 485 423 486 return err; 424 487 } ··· 720 649 err = xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support); 721 650 if (err) 722 651 goto abort; 652 + 653 + err = xen_blkbk_discard(xbt, be); 654 + 655 + /* If we can't advertise it is OK. */ 656 + err = xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support); 723 657 724 658 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", 725 659 (unsigned long long)vbd_sz(&be->blkif->vbd));

+97 -24

drivers/block/xen-blkfront.c

··· 98 98 unsigned long shadow_free; 99 99 unsigned int feature_flush; 100 100 unsigned int flush_op; 101 + unsigned int feature_discard; 102 + unsigned int discard_granularity; 103 + unsigned int discard_alignment; 101 104 int is_ready; 102 105 }; 103 106 ··· 305 302 ring_req->operation = info->flush_op; 306 303 } 307 304 308 - ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg); 309 - BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); 305 + if (unlikely(req->cmd_flags & REQ_DISCARD)) { 306 + /* id, sector_number and handle are set above. */ 307 + ring_req->operation = BLKIF_OP_DISCARD; 308 + ring_req->nr_segments = 0; 309 + ring_req->u.discard.nr_sectors = blk_rq_sectors(req); 310 + } else { 311 + ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg); 312 + BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); 310 313 311 - for_each_sg(info->sg, sg, ring_req->nr_segments, i) { 312 - buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg))); 313 - fsect = sg->offset >> 9; 314 - lsect = fsect + (sg->length >> 9) - 1; 315 - /* install a grant reference. */ 316 - ref = gnttab_claim_grant_reference(&gref_head); 317 - BUG_ON(ref == -ENOSPC); 314 + for_each_sg(info->sg, sg, ring_req->nr_segments, i) { 315 + buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg))); 316 + fsect = sg->offset >> 9; 317 + lsect = fsect + (sg->length >> 9) - 1; 318 + /* install a grant reference. */ 319 + ref = gnttab_claim_grant_reference(&gref_head); 320 + BUG_ON(ref == -ENOSPC); 318 321 319 - gnttab_grant_foreign_access_ref( 320 - ref, 321 - info->xbdev->otherend_id, 322 - buffer_mfn, 323 - rq_data_dir(req) ); 322 + gnttab_grant_foreign_access_ref( 323 + ref, 324 + info->xbdev->otherend_id, 325 + buffer_mfn, 326 + rq_data_dir(req)); 324 327 325 - info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); 326 - ring_req->u.rw.seg[i] = 327 - (struct blkif_request_segment) { 328 - .gref = ref, 329 - .first_sect = fsect, 330 - .last_sect = lsect }; 328 + info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); 329 + ring_req->u.rw.seg[i] = 330 + (struct blkif_request_segment) { 331 + .gref = ref, 332 + .first_sect = fsect, 333 + .last_sect = lsect }; 334 + } 331 335 } 332 336 333 337 info->ring.req_prod_pvt++; ··· 380 370 381 371 blk_start_request(req); 382 372 383 - if (req->cmd_type != REQ_TYPE_FS) { 373 + if ((req->cmd_type != REQ_TYPE_FS) || 374 + ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) && 375 + !info->flush_op)) { 384 376 __blk_end_request_all(req, -EIO); 385 377 continue; 386 378 } ··· 411 399 static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) 412 400 { 413 401 struct request_queue *rq; 402 + struct blkfront_info *info = gd->private_data; 414 403 415 404 rq = blk_init_queue(do_blkif_request, &blkif_io_lock); 416 405 if (rq == NULL) 417 406 return -1; 418 407 419 408 queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); 409 + 410 + if (info->feature_discard) { 411 + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq); 412 + blk_queue_max_discard_sectors(rq, get_capacity(gd)); 413 + rq->limits.discard_granularity = info->discard_granularity; 414 + rq->limits.discard_alignment = info->discard_alignment; 415 + } 420 416 421 417 /* Hard sector size and max sectors impersonate the equiv. hardware. */ 422 418 blk_queue_logical_block_size(rq, sector_size); ··· 742 722 743 723 error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; 744 724 switch (bret->operation) { 725 + case BLKIF_OP_DISCARD: 726 + if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { 727 + struct request_queue *rq = info->rq; 728 + printk(KERN_WARNING "blkfront: %s: discard op failed\n", 729 + info->gd->disk_name); 730 + error = -EOPNOTSUPP; 731 + info->feature_discard = 0; 732 + queue_flag_clear(QUEUE_FLAG_DISCARD, rq); 733 + } 734 + __blk_end_request_all(req, error); 735 + break; 745 736 case BLKIF_OP_FLUSH_DISKCACHE: 746 737 case BLKIF_OP_WRITE_BARRIER: 747 738 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { ··· 1129 1098 bdput(bdev); 1130 1099 } 1131 1100 1101 + static void blkfront_setup_discard(struct blkfront_info *info) 1102 + { 1103 + int err; 1104 + char *type; 1105 + unsigned int discard_granularity; 1106 + unsigned int discard_alignment; 1107 + 1108 + type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL); 1109 + if (IS_ERR(type)) 1110 + return; 1111 + 1112 + if (strncmp(type, "phy", 3) == 0) { 1113 + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1114 + "discard-granularity", "%u", &discard_granularity, 1115 + "discard-alignment", "%u", &discard_alignment, 1116 + NULL); 1117 + if (!err) { 1118 + info->feature_discard = 1; 1119 + info->discard_granularity = discard_granularity; 1120 + info->discard_alignment = discard_alignment; 1121 + } 1122 + } else if (strncmp(type, "file", 4) == 0) 1123 + info->feature_discard = 1; 1124 + 1125 + kfree(type); 1126 + } 1127 + 1132 1128 /* 1133 1129 * Invoked when the backend is finally 'ready' (and has told produced 1134 1130 * the details about the physical device - #sectors, size, etc). ··· 1166 1108 unsigned long sector_size; 1167 1109 unsigned int binfo; 1168 1110 int err; 1169 - int barrier, flush; 1111 + int barrier, flush, discard; 1170 1112 1171 1113 switch (info->connected) { 1172 1114 case BLKIF_STATE_CONNECTED: ··· 1236 1178 info->feature_flush = REQ_FLUSH; 1237 1179 info->flush_op = BLKIF_OP_FLUSH_DISKCACHE; 1238 1180 } 1239 - 1181 + 1182 + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, 1183 + "feature-discard", "%d", &discard, 1184 + NULL); 1185 + 1186 + if (!err && discard) 1187 + blkfront_setup_discard(info); 1188 + 1240 1189 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); 1241 1190 if (err) { 1242 1191 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", ··· 1450 1385 1451 1386 static int __init xlblk_init(void) 1452 1387 { 1388 + int ret; 1389 + 1453 1390 if (!xen_domain()) 1454 1391 return -ENODEV; 1455 1392 ··· 1461 1394 return -ENODEV; 1462 1395 } 1463 1396 1464 - return xenbus_register_frontend(&blkfront); 1397 + ret = xenbus_register_frontend(&blkfront); 1398 + if (ret) { 1399 + unregister_blkdev(XENVBD_MAJOR, DEV_NAME); 1400 + return ret; 1401 + } 1402 + 1403 + return 0; 1465 1404 } 1466 1405 module_init(xlblk_init); 1467 1406

+7

drivers/scsi/hpsa.c

··· 3300 3300 pmcsr &= ~PCI_PM_CTRL_STATE_MASK; 3301 3301 pmcsr |= PCI_D0; 3302 3302 pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr); 3303 + 3304 + /* 3305 + * The P600 requires a small delay when changing states. 3306 + * Otherwise we may think the board did not reset and we bail. 3307 + * This for kdump only and is particular to the P600. 3308 + */ 3309 + msleep(500); 3303 3310 } 3304 3311 return 0; 3305 3312 }

+1 -1

fs/block_dev.c

··· 971 971 972 972 if (!bdev->bd_disk) 973 973 return; 974 - if (disk_partitionable(bdev->bd_disk)) 974 + if (disk_part_scan_enabled(bdev->bd_disk)) 975 975 bdev->bd_invalidated = 1; 976 976 } 977 977

+4 -2

include/linux/genhd.h

··· 131 131 #define GENHD_FL_EXT_DEVT 64 /* allow extended devt */ 132 132 #define GENHD_FL_NATIVE_CAPACITY 128 133 133 #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256 134 + #define GENHD_FL_NO_PART_SCAN 512 134 135 135 136 enum { 136 137 DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ ··· 239 238 return disk->minors; 240 239 } 241 240 242 - static inline bool disk_partitionable(struct gendisk *disk) 241 + static inline bool disk_part_scan_enabled(struct gendisk *disk) 243 242 { 244 - return disk_max_parts(disk) > 1; 243 + return disk_max_parts(disk) > 1 && 244 + !(disk->flags & GENHD_FL_NO_PART_SCAN); 245 245 } 246 246 247 247 static inline dev_t disk_devt(struct gendisk *disk)

+1

include/linux/loop.h

··· 74 74 enum { 75 75 LO_FLAGS_READ_ONLY = 1, 76 76 LO_FLAGS_AUTOCLEAR = 4, 77 + LO_FLAGS_PARTSCAN = 8, 77 78 }; 78 79 79 80 #include <asm/posix_types.h> /* for __kernel_old_dev_t */

+36

include/xen/interface/io/blkif.h

··· 57 57 * "feature-flush-cache" node! 58 58 */ 59 59 #define BLKIF_OP_FLUSH_DISKCACHE 3 60 + 61 + /* 62 + * Recognised only if "feature-discard" is present in backend xenbus info. 63 + * The "feature-discard" node contains a boolean indicating whether trim 64 + * (ATA) or unmap (SCSI) - conviently called discard requests are likely 65 + * to succeed or fail. Either way, a discard request 66 + * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by 67 + * the underlying block-device hardware. The boolean simply indicates whether 68 + * or not it is worthwhile for the frontend to attempt discard requests. 69 + * If a backend does not recognise BLKIF_OP_DISCARD, it should *not* 70 + * create the "feature-discard" node! 71 + * 72 + * Discard operation is a request for the underlying block device to mark 73 + * extents to be erased. However, discard does not guarantee that the blocks 74 + * will be erased from the device - it is just a hint to the device 75 + * controller that these blocks are no longer in use. What the device 76 + * controller does with that information is left to the controller. 77 + * Discard operations are passed with sector_number as the 78 + * sector index to begin discard operations at and nr_sectors as the number of 79 + * sectors to be discarded. The specified sectors should be discarded if the 80 + * underlying block device supports trim (ATA) or unmap (SCSI) operations, 81 + * or a BLKIF_RSP_EOPNOTSUPP should be returned. 82 + * More information about trim/unmap operations at: 83 + * http://t13.org/Documents/UploadedDocuments/docs2008/ 84 + * e07154r6-Data_Set_Management_Proposal_for_ATA-ACS2.doc 85 + * http://www.seagate.com/staticfiles/support/disc/manuals/ 86 + * Interface%20manuals/100293068c.pdf 87 + */ 88 + #define BLKIF_OP_DISCARD 5 89 + 60 90 /* 61 91 * Maximum scatter/gather segments per request. 62 92 * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE. ··· 104 74 } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 105 75 }; 106 76 77 + struct blkif_request_discard { 78 + blkif_sector_t sector_number; 79 + uint64_t nr_sectors; 80 + }; 81 + 107 82 struct blkif_request { 108 83 uint8_t operation; /* BLKIF_OP_??? */ 109 84 uint8_t nr_segments; /* number of segments */ ··· 116 81 uint64_t id; /* private guest value, echoed in resp */ 117 82 union { 118 83 struct blkif_request_rw rw; 84 + struct blkif_request_discard discard; 119 85 } u; 120 86 }; 121 87