Merge branch 'for-3.17/drivers' of git://git.kernel.dk/linux-block

+4 -1

block/scsi_ioctl.c

··· 290 290 unsigned long start_time; 291 291 ssize_t ret = 0; 292 292 int writing = 0; 293 + int at_head = 0; 293 294 struct request *rq; 294 295 char sense[SCSI_SENSE_BUFFERSIZE]; 295 296 struct bio *bio; ··· 314 313 case SG_DXFER_FROM_DEV: 315 314 break; 316 315 } 316 + if (hdr->flags & SG_FLAG_Q_AT_HEAD) 317 + at_head = 1; 317 318 318 319 rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL); 319 320 if (!rq) ··· 372 369 * (if he doesn't check that is his problem). 373 370 * N.B. a non-zero SCSI status is _not_ necessarily an error. 374 371 */ 375 - blk_execute_rq(q, bd_disk, rq, 0); 372 + blk_execute_rq(q, bd_disk, rq, at_head); 376 373 377 374 hdr->duration = jiffies_to_msecs(jiffies - start_time); 378 375

+1

drivers/block/drbd/Makefile

··· 3 3 drbd-y += drbd_main.o drbd_strings.o drbd_nl.o 4 4 drbd-y += drbd_interval.o drbd_state.o 5 5 drbd-y += drbd_nla.o 6 + drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o 6 7 7 8 obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o

+202 -320

drivers/block/drbd/drbd_actlog.c

··· 92 92 __be32 context[AL_CONTEXT_PER_TRANSACTION]; 93 93 }; 94 94 95 - struct update_odbm_work { 96 - struct drbd_work w; 97 - struct drbd_device *device; 98 - unsigned int enr; 99 - }; 100 - 101 - struct update_al_work { 102 - struct drbd_work w; 103 - struct drbd_device *device; 104 - struct completion event; 105 - int err; 106 - }; 107 - 108 - 109 - void *drbd_md_get_buffer(struct drbd_device *device) 95 + void *drbd_md_get_buffer(struct drbd_device *device, const char *intent) 110 96 { 111 97 int r; 112 98 113 99 wait_event(device->misc_wait, 114 - (r = atomic_cmpxchg(&device->md_io_in_use, 0, 1)) == 0 || 100 + (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 || 115 101 device->state.disk <= D_FAILED); 116 102 117 - return r ? NULL : page_address(device->md_io_page); 103 + if (r) 104 + return NULL; 105 + 106 + device->md_io.current_use = intent; 107 + device->md_io.start_jif = jiffies; 108 + device->md_io.submit_jif = device->md_io.start_jif - 1; 109 + return page_address(device->md_io.page); 118 110 } 119 111 120 112 void drbd_md_put_buffer(struct drbd_device *device) 121 113 { 122 - if (atomic_dec_and_test(&device->md_io_in_use)) 114 + if (atomic_dec_and_test(&device->md_io.in_use)) 123 115 wake_up(&device->misc_wait); 124 116 } 125 117 ··· 137 145 138 146 static int _drbd_md_sync_page_io(struct drbd_device *device, 139 147 struct drbd_backing_dev *bdev, 140 - struct page *page, sector_t sector, 141 - int rw, int size) 148 + sector_t sector, int rw) 142 149 { 143 150 struct bio *bio; 151 + /* we do all our meta data IO in aligned 4k blocks. */ 152 + const int size = 4096; 144 153 int err; 145 154 146 155 device->md_io.done = 0; ··· 149 156 150 157 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags)) 151 158 rw |= REQ_FUA | REQ_FLUSH; 152 - rw |= REQ_SYNC; 159 + rw |= REQ_SYNC | REQ_NOIDLE; 153 160 154 161 bio = bio_alloc_drbd(GFP_NOIO); 155 162 bio->bi_bdev = bdev->md_bdev; 156 163 bio->bi_iter.bi_sector = sector; 157 164 err = -EIO; 158 - if (bio_add_page(bio, page, size, 0) != size) 165 + if (bio_add_page(bio, device->md_io.page, size, 0) != size) 159 166 goto out; 160 - bio->bi_private = &device->md_io; 167 + bio->bi_private = device; 161 168 bio->bi_end_io = drbd_md_io_complete; 162 169 bio->bi_rw = rw; 163 170 ··· 172 179 } 173 180 174 181 bio_get(bio); /* one bio_put() is in the completion handler */ 175 - atomic_inc(&device->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */ 182 + atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */ 183 + device->md_io.submit_jif = jiffies; 176 184 if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) 177 185 bio_endio(bio, -EIO); 178 186 else ··· 191 197 sector_t sector, int rw) 192 198 { 193 199 int err; 194 - struct page *iop = device->md_io_page; 195 - 196 - D_ASSERT(device, atomic_read(&device->md_io_in_use) == 1); 200 + D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1); 197 201 198 202 BUG_ON(!bdev->md_bdev); 199 203 ··· 206 214 current->comm, current->pid, __func__, 207 215 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 208 216 209 - /* we do all our meta data IO in aligned 4k blocks. */ 210 - err = _drbd_md_sync_page_io(device, bdev, iop, sector, rw, 4096); 217 + err = _drbd_md_sync_page_io(device, bdev, sector, rw); 211 218 if (err) { 212 219 drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", 213 220 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); ··· 288 297 return need_transaction; 289 298 } 290 299 291 - static int al_write_transaction(struct drbd_device *device, bool delegate); 300 + static int al_write_transaction(struct drbd_device *device); 292 301 293 - /* When called through generic_make_request(), we must delegate 294 - * activity log I/O to the worker thread: a further request 295 - * submitted via generic_make_request() within the same task 296 - * would be queued on current->bio_list, and would only start 297 - * after this function returns (see generic_make_request()). 298 - * 299 - * However, if we *are* the worker, we must not delegate to ourselves. 300 - */ 301 - 302 - /* 303 - * @delegate: delegate activity log I/O to the worker thread 304 - */ 305 - void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate) 302 + void drbd_al_begin_io_commit(struct drbd_device *device) 306 303 { 307 304 bool locked = false; 308 - 309 - BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task); 310 305 311 306 /* Serialize multiple transactions. 312 307 * This uses test_and_set_bit, memory barrier is implicit. ··· 312 335 rcu_read_unlock(); 313 336 314 337 if (write_al_updates) 315 - al_write_transaction(device, delegate); 338 + al_write_transaction(device); 316 339 spin_lock_irq(&device->al_lock); 317 340 /* FIXME 318 341 if (err) ··· 329 352 /* 330 353 * @delegate: delegate activity log I/O to the worker thread 331 354 */ 332 - void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate) 355 + void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i) 333 356 { 334 - BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task); 335 - 336 357 if (drbd_al_begin_io_prepare(device, i)) 337 - drbd_al_begin_io_commit(device, delegate); 358 + drbd_al_begin_io_commit(device); 338 359 } 339 360 340 361 int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i) ··· 355 380 /* We want all necessary updates for a given request within the same transaction 356 381 * We could first check how many updates are *actually* needed, 357 382 * and use that instead of the worst-case nr_al_extents */ 358 - if (available_update_slots < nr_al_extents) 359 - return -EWOULDBLOCK; 383 + if (available_update_slots < nr_al_extents) { 384 + /* Too many activity log extents are currently "hot". 385 + * 386 + * If we have accumulated pending changes already, 387 + * we made progress. 388 + * 389 + * If we cannot get even a single pending change through, 390 + * stop the fast path until we made some progress, 391 + * or requests to "cold" extents could be starved. */ 392 + if (!al->pending_changes) 393 + __set_bit(__LC_STARVING, &device->act_log->flags); 394 + return -ENOBUFS; 395 + } 360 396 361 397 /* Is resync active in this area? */ 362 398 for (enr = first; enr <= last; enr++) { ··· 438 452 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); 439 453 } 440 454 441 - static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) 442 - { 443 - return rs_enr >> 444 - /* bit to page */ 445 - ((PAGE_SHIFT + 3) - 446 - /* resync extent number to bit */ 447 - (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); 448 - } 449 - 450 455 static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) 451 456 { 452 457 const unsigned int stripes = device->ldev->md.al_stripes; ··· 456 479 return device->ldev->md.md_offset + device->ldev->md.al_offset + t; 457 480 } 458 481 459 - static int 460 - _al_write_transaction(struct drbd_device *device) 482 + int al_write_transaction(struct drbd_device *device) 461 483 { 462 484 struct al_transaction_on_disk *buffer; 463 485 struct lc_element *e; ··· 481 505 return -EIO; 482 506 } 483 507 484 - buffer = drbd_md_get_buffer(device); /* protects md_io_buffer, al_tr_cycle, ... */ 508 + /* protects md_io_buffer, al_tr_cycle, ... */ 509 + buffer = drbd_md_get_buffer(device, __func__); 485 510 if (!buffer) { 486 511 drbd_err(device, "disk failed while waiting for md_io buffer\n"); 487 512 put_ldev(device); ··· 567 590 return err; 568 591 } 569 592 570 - 571 - static int w_al_write_transaction(struct drbd_work *w, int unused) 572 - { 573 - struct update_al_work *aw = container_of(w, struct update_al_work, w); 574 - struct drbd_device *device = aw->device; 575 - int err; 576 - 577 - err = _al_write_transaction(device); 578 - aw->err = err; 579 - complete(&aw->event); 580 - 581 - return err != -EIO ? err : 0; 582 - } 583 - 584 - /* Calls from worker context (see w_restart_disk_io()) need to write the 585 - transaction directly. Others came through generic_make_request(), 586 - those need to delegate it to the worker. */ 587 - static int al_write_transaction(struct drbd_device *device, bool delegate) 588 - { 589 - if (delegate) { 590 - struct update_al_work al_work; 591 - init_completion(&al_work.event); 592 - al_work.w.cb = w_al_write_transaction; 593 - al_work.device = device; 594 - drbd_queue_work_front(&first_peer_device(device)->connection->sender_work, 595 - &al_work.w); 596 - wait_for_completion(&al_work.event); 597 - return al_work.err; 598 - } else 599 - return _al_write_transaction(device); 600 - } 601 - 602 593 static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) 603 594 { 604 595 int rv; ··· 627 682 return 0; 628 683 } 629 684 630 - static int w_update_odbm(struct drbd_work *w, int unused) 631 - { 632 - struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); 633 - struct drbd_device *device = udw->device; 634 - struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; 635 - 636 - if (!get_ldev(device)) { 637 - if (__ratelimit(&drbd_ratelimit_state)) 638 - drbd_warn(device, "Can not update on disk bitmap, local IO disabled.\n"); 639 - kfree(udw); 640 - return 0; 641 - } 642 - 643 - drbd_bm_write_page(device, rs_extent_to_bm_page(udw->enr)); 644 - put_ldev(device); 645 - 646 - kfree(udw); 647 - 648 - if (drbd_bm_total_weight(device) <= device->rs_failed) { 649 - switch (device->state.conn) { 650 - case C_SYNC_SOURCE: case C_SYNC_TARGET: 651 - case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T: 652 - drbd_resync_finished(device); 653 - default: 654 - /* nothing to do */ 655 - break; 656 - } 657 - } 658 - drbd_bcast_event(device, &sib); 659 - 660 - return 0; 661 - } 662 - 685 + static const char *drbd_change_sync_fname[] = { 686 + [RECORD_RS_FAILED] = "drbd_rs_failed_io", 687 + [SET_IN_SYNC] = "drbd_set_in_sync", 688 + [SET_OUT_OF_SYNC] = "drbd_set_out_of_sync" 689 + }; 663 690 664 691 /* ATTENTION. The AL's extents are 4MB each, while the extents in the 665 692 * resync LRU-cache are 16MB each. 666 693 * The caller of this function has to hold an get_ldev() reference. 667 694 * 695 + * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success), 696 + * potentially pulling in (and recounting the corresponding bits) 697 + * this resync extent into the resync extent lru cache. 698 + * 699 + * Returns whether all bits have been cleared for this resync extent, 700 + * precisely: (rs_left <= rs_failed) 701 + * 668 702 * TODO will be obsoleted once we have a caching lru of the on disk bitmap 669 703 */ 670 - static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t sector, 671 - int count, int success) 704 + static bool update_rs_extent(struct drbd_device *device, 705 + unsigned int enr, int count, 706 + enum update_sync_bits_mode mode) 672 707 { 673 708 struct lc_element *e; 674 - struct update_odbm_work *udw; 675 - 676 - unsigned int enr; 677 709 678 710 D_ASSERT(device, atomic_read(&device->local_cnt)); 679 711 680 - /* I simply assume that a sector/size pair never crosses 681 - * a 16 MB extent border. (Currently this is true...) */ 682 - enr = BM_SECT_TO_EXT(sector); 683 - 684 - e = lc_get(device->resync, enr); 712 + /* When setting out-of-sync bits, 713 + * we don't need it cached (lc_find). 714 + * But if it is present in the cache, 715 + * we should update the cached bit count. 716 + * Otherwise, that extent should be in the resync extent lru cache 717 + * already -- or we want to pull it in if necessary -- (lc_get), 718 + * then update and check rs_left and rs_failed. */ 719 + if (mode == SET_OUT_OF_SYNC) 720 + e = lc_find(device->resync, enr); 721 + else 722 + e = lc_get(device->resync, enr); 685 723 if (e) { 686 724 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); 687 725 if (ext->lce.lc_number == enr) { 688 - if (success) 726 + if (mode == SET_IN_SYNC) 689 727 ext->rs_left -= count; 728 + else if (mode == SET_OUT_OF_SYNC) 729 + ext->rs_left += count; 690 730 else 691 731 ext->rs_failed += count; 692 732 if (ext->rs_left < ext->rs_failed) { 693 - drbd_warn(device, "BAD! sector=%llus enr=%u rs_left=%d " 733 + drbd_warn(device, "BAD! enr=%u rs_left=%d " 694 734 "rs_failed=%d count=%d cstate=%s\n", 695 - (unsigned long long)sector, 696 735 ext->lce.lc_number, ext->rs_left, 697 736 ext->rs_failed, count, 698 737 drbd_conn_str(device->state.conn)); ··· 710 781 ext->lce.lc_number, ext->rs_failed); 711 782 } 712 783 ext->rs_left = rs_left; 713 - ext->rs_failed = success ? 0 : count; 784 + ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0; 714 785 /* we don't keep a persistent log of the resync lru, 715 786 * we can commit any change right away. */ 716 787 lc_committed(device->resync); 717 788 } 718 - lc_put(device->resync, &ext->lce); 789 + if (mode != SET_OUT_OF_SYNC) 790 + lc_put(device->resync, &ext->lce); 719 791 /* no race, we are within the al_lock! */ 720 792 721 - if (ext->rs_left == ext->rs_failed) { 793 + if (ext->rs_left <= ext->rs_failed) { 722 794 ext->rs_failed = 0; 723 - 724 - udw = kmalloc(sizeof(*udw), GFP_ATOMIC); 725 - if (udw) { 726 - udw->enr = ext->lce.lc_number; 727 - udw->w.cb = w_update_odbm; 728 - udw->device = device; 729 - drbd_queue_work_front(&first_peer_device(device)->connection->sender_work, 730 - &udw->w); 731 - } else { 732 - drbd_warn(device, "Could not kmalloc an udw\n"); 733 - } 795 + return true; 734 796 } 735 - } else { 797 + } else if (mode != SET_OUT_OF_SYNC) { 798 + /* be quiet if lc_find() did not find it. */ 736 799 drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n", 737 800 device->resync_locked, 738 801 device->resync->nr_elements, 739 802 device->resync->flags); 740 803 } 804 + return false; 741 805 } 742 806 743 807 void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go) ··· 749 827 } 750 828 } 751 829 830 + /* It is called lazy update, so don't do write-out too often. */ 831 + static bool lazy_bitmap_update_due(struct drbd_device *device) 832 + { 833 + return time_after(jiffies, device->rs_last_bcast + 2*HZ); 834 + } 835 + 836 + static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done) 837 + { 838 + if (rs_done) 839 + set_bit(RS_DONE, &device->flags); 840 + /* and also set RS_PROGRESS below */ 841 + else if (!lazy_bitmap_update_due(device)) 842 + return; 843 + 844 + drbd_device_post_work(device, RS_PROGRESS); 845 + } 846 + 847 + static int update_sync_bits(struct drbd_device *device, 848 + unsigned long sbnr, unsigned long ebnr, 849 + enum update_sync_bits_mode mode) 850 + { 851 + /* 852 + * We keep a count of set bits per resync-extent in the ->rs_left 853 + * caching member, so we need to loop and work within the resync extent 854 + * alignment. Typically this loop will execute exactly once. 855 + */ 856 + unsigned long flags; 857 + unsigned long count = 0; 858 + unsigned int cleared = 0; 859 + while (sbnr <= ebnr) { 860 + /* set temporary boundary bit number to last bit number within 861 + * the resync extent of the current start bit number, 862 + * but cap at provided end bit number */ 863 + unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK); 864 + unsigned long c; 865 + 866 + if (mode == RECORD_RS_FAILED) 867 + /* Only called from drbd_rs_failed_io(), bits 868 + * supposedly still set. Recount, maybe some 869 + * of the bits have been successfully cleared 870 + * by application IO meanwhile. 871 + */ 872 + c = drbd_bm_count_bits(device, sbnr, tbnr); 873 + else if (mode == SET_IN_SYNC) 874 + c = drbd_bm_clear_bits(device, sbnr, tbnr); 875 + else /* if (mode == SET_OUT_OF_SYNC) */ 876 + c = drbd_bm_set_bits(device, sbnr, tbnr); 877 + 878 + if (c) { 879 + spin_lock_irqsave(&device->al_lock, flags); 880 + cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode); 881 + spin_unlock_irqrestore(&device->al_lock, flags); 882 + count += c; 883 + } 884 + sbnr = tbnr + 1; 885 + } 886 + if (count) { 887 + if (mode == SET_IN_SYNC) { 888 + unsigned long still_to_go = drbd_bm_total_weight(device); 889 + bool rs_is_done = (still_to_go <= device->rs_failed); 890 + drbd_advance_rs_marks(device, still_to_go); 891 + if (cleared || rs_is_done) 892 + maybe_schedule_on_disk_bitmap_update(device, rs_is_done); 893 + } else if (mode == RECORD_RS_FAILED) 894 + device->rs_failed += count; 895 + wake_up(&device->al_wait); 896 + } 897 + return count; 898 + } 899 + 752 900 /* clear the bit corresponding to the piece of storage in question: 753 901 * size byte of data starting from sector. Only clear a bits of the affected 754 902 * one ore more _aligned_ BM_BLOCK_SIZE blocks. ··· 826 834 * called by worker on C_SYNC_TARGET and receiver on SyncSource. 827 835 * 828 836 */ 829 - void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size, 830 - const char *file, const unsigned int line) 837 + int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, 838 + enum update_sync_bits_mode mode, 839 + const char *file, const unsigned int line) 831 840 { 832 841 /* Is called from worker and receiver context _only_ */ 833 842 unsigned long sbnr, ebnr, lbnr; 834 843 unsigned long count = 0; 835 844 sector_t esector, nr_sectors; 836 - int wake_up = 0; 837 - unsigned long flags; 845 + 846 + /* This would be an empty REQ_FLUSH, be silent. */ 847 + if ((mode == SET_OUT_OF_SYNC) && size == 0) 848 + return 0; 838 849 839 850 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { 840 - drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", 851 + drbd_err(device, "%s: sector=%llus size=%d nonsense!\n", 852 + drbd_change_sync_fname[mode], 841 853 (unsigned long long)sector, size); 842 - return; 854 + return 0; 843 855 } 844 856 845 857 if (!get_ldev(device)) 846 - return; /* no disk, no metadata, no bitmap to clear bits in */ 858 + return 0; /* no disk, no metadata, no bitmap to manipulate bits in */ 847 859 848 860 nr_sectors = drbd_get_capacity(device->this_bdev); 849 861 esector = sector + (size >> 9) - 1; ··· 859 863 860 864 lbnr = BM_SECT_TO_BIT(nr_sectors-1); 861 865 862 - /* we clear it (in sync). 863 - * round up start sector, round down end sector. we make sure we only 864 - * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ 865 - if (unlikely(esector < BM_SECT_PER_BIT-1)) 866 - goto out; 867 - if (unlikely(esector == (nr_sectors-1))) 868 - ebnr = lbnr; 869 - else 870 - ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); 871 - sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); 872 - 873 - if (sbnr > ebnr) 874 - goto out; 875 - 876 - /* 877 - * ok, (capacity & 7) != 0 sometimes, but who cares... 878 - * we count rs_{total,left} in bits, not sectors. 879 - */ 880 - count = drbd_bm_clear_bits(device, sbnr, ebnr); 881 - if (count) { 882 - drbd_advance_rs_marks(device, drbd_bm_total_weight(device)); 883 - spin_lock_irqsave(&device->al_lock, flags); 884 - drbd_try_clear_on_disk_bm(device, sector, count, true); 885 - spin_unlock_irqrestore(&device->al_lock, flags); 886 - 887 - /* just wake_up unconditional now, various lc_chaged(), 888 - * lc_put() in drbd_try_clear_on_disk_bm(). */ 889 - wake_up = 1; 890 - } 891 - out: 892 - put_ldev(device); 893 - if (wake_up) 894 - wake_up(&device->al_wait); 895 - } 896 - 897 - /* 898 - * this is intended to set one request worth of data out of sync. 899 - * affects at least 1 bit, 900 - * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits. 901 - * 902 - * called by tl_clear and drbd_send_dblock (==drbd_make_request). 903 - * so this can be _any_ process. 904 - */ 905 - int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size, 906 - const char *file, const unsigned int line) 907 - { 908 - unsigned long sbnr, ebnr, flags; 909 - sector_t esector, nr_sectors; 910 - unsigned int enr, count = 0; 911 - struct lc_element *e; 912 - 913 - /* this should be an empty REQ_FLUSH */ 914 - if (size == 0) 915 - return 0; 916 - 917 - if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { 918 - drbd_err(device, "sector: %llus, size: %d\n", 919 - (unsigned long long)sector, size); 920 - return 0; 866 + if (mode == SET_IN_SYNC) { 867 + /* Round up start sector, round down end sector. We make sure 868 + * we only clear full, aligned, BM_BLOCK_SIZE blocks. */ 869 + if (unlikely(esector < BM_SECT_PER_BIT-1)) 870 + goto out; 871 + if (unlikely(esector == (nr_sectors-1))) 872 + ebnr = lbnr; 873 + else 874 + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); 875 + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); 876 + } else { 877 + /* We set it out of sync, or record resync failure. 878 + * Should not round anything here. */ 879 + sbnr = BM_SECT_TO_BIT(sector); 880 + ebnr = BM_SECT_TO_BIT(esector); 921 881 } 922 882 923 - if (!get_ldev(device)) 924 - return 0; /* no disk, no metadata, no bitmap to set bits in */ 925 - 926 - nr_sectors = drbd_get_capacity(device->this_bdev); 927 - esector = sector + (size >> 9) - 1; 928 - 929 - if (!expect(sector < nr_sectors)) 930 - goto out; 931 - if (!expect(esector < nr_sectors)) 932 - esector = nr_sectors - 1; 933 - 934 - /* we set it out of sync, 935 - * we do not need to round anything here */ 936 - sbnr = BM_SECT_TO_BIT(sector); 937 - ebnr = BM_SECT_TO_BIT(esector); 938 - 939 - /* ok, (capacity & 7) != 0 sometimes, but who cares... 940 - * we count rs_{total,left} in bits, not sectors. */ 941 - spin_lock_irqsave(&device->al_lock, flags); 942 - count = drbd_bm_set_bits(device, sbnr, ebnr); 943 - 944 - enr = BM_SECT_TO_EXT(sector); 945 - e = lc_find(device->resync, enr); 946 - if (e) 947 - lc_entry(e, struct bm_extent, lce)->rs_left += count; 948 - spin_unlock_irqrestore(&device->al_lock, flags); 949 - 883 + count = update_sync_bits(device, sbnr, ebnr, mode); 950 884 out: 951 885 put_ldev(device); 952 - 953 886 return count; 954 887 } 955 888 ··· 1000 1075 struct lc_element *e; 1001 1076 struct bm_extent *bm_ext; 1002 1077 int i; 1078 + bool throttle = drbd_rs_should_slow_down(device, sector, true); 1079 + 1080 + /* If we need to throttle, a half-locked (only marked BME_NO_WRITES, 1081 + * not yet BME_LOCKED) extent needs to be kicked out explicitly if we 1082 + * need to throttle. There is at most one such half-locked extent, 1083 + * which is remembered in resync_wenr. */ 1084 + 1085 + if (throttle && device->resync_wenr != enr) 1086 + return -EAGAIN; 1003 1087 1004 1088 spin_lock_irq(&device->al_lock); 1005 1089 if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) { ··· 1032 1098 D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 1033 1099 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1034 1100 device->resync_wenr = LC_FREE; 1035 - if (lc_put(device->resync, &bm_ext->lce) == 0) 1101 + if (lc_put(device->resync, &bm_ext->lce) == 0) { 1102 + bm_ext->flags = 0; 1036 1103 device->resync_locked--; 1104 + } 1037 1105 wake_up(&device->al_wait); 1038 1106 } else { 1039 1107 drbd_alert(device, "LOGIC BUG\n"); ··· 1097 1161 return 0; 1098 1162 1099 1163 try_again: 1100 - if (bm_ext) 1101 - device->resync_wenr = enr; 1164 + if (bm_ext) { 1165 + if (throttle) { 1166 + D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1167 + D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 1168 + clear_bit(BME_NO_WRITES, &bm_ext->flags); 1169 + device->resync_wenr = LC_FREE; 1170 + if (lc_put(device->resync, &bm_ext->lce) == 0) { 1171 + bm_ext->flags = 0; 1172 + device->resync_locked--; 1173 + } 1174 + wake_up(&device->al_wait); 1175 + } else 1176 + device->resync_wenr = enr; 1177 + } 1102 1178 spin_unlock_irq(&device->al_lock); 1103 1179 return -EAGAIN; 1104 1180 } ··· 1217 1269 wake_up(&device->al_wait); 1218 1270 1219 1271 return 0; 1220 - } 1221 - 1222 - /** 1223 - * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks 1224 - * @device: DRBD device. 1225 - * @sector: The sector number. 1226 - * @size: Size of failed IO operation, in byte. 1227 - */ 1228 - void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size) 1229 - { 1230 - /* Is called from worker and receiver context _only_ */ 1231 - unsigned long sbnr, ebnr, lbnr; 1232 - unsigned long count; 1233 - sector_t esector, nr_sectors; 1234 - int wake_up = 0; 1235 - 1236 - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { 1237 - drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", 1238 - (unsigned long long)sector, size); 1239 - return; 1240 - } 1241 - nr_sectors = drbd_get_capacity(device->this_bdev); 1242 - esector = sector + (size >> 9) - 1; 1243 - 1244 - if (!expect(sector < nr_sectors)) 1245 - return; 1246 - if (!expect(esector < nr_sectors)) 1247 - esector = nr_sectors - 1; 1248 - 1249 - lbnr = BM_SECT_TO_BIT(nr_sectors-1); 1250 - 1251 - /* 1252 - * round up start sector, round down end sector. we make sure we only 1253 - * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */ 1254 - if (unlikely(esector < BM_SECT_PER_BIT-1)) 1255 - return; 1256 - if (unlikely(esector == (nr_sectors-1))) 1257 - ebnr = lbnr; 1258 - else 1259 - ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); 1260 - sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); 1261 - 1262 - if (sbnr > ebnr) 1263 - return; 1264 - 1265 - /* 1266 - * ok, (capacity & 7) != 0 sometimes, but who cares... 1267 - * we count rs_{total,left} in bits, not sectors. 1268 - */ 1269 - spin_lock_irq(&device->al_lock); 1270 - count = drbd_bm_count_bits(device, sbnr, ebnr); 1271 - if (count) { 1272 - device->rs_failed += count; 1273 - 1274 - if (get_ldev(device)) { 1275 - drbd_try_clear_on_disk_bm(device, sector, count, false); 1276 - put_ldev(device); 1277 - } 1278 - 1279 - /* just wake_up unconditional now, various lc_chaged(), 1280 - * lc_put() in drbd_try_clear_on_disk_bm(). */ 1281 - wake_up = 1; 1282 - } 1283 - spin_unlock_irq(&device->al_lock); 1284 - if (wake_up) 1285 - wake_up(&device->al_wait); 1286 1272 }

+52 -98

drivers/block/drbd/drbd_bitmap.c

··· 22 22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 23 */ 24 24 25 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 26 + 25 27 #include <linux/bitops.h> 26 28 #include <linux/vmalloc.h> 27 29 #include <linux/string.h> ··· 355 353 356 354 for (i = 0; i < number; i++) { 357 355 if (!pages[i]) { 358 - printk(KERN_ALERT "drbd: bm_free_pages tried to free " 359 - "a NULL pointer; i=%lu n=%lu\n", 360 - i, number); 356 + pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n", 357 + i, number); 361 358 continue; 362 359 } 363 360 __free_page(pages[i]); ··· 593 592 end = offset + len; 594 593 595 594 if (end > b->bm_words) { 596 - printk(KERN_ALERT "drbd: bm_memset end > bm_words\n"); 595 + pr_alert("bm_memset end > bm_words\n"); 597 596 return; 598 597 } 599 598 ··· 603 602 p_addr = bm_map_pidx(b, idx); 604 603 bm = p_addr + MLPP(offset); 605 604 if (bm+do_now > p_addr + LWPP) { 606 - printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", 605 + pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", 607 606 p_addr, bm, (int)do_now); 608 607 } else 609 608 memset(bm, c, do_now * sizeof(long)); ··· 928 927 spin_unlock_irq(&b->bm_lock); 929 928 } 930 929 931 - struct bm_aio_ctx { 932 - struct drbd_device *device; 933 - atomic_t in_flight; 934 - unsigned int done; 935 - unsigned flags; 936 - #define BM_AIO_COPY_PAGES 1 937 - #define BM_AIO_WRITE_HINTED 2 938 - #define BM_WRITE_ALL_PAGES 4 939 - int error; 940 - struct kref kref; 941 - }; 942 - 943 - static void bm_aio_ctx_destroy(struct kref *kref) 930 + static void drbd_bm_aio_ctx_destroy(struct kref *kref) 944 931 { 945 - struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref); 932 + struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref); 933 + unsigned long flags; 946 934 935 + spin_lock_irqsave(&ctx->device->resource->req_lock, flags); 936 + list_del(&ctx->list); 937 + spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags); 947 938 put_ldev(ctx->device); 948 939 kfree(ctx); 949 940 } ··· 943 950 /* bv_page may be a copy, or may be the original */ 944 951 static void bm_async_io_complete(struct bio *bio, int error) 945 952 { 946 - struct bm_aio_ctx *ctx = bio->bi_private; 953 + struct drbd_bm_aio_ctx *ctx = bio->bi_private; 947 954 struct drbd_device *device = ctx->device; 948 955 struct drbd_bitmap *b = device->bitmap; 949 956 unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page); ··· 986 993 if (atomic_dec_and_test(&ctx->in_flight)) { 987 994 ctx->done = 1; 988 995 wake_up(&device->misc_wait); 989 - kref_put(&ctx->kref, &bm_aio_ctx_destroy); 996 + kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); 990 997 } 991 998 } 992 999 993 - static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local) 1000 + static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local) 994 1001 { 995 1002 struct bio *bio = bio_alloc_drbd(GFP_NOIO); 996 1003 struct drbd_device *device = ctx->device; 997 1004 struct drbd_bitmap *b = device->bitmap; 998 1005 struct page *page; 999 1006 unsigned int len; 1007 + unsigned int rw = (ctx->flags & BM_AIO_READ) ? READ : WRITE; 1000 1008 1001 1009 sector_t on_disk_sector = 1002 1010 device->ldev->md.md_offset + device->ldev->md.bm_offset; ··· 1043 1049 /* 1044 1050 * bm_rw: read/write the whole bitmap from/to its on disk location. 1045 1051 */ 1046 - static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local) 1052 + static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local) 1047 1053 { 1048 - struct bm_aio_ctx *ctx; 1054 + struct drbd_bm_aio_ctx *ctx; 1049 1055 struct drbd_bitmap *b = device->bitmap; 1050 1056 int num_pages, i, count = 0; 1051 1057 unsigned long now; ··· 1061 1067 * as we submit copies of pages anyways. 1062 1068 */ 1063 1069 1064 - ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); 1070 + ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO); 1065 1071 if (!ctx) 1066 1072 return -ENOMEM; 1067 1073 1068 - *ctx = (struct bm_aio_ctx) { 1074 + *ctx = (struct drbd_bm_aio_ctx) { 1069 1075 .device = device, 1076 + .start_jif = jiffies, 1070 1077 .in_flight = ATOMIC_INIT(1), 1071 1078 .done = 0, 1072 1079 .flags = flags, ··· 1075 1080 .kref = { ATOMIC_INIT(2) }, 1076 1081 }; 1077 1082 1078 - if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ 1083 + if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in drbd_bm_aio_ctx_destroy() */ 1079 1084 drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n"); 1080 1085 kfree(ctx); 1081 1086 return -ENODEV; 1082 1087 } 1088 + /* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from 1089 + drbd_adm_attach(), after device->ldev was assigned. */ 1083 1090 1084 - if (!ctx->flags) 1091 + if (0 == (ctx->flags & ~BM_AIO_READ)) 1085 1092 WARN_ON(!(BM_LOCKED_MASK & b->bm_flags)); 1093 + 1094 + spin_lock_irq(&device->resource->req_lock); 1095 + list_add_tail(&ctx->list, &device->pending_bitmap_io); 1096 + spin_unlock_irq(&device->resource->req_lock); 1086 1097 1087 1098 num_pages = b->bm_number_of_pages; 1088 1099 ··· 1099 1098 /* ignore completely unchanged pages */ 1100 1099 if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) 1101 1100 break; 1102 - if (rw & WRITE) { 1101 + if (!(flags & BM_AIO_READ)) { 1103 1102 if ((flags & BM_AIO_WRITE_HINTED) && 1104 1103 !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT, 1105 1104 &page_private(b->bm_pages[i]))) 1106 1105 continue; 1107 1106 1108 - if (!(flags & BM_WRITE_ALL_PAGES) && 1107 + if (!(flags & BM_AIO_WRITE_ALL_PAGES) && 1109 1108 bm_test_page_unchanged(b->bm_pages[i])) { 1110 1109 dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i); 1111 1110 continue; ··· 1119 1118 } 1120 1119 } 1121 1120 atomic_inc(&ctx->in_flight); 1122 - bm_page_io_async(ctx, i, rw); 1121 + bm_page_io_async(ctx, i); 1123 1122 ++count; 1124 1123 cond_resched(); 1125 1124 } ··· 1135 1134 if (!atomic_dec_and_test(&ctx->in_flight)) 1136 1135 wait_until_done_or_force_detached(device, device->ldev, &ctx->done); 1137 1136 else 1138 - kref_put(&ctx->kref, &bm_aio_ctx_destroy); 1137 + kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); 1139 1138 1140 1139 /* summary for global bitmap IO */ 1141 1140 if (flags == 0) 1142 1141 drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n", 1143 - rw == WRITE ? "WRITE" : "READ", 1142 + (flags & BM_AIO_READ) ? "READ" : "WRITE", 1144 1143 count, jiffies - now); 1145 1144 1146 1145 if (ctx->error) { ··· 1153 1152 err = -EIO; /* Disk timeout/force-detach during IO... */ 1154 1153 1155 1154 now = jiffies; 1156 - if (rw == WRITE) { 1157 - drbd_md_flush(device); 1158 - } else /* rw == READ */ { 1155 + if (flags & BM_AIO_READ) { 1159 1156 b->bm_set = bm_count_bits(b); 1160 1157 drbd_info(device, "recounting of set bits took additional %lu jiffies\n", 1161 1158 jiffies - now); 1162 1159 } 1163 1160 now = b->bm_set; 1164 1161 1165 - if (flags == 0) 1162 + if ((flags & ~BM_AIO_READ) == 0) 1166 1163 drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", 1167 1164 ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); 1168 1165 1169 - kref_put(&ctx->kref, &bm_aio_ctx_destroy); 1166 + kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); 1170 1167 return err; 1171 1168 } 1172 1169 ··· 1174 1175 */ 1175 1176 int drbd_bm_read(struct drbd_device *device) __must_hold(local) 1176 1177 { 1177 - return bm_rw(device, READ, 0, 0); 1178 + return bm_rw(device, BM_AIO_READ, 0); 1178 1179 } 1179 1180 1180 1181 /** ··· 1185 1186 */ 1186 1187 int drbd_bm_write(struct drbd_device *device) __must_hold(local) 1187 1188 { 1188 - return bm_rw(device, WRITE, 0, 0); 1189 + return bm_rw(device, 0, 0); 1189 1190 } 1190 1191 1191 1192 /** ··· 1196 1197 */ 1197 1198 int drbd_bm_write_all(struct drbd_device *device) __must_hold(local) 1198 1199 { 1199 - return bm_rw(device, WRITE, BM_WRITE_ALL_PAGES, 0); 1200 + return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0); 1201 + } 1202 + 1203 + /** 1204 + * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed. 1205 + * @device: DRBD device. 1206 + * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages 1207 + */ 1208 + int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local) 1209 + { 1210 + return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx); 1200 1211 } 1201 1212 1202 1213 /** ··· 1222 1213 */ 1223 1214 int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local) 1224 1215 { 1225 - return bm_rw(device, WRITE, BM_AIO_COPY_PAGES, 0); 1216 + return bm_rw(device, BM_AIO_COPY_PAGES, 0); 1226 1217 } 1227 1218 1228 1219 /** ··· 1231 1222 */ 1232 1223 int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local) 1233 1224 { 1234 - return bm_rw(device, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0); 1235 - } 1236 - 1237 - /** 1238 - * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap 1239 - * @device: DRBD device. 1240 - * @idx: bitmap page index 1241 - * 1242 - * We don't want to special case on logical_block_size of the backend device, 1243 - * so we submit PAGE_SIZE aligned pieces. 1244 - * Note that on "most" systems, PAGE_SIZE is 4k. 1245 - * 1246 - * In case this becomes an issue on systems with larger PAGE_SIZE, 1247 - * we may want to change this again to write 4k aligned 4k pieces. 1248 - */ 1249 - int drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local) 1250 - { 1251 - struct bm_aio_ctx *ctx; 1252 - int err; 1253 - 1254 - if (bm_test_page_unchanged(device->bitmap->bm_pages[idx])) { 1255 - dynamic_drbd_dbg(device, "skipped bm page write for idx %u\n", idx); 1256 - return 0; 1257 - } 1258 - 1259 - ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); 1260 - if (!ctx) 1261 - return -ENOMEM; 1262 - 1263 - *ctx = (struct bm_aio_ctx) { 1264 - .device = device, 1265 - .in_flight = ATOMIC_INIT(1), 1266 - .done = 0, 1267 - .flags = BM_AIO_COPY_PAGES, 1268 - .error = 0, 1269 - .kref = { ATOMIC_INIT(2) }, 1270 - }; 1271 - 1272 - if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ 1273 - drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n"); 1274 - kfree(ctx); 1275 - return -ENODEV; 1276 - } 1277 - 1278 - bm_page_io_async(ctx, idx, WRITE_SYNC); 1279 - wait_until_done_or_force_detached(device, device->ldev, &ctx->done); 1280 - 1281 - if (ctx->error) 1282 - drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); 1283 - /* that causes us to detach, so the in memory bitmap will be 1284 - * gone in a moment as well. */ 1285 - 1286 - device->bm_writ_cnt++; 1287 - err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error; 1288 - kref_put(&ctx->kref, &bm_aio_ctx_destroy); 1289 - return err; 1225 + return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0); 1290 1226 } 1291 1227 1292 1228 /* NOTE

+958

drivers/block/drbd/drbd_debugfs.c

··· 1 + #define pr_fmt(fmt) "drbd debugfs: " fmt 2 + #include <linux/kernel.h> 3 + #include <linux/module.h> 4 + #include <linux/debugfs.h> 5 + #include <linux/seq_file.h> 6 + #include <linux/stat.h> 7 + #include <linux/jiffies.h> 8 + #include <linux/list.h> 9 + 10 + #include "drbd_int.h" 11 + #include "drbd_req.h" 12 + #include "drbd_debugfs.h" 13 + 14 + 15 + /********************************************************************** 16 + * Whenever you change the file format, remember to bump the version. * 17 + **********************************************************************/ 18 + 19 + static struct dentry *drbd_debugfs_root; 20 + static struct dentry *drbd_debugfs_version; 21 + static struct dentry *drbd_debugfs_resources; 22 + static struct dentry *drbd_debugfs_minors; 23 + 24 + static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long dt) 25 + { 26 + if (valid) 27 + seq_printf(m, "\t%d", jiffies_to_msecs(dt)); 28 + else 29 + seq_printf(m, "\t-"); 30 + } 31 + 32 + static void __seq_print_rq_state_bit(struct seq_file *m, 33 + bool is_set, char *sep, const char *set_name, const char *unset_name) 34 + { 35 + if (is_set && set_name) { 36 + seq_putc(m, *sep); 37 + seq_puts(m, set_name); 38 + *sep = '|'; 39 + } else if (!is_set && unset_name) { 40 + seq_putc(m, *sep); 41 + seq_puts(m, unset_name); 42 + *sep = '|'; 43 + } 44 + } 45 + 46 + static void seq_print_rq_state_bit(struct seq_file *m, 47 + bool is_set, char *sep, const char *set_name) 48 + { 49 + __seq_print_rq_state_bit(m, is_set, sep, set_name, NULL); 50 + } 51 + 52 + /* pretty print enum drbd_req_state_bits req->rq_state */ 53 + static void seq_print_request_state(struct seq_file *m, struct drbd_request *req) 54 + { 55 + unsigned int s = req->rq_state; 56 + char sep = ' '; 57 + seq_printf(m, "\t0x%08x", s); 58 + seq_printf(m, "\tmaster: %s", req->master_bio ? "pending" : "completed"); 59 + 60 + /* RQ_WRITE ignored, already reported */ 61 + seq_puts(m, "\tlocal:"); 62 + seq_print_rq_state_bit(m, s & RQ_IN_ACT_LOG, &sep, "in-AL"); 63 + seq_print_rq_state_bit(m, s & RQ_POSTPONED, &sep, "postponed"); 64 + seq_print_rq_state_bit(m, s & RQ_COMPLETION_SUSP, &sep, "suspended"); 65 + sep = ' '; 66 + seq_print_rq_state_bit(m, s & RQ_LOCAL_PENDING, &sep, "pending"); 67 + seq_print_rq_state_bit(m, s & RQ_LOCAL_COMPLETED, &sep, "completed"); 68 + seq_print_rq_state_bit(m, s & RQ_LOCAL_ABORTED, &sep, "aborted"); 69 + seq_print_rq_state_bit(m, s & RQ_LOCAL_OK, &sep, "ok"); 70 + if (sep == ' ') 71 + seq_puts(m, " -"); 72 + 73 + /* for_each_connection ... */ 74 + seq_printf(m, "\tnet:"); 75 + sep = ' '; 76 + seq_print_rq_state_bit(m, s & RQ_NET_PENDING, &sep, "pending"); 77 + seq_print_rq_state_bit(m, s & RQ_NET_QUEUED, &sep, "queued"); 78 + seq_print_rq_state_bit(m, s & RQ_NET_SENT, &sep, "sent"); 79 + seq_print_rq_state_bit(m, s & RQ_NET_DONE, &sep, "done"); 80 + seq_print_rq_state_bit(m, s & RQ_NET_SIS, &sep, "sis"); 81 + seq_print_rq_state_bit(m, s & RQ_NET_OK, &sep, "ok"); 82 + if (sep == ' ') 83 + seq_puts(m, " -"); 84 + 85 + seq_printf(m, " :"); 86 + sep = ' '; 87 + seq_print_rq_state_bit(m, s & RQ_EXP_RECEIVE_ACK, &sep, "B"); 88 + seq_print_rq_state_bit(m, s & RQ_EXP_WRITE_ACK, &sep, "C"); 89 + seq_print_rq_state_bit(m, s & RQ_EXP_BARR_ACK, &sep, "barr"); 90 + if (sep == ' ') 91 + seq_puts(m, " -"); 92 + seq_printf(m, "\n"); 93 + } 94 + 95 + static void seq_print_one_request(struct seq_file *m, struct drbd_request *req, unsigned long now) 96 + { 97 + /* change anything here, fixup header below! */ 98 + unsigned int s = req->rq_state; 99 + 100 + #define RQ_HDR_1 "epoch\tsector\tsize\trw" 101 + seq_printf(m, "0x%x\t%llu\t%u\t%s", 102 + req->epoch, 103 + (unsigned long long)req->i.sector, req->i.size >> 9, 104 + (s & RQ_WRITE) ? "W" : "R"); 105 + 106 + #define RQ_HDR_2 "\tstart\tin AL\tsubmit" 107 + seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif)); 108 + seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif); 109 + seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif); 110 + 111 + #define RQ_HDR_3 "\tsent\tacked\tdone" 112 + seq_print_age_or_dash(m, s & RQ_NET_SENT, now - req->pre_send_jif); 113 + seq_print_age_or_dash(m, (s & RQ_NET_SENT) && !(s & RQ_NET_PENDING), now - req->acked_jif); 114 + seq_print_age_or_dash(m, s & RQ_NET_DONE, now - req->net_done_jif); 115 + 116 + #define RQ_HDR_4 "\tstate\n" 117 + seq_print_request_state(m, req); 118 + } 119 + #define RQ_HDR RQ_HDR_1 RQ_HDR_2 RQ_HDR_3 RQ_HDR_4 120 + 121 + static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req, unsigned long now) 122 + { 123 + seq_printf(m, "%u\t%u\t", req->device->minor, req->device->vnr); 124 + seq_print_one_request(m, req, now); 125 + } 126 + 127 + static void seq_print_resource_pending_meta_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now) 128 + { 129 + struct drbd_device *device; 130 + unsigned int i; 131 + 132 + seq_puts(m, "minor\tvnr\tstart\tsubmit\tintent\n"); 133 + rcu_read_lock(); 134 + idr_for_each_entry(&resource->devices, device, i) { 135 + struct drbd_md_io tmp; 136 + /* In theory this is racy, 137 + * in the sense that there could have been a 138 + * drbd_md_put_buffer(); drbd_md_get_buffer(); 139 + * between accessing these members here. */ 140 + tmp = device->md_io; 141 + if (atomic_read(&tmp.in_use)) { 142 + seq_printf(m, "%u\t%u\t%d\t", 143 + device->minor, device->vnr, 144 + jiffies_to_msecs(now - tmp.start_jif)); 145 + if (time_before(tmp.submit_jif, tmp.start_jif)) 146 + seq_puts(m, "-\t"); 147 + else 148 + seq_printf(m, "%d\t", jiffies_to_msecs(now - tmp.submit_jif)); 149 + seq_printf(m, "%s\n", tmp.current_use); 150 + } 151 + } 152 + rcu_read_unlock(); 153 + } 154 + 155 + static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *resource, unsigned long now) 156 + { 157 + struct drbd_device *device; 158 + unsigned int i; 159 + 160 + seq_puts(m, "minor\tvnr\tage\t#waiting\n"); 161 + rcu_read_lock(); 162 + idr_for_each_entry(&resource->devices, device, i) { 163 + unsigned long jif; 164 + struct drbd_request *req; 165 + int n = atomic_read(&device->ap_actlog_cnt); 166 + if (n) { 167 + spin_lock_irq(&device->resource->req_lock); 168 + req = list_first_entry_or_null(&device->pending_master_completion[1], 169 + struct drbd_request, req_pending_master_completion); 170 + /* if the oldest request does not wait for the activity log 171 + * it is not interesting for us here */ 172 + if (req && !(req->rq_state & RQ_IN_ACT_LOG)) 173 + jif = req->start_jif; 174 + else 175 + req = NULL; 176 + spin_unlock_irq(&device->resource->req_lock); 177 + } 178 + if (n) { 179 + seq_printf(m, "%u\t%u\t", device->minor, device->vnr); 180 + if (req) 181 + seq_printf(m, "%u\t", jiffies_to_msecs(now - jif)); 182 + else 183 + seq_puts(m, "-\t"); 184 + seq_printf(m, "%u\n", n); 185 + } 186 + } 187 + rcu_read_unlock(); 188 + } 189 + 190 + static void seq_print_device_bitmap_io(struct seq_file *m, struct drbd_device *device, unsigned long now) 191 + { 192 + struct drbd_bm_aio_ctx *ctx; 193 + unsigned long start_jif; 194 + unsigned int in_flight; 195 + unsigned int flags; 196 + spin_lock_irq(&device->resource->req_lock); 197 + ctx = list_first_entry_or_null(&device->pending_bitmap_io, struct drbd_bm_aio_ctx, list); 198 + if (ctx && ctx->done) 199 + ctx = NULL; 200 + if (ctx) { 201 + start_jif = ctx->start_jif; 202 + in_flight = atomic_read(&ctx->in_flight); 203 + flags = ctx->flags; 204 + } 205 + spin_unlock_irq(&device->resource->req_lock); 206 + if (ctx) { 207 + seq_printf(m, "%u\t%u\t%c\t%u\t%u\n", 208 + device->minor, device->vnr, 209 + (flags & BM_AIO_READ) ? 'R' : 'W', 210 + jiffies_to_msecs(now - start_jif), 211 + in_flight); 212 + } 213 + } 214 + 215 + static void seq_print_resource_pending_bitmap_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now) 216 + { 217 + struct drbd_device *device; 218 + unsigned int i; 219 + 220 + seq_puts(m, "minor\tvnr\trw\tage\t#in-flight\n"); 221 + rcu_read_lock(); 222 + idr_for_each_entry(&resource->devices, device, i) { 223 + seq_print_device_bitmap_io(m, device, now); 224 + } 225 + rcu_read_unlock(); 226 + } 227 + 228 + /* pretty print enum peer_req->flags */ 229 + static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_request *peer_req) 230 + { 231 + unsigned long f = peer_req->flags; 232 + char sep = ' '; 233 + 234 + __seq_print_rq_state_bit(m, f & EE_SUBMITTED, &sep, "submitted", "preparing"); 235 + __seq_print_rq_state_bit(m, f & EE_APPLICATION, &sep, "application", "internal"); 236 + seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL"); 237 + seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C"); 238 + seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync"); 239 + 240 + if (f & EE_IS_TRIM) { 241 + seq_putc(m, sep); 242 + sep = '|'; 243 + if (f & EE_IS_TRIM_USE_ZEROOUT) 244 + seq_puts(m, "zero-out"); 245 + else 246 + seq_puts(m, "trim"); 247 + } 248 + seq_putc(m, '\n'); 249 + } 250 + 251 + static void seq_print_peer_request(struct seq_file *m, 252 + struct drbd_device *device, struct list_head *lh, 253 + unsigned long now) 254 + { 255 + bool reported_preparing = false; 256 + struct drbd_peer_request *peer_req; 257 + list_for_each_entry(peer_req, lh, w.list) { 258 + if (reported_preparing && !(peer_req->flags & EE_SUBMITTED)) 259 + continue; 260 + 261 + if (device) 262 + seq_printf(m, "%u\t%u\t", device->minor, device->vnr); 263 + 264 + seq_printf(m, "%llu\t%u\t%c\t%u\t", 265 + (unsigned long long)peer_req->i.sector, peer_req->i.size >> 9, 266 + (peer_req->flags & EE_WRITE) ? 'W' : 'R', 267 + jiffies_to_msecs(now - peer_req->submit_jif)); 268 + seq_print_peer_request_flags(m, peer_req); 269 + if (peer_req->flags & EE_SUBMITTED) 270 + break; 271 + else 272 + reported_preparing = true; 273 + } 274 + } 275 + 276 + static void seq_print_device_peer_requests(struct seq_file *m, 277 + struct drbd_device *device, unsigned long now) 278 + { 279 + seq_puts(m, "minor\tvnr\tsector\tsize\trw\tage\tflags\n"); 280 + spin_lock_irq(&device->resource->req_lock); 281 + seq_print_peer_request(m, device, &device->active_ee, now); 282 + seq_print_peer_request(m, device, &device->read_ee, now); 283 + seq_print_peer_request(m, device, &device->sync_ee, now); 284 + spin_unlock_irq(&device->resource->req_lock); 285 + if (test_bit(FLUSH_PENDING, &device->flags)) { 286 + seq_printf(m, "%u\t%u\t-\t-\tF\t%u\tflush\n", 287 + device->minor, device->vnr, 288 + jiffies_to_msecs(now - device->flush_jif)); 289 + } 290 + } 291 + 292 + static void seq_print_resource_pending_peer_requests(struct seq_file *m, 293 + struct drbd_resource *resource, unsigned long now) 294 + { 295 + struct drbd_device *device; 296 + unsigned int i; 297 + 298 + rcu_read_lock(); 299 + idr_for_each_entry(&resource->devices, device, i) { 300 + seq_print_device_peer_requests(m, device, now); 301 + } 302 + rcu_read_unlock(); 303 + } 304 + 305 + static void seq_print_resource_transfer_log_summary(struct seq_file *m, 306 + struct drbd_resource *resource, 307 + struct drbd_connection *connection, 308 + unsigned long now) 309 + { 310 + struct drbd_request *req; 311 + unsigned int count = 0; 312 + unsigned int show_state = 0; 313 + 314 + seq_puts(m, "n\tdevice\tvnr\t" RQ_HDR); 315 + spin_lock_irq(&resource->req_lock); 316 + list_for_each_entry(req, &connection->transfer_log, tl_requests) { 317 + unsigned int tmp = 0; 318 + unsigned int s; 319 + ++count; 320 + 321 + /* don't disable irq "forever" */ 322 + if (!(count & 0x1ff)) { 323 + struct drbd_request *req_next; 324 + kref_get(&req->kref); 325 + spin_unlock_irq(&resource->req_lock); 326 + cond_resched(); 327 + spin_lock_irq(&resource->req_lock); 328 + req_next = list_next_entry(req, tl_requests); 329 + if (kref_put(&req->kref, drbd_req_destroy)) 330 + req = req_next; 331 + if (&req->tl_requests == &connection->transfer_log) 332 + break; 333 + } 334 + 335 + s = req->rq_state; 336 + 337 + /* This is meant to summarize timing issues, to be able to tell 338 + * local disk problems from network problems. 339 + * Skip requests, if we have shown an even older request with 340 + * similar aspects already. */ 341 + if (req->master_bio == NULL) 342 + tmp |= 1; 343 + if ((s & RQ_LOCAL_MASK) && (s & RQ_LOCAL_PENDING)) 344 + tmp |= 2; 345 + if (s & RQ_NET_MASK) { 346 + if (!(s & RQ_NET_SENT)) 347 + tmp |= 4; 348 + if (s & RQ_NET_PENDING) 349 + tmp |= 8; 350 + if (!(s & RQ_NET_DONE)) 351 + tmp |= 16; 352 + } 353 + if ((tmp & show_state) == tmp) 354 + continue; 355 + show_state |= tmp; 356 + seq_printf(m, "%u\t", count); 357 + seq_print_minor_vnr_req(m, req, now); 358 + if (show_state == 0x1f) 359 + break; 360 + } 361 + spin_unlock_irq(&resource->req_lock); 362 + } 363 + 364 + /* TODO: transfer_log and friends should be moved to resource */ 365 + static int in_flight_summary_show(struct seq_file *m, void *pos) 366 + { 367 + struct drbd_resource *resource = m->private; 368 + struct drbd_connection *connection; 369 + unsigned long jif = jiffies; 370 + 371 + connection = first_connection(resource); 372 + /* This does not happen, actually. 373 + * But be robust and prepare for future code changes. */ 374 + if (!connection || !kref_get_unless_zero(&connection->kref)) 375 + return -ESTALE; 376 + 377 + /* BUMP me if you change the file format/content/presentation */ 378 + seq_printf(m, "v: %u\n\n", 0); 379 + 380 + seq_puts(m, "oldest bitmap IO\n"); 381 + seq_print_resource_pending_bitmap_io(m, resource, jif); 382 + seq_putc(m, '\n'); 383 + 384 + seq_puts(m, "meta data IO\n"); 385 + seq_print_resource_pending_meta_io(m, resource, jif); 386 + seq_putc(m, '\n'); 387 + 388 + seq_puts(m, "socket buffer stats\n"); 389 + /* for each connection ... once we have more than one */ 390 + rcu_read_lock(); 391 + if (connection->data.socket) { 392 + /* open coded SIOCINQ, the "relevant" part */ 393 + struct tcp_sock *tp = tcp_sk(connection->data.socket->sk); 394 + int answ = tp->rcv_nxt - tp->copied_seq; 395 + seq_printf(m, "unread receive buffer: %u Byte\n", answ); 396 + /* open coded SIOCOUTQ, the "relevant" part */ 397 + answ = tp->write_seq - tp->snd_una; 398 + seq_printf(m, "unacked send buffer: %u Byte\n", answ); 399 + } 400 + rcu_read_unlock(); 401 + seq_putc(m, '\n'); 402 + 403 + seq_puts(m, "oldest peer requests\n"); 404 + seq_print_resource_pending_peer_requests(m, resource, jif); 405 + seq_putc(m, '\n'); 406 + 407 + seq_puts(m, "application requests waiting for activity log\n"); 408 + seq_print_waiting_for_AL(m, resource, jif); 409 + seq_putc(m, '\n'); 410 + 411 + seq_puts(m, "oldest application requests\n"); 412 + seq_print_resource_transfer_log_summary(m, resource, connection, jif); 413 + seq_putc(m, '\n'); 414 + 415 + jif = jiffies - jif; 416 + if (jif) 417 + seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif)); 418 + kref_put(&connection->kref, drbd_destroy_connection); 419 + return 0; 420 + } 421 + 422 + /* simple_positive(file->f_dentry) respectively debugfs_positive(), 423 + * but neither is "reachable" from here. 424 + * So we have our own inline version of it above. :-( */ 425 + static inline int debugfs_positive(struct dentry *dentry) 426 + { 427 + return dentry->d_inode && !d_unhashed(dentry); 428 + } 429 + 430 + /* make sure at *open* time that the respective object won't go away. */ 431 + static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *), 432 + void *data, struct kref *kref, 433 + void (*release)(struct kref *)) 434 + { 435 + struct dentry *parent; 436 + int ret = -ESTALE; 437 + 438 + /* Are we still linked, 439 + * or has debugfs_remove() already been called? */ 440 + parent = file->f_dentry->d_parent; 441 + /* not sure if this can happen: */ 442 + if (!parent || !parent->d_inode) 443 + goto out; 444 + /* serialize with d_delete() */ 445 + mutex_lock(&parent->d_inode->i_mutex); 446 + /* Make sure the object is still alive */ 447 + if (debugfs_positive(file->f_dentry) 448 + && kref_get_unless_zero(kref)) 449 + ret = 0; 450 + mutex_unlock(&parent->d_inode->i_mutex); 451 + if (!ret) { 452 + ret = single_open(file, show, data); 453 + if (ret) 454 + kref_put(kref, release); 455 + } 456 + out: 457 + return ret; 458 + } 459 + 460 + static int in_flight_summary_open(struct inode *inode, struct file *file) 461 + { 462 + struct drbd_resource *resource = inode->i_private; 463 + return drbd_single_open(file, in_flight_summary_show, resource, 464 + &resource->kref, drbd_destroy_resource); 465 + } 466 + 467 + static int in_flight_summary_release(struct inode *inode, struct file *file) 468 + { 469 + struct drbd_resource *resource = inode->i_private; 470 + kref_put(&resource->kref, drbd_destroy_resource); 471 + return single_release(inode, file); 472 + } 473 + 474 + static const struct file_operations in_flight_summary_fops = { 475 + .owner = THIS_MODULE, 476 + .open = in_flight_summary_open, 477 + .read = seq_read, 478 + .llseek = seq_lseek, 479 + .release = in_flight_summary_release, 480 + }; 481 + 482 + void drbd_debugfs_resource_add(struct drbd_resource *resource) 483 + { 484 + struct dentry *dentry; 485 + if (!drbd_debugfs_resources) 486 + return; 487 + 488 + dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources); 489 + if (IS_ERR_OR_NULL(dentry)) 490 + goto fail; 491 + resource->debugfs_res = dentry; 492 + 493 + dentry = debugfs_create_dir("volumes", resource->debugfs_res); 494 + if (IS_ERR_OR_NULL(dentry)) 495 + goto fail; 496 + resource->debugfs_res_volumes = dentry; 497 + 498 + dentry = debugfs_create_dir("connections", resource->debugfs_res); 499 + if (IS_ERR_OR_NULL(dentry)) 500 + goto fail; 501 + resource->debugfs_res_connections = dentry; 502 + 503 + dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP, 504 + resource->debugfs_res, resource, 505 + &in_flight_summary_fops); 506 + if (IS_ERR_OR_NULL(dentry)) 507 + goto fail; 508 + resource->debugfs_res_in_flight_summary = dentry; 509 + return; 510 + 511 + fail: 512 + drbd_debugfs_resource_cleanup(resource); 513 + drbd_err(resource, "failed to create debugfs dentry\n"); 514 + } 515 + 516 + static void drbd_debugfs_remove(struct dentry **dp) 517 + { 518 + debugfs_remove(*dp); 519 + *dp = NULL; 520 + } 521 + 522 + void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) 523 + { 524 + /* it is ok to call debugfs_remove(NULL) */ 525 + drbd_debugfs_remove(&resource->debugfs_res_in_flight_summary); 526 + drbd_debugfs_remove(&resource->debugfs_res_connections); 527 + drbd_debugfs_remove(&resource->debugfs_res_volumes); 528 + drbd_debugfs_remove(&resource->debugfs_res); 529 + } 530 + 531 + static void seq_print_one_timing_detail(struct seq_file *m, 532 + const struct drbd_thread_timing_details *tdp, 533 + unsigned long now) 534 + { 535 + struct drbd_thread_timing_details td; 536 + /* No locking... 537 + * use temporary assignment to get at consistent data. */ 538 + do { 539 + td = *tdp; 540 + } while (td.cb_nr != tdp->cb_nr); 541 + if (!td.cb_addr) 542 + return; 543 + seq_printf(m, "%u\t%d\t%s:%u\t%ps\n", 544 + td.cb_nr, 545 + jiffies_to_msecs(now - td.start_jif), 546 + td.caller_fn, td.line, 547 + td.cb_addr); 548 + } 549 + 550 + static void seq_print_timing_details(struct seq_file *m, 551 + const char *title, 552 + unsigned int cb_nr, struct drbd_thread_timing_details *tdp, unsigned long now) 553 + { 554 + unsigned int start_idx; 555 + unsigned int i; 556 + 557 + seq_printf(m, "%s\n", title); 558 + /* If not much is going on, this will result in natural ordering. 559 + * If it is very busy, we will possibly skip events, or even see wrap 560 + * arounds, which could only be avoided with locking. 561 + */ 562 + start_idx = cb_nr % DRBD_THREAD_DETAILS_HIST; 563 + for (i = start_idx; i < DRBD_THREAD_DETAILS_HIST; i++) 564 + seq_print_one_timing_detail(m, tdp+i, now); 565 + for (i = 0; i < start_idx; i++) 566 + seq_print_one_timing_detail(m, tdp+i, now); 567 + } 568 + 569 + static int callback_history_show(struct seq_file *m, void *ignored) 570 + { 571 + struct drbd_connection *connection = m->private; 572 + unsigned long jif = jiffies; 573 + 574 + /* BUMP me if you change the file format/content/presentation */ 575 + seq_printf(m, "v: %u\n\n", 0); 576 + 577 + seq_puts(m, "n\tage\tcallsite\tfn\n"); 578 + seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif); 579 + seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif); 580 + return 0; 581 + } 582 + 583 + static int callback_history_open(struct inode *inode, struct file *file) 584 + { 585 + struct drbd_connection *connection = inode->i_private; 586 + return drbd_single_open(file, callback_history_show, connection, 587 + &connection->kref, drbd_destroy_connection); 588 + } 589 + 590 + static int callback_history_release(struct inode *inode, struct file *file) 591 + { 592 + struct drbd_connection *connection = inode->i_private; 593 + kref_put(&connection->kref, drbd_destroy_connection); 594 + return single_release(inode, file); 595 + } 596 + 597 + static const struct file_operations connection_callback_history_fops = { 598 + .owner = THIS_MODULE, 599 + .open = callback_history_open, 600 + .read = seq_read, 601 + .llseek = seq_lseek, 602 + .release = callback_history_release, 603 + }; 604 + 605 + static int connection_oldest_requests_show(struct seq_file *m, void *ignored) 606 + { 607 + struct drbd_connection *connection = m->private; 608 + unsigned long now = jiffies; 609 + struct drbd_request *r1, *r2; 610 + 611 + /* BUMP me if you change the file format/content/presentation */ 612 + seq_printf(m, "v: %u\n\n", 0); 613 + 614 + spin_lock_irq(&connection->resource->req_lock); 615 + r1 = connection->req_next; 616 + if (r1) 617 + seq_print_minor_vnr_req(m, r1, now); 618 + r2 = connection->req_ack_pending; 619 + if (r2 && r2 != r1) { 620 + r1 = r2; 621 + seq_print_minor_vnr_req(m, r1, now); 622 + } 623 + r2 = connection->req_not_net_done; 624 + if (r2 && r2 != r1) 625 + seq_print_minor_vnr_req(m, r2, now); 626 + spin_unlock_irq(&connection->resource->req_lock); 627 + return 0; 628 + } 629 + 630 + static int connection_oldest_requests_open(struct inode *inode, struct file *file) 631 + { 632 + struct drbd_connection *connection = inode->i_private; 633 + return drbd_single_open(file, connection_oldest_requests_show, connection, 634 + &connection->kref, drbd_destroy_connection); 635 + } 636 + 637 + static int connection_oldest_requests_release(struct inode *inode, struct file *file) 638 + { 639 + struct drbd_connection *connection = inode->i_private; 640 + kref_put(&connection->kref, drbd_destroy_connection); 641 + return single_release(inode, file); 642 + } 643 + 644 + static const struct file_operations connection_oldest_requests_fops = { 645 + .owner = THIS_MODULE, 646 + .open = connection_oldest_requests_open, 647 + .read = seq_read, 648 + .llseek = seq_lseek, 649 + .release = connection_oldest_requests_release, 650 + }; 651 + 652 + void drbd_debugfs_connection_add(struct drbd_connection *connection) 653 + { 654 + struct dentry *conns_dir = connection->resource->debugfs_res_connections; 655 + struct dentry *dentry; 656 + if (!conns_dir) 657 + return; 658 + 659 + /* Once we enable mutliple peers, 660 + * these connections will have descriptive names. 661 + * For now, it is just the one connection to the (only) "peer". */ 662 + dentry = debugfs_create_dir("peer", conns_dir); 663 + if (IS_ERR_OR_NULL(dentry)) 664 + goto fail; 665 + connection->debugfs_conn = dentry; 666 + 667 + dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP, 668 + connection->debugfs_conn, connection, 669 + &connection_callback_history_fops); 670 + if (IS_ERR_OR_NULL(dentry)) 671 + goto fail; 672 + connection->debugfs_conn_callback_history = dentry; 673 + 674 + dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP, 675 + connection->debugfs_conn, connection, 676 + &connection_oldest_requests_fops); 677 + if (IS_ERR_OR_NULL(dentry)) 678 + goto fail; 679 + connection->debugfs_conn_oldest_requests = dentry; 680 + return; 681 + 682 + fail: 683 + drbd_debugfs_connection_cleanup(connection); 684 + drbd_err(connection, "failed to create debugfs dentry\n"); 685 + } 686 + 687 + void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) 688 + { 689 + drbd_debugfs_remove(&connection->debugfs_conn_callback_history); 690 + drbd_debugfs_remove(&connection->debugfs_conn_oldest_requests); 691 + drbd_debugfs_remove(&connection->debugfs_conn); 692 + } 693 + 694 + static void resync_dump_detail(struct seq_file *m, struct lc_element *e) 695 + { 696 + struct bm_extent *bme = lc_entry(e, struct bm_extent, lce); 697 + 698 + seq_printf(m, "%5d %s %s %s\n", bme->rs_left, 699 + test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------", 700 + test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------", 701 + test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------" 702 + ); 703 + } 704 + 705 + static int device_resync_extents_show(struct seq_file *m, void *ignored) 706 + { 707 + struct drbd_device *device = m->private; 708 + 709 + /* BUMP me if you change the file format/content/presentation */ 710 + seq_printf(m, "v: %u\n\n", 0); 711 + 712 + if (get_ldev_if_state(device, D_FAILED)) { 713 + lc_seq_printf_stats(m, device->resync); 714 + lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail); 715 + put_ldev(device); 716 + } 717 + return 0; 718 + } 719 + 720 + static int device_act_log_extents_show(struct seq_file *m, void *ignored) 721 + { 722 + struct drbd_device *device = m->private; 723 + 724 + /* BUMP me if you change the file format/content/presentation */ 725 + seq_printf(m, "v: %u\n\n", 0); 726 + 727 + if (get_ldev_if_state(device, D_FAILED)) { 728 + lc_seq_printf_stats(m, device->act_log); 729 + lc_seq_dump_details(m, device->act_log, "", NULL); 730 + put_ldev(device); 731 + } 732 + return 0; 733 + } 734 + 735 + static int device_oldest_requests_show(struct seq_file *m, void *ignored) 736 + { 737 + struct drbd_device *device = m->private; 738 + struct drbd_resource *resource = device->resource; 739 + unsigned long now = jiffies; 740 + struct drbd_request *r1, *r2; 741 + int i; 742 + 743 + /* BUMP me if you change the file format/content/presentation */ 744 + seq_printf(m, "v: %u\n\n", 0); 745 + 746 + seq_puts(m, RQ_HDR); 747 + spin_lock_irq(&resource->req_lock); 748 + /* WRITE, then READ */ 749 + for (i = 1; i >= 0; --i) { 750 + r1 = list_first_entry_or_null(&device->pending_master_completion[i], 751 + struct drbd_request, req_pending_master_completion); 752 + r2 = list_first_entry_or_null(&device->pending_completion[i], 753 + struct drbd_request, req_pending_local); 754 + if (r1) 755 + seq_print_one_request(m, r1, now); 756 + if (r2 && r2 != r1) 757 + seq_print_one_request(m, r2, now); 758 + } 759 + spin_unlock_irq(&resource->req_lock); 760 + return 0; 761 + } 762 + 763 + static int device_data_gen_id_show(struct seq_file *m, void *ignored) 764 + { 765 + struct drbd_device *device = m->private; 766 + struct drbd_md *md; 767 + enum drbd_uuid_index idx; 768 + 769 + if (!get_ldev_if_state(device, D_FAILED)) 770 + return -ENODEV; 771 + 772 + md = &device->ldev->md; 773 + spin_lock_irq(&md->uuid_lock); 774 + for (idx = UI_CURRENT; idx <= UI_HISTORY_END; idx++) { 775 + seq_printf(m, "0x%016llX\n", md->uuid[idx]); 776 + } 777 + spin_unlock_irq(&md->uuid_lock); 778 + put_ldev(device); 779 + return 0; 780 + } 781 + 782 + #define drbd_debugfs_device_attr(name) \ 783 + static int device_ ## name ## _open(struct inode *inode, struct file *file) \ 784 + { \ 785 + struct drbd_device *device = inode->i_private; \ 786 + return drbd_single_open(file, device_ ## name ## _show, device, \ 787 + &device->kref, drbd_destroy_device); \ 788 + } \ 789 + static int device_ ## name ## _release(struct inode *inode, struct file *file) \ 790 + { \ 791 + struct drbd_device *device = inode->i_private; \ 792 + kref_put(&device->kref, drbd_destroy_device); \ 793 + return single_release(inode, file); \ 794 + } \ 795 + static const struct file_operations device_ ## name ## _fops = { \ 796 + .owner = THIS_MODULE, \ 797 + .open = device_ ## name ## _open, \ 798 + .read = seq_read, \ 799 + .llseek = seq_lseek, \ 800 + .release = device_ ## name ## _release, \ 801 + }; 802 + 803 + drbd_debugfs_device_attr(oldest_requests) 804 + drbd_debugfs_device_attr(act_log_extents) 805 + drbd_debugfs_device_attr(resync_extents) 806 + drbd_debugfs_device_attr(data_gen_id) 807 + 808 + void drbd_debugfs_device_add(struct drbd_device *device) 809 + { 810 + struct dentry *vols_dir = device->resource->debugfs_res_volumes; 811 + char minor_buf[8]; /* MINORMASK, MINORBITS == 20; */ 812 + char vnr_buf[8]; /* volume number vnr is even 16 bit only; */ 813 + char *slink_name = NULL; 814 + 815 + struct dentry *dentry; 816 + if (!vols_dir || !drbd_debugfs_minors) 817 + return; 818 + 819 + snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr); 820 + dentry = debugfs_create_dir(vnr_buf, vols_dir); 821 + if (IS_ERR_OR_NULL(dentry)) 822 + goto fail; 823 + device->debugfs_vol = dentry; 824 + 825 + snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor); 826 + slink_name = kasprintf(GFP_KERNEL, "../resources/%s/volumes/%u", 827 + device->resource->name, device->vnr); 828 + if (!slink_name) 829 + goto fail; 830 + dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name); 831 + kfree(slink_name); 832 + slink_name = NULL; 833 + if (IS_ERR_OR_NULL(dentry)) 834 + goto fail; 835 + device->debugfs_minor = dentry; 836 + 837 + #define DCF(name) do { \ 838 + dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP, \ 839 + device->debugfs_vol, device, \ 840 + &device_ ## name ## _fops); \ 841 + if (IS_ERR_OR_NULL(dentry)) \ 842 + goto fail; \ 843 + device->debugfs_vol_ ## name = dentry; \ 844 + } while (0) 845 + 846 + DCF(oldest_requests); 847 + DCF(act_log_extents); 848 + DCF(resync_extents); 849 + DCF(data_gen_id); 850 + #undef DCF 851 + return; 852 + 853 + fail: 854 + drbd_debugfs_device_cleanup(device); 855 + drbd_err(device, "failed to create debugfs entries\n"); 856 + } 857 + 858 + void drbd_debugfs_device_cleanup(struct drbd_device *device) 859 + { 860 + drbd_debugfs_remove(&device->debugfs_minor); 861 + drbd_debugfs_remove(&device->debugfs_vol_oldest_requests); 862 + drbd_debugfs_remove(&device->debugfs_vol_act_log_extents); 863 + drbd_debugfs_remove(&device->debugfs_vol_resync_extents); 864 + drbd_debugfs_remove(&device->debugfs_vol_data_gen_id); 865 + drbd_debugfs_remove(&device->debugfs_vol); 866 + } 867 + 868 + void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) 869 + { 870 + struct dentry *conn_dir = peer_device->connection->debugfs_conn; 871 + struct dentry *dentry; 872 + char vnr_buf[8]; 873 + 874 + if (!conn_dir) 875 + return; 876 + 877 + snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr); 878 + dentry = debugfs_create_dir(vnr_buf, conn_dir); 879 + if (IS_ERR_OR_NULL(dentry)) 880 + goto fail; 881 + peer_device->debugfs_peer_dev = dentry; 882 + return; 883 + 884 + fail: 885 + drbd_debugfs_peer_device_cleanup(peer_device); 886 + drbd_err(peer_device, "failed to create debugfs entries\n"); 887 + } 888 + 889 + void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) 890 + { 891 + drbd_debugfs_remove(&peer_device->debugfs_peer_dev); 892 + } 893 + 894 + static int drbd_version_show(struct seq_file *m, void *ignored) 895 + { 896 + seq_printf(m, "# %s\n", drbd_buildtag()); 897 + seq_printf(m, "VERSION=%s\n", REL_VERSION); 898 + seq_printf(m, "API_VERSION=%u\n", API_VERSION); 899 + seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN); 900 + seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX); 901 + return 0; 902 + } 903 + 904 + static int drbd_version_open(struct inode *inode, struct file *file) 905 + { 906 + return single_open(file, drbd_version_show, NULL); 907 + } 908 + 909 + static struct file_operations drbd_version_fops = { 910 + .owner = THIS_MODULE, 911 + .open = drbd_version_open, 912 + .llseek = seq_lseek, 913 + .read = seq_read, 914 + .release = single_release, 915 + }; 916 + 917 + /* not __exit, may be indirectly called 918 + * from the module-load-failure path as well. */ 919 + void drbd_debugfs_cleanup(void) 920 + { 921 + drbd_debugfs_remove(&drbd_debugfs_resources); 922 + drbd_debugfs_remove(&drbd_debugfs_minors); 923 + drbd_debugfs_remove(&drbd_debugfs_version); 924 + drbd_debugfs_remove(&drbd_debugfs_root); 925 + } 926 + 927 + int __init drbd_debugfs_init(void) 928 + { 929 + struct dentry *dentry; 930 + 931 + dentry = debugfs_create_dir("drbd", NULL); 932 + if (IS_ERR_OR_NULL(dentry)) 933 + goto fail; 934 + drbd_debugfs_root = dentry; 935 + 936 + dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops); 937 + if (IS_ERR_OR_NULL(dentry)) 938 + goto fail; 939 + drbd_debugfs_version = dentry; 940 + 941 + dentry = debugfs_create_dir("resources", drbd_debugfs_root); 942 + if (IS_ERR_OR_NULL(dentry)) 943 + goto fail; 944 + drbd_debugfs_resources = dentry; 945 + 946 + dentry = debugfs_create_dir("minors", drbd_debugfs_root); 947 + if (IS_ERR_OR_NULL(dentry)) 948 + goto fail; 949 + drbd_debugfs_minors = dentry; 950 + return 0; 951 + 952 + fail: 953 + drbd_debugfs_cleanup(); 954 + if (dentry) 955 + return PTR_ERR(dentry); 956 + else 957 + return -EINVAL; 958 + }

+39

drivers/block/drbd/drbd_debugfs.h

··· 1 + #include <linux/kernel.h> 2 + #include <linux/module.h> 3 + #include <linux/debugfs.h> 4 + 5 + #include "drbd_int.h" 6 + 7 + #ifdef CONFIG_DEBUG_FS 8 + int __init drbd_debugfs_init(void); 9 + void drbd_debugfs_cleanup(void); 10 + 11 + void drbd_debugfs_resource_add(struct drbd_resource *resource); 12 + void drbd_debugfs_resource_cleanup(struct drbd_resource *resource); 13 + 14 + void drbd_debugfs_connection_add(struct drbd_connection *connection); 15 + void drbd_debugfs_connection_cleanup(struct drbd_connection *connection); 16 + 17 + void drbd_debugfs_device_add(struct drbd_device *device); 18 + void drbd_debugfs_device_cleanup(struct drbd_device *device); 19 + 20 + void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device); 21 + void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device); 22 + #else 23 + 24 + static inline int __init drbd_debugfs_init(void) { return -ENODEV; } 25 + static inline void drbd_debugfs_cleanup(void) { } 26 + 27 + static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { } 28 + static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { } 29 + 30 + static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { } 31 + static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { } 32 + 33 + static inline void drbd_debugfs_device_add(struct drbd_device *device) { } 34 + static inline void drbd_debugfs_device_cleanup(struct drbd_device *device) { } 35 + 36 + static inline void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) { } 37 + static inline void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) { } 38 + 39 + #endif

+258 -137

drivers/block/drbd/drbd_int.h

··· 317 317 318 318 struct list_head tl_requests; /* ring list in the transfer log */ 319 319 struct bio *master_bio; /* master bio pointer */ 320 - unsigned long start_time; 320 + 321 + /* see struct drbd_device */ 322 + struct list_head req_pending_master_completion; 323 + struct list_head req_pending_local; 324 + 325 + /* for generic IO accounting */ 326 + unsigned long start_jif; 327 + 328 + /* for DRBD internal statistics */ 329 + 330 + /* Minimal set of time stamps to determine if we wait for activity log 331 + * transactions, local disk or peer. 32 bit "jiffies" are good enough, 332 + * we don't expect a DRBD request to be stalled for several month. 333 + */ 334 + 335 + /* before actual request processing */ 336 + unsigned long in_actlog_jif; 337 + 338 + /* local disk */ 339 + unsigned long pre_submit_jif; 340 + 341 + /* per connection */ 342 + unsigned long pre_send_jif; 343 + unsigned long acked_jif; 344 + unsigned long net_done_jif; 345 + 346 + /* Possibly even more detail to track each phase: 347 + * master_completion_jif 348 + * how long did it take to complete the master bio 349 + * (application visible latency) 350 + * allocated_jif 351 + * how long the master bio was blocked until we finally allocated 352 + * a tracking struct 353 + * in_actlog_jif 354 + * how long did we wait for activity log transactions 355 + * 356 + * net_queued_jif 357 + * when did we finally queue it for sending 358 + * pre_send_jif 359 + * when did we start sending it 360 + * post_send_jif 361 + * how long did we block in the network stack trying to send it 362 + * acked_jif 363 + * when did we receive (or fake, in protocol A) a remote ACK 364 + * net_done_jif 365 + * when did we receive final acknowledgement (P_BARRIER_ACK), 366 + * or decide, e.g. on connection loss, that we do no longer expect 367 + * anything from this peer for this request. 368 + * 369 + * pre_submit_jif 370 + * post_sub_jif 371 + * when did we start submiting to the lower level device, 372 + * and how long did we block in that submit function 373 + * local_completion_jif 374 + * how long did it take the lower level device to complete this request 375 + */ 376 + 321 377 322 378 /* once it hits 0, we may complete the master_bio */ 323 379 atomic_t completion_ref; ··· 422 366 struct drbd_interval i; 423 367 /* see comments on ee flag bits below */ 424 368 unsigned long flags; 369 + unsigned long submit_jif; 425 370 union { 426 371 u64 block_id; 427 372 struct digest_info *digest; ··· 465 408 466 409 /* Is set when net_conf had two_primaries set while creating this peer_req */ 467 410 __EE_IN_INTERVAL_TREE, 411 + 412 + /* for debugfs: */ 413 + /* has this been submitted, or does it still wait for something else? */ 414 + __EE_SUBMITTED, 415 + 416 + /* this is/was a write request */ 417 + __EE_WRITE, 418 + 419 + /* this originates from application on peer 420 + * (not some resync or verify or other DRBD internal request) */ 421 + __EE_APPLICATION, 468 422 }; 469 423 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) 470 424 #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) ··· 487 419 #define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS) 488 420 #define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK) 489 421 #define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) 422 + #define EE_SUBMITTED (1<<__EE_SUBMITTED) 423 + #define EE_WRITE (1<<__EE_WRITE) 424 + #define EE_APPLICATION (1<<__EE_APPLICATION) 490 425 491 426 /* flag bits per device */ 492 427 enum { ··· 504 433 CONSIDER_RESYNC, 505 434 506 435 MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ 436 + 507 437 SUSPEND_IO, /* suspend application io */ 508 438 BITMAP_IO, /* suspend application io; 509 439 once no more io in flight, start bitmap io */ 510 440 BITMAP_IO_QUEUED, /* Started bitmap IO */ 511 - GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */ 512 441 WAS_IO_ERROR, /* Local disk failed, returned IO error */ 513 442 WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */ 514 443 FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ ··· 521 450 B_RS_H_DONE, /* Before resync handler done (already executed) */ 522 451 DISCARD_MY_DATA, /* discard_my_data flag per volume */ 523 452 READ_BALANCE_RR, 453 + 454 + FLUSH_PENDING, /* if set, device->flush_jif is when we submitted that flush 455 + * from drbd_flush_after_epoch() */ 456 + 457 + /* cleared only after backing device related structures have been destroyed. */ 458 + GOING_DISKLESS, /* Disk is being detached, because of io-error, or admin request. */ 459 + 460 + /* to be used in drbd_device_post_work() */ 461 + GO_DISKLESS, /* tell worker to schedule cleanup before detach */ 462 + DESTROY_DISK, /* tell worker to close backing devices and destroy related structures. */ 463 + MD_SYNC, /* tell worker to call drbd_md_sync() */ 464 + RS_START, /* tell worker to start resync/OV */ 465 + RS_PROGRESS, /* tell worker that resync made significant progress */ 466 + RS_DONE, /* tell worker that resync is done */ 524 467 }; 525 468 526 469 struct drbd_bitmap; /* opaque for drbd_device */ ··· 616 531 }; 617 532 618 533 struct drbd_md_io { 534 + struct page *page; 535 + unsigned long start_jif; /* last call to drbd_md_get_buffer */ 536 + unsigned long submit_jif; /* last _drbd_md_sync_page_io() submit */ 537 + const char *current_use; 538 + atomic_t in_use; 619 539 unsigned int done; 620 540 int error; 621 541 }; ··· 667 577 * and potentially deadlock on, this drbd worker. 668 578 */ 669 579 DISCONNECT_SENT, 580 + 581 + DEVICE_WORK_PENDING, /* tell worker that some device has pending work */ 670 582 }; 671 583 672 584 struct drbd_resource { 673 585 char *name; 586 + #ifdef CONFIG_DEBUG_FS 587 + struct dentry *debugfs_res; 588 + struct dentry *debugfs_res_volumes; 589 + struct dentry *debugfs_res_connections; 590 + struct dentry *debugfs_res_in_flight_summary; 591 + #endif 674 592 struct kref kref; 675 593 struct idr devices; /* volume number to device mapping */ 676 594 struct list_head connections; ··· 692 594 unsigned susp_nod:1; /* IO suspended because no data */ 693 595 unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ 694 596 597 + enum write_ordering_e write_ordering; 598 + 695 599 cpumask_var_t cpu_mask; 600 + }; 601 + 602 + struct drbd_thread_timing_details 603 + { 604 + unsigned long start_jif; 605 + void *cb_addr; 606 + const char *caller_fn; 607 + unsigned int line; 608 + unsigned int cb_nr; 696 609 }; 697 610 698 611 struct drbd_connection { 699 612 struct list_head connections; 700 613 struct drbd_resource *resource; 614 + #ifdef CONFIG_DEBUG_FS 615 + struct dentry *debugfs_conn; 616 + struct dentry *debugfs_conn_callback_history; 617 + struct dentry *debugfs_conn_oldest_requests; 618 + #endif 701 619 struct kref kref; 702 620 struct idr peer_devices; /* volume number to peer device mapping */ 703 621 enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */ ··· 750 636 struct drbd_epoch *current_epoch; 751 637 spinlock_t epoch_lock; 752 638 unsigned int epochs; 753 - enum write_ordering_e write_ordering; 754 639 atomic_t current_tle_nr; /* transfer log epoch number */ 755 640 unsigned current_tle_writes; /* writes seen within this tl epoch */ 756 641 ··· 758 645 struct drbd_thread worker; 759 646 struct drbd_thread asender; 760 647 648 + /* cached pointers, 649 + * so we can look up the oldest pending requests more quickly. 650 + * protected by resource->req_lock */ 651 + struct drbd_request *req_next; /* DRBD 9: todo.req_next */ 652 + struct drbd_request *req_ack_pending; 653 + struct drbd_request *req_not_net_done; 654 + 761 655 /* sender side */ 762 656 struct drbd_work_queue sender_work; 657 + 658 + #define DRBD_THREAD_DETAILS_HIST 16 659 + unsigned int w_cb_nr; /* keeps counting up */ 660 + unsigned int r_cb_nr; /* keeps counting up */ 661 + struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST]; 662 + struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST]; 763 663 764 664 struct { 765 665 /* whether this sender thread ··· 789 663 } send; 790 664 }; 791 665 666 + void __update_timing_details( 667 + struct drbd_thread_timing_details *tdp, 668 + unsigned int *cb_nr, 669 + void *cb, 670 + const char *fn, const unsigned int line); 671 + 672 + #define update_worker_timing_details(c, cb) \ 673 + __update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ ) 674 + #define update_receiver_timing_details(c, cb) \ 675 + __update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ ) 676 + 792 677 struct submit_worker { 793 678 struct workqueue_struct *wq; 794 679 struct work_struct worker; 795 680 796 - spinlock_t lock; 681 + /* protected by ..->resource->req_lock */ 797 682 struct list_head writes; 798 683 }; 799 684 ··· 812 675 struct list_head peer_devices; 813 676 struct drbd_device *device; 814 677 struct drbd_connection *connection; 678 + #ifdef CONFIG_DEBUG_FS 679 + struct dentry *debugfs_peer_dev; 680 + #endif 815 681 }; 816 682 817 683 struct drbd_device { 818 684 struct drbd_resource *resource; 819 685 struct list_head peer_devices; 820 - int vnr; /* volume number within the connection */ 686 + struct list_head pending_bitmap_io; 687 + 688 + unsigned long flush_jif; 689 + #ifdef CONFIG_DEBUG_FS 690 + struct dentry *debugfs_minor; 691 + struct dentry *debugfs_vol; 692 + struct dentry *debugfs_vol_oldest_requests; 693 + struct dentry *debugfs_vol_act_log_extents; 694 + struct dentry *debugfs_vol_resync_extents; 695 + struct dentry *debugfs_vol_data_gen_id; 696 + #endif 697 + 698 + unsigned int vnr; /* volume number within the connection */ 699 + unsigned int minor; /* device minor number */ 700 + 821 701 struct kref kref; 822 702 823 703 /* things that are stored as / read from meta data on disk */ ··· 851 697 unsigned long last_reattach_jif; 852 698 struct drbd_work resync_work; 853 699 struct drbd_work unplug_work; 854 - struct drbd_work go_diskless; 855 - struct drbd_work md_sync_work; 856 - struct drbd_work start_resync_work; 857 700 struct timer_list resync_timer; 858 701 struct timer_list md_sync_timer; 859 702 struct timer_list start_resync_timer; 860 703 struct timer_list request_timer; 861 - #ifdef DRBD_DEBUG_MD_SYNC 862 - struct { 863 - unsigned int line; 864 - const char* func; 865 - } last_md_mark_dirty; 866 - #endif 867 704 868 705 /* Used after attach while negotiating new disk state. */ 869 706 union drbd_state new_state_tmp; ··· 869 724 unsigned int al_writ_cnt; 870 725 unsigned int bm_writ_cnt; 871 726 atomic_t ap_bio_cnt; /* Requests we need to complete */ 727 + atomic_t ap_actlog_cnt; /* Requests waiting for activity log */ 872 728 atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ 873 729 atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ 874 730 atomic_t unacked_cnt; /* Need to send replies for */ ··· 879 733 struct rb_root read_requests; 880 734 struct rb_root write_requests; 881 735 736 + /* for statistics and timeouts */ 737 + /* [0] read, [1] write */ 738 + struct list_head pending_master_completion[2]; 739 + struct list_head pending_completion[2]; 740 + 741 + /* use checksums for *this* resync */ 742 + bool use_csums; 882 743 /* blocks to resync in this run [unit BM_BLOCK_SIZE] */ 883 744 unsigned long rs_total; 884 745 /* number of resync blocks that failed in this run */ ··· 941 788 atomic_t pp_in_use; /* allocated from page pool */ 942 789 atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ 943 790 wait_queue_head_t ee_wait; 944 - struct page *md_io_page; /* one page buffer for md_io */ 945 791 struct drbd_md_io md_io; 946 - atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ 947 792 spinlock_t al_lock; 948 793 wait_queue_head_t al_wait; 949 794 struct lru_cache *act_log; /* activity log */ ··· 951 800 atomic_t packet_seq; 952 801 unsigned int peer_seq; 953 802 spinlock_t peer_seq_lock; 954 - unsigned int minor; 955 803 unsigned long comm_bm_set; /* communicated number of set bits. */ 956 804 struct bm_io_work bm_io_work; 957 805 u64 ed_uuid; /* UUID of the exposed data */ ··· 972 822 /* any requests that would block in drbd_make_request() 973 823 * are deferred to this single-threaded work queue */ 974 824 struct submit_worker submit; 825 + }; 826 + 827 + struct drbd_bm_aio_ctx { 828 + struct drbd_device *device; 829 + struct list_head list; /* on device->pending_bitmap_io */; 830 + unsigned long start_jif; 831 + atomic_t in_flight; 832 + unsigned int done; 833 + unsigned flags; 834 + #define BM_AIO_COPY_PAGES 1 835 + #define BM_AIO_WRITE_HINTED 2 836 + #define BM_AIO_WRITE_ALL_PAGES 4 837 + #define BM_AIO_READ 8 838 + int error; 839 + struct kref kref; 975 840 }; 976 841 977 842 struct drbd_config_context { ··· 1114 949 extern int drbd_send_bitmap(struct drbd_device *device); 1115 950 extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); 1116 951 extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); 1117 - extern void drbd_free_bc(struct drbd_backing_dev *ldev); 952 + extern void drbd_free_ldev(struct drbd_backing_dev *ldev); 1118 953 extern void drbd_device_cleanup(struct drbd_device *device); 1119 954 void drbd_print_uuids(struct drbd_device *device, const char *text); 1120 955 ··· 1131 966 extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local); 1132 967 extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local); 1133 968 extern int drbd_md_test_flag(struct drbd_backing_dev *, int); 1134 - #ifndef DRBD_DEBUG_MD_SYNC 1135 969 extern void drbd_md_mark_dirty(struct drbd_device *device); 1136 - #else 1137 - #define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ ) 1138 - extern void drbd_md_mark_dirty_(struct drbd_device *device, 1139 - unsigned int line, const char *func); 1140 - #endif 1141 970 extern void drbd_queue_bitmap_io(struct drbd_device *device, 1142 971 int (*io_fn)(struct drbd_device *), 1143 972 void (*done)(struct drbd_device *, int), ··· 1142 983 extern int drbd_bitmap_io_from_worker(struct drbd_device *device, 1143 984 int (*io_fn)(struct drbd_device *), 1144 985 char *why, enum bm_flag flags); 1145 - extern int drbd_bmio_set_n_write(struct drbd_device *device); 1146 - extern int drbd_bmio_clear_n_write(struct drbd_device *device); 1147 - extern void drbd_ldev_destroy(struct drbd_device *device); 986 + extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local); 987 + extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local); 1148 988 1149 989 /* Meta data layout 1150 990 * ··· 1263 1105 /* in which _bitmap_ extent (resp. sector) the bit for a certain 1264 1106 * _storage_ sector is located in */ 1265 1107 #define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9)) 1108 + #define BM_BIT_TO_EXT(x) ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT)) 1266 1109 1267 - /* how much _storage_ sectors we have per bitmap sector */ 1110 + /* first storage sector a bitmap extent corresponds to */ 1268 1111 #define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9)) 1112 + /* how much _storage_ sectors we have per bitmap extent */ 1269 1113 #define BM_SECT_PER_EXT BM_EXT_TO_SECT(1) 1114 + /* how many bits are covered by one bitmap extent (resync extent) */ 1115 + #define BM_BITS_PER_EXT (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT)) 1116 + 1117 + #define BM_BLOCKS_PER_BM_EXT_MASK (BM_BITS_PER_EXT - 1) 1118 + 1270 1119 1271 1120 /* in one sector of the bitmap, we have this many activity_log extents. */ 1272 1121 #define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) 1273 - 1274 - #define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) 1275 - #define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) 1276 1122 1277 1123 /* the extent in "PER_EXTENT" below is an activity log extent 1278 1124 * we need that many (long words/bytes) to store the bitmap ··· 1357 1195 const unsigned long s, const unsigned long e); 1358 1196 extern int drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr); 1359 1197 extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr); 1360 - extern int drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local); 1361 1198 extern int drbd_bm_read(struct drbd_device *device) __must_hold(local); 1362 1199 extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr); 1363 1200 extern int drbd_bm_write(struct drbd_device *device) __must_hold(local); 1364 1201 extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local); 1202 + extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local); 1365 1203 extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local); 1366 1204 extern int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local); 1367 1205 extern size_t drbd_bm_words(struct drbd_device *device); ··· 1375 1213 extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo); 1376 1214 extern unsigned long _drbd_bm_total_weight(struct drbd_device *device); 1377 1215 extern unsigned long drbd_bm_total_weight(struct drbd_device *device); 1378 - extern int drbd_bm_rs_done(struct drbd_device *device); 1379 1216 /* for receive_bitmap */ 1380 1217 extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, 1381 1218 size_t number, unsigned long *buffer); ··· 1473 1312 extern enum determine_dev_size 1474 1313 drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local); 1475 1314 extern void resync_after_online_grow(struct drbd_device *); 1476 - extern void drbd_reconsider_max_bio_size(struct drbd_device *device); 1315 + extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev); 1477 1316 extern enum drbd_state_rv drbd_set_role(struct drbd_device *device, 1478 1317 enum drbd_role new_role, 1479 1318 int force); ··· 1494 1333 extern void suspend_other_sg(struct drbd_device *device); 1495 1334 extern int drbd_resync_finished(struct drbd_device *device); 1496 1335 /* maybe rather drbd_main.c ? */ 1497 - extern void *drbd_md_get_buffer(struct drbd_device *device); 1336 + extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent); 1498 1337 extern void drbd_md_put_buffer(struct drbd_device *device); 1499 1338 extern int drbd_md_sync_page_io(struct drbd_device *device, 1500 1339 struct drbd_backing_dev *bdev, sector_t sector, int rw); ··· 1541 1380 extern int drbd_receiver(struct drbd_thread *thi); 1542 1381 extern int drbd_asender(struct drbd_thread *thi); 1543 1382 extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); 1544 - extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector); 1383 + extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, 1384 + bool throttle_if_app_is_waiting); 1545 1385 extern int drbd_submit_peer_request(struct drbd_device *, 1546 1386 struct drbd_peer_request *, const unsigned, 1547 1387 const int); ··· 1626 1464 { 1627 1465 __release(local); 1628 1466 if (!bio->bi_bdev) { 1629 - printk(KERN_ERR "drbd%d: drbd_generic_make_request: " 1630 - "bio->bi_bdev == NULL\n", 1631 - device_to_minor(device)); 1632 - dump_stack(); 1467 + drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n"); 1633 1468 bio_endio(bio, -ENODEV); 1634 1469 return; 1635 1470 } ··· 1637 1478 generic_make_request(bio); 1638 1479 } 1639 1480 1640 - void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo); 1481 + void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev, 1482 + enum write_ordering_e wo); 1641 1483 1642 1484 /* drbd_proc.c */ 1643 1485 extern struct proc_dir_entry *drbd_proc; ··· 1649 1489 /* drbd_actlog.c */ 1650 1490 extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i); 1651 1491 extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i); 1652 - extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate); 1492 + extern void drbd_al_begin_io_commit(struct drbd_device *device); 1653 1493 extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i); 1654 - extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate); 1494 + extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i); 1655 1495 extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i); 1656 1496 extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector); 1657 1497 extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector); ··· 1661 1501 extern void drbd_rs_failed_io(struct drbd_device *device, 1662 1502 sector_t sector, int size); 1663 1503 extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go); 1664 - extern void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, 1665 - int size, const char *file, const unsigned int line); 1504 + 1505 + enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC }; 1506 + extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, 1507 + enum update_sync_bits_mode mode, 1508 + const char *file, const unsigned int line); 1666 1509 #define drbd_set_in_sync(device, sector, size) \ 1667 - __drbd_set_in_sync(device, sector, size, __FILE__, __LINE__) 1668 - extern int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, 1669 - int size, const char *file, const unsigned int line); 1510 + __drbd_change_sync(device, sector, size, SET_IN_SYNC, __FILE__, __LINE__) 1670 1511 #define drbd_set_out_of_sync(device, sector, size) \ 1671 - __drbd_set_out_of_sync(device, sector, size, __FILE__, __LINE__) 1512 + __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC, __FILE__, __LINE__) 1513 + #define drbd_rs_failed_io(device, sector, size) \ 1514 + __drbd_change_sync(device, sector, size, RECORD_RS_FAILED, __FILE__, __LINE__) 1672 1515 extern void drbd_al_shrink(struct drbd_device *device); 1673 1516 extern int drbd_initialize_al(struct drbd_device *, void *); 1674 1517 ··· 1927 1764 } 1928 1765 1929 1766 static inline void 1930 - drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) 1931 - { 1932 - unsigned long flags; 1933 - spin_lock_irqsave(&q->q_lock, flags); 1934 - list_add(&w->list, &q->q); 1935 - spin_unlock_irqrestore(&q->q_lock, flags); 1936 - wake_up(&q->q_wait); 1937 - } 1938 - 1939 - static inline void 1940 1767 drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) 1941 1768 { 1942 1769 unsigned long flags; ··· 1934 1781 list_add_tail(&w->list, &q->q); 1935 1782 spin_unlock_irqrestore(&q->q_lock, flags); 1936 1783 wake_up(&q->q_wait); 1784 + } 1785 + 1786 + static inline void 1787 + drbd_queue_work_if_unqueued(struct drbd_work_queue *q, struct drbd_work *w) 1788 + { 1789 + unsigned long flags; 1790 + spin_lock_irqsave(&q->q_lock, flags); 1791 + if (list_empty_careful(&w->list)) 1792 + list_add_tail(&w->list, &q->q); 1793 + spin_unlock_irqrestore(&q->q_lock, flags); 1794 + wake_up(&q->q_wait); 1795 + } 1796 + 1797 + static inline void 1798 + drbd_device_post_work(struct drbd_device *device, int work_bit) 1799 + { 1800 + if (!test_and_set_bit(work_bit, &device->flags)) { 1801 + struct drbd_connection *connection = 1802 + first_peer_device(device)->connection; 1803 + struct drbd_work_queue *q = &connection->sender_work; 1804 + if (!test_and_set_bit(DEVICE_WORK_PENDING, &connection->flags)) 1805 + wake_up(&q->q_wait); 1806 + } 1937 1807 } 1938 1808 1939 1809 extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue); ··· 2035 1859 func, line, \ 2036 1860 atomic_read(&device->which)) 2037 1861 2038 - #define dec_ap_pending(device) _dec_ap_pending(device, __FUNCTION__, __LINE__) 1862 + #define dec_ap_pending(device) _dec_ap_pending(device, __func__, __LINE__) 2039 1863 static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line) 2040 1864 { 2041 1865 if (atomic_dec_and_test(&device->ap_pending_cnt)) ··· 2054 1878 atomic_inc(&device->rs_pending_cnt); 2055 1879 } 2056 1880 2057 - #define dec_rs_pending(device) _dec_rs_pending(device, __FUNCTION__, __LINE__) 1881 + #define dec_rs_pending(device) _dec_rs_pending(device, __func__, __LINE__) 2058 1882 static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line) 2059 1883 { 2060 1884 atomic_dec(&device->rs_pending_cnt); ··· 2075 1899 atomic_inc(&device->unacked_cnt); 2076 1900 } 2077 1901 2078 - #define dec_unacked(device) _dec_unacked(device, __FUNCTION__, __LINE__) 1902 + #define dec_unacked(device) _dec_unacked(device, __func__, __LINE__) 2079 1903 static inline void _dec_unacked(struct drbd_device *device, const char *func, int line) 2080 1904 { 2081 1905 atomic_dec(&device->unacked_cnt); 2082 1906 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); 2083 1907 } 2084 1908 2085 - #define sub_unacked(device, n) _sub_unacked(device, n, __FUNCTION__, __LINE__) 1909 + #define sub_unacked(device, n) _sub_unacked(device, n, __func__, __LINE__) 2086 1910 static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line) 2087 1911 { 2088 1912 atomic_sub(n, &device->unacked_cnt); 2089 1913 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); 1914 + } 1915 + 1916 + static inline bool is_sync_state(enum drbd_conns connection_state) 1917 + { 1918 + return 1919 + (connection_state == C_SYNC_SOURCE 1920 + || connection_state == C_SYNC_TARGET 1921 + || connection_state == C_PAUSED_SYNC_S 1922 + || connection_state == C_PAUSED_SYNC_T); 2090 1923 } 2091 1924 2092 1925 /** ··· 2109 1924 2110 1925 static inline void put_ldev(struct drbd_device *device) 2111 1926 { 1927 + enum drbd_disk_state ds = device->state.disk; 1928 + /* We must check the state *before* the atomic_dec becomes visible, 1929 + * or we have a theoretical race where someone hitting zero, 1930 + * while state still D_FAILED, will then see D_DISKLESS in the 1931 + * condition below and calling into destroy, where he must not, yet. */ 2112 1932 int i = atomic_dec_return(&device->local_cnt); 2113 1933 2114 1934 /* This may be called from some endio handler, ··· 2122 1932 __release(local); 2123 1933 D_ASSERT(device, i >= 0); 2124 1934 if (i == 0) { 2125 - if (device->state.disk == D_DISKLESS) 1935 + if (ds == D_DISKLESS) 2126 1936 /* even internal references gone, safe to destroy */ 2127 - drbd_ldev_destroy(device); 2128 - if (device->state.disk == D_FAILED) { 1937 + drbd_device_post_work(device, DESTROY_DISK); 1938 + if (ds == D_FAILED) 2129 1939 /* all application IO references gone. */ 2130 - if (!test_and_set_bit(GO_DISKLESS, &device->flags)) 2131 - drbd_queue_work(&first_peer_device(device)->connection->sender_work, 2132 - &device->go_diskless); 2133 - } 1940 + if (!test_and_set_bit(GOING_DISKLESS, &device->flags)) 1941 + drbd_device_post_work(device, GO_DISKLESS); 2134 1942 wake_up(&device->misc_wait); 2135 1943 } 2136 1944 } ··· 2151 1963 #else 2152 1964 extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins); 2153 1965 #endif 2154 - 2155 - /* you must have an "get_ldev" reference */ 2156 - static inline void drbd_get_syncer_progress(struct drbd_device *device, 2157 - unsigned long *bits_left, unsigned int *per_mil_done) 2158 - { 2159 - /* this is to break it at compile time when we change that, in case we 2160 - * want to support more than (1<<32) bits on a 32bit arch. */ 2161 - typecheck(unsigned long, device->rs_total); 2162 - 2163 - /* note: both rs_total and rs_left are in bits, i.e. in 2164 - * units of BM_BLOCK_SIZE. 2165 - * for the percentage, we don't care. */ 2166 - 2167 - if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T) 2168 - *bits_left = device->ov_left; 2169 - else 2170 - *bits_left = drbd_bm_total_weight(device) - device->rs_failed; 2171 - /* >> 10 to prevent overflow, 2172 - * +1 to prevent division by zero */ 2173 - if (*bits_left > device->rs_total) { 2174 - /* doh. maybe a logic bug somewhere. 2175 - * may also be just a race condition 2176 - * between this and a disconnect during sync. 2177 - * for now, just prevent in-kernel buffer overflow. 2178 - */ 2179 - smp_rmb(); 2180 - drbd_warn(device, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n", 2181 - drbd_conn_str(device->state.conn), 2182 - *bits_left, device->rs_total, device->rs_failed); 2183 - *per_mil_done = 0; 2184 - } else { 2185 - /* Make sure the division happens in long context. 2186 - * We allow up to one petabyte storage right now, 2187 - * at a granularity of 4k per bit that is 2**38 bits. 2188 - * After shift right and multiplication by 1000, 2189 - * this should still fit easily into a 32bit long, 2190 - * so we don't need a 64bit division on 32bit arch. 2191 - * Note: currently we don't support such large bitmaps on 32bit 2192 - * arch anyways, but no harm done to be prepared for it here. 2193 - */ 2194 - unsigned int shift = device->rs_total > UINT_MAX ? 16 : 10; 2195 - unsigned long left = *bits_left >> shift; 2196 - unsigned long total = 1UL + (device->rs_total >> shift); 2197 - unsigned long tmp = 1000UL - left * 1000UL/total; 2198 - *per_mil_done = tmp; 2199 - } 2200 - } 2201 - 2202 1966 2203 1967 /* this throttles on-the-fly application requests 2204 1968 * according to max_buffers settings; ··· 2339 2199 #define QUEUE_ORDERED_NONE 0 2340 2200 #endif 2341 2201 return QUEUE_ORDERED_NONE; 2342 - } 2343 - 2344 - static inline void drbd_md_flush(struct drbd_device *device) 2345 - { 2346 - int r; 2347 - 2348 - if (device->ldev == NULL) { 2349 - drbd_warn(device, "device->ldev == NULL in drbd_md_flush\n"); 2350 - return; 2351 - } 2352 - 2353 - if (test_bit(MD_NO_FUA, &device->flags)) 2354 - return; 2355 - 2356 - r = blkdev_issue_flush(device->ldev->md_bdev, GFP_NOIO, NULL); 2357 - if (r) { 2358 - set_bit(MD_NO_FUA, &device->flags); 2359 - drbd_err(device, "meta data flush failed with status %d, disabling md-flushes\n", r); 2360 - } 2361 2202 } 2362 2203 2363 2204 static inline struct drbd_connection *first_connection(struct drbd_resource *resource)

+3 -1

drivers/block/drbd/drbd_interval.h

··· 10 10 unsigned int size; /* size in bytes */ 11 11 sector_t end; /* highest interval end in subtree */ 12 12 int local:1 /* local or remote request? */; 13 - int waiting:1; 13 + int waiting:1; /* someone is waiting for this to complete */ 14 + int completed:1; /* this has been completed already; 15 + * ignore for conflict detection */ 14 16 }; 15 17 16 18 static inline void drbd_clear_interval(struct drbd_interval *i)

+126 -178

drivers/block/drbd/drbd_main.c

··· 26 26 27 27 */ 28 28 29 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 30 + 29 31 #include <linux/module.h> 32 + #include <linux/jiffies.h> 30 33 #include <linux/drbd.h> 31 34 #include <asm/uaccess.h> 32 35 #include <asm/types.h> ··· 57 54 #include "drbd_int.h" 58 55 #include "drbd_protocol.h" 59 56 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */ 60 - 61 57 #include "drbd_vli.h" 58 + #include "drbd_debugfs.h" 62 59 63 60 static DEFINE_MUTEX(drbd_main_mutex); 64 61 static int drbd_open(struct block_device *bdev, fmode_t mode); 65 62 static void drbd_release(struct gendisk *gd, fmode_t mode); 66 - static int w_md_sync(struct drbd_work *w, int unused); 67 63 static void md_sync_timer_fn(unsigned long data); 68 64 static int w_bitmap_io(struct drbd_work *w, int unused); 69 - static int w_go_diskless(struct drbd_work *w, int unused); 70 65 71 66 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " 72 67 "Lars Ellenberg <lars@linbit.com>"); ··· 265 264 266 265 /** 267 266 * _tl_restart() - Walks the transfer log, and applies an action to all requests 268 - * @device: DRBD device. 267 + * @connection: DRBD connection to operate on. 269 268 * @what: The action/event to perform with all request objects 270 269 * 271 270 * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO, ··· 663 662 msg_flags); 664 663 if (data && !err) 665 664 err = drbd_send_all(connection, sock->socket, data, size, 0); 665 + /* DRBD protocol "pings" are latency critical. 666 + * This is supposed to trigger tcp_push_pending_frames() */ 667 + if (!err && (cmd == P_PING || cmd == P_PING_ACK)) 668 + drbd_tcp_nodelay(sock->socket); 669 + 666 670 return err; 667 671 } 668 672 ··· 1642 1636 if (peer_device->connection->agreed_pro_version >= 100) { 1643 1637 if (req->rq_state & RQ_EXP_RECEIVE_ACK) 1644 1638 dp_flags |= DP_SEND_RECEIVE_ACK; 1645 - if (req->rq_state & RQ_EXP_WRITE_ACK) 1639 + /* During resync, request an explicit write ack, 1640 + * even in protocol != C */ 1641 + if (req->rq_state & RQ_EXP_WRITE_ACK 1642 + || (dp_flags & DP_MAY_SET_IN_SYNC)) 1646 1643 dp_flags |= DP_SEND_WRITE_ACK; 1647 1644 } 1648 1645 p->dp_flags = cpu_to_be32(dp_flags); ··· 1909 1900 drbd_set_defaults(device); 1910 1901 1911 1902 atomic_set(&device->ap_bio_cnt, 0); 1903 + atomic_set(&device->ap_actlog_cnt, 0); 1912 1904 atomic_set(&device->ap_pending_cnt, 0); 1913 1905 atomic_set(&device->rs_pending_cnt, 0); 1914 1906 atomic_set(&device->unacked_cnt, 0); ··· 1918 1908 atomic_set(&device->rs_sect_in, 0); 1919 1909 atomic_set(&device->rs_sect_ev, 0); 1920 1910 atomic_set(&device->ap_in_flight, 0); 1921 - atomic_set(&device->md_io_in_use, 0); 1911 + atomic_set(&device->md_io.in_use, 0); 1922 1912 1923 1913 mutex_init(&device->own_state_mutex); 1924 1914 device->state_mutex = &device->own_state_mutex; ··· 1934 1924 INIT_LIST_HEAD(&device->resync_reads); 1935 1925 INIT_LIST_HEAD(&device->resync_work.list); 1936 1926 INIT_LIST_HEAD(&device->unplug_work.list); 1937 - INIT_LIST_HEAD(&device->go_diskless.list); 1938 - INIT_LIST_HEAD(&device->md_sync_work.list); 1939 - INIT_LIST_HEAD(&device->start_resync_work.list); 1940 1927 INIT_LIST_HEAD(&device->bm_io_work.w.list); 1928 + INIT_LIST_HEAD(&device->pending_master_completion[0]); 1929 + INIT_LIST_HEAD(&device->pending_master_completion[1]); 1930 + INIT_LIST_HEAD(&device->pending_completion[0]); 1931 + INIT_LIST_HEAD(&device->pending_completion[1]); 1941 1932 1942 1933 device->resync_work.cb = w_resync_timer; 1943 1934 device->unplug_work.cb = w_send_write_hint; 1944 - device->go_diskless.cb = w_go_diskless; 1945 - device->md_sync_work.cb = w_md_sync; 1946 1935 device->bm_io_work.w.cb = w_bitmap_io; 1947 - device->start_resync_work.cb = w_start_resync; 1948 1936 1949 1937 init_timer(&device->resync_timer); 1950 1938 init_timer(&device->md_sync_timer); ··· 2000 1992 drbd_bm_cleanup(device); 2001 1993 } 2002 1994 2003 - drbd_free_bc(device->ldev); 1995 + drbd_free_ldev(device->ldev); 2004 1996 device->ldev = NULL; 2005 1997 2006 1998 clear_bit(AL_SUSPENDED, &device->flags); ··· 2014 2006 D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q)); 2015 2007 D_ASSERT(device, list_empty(&device->resync_work.list)); 2016 2008 D_ASSERT(device, list_empty(&device->unplug_work.list)); 2017 - D_ASSERT(device, list_empty(&device->go_diskless.list)); 2018 2009 2019 2010 drbd_set_defaults(device); 2020 2011 } ··· 2136 2129 return -ENOMEM; 2137 2130 } 2138 2131 2139 - static int drbd_notify_sys(struct notifier_block *this, unsigned long code, 2140 - void *unused) 2141 - { 2142 - /* just so we have it. you never know what interesting things we 2143 - * might want to do here some day... 2144 - */ 2145 - 2146 - return NOTIFY_DONE; 2147 - } 2148 - 2149 - static struct notifier_block drbd_notifier = { 2150 - .notifier_call = drbd_notify_sys, 2151 - }; 2152 - 2153 2132 static void drbd_release_all_peer_reqs(struct drbd_device *device) 2154 2133 { 2155 2134 int rr; ··· 2166 2173 { 2167 2174 struct drbd_device *device = container_of(kref, struct drbd_device, kref); 2168 2175 struct drbd_resource *resource = device->resource; 2169 - struct drbd_connection *connection; 2176 + struct drbd_peer_device *peer_device, *tmp_peer_device; 2170 2177 2171 2178 del_timer_sync(&device->request_timer); 2172 2179 ··· 2180 2187 if (device->this_bdev) 2181 2188 bdput(device->this_bdev); 2182 2189 2183 - drbd_free_bc(device->ldev); 2190 + drbd_free_ldev(device->ldev); 2184 2191 device->ldev = NULL; 2185 2192 2186 2193 drbd_release_all_peer_reqs(device); ··· 2193 2200 2194 2201 if (device->bitmap) /* should no longer be there. */ 2195 2202 drbd_bm_cleanup(device); 2196 - __free_page(device->md_io_page); 2203 + __free_page(device->md_io.page); 2197 2204 put_disk(device->vdisk); 2198 2205 blk_cleanup_queue(device->rq_queue); 2199 2206 kfree(device->rs_plan_s); 2200 - kfree(first_peer_device(device)); 2201 - kfree(device); 2202 2207 2203 - for_each_connection(connection, resource) 2204 - kref_put(&connection->kref, drbd_destroy_connection); 2208 + /* not for_each_connection(connection, resource): 2209 + * those may have been cleaned up and disassociated already. 2210 + */ 2211 + for_each_peer_device_safe(peer_device, tmp_peer_device, device) { 2212 + kref_put(&peer_device->connection->kref, drbd_destroy_connection); 2213 + kfree(peer_device); 2214 + } 2215 + memset(device, 0xfd, sizeof(*device)); 2216 + kfree(device); 2205 2217 kref_put(&resource->kref, drbd_destroy_resource); 2206 2218 } 2207 2219 ··· 2234 2236 list_for_each_entry_safe(req, tmp, &writes, tl_requests) { 2235 2237 struct drbd_device *device = req->device; 2236 2238 struct bio *bio = req->master_bio; 2237 - unsigned long start_time = req->start_time; 2239 + unsigned long start_jif = req->start_jif; 2238 2240 bool expected; 2239 2241 2240 2242 expected = ··· 2269 2271 /* We are not just doing generic_make_request(), 2270 2272 * as we want to keep the start_time information. */ 2271 2273 inc_ap_bio(device); 2272 - __drbd_make_request(device, bio, start_time); 2274 + __drbd_make_request(device, bio, start_jif); 2273 2275 } 2274 2276 } 2275 2277 2278 + /* called via drbd_req_put_completion_ref(), 2279 + * holds resource->req_lock */ 2276 2280 void drbd_restart_request(struct drbd_request *req) 2277 2281 { 2278 2282 unsigned long flags; ··· 2298 2298 idr_destroy(&resource->devices); 2299 2299 free_cpumask_var(resource->cpu_mask); 2300 2300 kfree(resource->name); 2301 + memset(resource, 0xf2, sizeof(*resource)); 2301 2302 kfree(resource); 2302 2303 } 2303 2304 ··· 2308 2307 2309 2308 for_each_connection_safe(connection, tmp, resource) { 2310 2309 list_del(&connection->connections); 2310 + drbd_debugfs_connection_cleanup(connection); 2311 2311 kref_put(&connection->kref, drbd_destroy_connection); 2312 2312 } 2313 + drbd_debugfs_resource_cleanup(resource); 2313 2314 kref_put(&resource->kref, drbd_destroy_resource); 2314 2315 } 2315 2316 ··· 2320 2317 unsigned int i; 2321 2318 struct drbd_device *device; 2322 2319 struct drbd_resource *resource, *tmp; 2323 - 2324 - unregister_reboot_notifier(&drbd_notifier); 2325 2320 2326 2321 /* first remove proc, 2327 2322 * drbdsetup uses it's presence to detect ··· 2336 2335 destroy_workqueue(retry.wq); 2337 2336 2338 2337 drbd_genl_unregister(); 2338 + drbd_debugfs_cleanup(); 2339 2339 2340 2340 idr_for_each_entry(&drbd_devices, device, i) 2341 2341 drbd_delete_device(device); ··· 2352 2350 2353 2351 idr_destroy(&drbd_devices); 2354 2352 2355 - printk(KERN_INFO "drbd: module cleanup done.\n"); 2353 + pr_info("module cleanup done.\n"); 2356 2354 } 2357 2355 2358 2356 /** ··· 2541 2539 if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) { 2542 2540 err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE, 2543 2541 cpumask_bits(new_cpu_mask), nr_cpu_ids); 2542 + if (err == -EOVERFLOW) { 2543 + /* So what. mask it out. */ 2544 + cpumask_var_t tmp_cpu_mask; 2545 + if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) { 2546 + cpumask_setall(tmp_cpu_mask); 2547 + cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask); 2548 + drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n", 2549 + res_opts->cpu_mask, 2550 + strlen(res_opts->cpu_mask) > 12 ? "..." : "", 2551 + nr_cpu_ids); 2552 + free_cpumask_var(tmp_cpu_mask); 2553 + err = 0; 2554 + } 2555 + } 2544 2556 if (err) { 2545 2557 drbd_warn(resource, "bitmap_parse() failed with %d\n", err); 2546 2558 /* retcode = ERR_CPU_MASK_PARSE; */ ··· 2595 2579 kref_init(&resource->kref); 2596 2580 idr_init(&resource->devices); 2597 2581 INIT_LIST_HEAD(&resource->connections); 2582 + resource->write_ordering = WO_bdev_flush; 2598 2583 list_add_tail_rcu(&resource->resources, &drbd_resources); 2599 2584 mutex_init(&resource->conf_update); 2600 2585 mutex_init(&resource->adm_mutex); 2601 2586 spin_lock_init(&resource->req_lock); 2587 + drbd_debugfs_resource_add(resource); 2602 2588 return resource; 2603 2589 2604 2590 fail_free_name: ··· 2611 2593 return NULL; 2612 2594 } 2613 2595 2614 - /* caller must be under genl_lock() */ 2596 + /* caller must be under adm_mutex */ 2615 2597 struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts) 2616 2598 { 2617 2599 struct drbd_resource *resource; ··· 2635 2617 INIT_LIST_HEAD(&connection->current_epoch->list); 2636 2618 connection->epochs = 1; 2637 2619 spin_lock_init(&connection->epoch_lock); 2638 - connection->write_ordering = WO_bdev_flush; 2639 2620 2640 2621 connection->send.seen_any_write_yet = false; 2641 2622 connection->send.current_epoch_nr = 0; ··· 2669 2652 2670 2653 kref_get(&resource->kref); 2671 2654 list_add_tail_rcu(&connection->connections, &resource->connections); 2655 + drbd_debugfs_connection_add(connection); 2672 2656 return connection; 2673 2657 2674 2658 fail_resource: ··· 2698 2680 drbd_free_socket(&connection->data); 2699 2681 kfree(connection->int_dig_in); 2700 2682 kfree(connection->int_dig_vv); 2683 + memset(connection, 0xfc, sizeof(*connection)); 2701 2684 kfree(connection); 2702 2685 kref_put(&resource->kref, drbd_destroy_resource); 2703 2686 } ··· 2713 2694 return -ENOMEM; 2714 2695 2715 2696 INIT_WORK(&device->submit.worker, do_submit); 2716 - spin_lock_init(&device->submit.lock); 2717 2697 INIT_LIST_HEAD(&device->submit.writes); 2718 2698 return 0; 2719 2699 } ··· 2782 2764 blk_queue_merge_bvec(q, drbd_merge_bvec); 2783 2765 q->queue_lock = &resource->req_lock; 2784 2766 2785 - device->md_io_page = alloc_page(GFP_KERNEL); 2786 - if (!device->md_io_page) 2767 + device->md_io.page = alloc_page(GFP_KERNEL); 2768 + if (!device->md_io.page) 2787 2769 goto out_no_io_page; 2788 2770 2789 2771 if (drbd_bm_init(device)) ··· 2812 2794 kref_get(&device->kref); 2813 2795 2814 2796 INIT_LIST_HEAD(&device->peer_devices); 2797 + INIT_LIST_HEAD(&device->pending_bitmap_io); 2815 2798 for_each_connection(connection, resource) { 2816 2799 peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL); 2817 2800 if (!peer_device) ··· 2848 2829 for_each_peer_device(peer_device, device) 2849 2830 drbd_connected(peer_device); 2850 2831 } 2851 - 2832 + /* move to create_peer_device() */ 2833 + for_each_peer_device(peer_device, device) 2834 + drbd_debugfs_peer_device_add(peer_device); 2835 + drbd_debugfs_device_add(device); 2852 2836 return NO_ERROR; 2853 2837 2854 2838 out_idr_remove_vol: ··· 2875 2853 out_no_minor_idr: 2876 2854 drbd_bm_cleanup(device); 2877 2855 out_no_bitmap: 2878 - __free_page(device->md_io_page); 2856 + __free_page(device->md_io.page); 2879 2857 out_no_io_page: 2880 2858 put_disk(disk); 2881 2859 out_no_disk: ··· 2890 2868 { 2891 2869 struct drbd_resource *resource = device->resource; 2892 2870 struct drbd_connection *connection; 2871 + struct drbd_peer_device *peer_device; 2893 2872 int refs = 3; 2894 2873 2874 + /* move to free_peer_device() */ 2875 + for_each_peer_device(peer_device, device) 2876 + drbd_debugfs_peer_device_cleanup(peer_device); 2877 + drbd_debugfs_device_cleanup(device); 2895 2878 for_each_connection(connection, resource) { 2896 2879 idr_remove(&connection->peer_devices, device->vnr); 2897 2880 refs++; ··· 2908 2881 kref_sub(&device->kref, refs, drbd_destroy_device); 2909 2882 } 2910 2883 2911 - int __init drbd_init(void) 2884 + static int __init drbd_init(void) 2912 2885 { 2913 2886 int err; 2914 2887 2915 2888 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) { 2916 - printk(KERN_ERR 2917 - "drbd: invalid minor_count (%d)\n", minor_count); 2889 + pr_err("invalid minor_count (%d)\n", minor_count); 2918 2890 #ifdef MODULE 2919 2891 return -EINVAL; 2920 2892 #else ··· 2923 2897 2924 2898 err = register_blkdev(DRBD_MAJOR, "drbd"); 2925 2899 if (err) { 2926 - printk(KERN_ERR 2927 - "drbd: unable to register block device major %d\n", 2900 + pr_err("unable to register block device major %d\n", 2928 2901 DRBD_MAJOR); 2929 2902 return err; 2930 2903 } 2931 - 2932 - register_reboot_notifier(&drbd_notifier); 2933 2904 2934 2905 /* 2935 2906 * allocate all necessary structs ··· 2941 2918 2942 2919 err = drbd_genl_register(); 2943 2920 if (err) { 2944 - printk(KERN_ERR "drbd: unable to register generic netlink family\n"); 2921 + pr_err("unable to register generic netlink family\n"); 2945 2922 goto fail; 2946 2923 } 2947 2924 ··· 2952 2929 err = -ENOMEM; 2953 2930 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); 2954 2931 if (!drbd_proc) { 2955 - printk(KERN_ERR "drbd: unable to register proc file\n"); 2932 + pr_err("unable to register proc file\n"); 2956 2933 goto fail; 2957 2934 } 2958 2935 2959 2936 retry.wq = create_singlethread_workqueue("drbd-reissue"); 2960 2937 if (!retry.wq) { 2961 - printk(KERN_ERR "drbd: unable to create retry workqueue\n"); 2938 + pr_err("unable to create retry workqueue\n"); 2962 2939 goto fail; 2963 2940 } 2964 2941 INIT_WORK(&retry.worker, do_retry); 2965 2942 spin_lock_init(&retry.lock); 2966 2943 INIT_LIST_HEAD(&retry.writes); 2967 2944 2968 - printk(KERN_INFO "drbd: initialized. " 2945 + if (drbd_debugfs_init()) 2946 + pr_notice("failed to initialize debugfs -- will not be available\n"); 2947 + 2948 + pr_info("initialized. " 2969 2949 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", 2970 2950 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX); 2971 - printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); 2972 - printk(KERN_INFO "drbd: registered as block device major %d\n", 2973 - DRBD_MAJOR); 2974 - 2951 + pr_info("%s\n", drbd_buildtag()); 2952 + pr_info("registered as block device major %d\n", DRBD_MAJOR); 2975 2953 return 0; /* Success! */ 2976 2954 2977 2955 fail: 2978 2956 drbd_cleanup(); 2979 2957 if (err == -ENOMEM) 2980 - printk(KERN_ERR "drbd: ran out of memory\n"); 2958 + pr_err("ran out of memory\n"); 2981 2959 else 2982 - printk(KERN_ERR "drbd: initialization failure\n"); 2960 + pr_err("initialization failure\n"); 2983 2961 return err; 2984 2962 } 2985 2963 2986 - void drbd_free_bc(struct drbd_backing_dev *ldev) 2964 + void drbd_free_ldev(struct drbd_backing_dev *ldev) 2987 2965 { 2988 2966 if (ldev == NULL) 2989 2967 return; ··· 2996 2972 kfree(ldev); 2997 2973 } 2998 2974 2975 + static void drbd_free_one_sock(struct drbd_socket *ds) 2976 + { 2977 + struct socket *s; 2978 + mutex_lock(&ds->mutex); 2979 + s = ds->socket; 2980 + ds->socket = NULL; 2981 + mutex_unlock(&ds->mutex); 2982 + if (s) { 2983 + /* so debugfs does not need to mutex_lock() */ 2984 + synchronize_rcu(); 2985 + kernel_sock_shutdown(s, SHUT_RDWR); 2986 + sock_release(s); 2987 + } 2988 + } 2989 + 2999 2990 void drbd_free_sock(struct drbd_connection *connection) 3000 2991 { 3001 - if (connection->data.socket) { 3002 - mutex_lock(&connection->data.mutex); 3003 - kernel_sock_shutdown(connection->data.socket, SHUT_RDWR); 3004 - sock_release(connection->data.socket); 3005 - connection->data.socket = NULL; 3006 - mutex_unlock(&connection->data.mutex); 3007 - } 3008 - if (connection->meta.socket) { 3009 - mutex_lock(&connection->meta.mutex); 3010 - kernel_sock_shutdown(connection->meta.socket, SHUT_RDWR); 3011 - sock_release(connection->meta.socket); 3012 - connection->meta.socket = NULL; 3013 - mutex_unlock(&connection->meta.mutex); 3014 - } 2992 + if (connection->data.socket) 2993 + drbd_free_one_sock(&connection->data); 2994 + if (connection->meta.socket) 2995 + drbd_free_one_sock(&connection->meta); 3015 2996 } 3016 2997 3017 2998 /* meta data management */ ··· 3122 3093 if (!get_ldev_if_state(device, D_FAILED)) 3123 3094 return; 3124 3095 3125 - buffer = drbd_md_get_buffer(device); 3096 + buffer = drbd_md_get_buffer(device, __func__); 3126 3097 if (!buffer) 3127 3098 goto out; 3128 3099 ··· 3282 3253 if (device->state.disk != D_DISKLESS) 3283 3254 return ERR_DISK_CONFIGURED; 3284 3255 3285 - buffer = drbd_md_get_buffer(device); 3256 + buffer = drbd_md_get_buffer(device, __func__); 3286 3257 if (!buffer) 3287 3258 return ERR_NOMEM; 3288 3259 ··· 3495 3466 * 3496 3467 * Sets all bits in the bitmap and writes the whole bitmap to stable storage. 3497 3468 */ 3498 - int drbd_bmio_set_n_write(struct drbd_device *device) 3469 + int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local) 3499 3470 { 3500 3471 int rv = -EIO; 3501 3472 3502 - if (get_ldev_if_state(device, D_ATTACHING)) { 3503 - drbd_md_set_flag(device, MDF_FULL_SYNC); 3473 + drbd_md_set_flag(device, MDF_FULL_SYNC); 3474 + drbd_md_sync(device); 3475 + drbd_bm_set_all(device); 3476 + 3477 + rv = drbd_bm_write(device); 3478 + 3479 + if (!rv) { 3480 + drbd_md_clear_flag(device, MDF_FULL_SYNC); 3504 3481 drbd_md_sync(device); 3505 - drbd_bm_set_all(device); 3506 - 3507 - rv = drbd_bm_write(device); 3508 - 3509 - if (!rv) { 3510 - drbd_md_clear_flag(device, MDF_FULL_SYNC); 3511 - drbd_md_sync(device); 3512 - } 3513 - 3514 - put_ldev(device); 3515 3482 } 3516 3483 3517 3484 return rv; ··· 3519 3494 * 3520 3495 * Clears all bits in the bitmap and writes the whole bitmap to stable storage. 3521 3496 */ 3522 - int drbd_bmio_clear_n_write(struct drbd_device *device) 3497 + int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local) 3523 3498 { 3524 - int rv = -EIO; 3525 - 3526 3499 drbd_resume_al(device); 3527 - if (get_ldev_if_state(device, D_ATTACHING)) { 3528 - drbd_bm_clear_all(device); 3529 - rv = drbd_bm_write(device); 3530 - put_ldev(device); 3531 - } 3532 - 3533 - return rv; 3500 + drbd_bm_clear_all(device); 3501 + return drbd_bm_write(device); 3534 3502 } 3535 3503 3536 3504 static int w_bitmap_io(struct drbd_work *w, int unused) ··· 3555 3537 return 0; 3556 3538 } 3557 3539 3558 - void drbd_ldev_destroy(struct drbd_device *device) 3559 - { 3560 - lc_destroy(device->resync); 3561 - device->resync = NULL; 3562 - lc_destroy(device->act_log); 3563 - device->act_log = NULL; 3564 - __no_warn(local, 3565 - drbd_free_bc(device->ldev); 3566 - device->ldev = NULL;); 3567 - 3568 - clear_bit(GO_DISKLESS, &device->flags); 3569 - } 3570 - 3571 - static int w_go_diskless(struct drbd_work *w, int unused) 3572 - { 3573 - struct drbd_device *device = 3574 - container_of(w, struct drbd_device, go_diskless); 3575 - 3576 - D_ASSERT(device, device->state.disk == D_FAILED); 3577 - /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will 3578 - * inc/dec it frequently. Once we are D_DISKLESS, no one will touch 3579 - * the protected members anymore, though, so once put_ldev reaches zero 3580 - * again, it will be safe to free them. */ 3581 - 3582 - /* Try to write changed bitmap pages, read errors may have just 3583 - * set some bits outside the area covered by the activity log. 3584 - * 3585 - * If we have an IO error during the bitmap writeout, 3586 - * we will want a full sync next time, just in case. 3587 - * (Do we want a specific meta data flag for this?) 3588 - * 3589 - * If that does not make it to stable storage either, 3590 - * we cannot do anything about that anymore. 3591 - * 3592 - * We still need to check if both bitmap and ldev are present, we may 3593 - * end up here after a failed attach, before ldev was even assigned. 3594 - */ 3595 - if (device->bitmap && device->ldev) { 3596 - /* An interrupted resync or similar is allowed to recounts bits 3597 - * while we detach. 3598 - * Any modifications would not be expected anymore, though. 3599 - */ 3600 - if (drbd_bitmap_io_from_worker(device, drbd_bm_write, 3601 - "detach", BM_LOCKED_TEST_ALLOWED)) { 3602 - if (test_bit(WAS_READ_ERROR, &device->flags)) { 3603 - drbd_md_set_flag(device, MDF_FULL_SYNC); 3604 - drbd_md_sync(device); 3605 - } 3606 - } 3607 - } 3608 - 3609 - drbd_force_state(device, NS(disk, D_DISKLESS)); 3610 - return 0; 3611 - } 3612 - 3613 3540 /** 3614 3541 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap 3615 3542 * @device: DRBD device. ··· 3566 3603 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be 3567 3604 * called from worker context. It MUST NOT be used while a previous such 3568 3605 * work is still pending! 3606 + * 3607 + * Its worker function encloses the call of io_fn() by get_ldev() and 3608 + * put_ldev(). 3569 3609 */ 3570 3610 void drbd_queue_bitmap_io(struct drbd_device *device, 3571 3611 int (*io_fn)(struct drbd_device *), ··· 3651 3685 static void md_sync_timer_fn(unsigned long data) 3652 3686 { 3653 3687 struct drbd_device *device = (struct drbd_device *) data; 3654 - 3655 - /* must not double-queue! */ 3656 - if (list_empty(&device->md_sync_work.list)) 3657 - drbd_queue_work_front(&first_peer_device(device)->connection->sender_work, 3658 - &device->md_sync_work); 3659 - } 3660 - 3661 - static int w_md_sync(struct drbd_work *w, int unused) 3662 - { 3663 - struct drbd_device *device = 3664 - container_of(w, struct drbd_device, md_sync_work); 3665 - 3666 - drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); 3667 - #ifdef DEBUG 3668 - drbd_warn(device, "last md_mark_dirty: %s:%u\n", 3669 - device->last_md_mark_dirty.func, device->last_md_mark_dirty.line); 3670 - #endif 3671 - drbd_md_sync(device); 3672 - return 0; 3688 + drbd_device_post_work(device, MD_SYNC); 3673 3689 } 3674 3690 3675 3691 const char *cmdname(enum drbd_packet cmd)

+66 -44

drivers/block/drbd/drbd_nl.c

··· 23 23 24 24 */ 25 25 26 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 27 + 26 28 #include <linux/module.h> 27 29 #include <linux/drbd.h> 28 30 #include <linux/in.h> ··· 87 85 { 88 86 genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); 89 87 if (genlmsg_reply(skb, info)) 90 - printk(KERN_ERR "drbd: error sending genl reply\n"); 88 + pr_err("error sending genl reply\n"); 91 89 } 92 90 93 91 /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only ··· 560 558 } 561 559 562 560 enum drbd_state_rv 563 - drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force) 561 + drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int force) 564 562 { 563 + struct drbd_peer_device *const peer_device = first_peer_device(device); 564 + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; 565 565 const int max_tries = 4; 566 566 enum drbd_state_rv rv = SS_UNKNOWN_ERROR; 567 567 struct net_conf *nc; ··· 611 607 device->state.disk == D_CONSISTENT && mask.pdsk == 0) { 612 608 D_ASSERT(device, device->state.pdsk == D_UNKNOWN); 613 609 614 - if (conn_try_outdate_peer(first_peer_device(device)->connection)) { 610 + if (conn_try_outdate_peer(connection)) { 615 611 val.disk = D_UP_TO_DATE; 616 612 mask.disk = D_MASK; 617 613 } ··· 621 617 if (rv == SS_NOTHING_TO_DO) 622 618 goto out; 623 619 if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) { 624 - if (!conn_try_outdate_peer(first_peer_device(device)->connection) && force) { 620 + if (!conn_try_outdate_peer(connection) && force) { 625 621 drbd_warn(device, "Forced into split brain situation!\n"); 626 622 mask.pdsk = D_MASK; 627 623 val.pdsk = D_OUTDATED; ··· 634 630 retry at most once more in this case. */ 635 631 int timeo; 636 632 rcu_read_lock(); 637 - nc = rcu_dereference(first_peer_device(device)->connection->net_conf); 633 + nc = rcu_dereference(connection->net_conf); 638 634 timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1; 639 635 rcu_read_unlock(); 640 636 schedule_timeout_interruptible(timeo); ··· 663 659 /* FIXME also wait for all pending P_BARRIER_ACK? */ 664 660 665 661 if (new_role == R_SECONDARY) { 666 - set_disk_ro(device->vdisk, true); 667 662 if (get_ldev(device)) { 668 663 device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; 669 664 put_ldev(device); 670 665 } 671 666 } else { 672 - /* Called from drbd_adm_set_role only. 673 - * We are still holding the conf_update mutex. */ 674 - nc = first_peer_device(device)->connection->net_conf; 667 + mutex_lock(&device->resource->conf_update); 668 + nc = connection->net_conf; 675 669 if (nc) 676 670 nc->discard_my_data = 0; /* without copy; single bit op is atomic */ 671 + mutex_unlock(&device->resource->conf_update); 677 672 678 - set_disk_ro(device->vdisk, false); 679 673 if (get_ldev(device)) { 680 674 if (((device->state.conn < C_CONNECTED || 681 675 device->state.pdsk <= D_FAILED) ··· 691 689 if (device->state.conn >= C_WF_REPORT_PARAMS) { 692 690 /* if this was forced, we should consider sync */ 693 691 if (forced) 694 - drbd_send_uuids(first_peer_device(device)); 695 - drbd_send_current_state(first_peer_device(device)); 692 + drbd_send_uuids(peer_device); 693 + drbd_send_current_state(peer_device); 696 694 } 697 695 698 696 drbd_md_sync(device); 699 - 697 + set_disk_ro(device->vdisk, new_role == R_SECONDARY); 700 698 kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); 701 699 out: 702 700 mutex_unlock(device->state_mutex); ··· 893 891 * still lock the act_log to not trigger ASSERTs there. 894 892 */ 895 893 drbd_suspend_io(device); 896 - buffer = drbd_md_get_buffer(device); /* Lock meta-data IO */ 894 + buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */ 897 895 if (!buffer) { 898 896 drbd_resume_io(device); 899 897 return DS_ERROR; ··· 973 971 if (la_size_changed || md_moved || rs) { 974 972 u32 prev_flags; 975 973 974 + /* We do some synchronous IO below, which may take some time. 975 + * Clear the timer, to avoid scary "timer expired!" messages, 976 + * "Superblock" is written out at least twice below, anyways. */ 977 + del_timer(&device->md_sync_timer); 976 978 drbd_al_shrink(device); /* All extents inactive. */ 977 979 978 980 prev_flags = md->flags; ··· 1122 1116 return 0; 1123 1117 } 1124 1118 1125 - static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_bio_size) 1119 + static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev, 1120 + unsigned int max_bio_size) 1126 1121 { 1127 1122 struct request_queue * const q = device->rq_queue; 1128 1123 unsigned int max_hw_sectors = max_bio_size >> 9; 1129 1124 unsigned int max_segments = 0; 1130 1125 struct request_queue *b = NULL; 1131 1126 1132 - if (get_ldev_if_state(device, D_ATTACHING)) { 1133 - b = device->ldev->backing_bdev->bd_disk->queue; 1127 + if (bdev) { 1128 + b = bdev->backing_bdev->bd_disk->queue; 1134 1129 1135 1130 max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); 1136 1131 rcu_read_lock(); ··· 1176 1169 b->backing_dev_info.ra_pages); 1177 1170 q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; 1178 1171 } 1179 - put_ldev(device); 1180 1172 } 1181 1173 } 1182 1174 1183 - void drbd_reconsider_max_bio_size(struct drbd_device *device) 1175 + void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev) 1184 1176 { 1185 1177 unsigned int now, new, local, peer; 1186 1178 ··· 1187 1181 local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */ 1188 1182 peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */ 1189 1183 1190 - if (get_ldev_if_state(device, D_ATTACHING)) { 1191 - local = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9; 1184 + if (bdev) { 1185 + local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9; 1192 1186 device->local_max_bio_size = local; 1193 - put_ldev(device); 1194 1187 } 1195 1188 local = min(local, DRBD_MAX_BIO_SIZE); 1196 1189 ··· 1222 1217 if (new != now) 1223 1218 drbd_info(device, "max BIO size = %u\n", new); 1224 1219 1225 - drbd_setup_queue_param(device, new); 1220 + drbd_setup_queue_param(device, bdev, new); 1226 1221 } 1227 1222 1228 1223 /* Starts the worker thread */ ··· 1302 1297 return max_al_nr; 1303 1298 1304 1299 return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION; 1300 + } 1301 + 1302 + static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b) 1303 + { 1304 + return a->disk_barrier != b->disk_barrier || 1305 + a->disk_flushes != b->disk_flushes || 1306 + a->disk_drain != b->disk_drain; 1305 1307 } 1306 1308 1307 1309 int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) ··· 1417 1405 else 1418 1406 set_bit(MD_NO_FUA, &device->flags); 1419 1407 1420 - drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush); 1408 + if (write_ordering_changed(old_disk_conf, new_disk_conf)) 1409 + drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush); 1421 1410 1422 1411 drbd_md_sync(device); 1423 1412 ··· 1453 1440 { 1454 1441 struct drbd_config_context adm_ctx; 1455 1442 struct drbd_device *device; 1443 + struct drbd_peer_device *peer_device; 1444 + struct drbd_connection *connection; 1456 1445 int err; 1457 1446 enum drbd_ret_code retcode; 1458 1447 enum determine_dev_size dd; ··· 1477 1462 1478 1463 device = adm_ctx.device; 1479 1464 mutex_lock(&adm_ctx.resource->adm_mutex); 1480 - conn_reconfig_start(first_peer_device(device)->connection); 1465 + peer_device = first_peer_device(device); 1466 + connection = peer_device ? peer_device->connection : NULL; 1467 + conn_reconfig_start(connection); 1481 1468 1482 1469 /* if you want to reconfigure, please tear down first */ 1483 1470 if (device->state.disk > D_DISKLESS) { ··· 1490 1473 * drbd_ldev_destroy is done already, we may end up here very fast, 1491 1474 * e.g. if someone calls attach from the on-io-error handler, 1492 1475 * to realize a "hot spare" feature (not that I'd recommend that) */ 1493 - wait_event(device->misc_wait, !atomic_read(&device->local_cnt)); 1476 + wait_event(device->misc_wait, !test_bit(GOING_DISKLESS, &device->flags)); 1494 1477 1495 1478 /* make sure there is no leftover from previous force-detach attempts */ 1496 1479 clear_bit(FORCE_DETACH, &device->flags); ··· 1546 1529 goto fail; 1547 1530 1548 1531 rcu_read_lock(); 1549 - nc = rcu_dereference(first_peer_device(device)->connection->net_conf); 1532 + nc = rcu_dereference(connection->net_conf); 1550 1533 if (nc) { 1551 1534 if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) { 1552 1535 rcu_read_unlock(); ··· 1666 1649 */ 1667 1650 wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device)); 1668 1651 /* and for any other previously queued work */ 1669 - drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work); 1652 + drbd_flush_workqueue(&connection->sender_work); 1670 1653 1671 1654 rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE); 1672 1655 retcode = rv; /* FIXME: Type mismatch. */ ··· 1727 1710 new_disk_conf = NULL; 1728 1711 new_plan = NULL; 1729 1712 1730 - drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush); 1713 + drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush); 1731 1714 1732 1715 if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY)) 1733 1716 set_bit(CRASHED_PRIMARY, &device->flags); ··· 1743 1726 device->read_cnt = 0; 1744 1727 device->writ_cnt = 0; 1745 1728 1746 - drbd_reconsider_max_bio_size(device); 1729 + drbd_reconsider_max_bio_size(device, device->ldev); 1747 1730 1748 1731 /* If I am currently not R_PRIMARY, 1749 1732 * but meta data primary indicator is set, ··· 1862 1845 1863 1846 kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); 1864 1847 put_ldev(device); 1865 - conn_reconfig_done(first_peer_device(device)->connection); 1848 + conn_reconfig_done(connection); 1866 1849 mutex_unlock(&adm_ctx.resource->adm_mutex); 1867 1850 drbd_adm_finish(&adm_ctx, info, retcode); 1868 1851 return 0; ··· 1873 1856 drbd_force_state(device, NS(disk, D_DISKLESS)); 1874 1857 drbd_md_sync(device); 1875 1858 fail: 1876 - conn_reconfig_done(first_peer_device(device)->connection); 1859 + conn_reconfig_done(connection); 1877 1860 if (nbc) { 1878 1861 if (nbc->backing_bdev) 1879 1862 blkdev_put(nbc->backing_bdev, ··· 1905 1888 } 1906 1889 1907 1890 drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */ 1908 - drbd_md_get_buffer(device); /* make sure there is no in-flight meta-data IO */ 1891 + drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */ 1909 1892 retcode = drbd_request_state(device, NS(disk, D_FAILED)); 1910 1893 drbd_md_put_buffer(device); 1911 1894 /* D_FAILED will transition to DISKLESS. */ ··· 2671 2654 if (retcode != NO_ERROR) 2672 2655 goto out; 2673 2656 2674 - mutex_lock(&adm_ctx.resource->adm_mutex); 2675 2657 device = adm_ctx.device; 2658 + if (!get_ldev(device)) { 2659 + retcode = ERR_NO_DISK; 2660 + goto out; 2661 + } 2662 + 2663 + mutex_lock(&adm_ctx.resource->adm_mutex); 2676 2664 2677 2665 /* If there is still bitmap IO pending, probably because of a previous 2678 2666 * resync just being finished, wait for it before requesting a new resync. ··· 2701 2679 retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T)); 2702 2680 drbd_resume_io(device); 2703 2681 mutex_unlock(&adm_ctx.resource->adm_mutex); 2682 + put_ldev(device); 2704 2683 out: 2705 2684 drbd_adm_finish(&adm_ctx, info, retcode); 2706 2685 return 0; ··· 2727 2704 return 0; 2728 2705 } 2729 2706 2730 - static int drbd_bmio_set_susp_al(struct drbd_device *device) 2707 + static int drbd_bmio_set_susp_al(struct drbd_device *device) __must_hold(local) 2731 2708 { 2732 2709 int rv; 2733 2710 ··· 2748 2725 if (retcode != NO_ERROR) 2749 2726 goto out; 2750 2727 2751 - mutex_lock(&adm_ctx.resource->adm_mutex); 2752 2728 device = adm_ctx.device; 2729 + if (!get_ldev(device)) { 2730 + retcode = ERR_NO_DISK; 2731 + goto out; 2732 + } 2733 + 2734 + mutex_lock(&adm_ctx.resource->adm_mutex); 2753 2735 2754 2736 /* If there is still bitmap IO pending, probably because of a previous 2755 2737 * resync just being finished, wait for it before requesting a new resync. ··· 2781 2753 retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S)); 2782 2754 drbd_resume_io(device); 2783 2755 mutex_unlock(&adm_ctx.resource->adm_mutex); 2756 + put_ldev(device); 2784 2757 out: 2785 2758 drbd_adm_finish(&adm_ctx, info, retcode); 2786 2759 return 0; ··· 2921 2892 return list_first_entry(&resource->connections, struct drbd_connection, connections); 2922 2893 } 2923 2894 2924 - int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device, 2895 + static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device, 2925 2896 const struct sib_info *sib) 2926 2897 { 2927 2898 struct drbd_resource *resource = device->resource; ··· 3650 3621 struct drbd_genlmsghdr *d_out; 3651 3622 unsigned seq; 3652 3623 int err = -ENOMEM; 3653 - 3654 - if (sib->sib_reason == SIB_SYNC_PROGRESS) { 3655 - if (time_after(jiffies, device->rs_last_bcast + HZ)) 3656 - device->rs_last_bcast = jiffies; 3657 - else 3658 - return; 3659 - } 3660 3624 3661 3625 seq = atomic_inc_return(&drbd_genl_seq); 3662 3626 msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);

+79 -46

drivers/block/drbd/drbd_proc.c

··· 60 60 seq_printf(seq, "%ld", v); 61 61 } 62 62 63 + static void drbd_get_syncer_progress(struct drbd_device *device, 64 + union drbd_dev_state state, unsigned long *rs_total, 65 + unsigned long *bits_left, unsigned int *per_mil_done) 66 + { 67 + /* this is to break it at compile time when we change that, in case we 68 + * want to support more than (1<<32) bits on a 32bit arch. */ 69 + typecheck(unsigned long, device->rs_total); 70 + *rs_total = device->rs_total; 71 + 72 + /* note: both rs_total and rs_left are in bits, i.e. in 73 + * units of BM_BLOCK_SIZE. 74 + * for the percentage, we don't care. */ 75 + 76 + if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T) 77 + *bits_left = device->ov_left; 78 + else 79 + *bits_left = drbd_bm_total_weight(device) - device->rs_failed; 80 + /* >> 10 to prevent overflow, 81 + * +1 to prevent division by zero */ 82 + if (*bits_left > *rs_total) { 83 + /* D'oh. Maybe a logic bug somewhere. More likely just a race 84 + * between state change and reset of rs_total. 85 + */ 86 + *bits_left = *rs_total; 87 + *per_mil_done = *rs_total ? 0 : 1000; 88 + } else { 89 + /* Make sure the division happens in long context. 90 + * We allow up to one petabyte storage right now, 91 + * at a granularity of 4k per bit that is 2**38 bits. 92 + * After shift right and multiplication by 1000, 93 + * this should still fit easily into a 32bit long, 94 + * so we don't need a 64bit division on 32bit arch. 95 + * Note: currently we don't support such large bitmaps on 32bit 96 + * arch anyways, but no harm done to be prepared for it here. 97 + */ 98 + unsigned int shift = *rs_total > UINT_MAX ? 16 : 10; 99 + unsigned long left = *bits_left >> shift; 100 + unsigned long total = 1UL + (*rs_total >> shift); 101 + unsigned long tmp = 1000UL - left * 1000UL/total; 102 + *per_mil_done = tmp; 103 + } 104 + } 105 + 106 + 63 107 /*lge 64 108 * progress bars shamelessly adapted from driver/md/md.c 65 109 * output looks like 66 110 * [=====>..............] 33.5% (23456/123456) 67 111 * finish: 2:20:20 speed: 6,345 (6,456) K/sec 68 112 */ 69 - static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq) 113 + static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq, 114 + union drbd_dev_state state) 70 115 { 71 - unsigned long db, dt, dbdt, rt, rs_left; 116 + unsigned long db, dt, dbdt, rt, rs_total, rs_left; 72 117 unsigned int res; 73 118 int i, x, y; 74 119 int stalled = 0; 75 120 76 - drbd_get_syncer_progress(device, &rs_left, &res); 121 + drbd_get_syncer_progress(device, state, &rs_total, &rs_left, &res); 77 122 78 123 x = res/50; 79 124 y = 20-x; ··· 130 85 seq_printf(seq, "."); 131 86 seq_printf(seq, "] "); 132 87 133 - if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T) 88 + if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T) 134 89 seq_printf(seq, "verified:"); 135 90 else 136 91 seq_printf(seq, "sync'ed:"); 137 92 seq_printf(seq, "%3u.%u%% ", res / 10, res % 10); 138 93 139 94 /* if more than a few GB, display in MB */ 140 - if (device->rs_total > (4UL << (30 - BM_BLOCK_SHIFT))) 95 + if (rs_total > (4UL << (30 - BM_BLOCK_SHIFT))) 141 96 seq_printf(seq, "(%lu/%lu)M", 142 97 (unsigned long) Bit2KB(rs_left >> 10), 143 - (unsigned long) Bit2KB(device->rs_total >> 10)); 98 + (unsigned long) Bit2KB(rs_total >> 10)); 144 99 else 145 100 seq_printf(seq, "(%lu/%lu)K\n\t", 146 101 (unsigned long) Bit2KB(rs_left), 147 - (unsigned long) Bit2KB(device->rs_total)); 102 + (unsigned long) Bit2KB(rs_total)); 148 103 149 104 /* see drivers/md/md.c 150 105 * We do not want to overflow, so the order of operands and ··· 195 150 dt = (jiffies - device->rs_start - device->rs_paused) / HZ; 196 151 if (dt == 0) 197 152 dt = 1; 198 - db = device->rs_total - rs_left; 153 + db = rs_total - rs_left; 199 154 dbdt = Bit2KB(db/dt); 200 155 seq_printf_with_thousands_grouping(seq, dbdt); 201 156 seq_printf(seq, ")"); 202 157 203 - if (device->state.conn == C_SYNC_TARGET || 204 - device->state.conn == C_VERIFY_S) { 158 + if (state.conn == C_SYNC_TARGET || 159 + state.conn == C_VERIFY_S) { 205 160 seq_printf(seq, " want: "); 206 161 seq_printf_with_thousands_grouping(seq, device->c_sync_rate); 207 162 } ··· 213 168 unsigned long bm_bits = drbd_bm_bits(device); 214 169 unsigned long bit_pos; 215 170 unsigned long long stop_sector = 0; 216 - if (device->state.conn == C_VERIFY_S || 217 - device->state.conn == C_VERIFY_T) { 171 + if (state.conn == C_VERIFY_S || 172 + state.conn == C_VERIFY_T) { 218 173 bit_pos = bm_bits - device->ov_left; 219 174 if (verify_can_do_stop_sector(device)) 220 175 stop_sector = device->ov_stop_sector; ··· 233 188 } 234 189 } 235 190 236 - static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) 237 - { 238 - struct bm_extent *bme = lc_entry(e, struct bm_extent, lce); 239 - 240 - seq_printf(seq, "%5d %s %s\n", bme->rs_left, 241 - bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------", 242 - bme->flags & BME_LOCKED ? "LOCKED" : "------" 243 - ); 244 - } 245 - 246 191 static int drbd_seq_show(struct seq_file *seq, void *v) 247 192 { 248 193 int i, prev_i = -1; 249 194 const char *sn; 250 195 struct drbd_device *device; 251 196 struct net_conf *nc; 197 + union drbd_dev_state state; 252 198 char wp; 253 199 254 200 static char write_ordering_chars[] = { ··· 277 241 seq_printf(seq, "\n"); 278 242 prev_i = i; 279 243 280 - sn = drbd_conn_str(device->state.conn); 244 + state = device->state; 245 + sn = drbd_conn_str(state.conn); 281 246 282 - if (device->state.conn == C_STANDALONE && 283 - device->state.disk == D_DISKLESS && 284 - device->state.role == R_SECONDARY) { 247 + if (state.conn == C_STANDALONE && 248 + state.disk == D_DISKLESS && 249 + state.role == R_SECONDARY) { 285 250 seq_printf(seq, "%2d: cs:Unconfigured\n", i); 286 251 } else { 287 252 /* reset device->congestion_reason */ ··· 295 258 " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " 296 259 "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", 297 260 i, sn, 298 - drbd_role_str(device->state.role), 299 - drbd_role_str(device->state.peer), 300 - drbd_disk_str(device->state.disk), 301 - drbd_disk_str(device->state.pdsk), 261 + drbd_role_str(state.role), 262 + drbd_role_str(state.peer), 263 + drbd_disk_str(state.disk), 264 + drbd_disk_str(state.pdsk), 302 265 wp, 303 266 drbd_suspended(device) ? 's' : 'r', 304 - device->state.aftr_isp ? 'a' : '-', 305 - device->state.peer_isp ? 'p' : '-', 306 - device->state.user_isp ? 'u' : '-', 267 + state.aftr_isp ? 'a' : '-', 268 + state.peer_isp ? 'p' : '-', 269 + state.user_isp ? 'u' : '-', 307 270 device->congestion_reason ?: '-', 308 271 test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-', 309 272 device->send_cnt/2, ··· 318 281 atomic_read(&device->unacked_cnt), 319 282 atomic_read(&device->ap_bio_cnt), 320 283 first_peer_device(device)->connection->epochs, 321 - write_ordering_chars[first_peer_device(device)->connection->write_ordering] 284 + write_ordering_chars[device->resource->write_ordering] 322 285 ); 323 286 seq_printf(seq, " oos:%llu\n", 324 287 Bit2KB((unsigned long long) 325 288 drbd_bm_total_weight(device))); 326 289 } 327 - if (device->state.conn == C_SYNC_SOURCE || 328 - device->state.conn == C_SYNC_TARGET || 329 - device->state.conn == C_VERIFY_S || 330 - device->state.conn == C_VERIFY_T) 331 - drbd_syncer_progress(device, seq); 290 + if (state.conn == C_SYNC_SOURCE || 291 + state.conn == C_SYNC_TARGET || 292 + state.conn == C_VERIFY_S || 293 + state.conn == C_VERIFY_T) 294 + drbd_syncer_progress(device, seq, state); 332 295 333 296 if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) { 334 297 lc_seq_printf_stats(seq, device->resync); ··· 336 299 put_ldev(device); 337 300 } 338 301 339 - if (proc_details >= 2) { 340 - if (device->resync) { 341 - lc_seq_dump_details(seq, device->resync, "rs_left", 342 - resync_dump_detail); 343 - } 344 - } 302 + if (proc_details >= 2) 303 + seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt)); 345 304 } 346 305 rcu_read_unlock(); 347 306 ··· 349 316 int err; 350 317 351 318 if (try_module_get(THIS_MODULE)) { 352 - err = single_open(file, drbd_seq_show, PDE_DATA(inode)); 319 + err = single_open(file, drbd_seq_show, NULL); 353 320 if (err) 354 321 module_put(THIS_MODULE); 355 322 return err;

+212 -106

drivers/block/drbd/drbd_receiver.c

··· 362 362 goto fail; 363 363 } 364 364 365 + memset(peer_req, 0, sizeof(*peer_req)); 366 + INIT_LIST_HEAD(&peer_req->w.list); 365 367 drbd_clear_interval(&peer_req->i); 366 368 peer_req->i.size = data_size; 367 369 peer_req->i.sector = sector; 368 - peer_req->i.local = false; 369 - peer_req->i.waiting = false; 370 - 371 - peer_req->epoch = NULL; 370 + peer_req->submit_jif = jiffies; 372 371 peer_req->peer_device = peer_device; 373 372 peer_req->pages = page; 374 - atomic_set(&peer_req->pending_bios, 0); 375 - peer_req->flags = 0; 376 373 /* 377 374 * The block_id is opaque to the receiver. It is not endianness 378 375 * converted, and sent back to the sender unchanged. ··· 386 389 void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req, 387 390 int is_net) 388 391 { 392 + might_sleep(); 389 393 if (peer_req->flags & EE_HAS_DIGEST) 390 394 kfree(peer_req->digest); 391 395 drbd_free_pages(device, peer_req->pages, is_net); 392 396 D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0); 393 397 D_ASSERT(device, drbd_interval_empty(&peer_req->i)); 398 + if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) { 399 + peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; 400 + drbd_al_complete_io(device, &peer_req->i); 401 + } 394 402 mempool_free(peer_req, drbd_ee_mempool); 395 403 } 396 404 ··· 793 791 { 794 792 unsigned int header_size = drbd_header_size(connection); 795 793 struct packet_info pi; 794 + struct net_conf *nc; 796 795 int err; 796 + 797 + rcu_read_lock(); 798 + nc = rcu_dereference(connection->net_conf); 799 + if (!nc) { 800 + rcu_read_unlock(); 801 + return -EIO; 802 + } 803 + sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10; 804 + rcu_read_unlock(); 797 805 798 806 err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0); 799 807 if (err != header_size) { ··· 821 809 * drbd_socket_okay() - Free the socket if its connection is not okay 822 810 * @sock: pointer to the pointer to the socket. 823 811 */ 824 - static int drbd_socket_okay(struct socket **sock) 812 + static bool drbd_socket_okay(struct socket **sock) 825 813 { 826 814 int rr; 827 815 char tb[4]; ··· 839 827 return false; 840 828 } 841 829 } 830 + 831 + static bool connection_established(struct drbd_connection *connection, 832 + struct socket **sock1, 833 + struct socket **sock2) 834 + { 835 + struct net_conf *nc; 836 + int timeout; 837 + bool ok; 838 + 839 + if (!*sock1 || !*sock2) 840 + return false; 841 + 842 + rcu_read_lock(); 843 + nc = rcu_dereference(connection->net_conf); 844 + timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10; 845 + rcu_read_unlock(); 846 + schedule_timeout_interruptible(timeout); 847 + 848 + ok = drbd_socket_okay(sock1); 849 + ok = drbd_socket_okay(sock2) && ok; 850 + 851 + return ok; 852 + } 853 + 842 854 /* Gets called if a connection is established, or if a new minor gets created 843 855 in a connection */ 844 856 int drbd_connected(struct drbd_peer_device *peer_device) ··· 904 868 struct drbd_socket sock, msock; 905 869 struct drbd_peer_device *peer_device; 906 870 struct net_conf *nc; 907 - int vnr, timeout, h, ok; 908 - bool discard_my_data; 871 + int vnr, timeout, h; 872 + bool discard_my_data, ok; 909 873 enum drbd_state_rv rv; 910 874 struct accept_wait_data ad = { 911 875 .connection = connection, ··· 949 913 } 950 914 } 951 915 952 - if (sock.socket && msock.socket) { 953 - rcu_read_lock(); 954 - nc = rcu_dereference(connection->net_conf); 955 - timeout = nc->ping_timeo * HZ / 10; 956 - rcu_read_unlock(); 957 - schedule_timeout_interruptible(timeout); 958 - ok = drbd_socket_okay(&sock.socket); 959 - ok = drbd_socket_okay(&msock.socket) && ok; 960 - if (ok) 961 - break; 962 - } 916 + if (connection_established(connection, &sock.socket, &msock.socket)) 917 + break; 963 918 964 919 retry: 965 920 s = drbd_wait_for_connect(connection, &ad); ··· 996 969 goto out_release_sockets; 997 970 } 998 971 999 - ok = drbd_socket_okay(&sock.socket); 1000 - ok = drbd_socket_okay(&msock.socket) && ok; 972 + ok = connection_established(connection, &sock.socket, &msock.socket); 1001 973 } while (!ok); 1002 974 1003 975 if (ad.s_listen) ··· 1177 1151 struct drbd_peer_device *peer_device; 1178 1152 int vnr; 1179 1153 1180 - if (connection->write_ordering >= WO_bdev_flush) { 1154 + if (connection->resource->write_ordering >= WO_bdev_flush) { 1181 1155 rcu_read_lock(); 1182 1156 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1183 1157 struct drbd_device *device = peer_device->device; ··· 1187 1161 kref_get(&device->kref); 1188 1162 rcu_read_unlock(); 1189 1163 1164 + /* Right now, we have only this one synchronous code path 1165 + * for flushes between request epochs. 1166 + * We may want to make those asynchronous, 1167 + * or at least parallelize the flushes to the volume devices. 1168 + */ 1169 + device->flush_jif = jiffies; 1170 + set_bit(FLUSH_PENDING, &device->flags); 1190 1171 rv = blkdev_issue_flush(device->ldev->backing_bdev, 1191 1172 GFP_NOIO, NULL); 1173 + clear_bit(FLUSH_PENDING, &device->flags); 1192 1174 if (rv) { 1193 1175 drbd_info(device, "local disk flush failed with status %d\n", rv); 1194 1176 /* would rather check on EOPNOTSUPP, but that is not reliable. 1195 1177 * don't try again for ANY return value != 0 1196 1178 * if (rv == -EOPNOTSUPP) */ 1197 - drbd_bump_write_ordering(connection, WO_drain_io); 1179 + drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io); 1198 1180 } 1199 1181 put_ldev(device); 1200 1182 kref_put(&device->kref, drbd_destroy_device); ··· 1291 1257 return rv; 1292 1258 } 1293 1259 1260 + static enum write_ordering_e 1261 + max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo) 1262 + { 1263 + struct disk_conf *dc; 1264 + 1265 + dc = rcu_dereference(bdev->disk_conf); 1266 + 1267 + if (wo == WO_bdev_flush && !dc->disk_flushes) 1268 + wo = WO_drain_io; 1269 + if (wo == WO_drain_io && !dc->disk_drain) 1270 + wo = WO_none; 1271 + 1272 + return wo; 1273 + } 1274 + 1294 1275 /** 1295 1276 * drbd_bump_write_ordering() - Fall back to an other write ordering method 1296 1277 * @connection: DRBD connection. 1297 1278 * @wo: Write ordering method to try. 1298 1279 */ 1299 - void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo) 1280 + void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev, 1281 + enum write_ordering_e wo) 1300 1282 { 1301 - struct disk_conf *dc; 1302 - struct drbd_peer_device *peer_device; 1283 + struct drbd_device *device; 1303 1284 enum write_ordering_e pwo; 1304 1285 int vnr; 1305 1286 static char *write_ordering_str[] = { ··· 1323 1274 [WO_bdev_flush] = "flush", 1324 1275 }; 1325 1276 1326 - pwo = connection->write_ordering; 1327 - wo = min(pwo, wo); 1277 + pwo = resource->write_ordering; 1278 + if (wo != WO_bdev_flush) 1279 + wo = min(pwo, wo); 1328 1280 rcu_read_lock(); 1329 - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1330 - struct drbd_device *device = peer_device->device; 1331 - 1332 - if (!get_ldev_if_state(device, D_ATTACHING)) 1333 - continue; 1334 - dc = rcu_dereference(device->ldev->disk_conf); 1335 - 1336 - if (wo == WO_bdev_flush && !dc->disk_flushes) 1337 - wo = WO_drain_io; 1338 - if (wo == WO_drain_io && !dc->disk_drain) 1339 - wo = WO_none; 1340 - put_ldev(device); 1281 + idr_for_each_entry(&resource->devices, device, vnr) { 1282 + if (get_ldev(device)) { 1283 + wo = max_allowed_wo(device->ldev, wo); 1284 + if (device->ldev == bdev) 1285 + bdev = NULL; 1286 + put_ldev(device); 1287 + } 1341 1288 } 1289 + 1290 + if (bdev) 1291 + wo = max_allowed_wo(bdev, wo); 1292 + 1342 1293 rcu_read_unlock(); 1343 - connection->write_ordering = wo; 1344 - if (pwo != connection->write_ordering || wo == WO_bdev_flush) 1345 - drbd_info(connection, "Method to ensure write ordering: %s\n", write_ordering_str[connection->write_ordering]); 1294 + 1295 + resource->write_ordering = wo; 1296 + if (pwo != resource->write_ordering || wo == WO_bdev_flush) 1297 + drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); 1346 1298 } 1347 1299 1348 1300 /** ··· 1380 1330 /* wait for all pending IO completions, before we start 1381 1331 * zeroing things out. */ 1382 1332 conn_wait_active_ee_empty(first_peer_device(device)->connection); 1333 + /* add it to the active list now, 1334 + * so we can find it to present it in debugfs */ 1335 + peer_req->submit_jif = jiffies; 1336 + peer_req->flags |= EE_SUBMITTED; 1337 + spin_lock_irq(&device->resource->req_lock); 1338 + list_add_tail(&peer_req->w.list, &device->active_ee); 1339 + spin_unlock_irq(&device->resource->req_lock); 1383 1340 if (blkdev_issue_zeroout(device->ldev->backing_bdev, 1384 1341 sector, ds >> 9, GFP_NOIO)) 1385 1342 peer_req->flags |= EE_WAS_ERROR; ··· 1455 1398 D_ASSERT(device, page == NULL); 1456 1399 1457 1400 atomic_set(&peer_req->pending_bios, n_bios); 1401 + /* for debugfs: update timestamp, mark as submitted */ 1402 + peer_req->submit_jif = jiffies; 1403 + peer_req->flags |= EE_SUBMITTED; 1458 1404 do { 1459 1405 bio = bios; 1460 1406 bios = bios->bi_next; ··· 1531 1471 * R_PRIMARY crashes now. 1532 1472 * Therefore we must send the barrier_ack after the barrier request was 1533 1473 * completed. */ 1534 - switch (connection->write_ordering) { 1474 + switch (connection->resource->write_ordering) { 1535 1475 case WO_none: 1536 1476 if (rv == FE_RECYCLED) 1537 1477 return 0; ··· 1558 1498 1559 1499 return 0; 1560 1500 default: 1561 - drbd_err(connection, "Strangeness in connection->write_ordering %d\n", connection->write_ordering); 1501 + drbd_err(connection, "Strangeness in connection->write_ordering %d\n", 1502 + connection->resource->write_ordering); 1562 1503 return -EIO; 1563 1504 } 1564 1505 ··· 1592 1531 struct drbd_peer_request *peer_req; 1593 1532 struct page *page; 1594 1533 int dgs, ds, err; 1595 - int data_size = pi->size; 1534 + unsigned int data_size = pi->size; 1596 1535 void *dig_in = peer_device->connection->int_dig_in; 1597 1536 void *dig_vv = peer_device->connection->int_dig_vv; 1598 1537 unsigned long *data; ··· 1639 1578 if (!peer_req) 1640 1579 return NULL; 1641 1580 1581 + peer_req->flags |= EE_WRITE; 1642 1582 if (trim) 1643 1583 return peer_req; 1644 1584 ··· 1796 1734 * respective _drbd_clear_done_ee */ 1797 1735 1798 1736 peer_req->w.cb = e_end_resync_block; 1737 + peer_req->submit_jif = jiffies; 1799 1738 1800 1739 spin_lock_irq(&device->resource->req_lock); 1801 - list_add(&peer_req->w.list, &device->sync_ee); 1740 + list_add_tail(&peer_req->w.list, &device->sync_ee); 1802 1741 spin_unlock_irq(&device->resource->req_lock); 1803 1742 1804 1743 atomic_add(pi->size >> 9, &device->rs_sect_ev); ··· 1952 1889 } 1953 1890 dec_unacked(device); 1954 1891 } 1892 + 1955 1893 /* we delete from the conflict detection hash _after_ we sent out the 1956 1894 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ 1957 1895 if (peer_req->flags & EE_IN_INTERVAL_TREE) { ··· 2179 2115 drbd_for_each_overlap(i, &device->write_requests, sector, size) { 2180 2116 if (i == &peer_req->i) 2181 2117 continue; 2118 + if (i->completed) 2119 + continue; 2182 2120 2183 2121 if (!i->local) { 2184 2122 /* ··· 2213 2147 (unsigned long long)sector, size, 2214 2148 superseded ? "local" : "remote"); 2215 2149 2216 - inc_unacked(device); 2217 2150 peer_req->w.cb = superseded ? e_send_superseded : 2218 2151 e_send_retry_write; 2219 2152 list_add_tail(&peer_req->w.list, &device->done_ee); ··· 2271 2206 { 2272 2207 struct drbd_peer_device *peer_device; 2273 2208 struct drbd_device *device; 2209 + struct net_conf *nc; 2274 2210 sector_t sector; 2275 2211 struct drbd_peer_request *peer_req; 2276 2212 struct p_data *p = pi->data; ··· 2311 2245 } 2312 2246 2313 2247 peer_req->w.cb = e_end_block; 2248 + peer_req->submit_jif = jiffies; 2249 + peer_req->flags |= EE_APPLICATION; 2314 2250 2315 2251 dp_flags = be32_to_cpu(p->dp_flags); 2316 2252 rw |= wire_flags_to_bio(dp_flags); ··· 2339 2271 spin_unlock(&connection->epoch_lock); 2340 2272 2341 2273 rcu_read_lock(); 2342 - tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries; 2274 + nc = rcu_dereference(peer_device->connection->net_conf); 2275 + tp = nc->two_primaries; 2276 + if (peer_device->connection->agreed_pro_version < 100) { 2277 + switch (nc->wire_protocol) { 2278 + case DRBD_PROT_C: 2279 + dp_flags |= DP_SEND_WRITE_ACK; 2280 + break; 2281 + case DRBD_PROT_B: 2282 + dp_flags |= DP_SEND_RECEIVE_ACK; 2283 + break; 2284 + } 2285 + } 2343 2286 rcu_read_unlock(); 2287 + 2288 + if (dp_flags & DP_SEND_WRITE_ACK) { 2289 + peer_req->flags |= EE_SEND_WRITE_ACK; 2290 + inc_unacked(device); 2291 + /* corresponding dec_unacked() in e_end_block() 2292 + * respective _drbd_clear_done_ee */ 2293 + } 2294 + 2295 + if (dp_flags & DP_SEND_RECEIVE_ACK) { 2296 + /* I really don't like it that the receiver thread 2297 + * sends on the msock, but anyways */ 2298 + drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req); 2299 + } 2300 + 2344 2301 if (tp) { 2302 + /* two primaries implies protocol C */ 2303 + D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK); 2345 2304 peer_req->flags |= EE_IN_INTERVAL_TREE; 2346 2305 err = wait_for_and_update_peer_seq(peer_device, peer_seq); 2347 2306 if (err) ··· 2392 2297 * active_ee to become empty in drbd_submit_peer_request(); 2393 2298 * better not add ourselves here. */ 2394 2299 if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0) 2395 - list_add(&peer_req->w.list, &device->active_ee); 2300 + list_add_tail(&peer_req->w.list, &device->active_ee); 2396 2301 spin_unlock_irq(&device->resource->req_lock); 2397 2302 2398 2303 if (device->state.conn == C_SYNC_TARGET) 2399 2304 wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req)); 2400 2305 2401 - if (peer_device->connection->agreed_pro_version < 100) { 2402 - rcu_read_lock(); 2403 - switch (rcu_dereference(peer_device->connection->net_conf)->wire_protocol) { 2404 - case DRBD_PROT_C: 2405 - dp_flags |= DP_SEND_WRITE_ACK; 2406 - break; 2407 - case DRBD_PROT_B: 2408 - dp_flags |= DP_SEND_RECEIVE_ACK; 2409 - break; 2410 - } 2411 - rcu_read_unlock(); 2412 - } 2413 - 2414 - if (dp_flags & DP_SEND_WRITE_ACK) { 2415 - peer_req->flags |= EE_SEND_WRITE_ACK; 2416 - inc_unacked(device); 2417 - /* corresponding dec_unacked() in e_end_block() 2418 - * respective _drbd_clear_done_ee */ 2419 - } 2420 - 2421 - if (dp_flags & DP_SEND_RECEIVE_ACK) { 2422 - /* I really don't like it that the receiver thread 2423 - * sends on the msock, but anyways */ 2424 - drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req); 2425 - } 2426 - 2427 2306 if (device->state.pdsk < D_INCONSISTENT) { 2428 2307 /* In case we have the only disk of the cluster, */ 2429 2308 drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size); 2430 - peer_req->flags |= EE_CALL_AL_COMPLETE_IO; 2431 2309 peer_req->flags &= ~EE_MAY_SET_IN_SYNC; 2432 - drbd_al_begin_io(device, &peer_req->i, true); 2310 + drbd_al_begin_io(device, &peer_req->i); 2311 + peer_req->flags |= EE_CALL_AL_COMPLETE_IO; 2433 2312 } 2434 2313 2435 2314 err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR); ··· 2416 2347 list_del(&peer_req->w.list); 2417 2348 drbd_remove_epoch_entry_interval(device, peer_req); 2418 2349 spin_unlock_irq(&device->resource->req_lock); 2419 - if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) 2350 + if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) { 2351 + peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; 2420 2352 drbd_al_complete_io(device, &peer_req->i); 2353 + } 2421 2354 2422 2355 out_interrupted: 2423 2356 drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP); ··· 2439 2368 * The current sync rate used here uses only the most recent two step marks, 2440 2369 * to have a short time average so we can react faster. 2441 2370 */ 2442 - bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) 2371 + bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, 2372 + bool throttle_if_app_is_waiting) 2443 2373 { 2444 2374 struct lc_element *tmp; 2445 - bool throttle = true; 2375 + bool throttle = drbd_rs_c_min_rate_throttle(device); 2446 2376 2447 - if (!drbd_rs_c_min_rate_throttle(device)) 2448 - return false; 2377 + if (!throttle || throttle_if_app_is_waiting) 2378 + return throttle; 2449 2379 2450 2380 spin_lock_irq(&device->al_lock); 2451 2381 tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector)); ··· 2454 2382 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 2455 2383 if (test_bit(BME_PRIORITY, &bm_ext->flags)) 2456 2384 throttle = false; 2457 - /* Do not slow down if app IO is already waiting for this extent */ 2385 + /* Do not slow down if app IO is already waiting for this extent, 2386 + * and our progress is necessary for application IO to complete. */ 2458 2387 } 2459 2388 spin_unlock_irq(&device->al_lock); 2460 2389 ··· 2480 2407 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 2481 2408 (int)part_stat_read(&disk->part0, sectors[1]) - 2482 2409 atomic_read(&device->rs_sect_ev); 2483 - if (!device->rs_last_events || curr_events - device->rs_last_events > 64) { 2410 + 2411 + if (atomic_read(&device->ap_actlog_cnt) 2412 + || !device->rs_last_events || curr_events - device->rs_last_events > 64) { 2484 2413 unsigned long rs_left; 2485 2414 int i; 2486 2415 ··· 2583 2508 peer_req->w.cb = w_e_end_data_req; 2584 2509 fault_type = DRBD_FAULT_DT_RD; 2585 2510 /* application IO, don't drbd_rs_begin_io */ 2511 + peer_req->flags |= EE_APPLICATION; 2586 2512 goto submit; 2587 2513 2588 2514 case P_RS_DATA_REQUEST: ··· 2614 2538 peer_req->w.cb = w_e_end_csum_rs_req; 2615 2539 /* used in the sector offset progress display */ 2616 2540 device->bm_resync_fo = BM_SECT_TO_BIT(sector); 2541 + /* remember to report stats in drbd_resync_finished */ 2542 + device->use_csums = true; 2617 2543 } else if (pi->cmd == P_OV_REPLY) { 2618 2544 /* track progress, we may need to throttle */ 2619 2545 atomic_add(size >> 9, &device->rs_sect_in); ··· 2673 2595 * we would also throttle its application reads. 2674 2596 * In that case, throttling is done on the SyncTarget only. 2675 2597 */ 2676 - if (device->state.peer != R_PRIMARY && drbd_rs_should_slow_down(device, sector)) 2598 + 2599 + /* Even though this may be a resync request, we do add to "read_ee"; 2600 + * "sync_ee" is only used for resync WRITEs. 2601 + * Add to list early, so debugfs can find this request 2602 + * even if we have to sleep below. */ 2603 + spin_lock_irq(&device->resource->req_lock); 2604 + list_add_tail(&peer_req->w.list, &device->read_ee); 2605 + spin_unlock_irq(&device->resource->req_lock); 2606 + 2607 + update_receiver_timing_details(connection, drbd_rs_should_slow_down); 2608 + if (device->state.peer != R_PRIMARY 2609 + && drbd_rs_should_slow_down(device, sector, false)) 2677 2610 schedule_timeout_uninterruptible(HZ/10); 2611 + update_receiver_timing_details(connection, drbd_rs_begin_io); 2678 2612 if (drbd_rs_begin_io(device, sector)) 2679 2613 goto out_free_e; 2680 2614 ··· 2694 2604 atomic_add(size >> 9, &device->rs_sect_ev); 2695 2605 2696 2606 submit: 2607 + update_receiver_timing_details(connection, drbd_submit_peer_request); 2697 2608 inc_unacked(device); 2698 - spin_lock_irq(&device->resource->req_lock); 2699 - list_add_tail(&peer_req->w.list, &device->read_ee); 2700 - spin_unlock_irq(&device->resource->req_lock); 2701 - 2702 2609 if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0) 2703 2610 return 0; 2704 2611 2705 2612 /* don't care for the reason here */ 2706 2613 drbd_err(device, "submit failed, triggering re-connect\n"); 2614 + 2615 + out_free_e: 2707 2616 spin_lock_irq(&device->resource->req_lock); 2708 2617 list_del(&peer_req->w.list); 2709 2618 spin_unlock_irq(&device->resource->req_lock); 2710 2619 /* no drbd_rs_complete_io(), we are dropping the connection anyways */ 2711 2620 2712 - out_free_e: 2713 2621 put_ldev(device); 2714 2622 drbd_free_peer_req(device, peer_req); 2715 2623 return -EIO; ··· 2930 2842 -1091 requires proto 91 2931 2843 -1096 requires proto 96 2932 2844 */ 2933 - static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_hold(local) 2845 + static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local) 2934 2846 { 2847 + struct drbd_peer_device *const peer_device = first_peer_device(device); 2848 + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; 2935 2849 u64 self, peer; 2936 2850 int i, j; 2937 2851 ··· 2959 2869 2960 2870 if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) { 2961 2871 2962 - if (first_peer_device(device)->connection->agreed_pro_version < 91) 2872 + if (connection->agreed_pro_version < 91) 2963 2873 return -1091; 2964 2874 2965 2875 if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) && ··· 2982 2892 2983 2893 if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) { 2984 2894 2985 - if (first_peer_device(device)->connection->agreed_pro_version < 91) 2895 + if (connection->agreed_pro_version < 91) 2986 2896 return -1091; 2987 2897 2988 2898 if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) && ··· 3015 2925 case 1: /* self_pri && !peer_pri */ return 1; 3016 2926 case 2: /* !self_pri && peer_pri */ return -1; 3017 2927 case 3: /* self_pri && peer_pri */ 3018 - dc = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags); 2928 + dc = test_bit(RESOLVE_CONFLICTS, &connection->flags); 3019 2929 return dc ? -1 : 1; 3020 2930 } 3021 2931 } ··· 3028 2938 *rule_nr = 51; 3029 2939 peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1); 3030 2940 if (self == peer) { 3031 - if (first_peer_device(device)->connection->agreed_pro_version < 96 ? 2941 + if (connection->agreed_pro_version < 96 ? 3032 2942 (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == 3033 2943 (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) : 3034 2944 peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) { 3035 2945 /* The last P_SYNC_UUID did not get though. Undo the last start of 3036 2946 resync as sync source modifications of the peer's UUIDs. */ 3037 2947 3038 - if (first_peer_device(device)->connection->agreed_pro_version < 91) 2948 + if (connection->agreed_pro_version < 91) 3039 2949 return -1091; 3040 2950 3041 2951 device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START]; ··· 3065 2975 *rule_nr = 71; 3066 2976 self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); 3067 2977 if (self == peer) { 3068 - if (first_peer_device(device)->connection->agreed_pro_version < 96 ? 2978 + if (connection->agreed_pro_version < 96 ? 3069 2979 (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == 3070 2980 (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) : 3071 2981 self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) { 3072 2982 /* The last P_SYNC_UUID did not get though. Undo the last start of 3073 2983 resync as sync source modifications of our UUIDs. */ 3074 2984 3075 - if (first_peer_device(device)->connection->agreed_pro_version < 91) 2985 + if (connection->agreed_pro_version < 91) 3076 2986 return -1091; 3077 2987 3078 2988 __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]); ··· 3442 3352 * return: NULL (alg name was "") 3443 3353 * ERR_PTR(error) if something goes wrong 3444 3354 * or the crypto hash ptr, if it worked out ok. */ 3445 - static 3446 - struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device, 3355 + static struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device, 3447 3356 const char *alg, const char *name) 3448 3357 { 3449 3358 struct crypto_hash *tfm; ··· 3728 3639 struct drbd_device *device; 3729 3640 struct p_sizes *p = pi->data; 3730 3641 enum determine_dev_size dd = DS_UNCHANGED; 3731 - sector_t p_size, p_usize, my_usize; 3642 + sector_t p_size, p_usize, p_csize, my_usize; 3732 3643 int ldsc = 0; /* local disk size changed */ 3733 3644 enum dds_flags ddsf; 3734 3645 ··· 3739 3650 3740 3651 p_size = be64_to_cpu(p->d_size); 3741 3652 p_usize = be64_to_cpu(p->u_size); 3653 + p_csize = be64_to_cpu(p->c_size); 3742 3654 3743 3655 /* just store the peer's disk size for now. 3744 3656 * we still need to figure out whether we accept that. */ ··· 3800 3710 } 3801 3711 3802 3712 device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); 3803 - drbd_reconsider_max_bio_size(device); 3804 3713 /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size(). 3805 3714 In case we cleared the QUEUE_FLAG_DISCARD from our queue in 3806 3715 drbd_reconsider_max_bio_size(), we can be sure that after ··· 3807 3718 3808 3719 ddsf = be16_to_cpu(p->dds_flags); 3809 3720 if (get_ldev(device)) { 3721 + drbd_reconsider_max_bio_size(device, device->ldev); 3810 3722 dd = drbd_determine_dev_size(device, ddsf, NULL); 3811 3723 put_ldev(device); 3812 3724 if (dd == DS_ERROR) 3813 3725 return -EIO; 3814 3726 drbd_md_sync(device); 3815 3727 } else { 3816 - /* I am diskless, need to accept the peer's size. */ 3817 - drbd_set_my_capacity(device, p_size); 3728 + /* 3729 + * I am diskless, need to accept the peer's *current* size. 3730 + * I must NOT accept the peers backing disk size, 3731 + * it may have been larger than mine all along... 3732 + * 3733 + * At this point, the peer knows more about my disk, or at 3734 + * least about what we last agreed upon, than myself. 3735 + * So if his c_size is less than his d_size, the most likely 3736 + * reason is that *my* d_size was smaller last time we checked. 3737 + * 3738 + * However, if he sends a zero current size, 3739 + * take his (user-capped or) backing disk size anyways. 3740 + */ 3741 + drbd_reconsider_max_bio_size(device, NULL); 3742 + drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size); 3818 3743 } 3819 3744 3820 3745 if (get_ldev(device)) { ··· 4604 4501 struct data_cmd *cmd; 4605 4502 4606 4503 drbd_thread_current_set_cpu(&connection->receiver); 4504 + update_receiver_timing_details(connection, drbd_recv_header); 4607 4505 if (drbd_recv_header(connection, &pi)) 4608 4506 goto err_out; 4609 4507 ··· 4623 4519 } 4624 4520 4625 4521 if (shs) { 4522 + update_receiver_timing_details(connection, drbd_recv_all_warn); 4626 4523 err = drbd_recv_all_warn(connection, pi.data, shs); 4627 4524 if (err) 4628 4525 goto err_out; 4629 4526 pi.size -= shs; 4630 4527 } 4631 4528 4529 + update_receiver_timing_details(connection, cmd->fn); 4632 4530 err = cmd->fn(connection, &pi); 4633 4531 if (err) { 4634 4532 drbd_err(connection, "error receiving %s, e: %d l: %d!\n",

+361 -164

drivers/block/drbd/drbd_req.c

··· 52 52 static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req) 53 53 { 54 54 int rw = bio_data_dir(req->master_bio); 55 - unsigned long duration = jiffies - req->start_time; 55 + unsigned long duration = jiffies - req->start_jif; 56 56 int cpu; 57 57 cpu = part_stat_lock(); 58 58 part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration); ··· 66 66 { 67 67 struct drbd_request *req; 68 68 69 - req = mempool_alloc(drbd_request_mempool, GFP_NOIO); 69 + req = mempool_alloc(drbd_request_mempool, GFP_NOIO | __GFP_ZERO); 70 70 if (!req) 71 71 return NULL; 72 72 ··· 84 84 85 85 INIT_LIST_HEAD(&req->tl_requests); 86 86 INIT_LIST_HEAD(&req->w.list); 87 + INIT_LIST_HEAD(&req->req_pending_master_completion); 88 + INIT_LIST_HEAD(&req->req_pending_local); 87 89 88 90 /* one reference to be put by __drbd_make_request */ 89 91 atomic_set(&req->completion_ref, 1); 90 92 /* one kref as long as completion_ref > 0 */ 91 93 kref_init(&req->kref); 92 94 return req; 95 + } 96 + 97 + static void drbd_remove_request_interval(struct rb_root *root, 98 + struct drbd_request *req) 99 + { 100 + struct drbd_device *device = req->device; 101 + struct drbd_interval *i = &req->i; 102 + 103 + drbd_remove_interval(root, i); 104 + 105 + /* Wake up any processes waiting for this request to complete. */ 106 + if (i->waiting) 107 + wake_up(&device->misc_wait); 93 108 } 94 109 95 110 void drbd_req_destroy(struct kref *kref) ··· 122 107 return; 123 108 } 124 109 125 - /* remove it from the transfer log. 126 - * well, only if it had been there in the first 127 - * place... if it had not (local only or conflicting 128 - * and never sent), it should still be "empty" as 129 - * initialized in drbd_req_new(), so we can list_del() it 130 - * here unconditionally */ 110 + /* If called from mod_rq_state (expected normal case) or 111 + * drbd_send_and_submit (the less likely normal path), this holds the 112 + * req_lock, and req->tl_requests will typicaly be on ->transfer_log, 113 + * though it may be still empty (never added to the transfer log). 114 + * 115 + * If called from do_retry(), we do NOT hold the req_lock, but we are 116 + * still allowed to unconditionally list_del(&req->tl_requests), 117 + * because it will be on a local on-stack list only. */ 131 118 list_del_init(&req->tl_requests); 119 + 120 + /* finally remove the request from the conflict detection 121 + * respective block_id verification interval tree. */ 122 + if (!drbd_interval_empty(&req->i)) { 123 + struct rb_root *root; 124 + 125 + if (s & RQ_WRITE) 126 + root = &device->write_requests; 127 + else 128 + root = &device->read_requests; 129 + drbd_remove_request_interval(root, req); 130 + } else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0) 131 + drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n", 132 + s, (unsigned long long)req->i.sector, req->i.size); 132 133 133 134 /* if it was a write, we may have to set the corresponding 134 135 * bit(s) out-of-sync first. If it had a local part, we need to ··· 219 188 } 220 189 221 190 222 - static void drbd_remove_request_interval(struct rb_root *root, 223 - struct drbd_request *req) 224 - { 225 - struct drbd_device *device = req->device; 226 - struct drbd_interval *i = &req->i; 227 - 228 - drbd_remove_interval(root, i); 229 - 230 - /* Wake up any processes waiting for this request to complete. */ 231 - if (i->waiting) 232 - wake_up(&device->misc_wait); 233 - } 234 - 235 191 /* Helper for __req_mod(). 236 192 * Set m->bio to the master bio, if it is fit to be completed, 237 193 * or leave it alone (it is initialized to NULL in __req_mod), ··· 272 254 ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); 273 255 error = PTR_ERR(req->private_bio); 274 256 275 - /* remove the request from the conflict detection 276 - * respective block_id verification hash */ 277 - if (!drbd_interval_empty(&req->i)) { 278 - struct rb_root *root; 279 - 280 - if (rw == WRITE) 281 - root = &device->write_requests; 282 - else 283 - root = &device->read_requests; 284 - drbd_remove_request_interval(root, req); 285 - } 286 - 287 257 /* Before we can signal completion to the upper layers, 288 258 * we may need to close the current transfer log epoch. 289 259 * We are within the request lock, so we can simply compare ··· 307 301 m->error = ok ? 0 : (error ?: -EIO); 308 302 m->bio = req->master_bio; 309 303 req->master_bio = NULL; 304 + /* We leave it in the tree, to be able to verify later 305 + * write-acks in protocol != C during resync. 306 + * But we mark it as "complete", so it won't be counted as 307 + * conflict in a multi-primary setup. */ 308 + req->i.completed = true; 310 309 } 310 + 311 + if (req->i.waiting) 312 + wake_up(&device->misc_wait); 313 + 314 + /* Either we are about to complete to upper layers, 315 + * or we will restart this request. 316 + * In either case, the request object will be destroyed soon, 317 + * so better remove it from all lists. */ 318 + list_del_init(&req->req_pending_master_completion); 311 319 } 312 320 321 + /* still holds resource->req_lock */ 313 322 static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put) 314 323 { 315 324 struct drbd_device *device = req->device; ··· 345 324 return 1; 346 325 } 347 326 327 + static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req) 328 + { 329 + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 330 + if (!connection) 331 + return; 332 + if (connection->req_next == NULL) 333 + connection->req_next = req; 334 + } 335 + 336 + static void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req) 337 + { 338 + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 339 + if (!connection) 340 + return; 341 + if (connection->req_next != req) 342 + return; 343 + list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { 344 + const unsigned s = req->rq_state; 345 + if (s & RQ_NET_QUEUED) 346 + break; 347 + } 348 + if (&req->tl_requests == &connection->transfer_log) 349 + req = NULL; 350 + connection->req_next = req; 351 + } 352 + 353 + static void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req) 354 + { 355 + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 356 + if (!connection) 357 + return; 358 + if (connection->req_ack_pending == NULL) 359 + connection->req_ack_pending = req; 360 + } 361 + 362 + static void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req) 363 + { 364 + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 365 + if (!connection) 366 + return; 367 + if (connection->req_ack_pending != req) 368 + return; 369 + list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { 370 + const unsigned s = req->rq_state; 371 + if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING)) 372 + break; 373 + } 374 + if (&req->tl_requests == &connection->transfer_log) 375 + req = NULL; 376 + connection->req_ack_pending = req; 377 + } 378 + 379 + static void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req) 380 + { 381 + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 382 + if (!connection) 383 + return; 384 + if (connection->req_not_net_done == NULL) 385 + connection->req_not_net_done = req; 386 + } 387 + 388 + static void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req) 389 + { 390 + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 391 + if (!connection) 392 + return; 393 + if (connection->req_not_net_done != req) 394 + return; 395 + list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { 396 + const unsigned s = req->rq_state; 397 + if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE)) 398 + break; 399 + } 400 + if (&req->tl_requests == &connection->transfer_log) 401 + req = NULL; 402 + connection->req_not_net_done = req; 403 + } 404 + 348 405 /* I'd like this to be the only place that manipulates 349 406 * req->completion_ref and req->kref. */ 350 407 static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, 351 408 int clear, int set) 352 409 { 353 410 struct drbd_device *device = req->device; 411 + struct drbd_peer_device *peer_device = first_peer_device(device); 354 412 unsigned s = req->rq_state; 355 413 int c_put = 0; 356 414 int k_put = 0; ··· 456 356 atomic_inc(&req->completion_ref); 457 357 } 458 358 459 - if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) 359 + if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) { 460 360 atomic_inc(&req->completion_ref); 361 + set_if_null_req_next(peer_device, req); 362 + } 461 363 462 364 if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK)) 463 365 kref_get(&req->kref); /* wait for the DONE */ 464 366 465 - if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) 466 - atomic_add(req->i.size >> 9, &device->ap_in_flight); 367 + if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) { 368 + /* potentially already completed in the asender thread */ 369 + if (!(s & RQ_NET_DONE)) { 370 + atomic_add(req->i.size >> 9, &device->ap_in_flight); 371 + set_if_null_req_not_net_done(peer_device, req); 372 + } 373 + if (s & RQ_NET_PENDING) 374 + set_if_null_req_ack_pending(peer_device, req); 375 + } 467 376 468 377 if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP)) 469 378 atomic_inc(&req->completion_ref); ··· 495 386 ++k_put; 496 387 else 497 388 ++c_put; 389 + list_del_init(&req->req_pending_local); 498 390 } 499 391 500 392 if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) { 501 393 dec_ap_pending(device); 502 394 ++c_put; 395 + req->acked_jif = jiffies; 396 + advance_conn_req_ack_pending(peer_device, req); 503 397 } 504 398 505 - if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) 399 + if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) { 506 400 ++c_put; 401 + advance_conn_req_next(peer_device, req); 402 + } 507 403 508 - if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) { 509 - if (req->rq_state & RQ_NET_SENT) 404 + if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) { 405 + if (s & RQ_NET_SENT) 510 406 atomic_sub(req->i.size >> 9, &device->ap_in_flight); 511 - ++k_put; 407 + if (s & RQ_EXP_BARR_ACK) 408 + ++k_put; 409 + req->net_done_jif = jiffies; 410 + 411 + /* in ahead/behind mode, or just in case, 412 + * before we finally destroy this request, 413 + * the caching pointers must not reference it anymore */ 414 + advance_conn_req_next(peer_device, req); 415 + advance_conn_req_ack_pending(peer_device, req); 416 + advance_conn_req_not_net_done(peer_device, req); 512 417 } 513 418 514 419 /* potentially complete and destroy */ ··· 562 439 bdevname(device->ldev->backing_bdev, b)); 563 440 } 564 441 442 + /* Helper for HANDED_OVER_TO_NETWORK. 443 + * Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)? 444 + * Is it also still "PENDING"? 445 + * --> If so, clear PENDING and set NET_OK below. 446 + * If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster 447 + * (and we must not set RQ_NET_OK) */ 448 + static inline bool is_pending_write_protocol_A(struct drbd_request *req) 449 + { 450 + return (req->rq_state & 451 + (RQ_WRITE|RQ_NET_PENDING|RQ_EXP_WRITE_ACK|RQ_EXP_RECEIVE_ACK)) 452 + == (RQ_WRITE|RQ_NET_PENDING); 453 + } 454 + 565 455 /* obviously this could be coded as many single functions 566 456 * instead of one huge switch, 567 457 * or by putting the code directly in the respective locations ··· 590 454 int __req_mod(struct drbd_request *req, enum drbd_req_event what, 591 455 struct bio_and_error *m) 592 456 { 593 - struct drbd_device *device = req->device; 457 + struct drbd_device *const device = req->device; 458 + struct drbd_peer_device *const peer_device = first_peer_device(device); 459 + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; 594 460 struct net_conf *nc; 595 461 int p, rv = 0; 596 462 ··· 615 477 * and from w_read_retry_remote */ 616 478 D_ASSERT(device, !(req->rq_state & RQ_NET_MASK)); 617 479 rcu_read_lock(); 618 - nc = rcu_dereference(first_peer_device(device)->connection->net_conf); 480 + nc = rcu_dereference(connection->net_conf); 619 481 p = nc->wire_protocol; 620 482 rcu_read_unlock(); 621 483 req->rq_state |= ··· 687 549 D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0); 688 550 mod_rq_state(req, m, 0, RQ_NET_QUEUED); 689 551 req->w.cb = w_send_read_req; 690 - drbd_queue_work(&first_peer_device(device)->connection->sender_work, 552 + drbd_queue_work(&connection->sender_work, 691 553 &req->w); 692 554 break; 693 555 ··· 723 585 D_ASSERT(device, req->rq_state & RQ_NET_PENDING); 724 586 mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK); 725 587 req->w.cb = w_send_dblock; 726 - drbd_queue_work(&first_peer_device(device)->connection->sender_work, 588 + drbd_queue_work(&connection->sender_work, 727 589 &req->w); 728 590 729 591 /* close the epoch, in case it outgrew the limit */ 730 592 rcu_read_lock(); 731 - nc = rcu_dereference(first_peer_device(device)->connection->net_conf); 593 + nc = rcu_dereference(connection->net_conf); 732 594 p = nc->max_epoch_size; 733 595 rcu_read_unlock(); 734 - if (first_peer_device(device)->connection->current_tle_writes >= p) 735 - start_new_tl_epoch(first_peer_device(device)->connection); 596 + if (connection->current_tle_writes >= p) 597 + start_new_tl_epoch(connection); 736 598 737 599 break; 738 600 739 601 case QUEUE_FOR_SEND_OOS: 740 602 mod_rq_state(req, m, 0, RQ_NET_QUEUED); 741 603 req->w.cb = w_send_out_of_sync; 742 - drbd_queue_work(&first_peer_device(device)->connection->sender_work, 604 + drbd_queue_work(&connection->sender_work, 743 605 &req->w); 744 606 break; 745 607 ··· 753 615 754 616 case HANDED_OVER_TO_NETWORK: 755 617 /* assert something? */ 756 - if (bio_data_dir(req->master_bio) == WRITE && 757 - !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) { 618 + if (is_pending_write_protocol_A(req)) 758 619 /* this is what is dangerous about protocol A: 759 620 * pretend it was successfully written on the peer. */ 760 - if (req->rq_state & RQ_NET_PENDING) 761 - mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); 762 - /* else: neg-ack was faster... */ 763 - /* it is still not yet RQ_NET_DONE until the 764 - * corresponding epoch barrier got acked as well, 765 - * so we know what to dirty on connection loss */ 766 - } 767 - mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT); 621 + mod_rq_state(req, m, RQ_NET_QUEUED|RQ_NET_PENDING, 622 + RQ_NET_SENT|RQ_NET_OK); 623 + else 624 + mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT); 625 + /* It is still not yet RQ_NET_DONE until the 626 + * corresponding epoch barrier got acked as well, 627 + * so we know what to dirty on connection loss. */ 768 628 break; 769 629 770 630 case OOS_HANDED_TO_NETWORK: ··· 794 658 case WRITE_ACKED_BY_PEER_AND_SIS: 795 659 req->rq_state |= RQ_NET_SIS; 796 660 case WRITE_ACKED_BY_PEER: 797 - D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK); 798 - /* protocol C; successfully written on peer. 661 + /* Normal operation protocol C: successfully written on peer. 662 + * During resync, even in protocol != C, 663 + * we requested an explicit write ack anyways. 664 + * Which means we cannot even assert anything here. 799 665 * Nothing more to do here. 800 666 * We want to keep the tl in place for all protocols, to cater 801 667 * for volatile write-back caches on lower level devices. */ 802 - 803 668 goto ack_common; 804 669 case RECV_ACKED_BY_PEER: 805 670 D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK); ··· 808 671 * see also notes above in HANDED_OVER_TO_NETWORK about 809 672 * protocol != C */ 810 673 ack_common: 811 - D_ASSERT(device, req->rq_state & RQ_NET_PENDING); 812 674 mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); 813 675 break; 814 676 ··· 850 714 851 715 get_ldev(device); /* always succeeds in this call path */ 852 716 req->w.cb = w_restart_disk_io; 853 - drbd_queue_work(&first_peer_device(device)->connection->sender_work, 717 + drbd_queue_work(&connection->sender_work, 854 718 &req->w); 855 719 break; 856 720 ··· 872 736 873 737 mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING); 874 738 if (req->w.cb) { 875 - drbd_queue_work(&first_peer_device(device)->connection->sender_work, 739 + /* w.cb expected to be w_send_dblock, or w_send_read_req */ 740 + drbd_queue_work(&connection->sender_work, 876 741 &req->w); 877 742 rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; 878 743 } /* else: FIXME can this happen? */ ··· 906 769 break; 907 770 908 771 case QUEUE_AS_DRBD_BARRIER: 909 - start_new_tl_epoch(first_peer_device(device)->connection); 772 + start_new_tl_epoch(connection); 910 773 mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE); 911 774 break; 912 775 }; ··· 1022 885 if (on_congestion == OC_BLOCK || 1023 886 connection->agreed_pro_version < 96) 1024 887 return; 888 + 889 + if (on_congestion == OC_PULL_AHEAD && device->state.conn == C_AHEAD) 890 + return; /* nothing to do ... */ 1025 891 1026 892 /* If I don't even have good local storage, we can not reasonably try 1027 893 * to pull ahead of the peer. We also need the local reference to make ··· 1161 1021 * stable storage, and this is a WRITE, we may not even submit 1162 1022 * this bio. */ 1163 1023 if (get_ldev(device)) { 1024 + req->pre_submit_jif = jiffies; 1164 1025 if (drbd_insert_fault(device, 1165 1026 rw == WRITE ? DRBD_FAULT_DT_WR 1166 1027 : rw == READ ? DRBD_FAULT_DT_RD ··· 1176 1035 1177 1036 static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req) 1178 1037 { 1179 - spin_lock(&device->submit.lock); 1038 + spin_lock_irq(&device->resource->req_lock); 1180 1039 list_add_tail(&req->tl_requests, &device->submit.writes); 1181 - spin_unlock(&device->submit.lock); 1040 + list_add_tail(&req->req_pending_master_completion, 1041 + &device->pending_master_completion[1 /* WRITE */]); 1042 + spin_unlock_irq(&device->resource->req_lock); 1182 1043 queue_work(device->submit.wq, &device->submit.worker); 1044 + /* do_submit() may sleep internally on al_wait, too */ 1045 + wake_up(&device->al_wait); 1183 1046 } 1184 1047 1185 1048 /* returns the new drbd_request pointer, if the caller is expected to ··· 1192 1047 * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request. 1193 1048 */ 1194 1049 static struct drbd_request * 1195 - drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_time) 1050 + drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif) 1196 1051 { 1197 1052 const int rw = bio_data_dir(bio); 1198 1053 struct drbd_request *req; ··· 1207 1062 bio_endio(bio, -ENOMEM); 1208 1063 return ERR_PTR(-ENOMEM); 1209 1064 } 1210 - req->start_time = start_time; 1065 + req->start_jif = start_jif; 1211 1066 1212 1067 if (!get_ldev(device)) { 1213 1068 bio_put(req->private_bio); ··· 1220 1075 if (rw == WRITE && req->private_bio && req->i.size 1221 1076 && !test_bit(AL_SUSPENDED, &device->flags)) { 1222 1077 if (!drbd_al_begin_io_fastpath(device, &req->i)) { 1078 + atomic_inc(&device->ap_actlog_cnt); 1223 1079 drbd_queue_write(device, req); 1224 1080 return NULL; 1225 1081 } 1226 1082 req->rq_state |= RQ_IN_ACT_LOG; 1083 + req->in_actlog_jif = jiffies; 1227 1084 } 1228 1085 1229 1086 return req; ··· 1233 1086 1234 1087 static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req) 1235 1088 { 1089 + struct drbd_resource *resource = device->resource; 1236 1090 const int rw = bio_rw(req->master_bio); 1237 1091 struct bio_and_error m = { NULL, }; 1238 1092 bool no_remote = false; 1093 + bool submit_private_bio = false; 1239 1094 1240 - spin_lock_irq(&device->resource->req_lock); 1095 + spin_lock_irq(&resource->req_lock); 1241 1096 if (rw == WRITE) { 1242 1097 /* This may temporarily give up the req_lock, 1243 1098 * but will re-aquire it before it returns here. ··· 1297 1148 no_remote = true; 1298 1149 } 1299 1150 1151 + /* If it took the fast path in drbd_request_prepare, add it here. 1152 + * The slow path has added it already. */ 1153 + if (list_empty(&req->req_pending_master_completion)) 1154 + list_add_tail(&req->req_pending_master_completion, 1155 + &device->pending_master_completion[rw == WRITE]); 1300 1156 if (req->private_bio) { 1301 1157 /* needs to be marked within the same spinlock */ 1158 + list_add_tail(&req->req_pending_local, 1159 + &device->pending_completion[rw == WRITE]); 1302 1160 _req_mod(req, TO_BE_SUBMITTED); 1303 1161 /* but we need to give up the spinlock to submit */ 1304 - spin_unlock_irq(&device->resource->req_lock); 1305 - drbd_submit_req_private_bio(req); 1306 - spin_lock_irq(&device->resource->req_lock); 1162 + submit_private_bio = true; 1307 1163 } else if (no_remote) { 1308 1164 nodata: 1309 1165 if (__ratelimit(&drbd_ratelimit_state)) ··· 1321 1167 out: 1322 1168 if (drbd_req_put_completion_ref(req, &m, 1)) 1323 1169 kref_put(&req->kref, drbd_req_destroy); 1324 - spin_unlock_irq(&device->resource->req_lock); 1170 + spin_unlock_irq(&resource->req_lock); 1325 1171 1172 + /* Even though above is a kref_put(), this is safe. 1173 + * As long as we still need to submit our private bio, 1174 + * we hold a completion ref, and the request cannot disappear. 1175 + * If however this request did not even have a private bio to submit 1176 + * (e.g. remote read), req may already be invalid now. 1177 + * That's why we cannot check on req->private_bio. */ 1178 + if (submit_private_bio) 1179 + drbd_submit_req_private_bio(req); 1326 1180 if (m.bio) 1327 1181 complete_master_bio(device, &m); 1328 1182 } 1329 1183 1330 - void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_time) 1184 + void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif) 1331 1185 { 1332 - struct drbd_request *req = drbd_request_prepare(device, bio, start_time); 1186 + struct drbd_request *req = drbd_request_prepare(device, bio, start_jif); 1333 1187 if (IS_ERR_OR_NULL(req)) 1334 1188 return; 1335 1189 drbd_send_and_submit(device, req); ··· 1356 1194 continue; 1357 1195 1358 1196 req->rq_state |= RQ_IN_ACT_LOG; 1197 + req->in_actlog_jif = jiffies; 1198 + atomic_dec(&device->ap_actlog_cnt); 1359 1199 } 1360 1200 1361 1201 list_del_init(&req->tl_requests); ··· 1367 1203 1368 1204 static bool prepare_al_transaction_nonblock(struct drbd_device *device, 1369 1205 struct list_head *incoming, 1370 - struct list_head *pending) 1206 + struct list_head *pending, 1207 + struct list_head *later) 1371 1208 { 1372 1209 struct drbd_request *req, *tmp; 1373 1210 int wake = 0; ··· 1377 1212 spin_lock_irq(&device->al_lock); 1378 1213 list_for_each_entry_safe(req, tmp, incoming, tl_requests) { 1379 1214 err = drbd_al_begin_io_nonblock(device, &req->i); 1215 + if (err == -ENOBUFS) 1216 + break; 1380 1217 if (err == -EBUSY) 1381 1218 wake = 1; 1382 1219 if (err) 1383 - continue; 1384 - req->rq_state |= RQ_IN_ACT_LOG; 1385 - list_move_tail(&req->tl_requests, pending); 1220 + list_move_tail(&req->tl_requests, later); 1221 + else 1222 + list_move_tail(&req->tl_requests, pending); 1386 1223 } 1387 1224 spin_unlock_irq(&device->al_lock); 1388 1225 if (wake) 1389 1226 wake_up(&device->al_wait); 1390 - 1391 1227 return !list_empty(pending); 1228 + } 1229 + 1230 + void send_and_submit_pending(struct drbd_device *device, struct list_head *pending) 1231 + { 1232 + struct drbd_request *req, *tmp; 1233 + 1234 + list_for_each_entry_safe(req, tmp, pending, tl_requests) { 1235 + req->rq_state |= RQ_IN_ACT_LOG; 1236 + req->in_actlog_jif = jiffies; 1237 + atomic_dec(&device->ap_actlog_cnt); 1238 + list_del_init(&req->tl_requests); 1239 + drbd_send_and_submit(device, req); 1240 + } 1392 1241 } 1393 1242 1394 1243 void do_submit(struct work_struct *ws) 1395 1244 { 1396 1245 struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker); 1397 - LIST_HEAD(incoming); 1398 - LIST_HEAD(pending); 1399 - struct drbd_request *req, *tmp; 1246 + LIST_HEAD(incoming); /* from drbd_make_request() */ 1247 + LIST_HEAD(pending); /* to be submitted after next AL-transaction commit */ 1248 + LIST_HEAD(busy); /* blocked by resync requests */ 1249 + 1250 + /* grab new incoming requests */ 1251 + spin_lock_irq(&device->resource->req_lock); 1252 + list_splice_tail_init(&device->submit.writes, &incoming); 1253 + spin_unlock_irq(&device->resource->req_lock); 1400 1254 1401 1255 for (;;) { 1402 - spin_lock(&device->submit.lock); 1403 - list_splice_tail_init(&device->submit.writes, &incoming); 1404 - spin_unlock(&device->submit.lock); 1256 + DEFINE_WAIT(wait); 1405 1257 1258 + /* move used-to-be-busy back to front of incoming */ 1259 + list_splice_init(&busy, &incoming); 1406 1260 submit_fast_path(device, &incoming); 1407 1261 if (list_empty(&incoming)) 1408 1262 break; 1409 1263 1410 - skip_fast_path: 1411 - wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending)); 1412 - /* Maybe more was queued, while we prepared the transaction? 1413 - * Try to stuff them into this transaction as well. 1414 - * Be strictly non-blocking here, no wait_event, we already 1415 - * have something to commit. 1416 - * Stop if we don't make any more progres. 1417 - */ 1418 1264 for (;;) { 1265 + prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE); 1266 + 1267 + list_splice_init(&busy, &incoming); 1268 + prepare_al_transaction_nonblock(device, &incoming, &pending, &busy); 1269 + if (!list_empty(&pending)) 1270 + break; 1271 + 1272 + schedule(); 1273 + 1274 + /* If all currently "hot" activity log extents are kept busy by 1275 + * incoming requests, we still must not totally starve new 1276 + * requests to "cold" extents. 1277 + * Something left on &incoming means there had not been 1278 + * enough update slots available, and the activity log 1279 + * has been marked as "starving". 1280 + * 1281 + * Try again now, without looking for new requests, 1282 + * effectively blocking all new requests until we made 1283 + * at least _some_ progress with what we currently have. 1284 + */ 1285 + if (!list_empty(&incoming)) 1286 + continue; 1287 + 1288 + /* Nothing moved to pending, but nothing left 1289 + * on incoming: all moved to busy! 1290 + * Grab new and iterate. */ 1291 + spin_lock_irq(&device->resource->req_lock); 1292 + list_splice_tail_init(&device->submit.writes, &incoming); 1293 + spin_unlock_irq(&device->resource->req_lock); 1294 + } 1295 + finish_wait(&device->al_wait, &wait); 1296 + 1297 + /* If the transaction was full, before all incoming requests 1298 + * had been processed, skip ahead to commit, and iterate 1299 + * without splicing in more incoming requests from upper layers. 1300 + * 1301 + * Else, if all incoming have been processed, 1302 + * they have become either "pending" (to be submitted after 1303 + * next transaction commit) or "busy" (blocked by resync). 1304 + * 1305 + * Maybe more was queued, while we prepared the transaction? 1306 + * Try to stuff those into this transaction as well. 1307 + * Be strictly non-blocking here, 1308 + * we already have something to commit. 1309 + * 1310 + * Commit if we don't make any more progres. 1311 + */ 1312 + 1313 + while (list_empty(&incoming)) { 1419 1314 LIST_HEAD(more_pending); 1420 1315 LIST_HEAD(more_incoming); 1421 1316 bool made_progress; ··· 1485 1260 if (list_empty(&device->submit.writes)) 1486 1261 break; 1487 1262 1488 - spin_lock(&device->submit.lock); 1263 + spin_lock_irq(&device->resource->req_lock); 1489 1264 list_splice_tail_init(&device->submit.writes, &more_incoming); 1490 - spin_unlock(&device->submit.lock); 1265 + spin_unlock_irq(&device->resource->req_lock); 1491 1266 1492 1267 if (list_empty(&more_incoming)) 1493 1268 break; 1494 1269 1495 - made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending); 1270 + made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy); 1496 1271 1497 1272 list_splice_tail_init(&more_pending, &pending); 1498 1273 list_splice_tail_init(&more_incoming, &incoming); 1499 - 1500 1274 if (!made_progress) 1501 1275 break; 1502 1276 } 1503 - drbd_al_begin_io_commit(device, false); 1504 1277 1505 - list_for_each_entry_safe(req, tmp, &pending, tl_requests) { 1506 - list_del_init(&req->tl_requests); 1507 - drbd_send_and_submit(device, req); 1508 - } 1509 - 1510 - /* If all currently hot activity log extents are kept busy by 1511 - * incoming requests, we still must not totally starve new 1512 - * requests to cold extents. In that case, prepare one request 1513 - * in blocking mode. */ 1514 - list_for_each_entry_safe(req, tmp, &incoming, tl_requests) { 1515 - list_del_init(&req->tl_requests); 1516 - req->rq_state |= RQ_IN_ACT_LOG; 1517 - if (!drbd_al_begin_io_prepare(device, &req->i)) { 1518 - /* Corresponding extent was hot after all? */ 1519 - drbd_send_and_submit(device, req); 1520 - } else { 1521 - /* Found a request to a cold extent. 1522 - * Put on "pending" list, 1523 - * and try to cumulate with more. */ 1524 - list_add(&req->tl_requests, &pending); 1525 - goto skip_fast_path; 1526 - } 1527 - } 1278 + drbd_al_begin_io_commit(device); 1279 + send_and_submit_pending(device, &pending); 1528 1280 } 1529 1281 } 1530 1282 1531 1283 void drbd_make_request(struct request_queue *q, struct bio *bio) 1532 1284 { 1533 1285 struct drbd_device *device = (struct drbd_device *) q->queuedata; 1534 - unsigned long start_time; 1286 + unsigned long start_jif; 1535 1287 1536 - start_time = jiffies; 1288 + start_jif = jiffies; 1537 1289 1538 1290 /* 1539 1291 * what we "blindly" assume: ··· 1518 1316 D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512)); 1519 1317 1520 1318 inc_ap_bio(device); 1521 - __drbd_make_request(device, bio, start_time); 1319 + __drbd_make_request(device, bio, start_jif); 1522 1320 } 1523 1321 1524 1322 /* This is called by bio_add_page(). ··· 1555 1353 return limit; 1556 1354 } 1557 1355 1558 - static void find_oldest_requests( 1559 - struct drbd_connection *connection, 1560 - struct drbd_device *device, 1561 - struct drbd_request **oldest_req_waiting_for_peer, 1562 - struct drbd_request **oldest_req_waiting_for_disk) 1563 - { 1564 - struct drbd_request *r; 1565 - *oldest_req_waiting_for_peer = NULL; 1566 - *oldest_req_waiting_for_disk = NULL; 1567 - list_for_each_entry(r, &connection->transfer_log, tl_requests) { 1568 - const unsigned s = r->rq_state; 1569 - if (!*oldest_req_waiting_for_peer 1570 - && ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) 1571 - *oldest_req_waiting_for_peer = r; 1572 - 1573 - if (!*oldest_req_waiting_for_disk 1574 - && (s & RQ_LOCAL_PENDING) && r->device == device) 1575 - *oldest_req_waiting_for_disk = r; 1576 - 1577 - if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk) 1578 - break; 1579 - } 1580 - } 1581 - 1582 1356 void request_timer_fn(unsigned long data) 1583 1357 { 1584 1358 struct drbd_device *device = (struct drbd_device *) data; 1585 1359 struct drbd_connection *connection = first_peer_device(device)->connection; 1586 - struct drbd_request *req_disk, *req_peer; /* oldest request */ 1360 + struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */ 1587 1361 struct net_conf *nc; 1362 + unsigned long oldest_submit_jif; 1588 1363 unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ 1589 1364 unsigned long now; 1590 1365 ··· 1582 1403 return; /* Recurring timer stopped */ 1583 1404 1584 1405 now = jiffies; 1406 + nt = now + et; 1585 1407 1586 1408 spin_lock_irq(&device->resource->req_lock); 1587 - find_oldest_requests(connection, device, &req_peer, &req_disk); 1588 - if (req_peer == NULL && req_disk == NULL) { 1589 - spin_unlock_irq(&device->resource->req_lock); 1590 - mod_timer(&device->request_timer, now + et); 1591 - return; 1592 - } 1409 + req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local); 1410 + req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local); 1411 + req_peer = connection->req_not_net_done; 1412 + /* maybe the oldest request waiting for the peer is in fact still 1413 + * blocking in tcp sendmsg */ 1414 + if (!req_peer && connection->req_next && connection->req_next->pre_send_jif) 1415 + req_peer = connection->req_next; 1416 + 1417 + /* evaluate the oldest peer request only in one timer! */ 1418 + if (req_peer && req_peer->device != device) 1419 + req_peer = NULL; 1420 + 1421 + /* do we have something to evaluate? */ 1422 + if (req_peer == NULL && req_write == NULL && req_read == NULL) 1423 + goto out; 1424 + 1425 + oldest_submit_jif = 1426 + (req_write && req_read) 1427 + ? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif) 1428 + ? req_write->pre_submit_jif : req_read->pre_submit_jif ) 1429 + : req_write ? req_write->pre_submit_jif 1430 + : req_read ? req_read->pre_submit_jif : now; 1593 1431 1594 1432 /* The request is considered timed out, if 1595 1433 * - we have some effective timeout from the configuration, ··· 1625 1429 * to expire twice (worst case) to become effective. Good enough. 1626 1430 */ 1627 1431 if (ent && req_peer && 1628 - time_after(now, req_peer->start_time + ent) && 1432 + time_after(now, req_peer->pre_send_jif + ent) && 1629 1433 !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { 1630 1434 drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); 1631 1435 _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); 1632 1436 } 1633 - if (dt && req_disk && 1634 - time_after(now, req_disk->start_time + dt) && 1437 + if (dt && oldest_submit_jif != now && 1438 + time_after(now, oldest_submit_jif + dt) && 1635 1439 !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { 1636 1440 drbd_warn(device, "Local backing device failed to meet the disk-timeout\n"); 1637 1441 __drbd_chk_io_error(device, DRBD_FORCE_DETACH); ··· 1639 1443 1640 1444 /* Reschedule timer for the nearest not already expired timeout. 1641 1445 * Fallback to now + min(effective network timeout, disk timeout). */ 1642 - ent = (ent && req_peer && time_before(now, req_peer->start_time + ent)) 1643 - ? req_peer->start_time + ent : now + et; 1644 - dt = (dt && req_disk && time_before(now, req_disk->start_time + dt)) 1645 - ? req_disk->start_time + dt : now + et; 1446 + ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent)) 1447 + ? req_peer->pre_send_jif + ent : now + et; 1448 + dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt)) 1449 + ? oldest_submit_jif + dt : now + et; 1646 1450 nt = time_before(ent, dt) ? ent : dt; 1451 + out: 1647 1452 spin_unlock_irq(&connection->resource->req_lock); 1648 1453 mod_timer(&device->request_timer, nt); 1649 1454 }

+1

drivers/block/drbd/drbd_req.h

··· 288 288 extern void request_timer_fn(unsigned long data); 289 289 extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what); 290 290 extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what); 291 + extern void tl_abort_disk_io(struct drbd_device *device); 291 292 292 293 /* this is in drbd_main.c */ 293 294 extern void drbd_restart_request(struct drbd_request *req);

+49 -41

drivers/block/drbd/drbd_state.c

··· 410 410 return rv; 411 411 } 412 412 413 - static void print_st(struct drbd_device *device, char *name, union drbd_state ns) 413 + static void print_st(struct drbd_device *device, const char *name, union drbd_state ns) 414 414 { 415 415 drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n", 416 416 name, ··· 952 952 __drbd_set_state(struct drbd_device *device, union drbd_state ns, 953 953 enum chg_state_flags flags, struct completion *done) 954 954 { 955 + struct drbd_peer_device *peer_device = first_peer_device(device); 956 + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 955 957 union drbd_state os; 956 958 enum drbd_state_rv rv = SS_SUCCESS; 957 959 enum sanitize_state_warnings ssw; 958 960 struct after_state_chg_work *ascw; 959 - bool did_remote, should_do_remote; 960 961 961 962 os = drbd_read_state(device); 962 963 ··· 979 978 this happen...*/ 980 979 981 980 if (is_valid_state(device, os) == rv) 982 - rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection); 981 + rv = is_valid_soft_transition(os, ns, connection); 983 982 } else 984 - rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection); 983 + rv = is_valid_soft_transition(os, ns, connection); 985 984 } 986 985 987 986 if (rv < SS_SUCCESS) { ··· 998 997 sanitize_state(). Only display it here if we where not called from 999 998 _conn_request_state() */ 1000 999 if (!(flags & CS_DC_SUSP)) 1001 - conn_pr_state_change(first_peer_device(device)->connection, os, ns, 1000 + conn_pr_state_change(connection, os, ns, 1002 1001 (flags & ~CS_DC_MASK) | CS_DC_SUSP); 1003 1002 1004 1003 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference ··· 1009 1008 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) 1010 1009 atomic_inc(&device->local_cnt); 1011 1010 1012 - did_remote = drbd_should_do_remote(device->state); 1011 + if (!is_sync_state(os.conn) && is_sync_state(ns.conn)) 1012 + clear_bit(RS_DONE, &device->flags); 1013 + 1014 + /* changes to local_cnt and device flags should be visible before 1015 + * changes to state, which again should be visible before anything else 1016 + * depending on that change happens. */ 1017 + smp_wmb(); 1013 1018 device->state.i = ns.i; 1014 - should_do_remote = drbd_should_do_remote(device->state); 1015 1019 device->resource->susp = ns.susp; 1016 1020 device->resource->susp_nod = ns.susp_nod; 1017 1021 device->resource->susp_fen = ns.susp_fen; 1022 + smp_wmb(); 1018 1023 1019 1024 /* put replicated vs not-replicated requests in seperate epochs */ 1020 - if (did_remote != should_do_remote) 1021 - start_new_tl_epoch(first_peer_device(device)->connection); 1025 + if (drbd_should_do_remote((union drbd_dev_state)os.i) != 1026 + drbd_should_do_remote((union drbd_dev_state)ns.i)) 1027 + start_new_tl_epoch(connection); 1022 1028 1023 1029 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) 1024 1030 drbd_print_uuids(device, "attached to UUIDs"); 1025 1031 1026 1032 /* Wake up role changes, that were delayed because of connection establishing */ 1027 1033 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS && 1028 - no_peer_wf_report_params(first_peer_device(device)->connection)) 1029 - clear_bit(STATE_SENT, &first_peer_device(device)->connection->flags); 1034 + no_peer_wf_report_params(connection)) 1035 + clear_bit(STATE_SENT, &connection->flags); 1030 1036 1031 1037 wake_up(&device->misc_wait); 1032 1038 wake_up(&device->state_wait); 1033 - wake_up(&first_peer_device(device)->connection->ping_wait); 1039 + wake_up(&connection->ping_wait); 1034 1040 1035 1041 /* Aborted verify run, or we reached the stop sector. 1036 1042 * Log the last position, unless end-of-device. */ ··· 1126 1118 1127 1119 /* Receiver should clean up itself */ 1128 1120 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) 1129 - drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver); 1121 + drbd_thread_stop_nowait(&connection->receiver); 1130 1122 1131 1123 /* Now the receiver finished cleaning up itself, it should die */ 1132 1124 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) 1133 - drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver); 1125 + drbd_thread_stop_nowait(&connection->receiver); 1134 1126 1135 1127 /* Upon network failure, we need to restart the receiver. */ 1136 1128 if (os.conn > C_WF_CONNECTION && 1137 1129 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) 1138 - drbd_thread_restart_nowait(&first_peer_device(device)->connection->receiver); 1130 + drbd_thread_restart_nowait(&connection->receiver); 1139 1131 1140 1132 /* Resume AL writing if we get a connection */ 1141 1133 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { 1142 1134 drbd_resume_al(device); 1143 - first_peer_device(device)->connection->connect_cnt++; 1135 + connection->connect_cnt++; 1144 1136 } 1145 1137 1146 1138 /* remember last attach time so request_timer_fn() won't ··· 1158 1150 ascw->w.cb = w_after_state_ch; 1159 1151 ascw->device = device; 1160 1152 ascw->done = done; 1161 - drbd_queue_work(&first_peer_device(device)->connection->sender_work, 1153 + drbd_queue_work(&connection->sender_work, 1162 1154 &ascw->w); 1163 1155 } else { 1164 1156 drbd_err(device, "Could not kmalloc an ascw\n"); ··· 1230 1222 union drbd_state ns, enum chg_state_flags flags) 1231 1223 { 1232 1224 struct drbd_resource *resource = device->resource; 1225 + struct drbd_peer_device *peer_device = first_peer_device(device); 1226 + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 1233 1227 struct sib_info sib; 1234 1228 1235 1229 sib.sib_reason = SIB_STATE_CHANGE; 1236 1230 sib.os = os; 1237 1231 sib.ns = ns; 1238 1232 1239 - if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { 1233 + if ((os.disk != D_UP_TO_DATE || os.pdsk != D_UP_TO_DATE) 1234 + && (ns.disk == D_UP_TO_DATE && ns.pdsk == D_UP_TO_DATE)) { 1240 1235 clear_bit(CRASHED_PRIMARY, &device->flags); 1241 1236 if (device->p_uuid) 1242 1237 device->p_uuid[UI_FLAGS] &= ~((u64)2); ··· 1256 1245 state change. This function might sleep */ 1257 1246 1258 1247 if (ns.susp_nod) { 1259 - struct drbd_connection *connection = first_peer_device(device)->connection; 1260 1248 enum drbd_req_event what = NOTHING; 1261 1249 1262 1250 spin_lock_irq(&device->resource->req_lock); ··· 1277 1267 } 1278 1268 1279 1269 if (ns.susp_fen) { 1280 - struct drbd_connection *connection = first_peer_device(device)->connection; 1281 - 1282 1270 spin_lock_irq(&device->resource->req_lock); 1283 1271 if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) { 1284 1272 /* case2: The connection was established again: */ ··· 1302 1294 * which is unexpected. */ 1303 1295 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && 1304 1296 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && 1305 - first_peer_device(device)->connection->agreed_pro_version >= 96 && get_ldev(device)) { 1306 - drbd_gen_and_send_sync_uuid(first_peer_device(device)); 1297 + connection->agreed_pro_version >= 96 && get_ldev(device)) { 1298 + drbd_gen_and_send_sync_uuid(peer_device); 1307 1299 put_ldev(device); 1308 1300 } 1309 1301 ··· 1317 1309 atomic_set(&device->rs_pending_cnt, 0); 1318 1310 drbd_rs_cancel_all(device); 1319 1311 1320 - drbd_send_uuids(first_peer_device(device)); 1321 - drbd_send_state(first_peer_device(device), ns); 1312 + drbd_send_uuids(peer_device); 1313 + drbd_send_state(peer_device, ns); 1322 1314 } 1323 1315 /* No point in queuing send_bitmap if we don't have a connection 1324 1316 * anymore, so check also the _current_ state, not only the new state ··· 1343 1335 set_bit(NEW_CUR_UUID, &device->flags); 1344 1336 } else { 1345 1337 drbd_uuid_new_current(device); 1346 - drbd_send_uuids(first_peer_device(device)); 1338 + drbd_send_uuids(peer_device); 1347 1339 } 1348 1340 } 1349 1341 put_ldev(device); ··· 1354 1346 if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && 1355 1347 device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { 1356 1348 drbd_uuid_new_current(device); 1357 - drbd_send_uuids(first_peer_device(device)); 1349 + drbd_send_uuids(peer_device); 1358 1350 } 1359 1351 /* D_DISKLESS Peer becomes secondary */ 1360 1352 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) ··· 1381 1373 /* Last part of the attaching process ... */ 1382 1374 if (ns.conn >= C_CONNECTED && 1383 1375 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { 1384 - drbd_send_sizes(first_peer_device(device), 0, 0); /* to start sync... */ 1385 - drbd_send_uuids(first_peer_device(device)); 1386 - drbd_send_state(first_peer_device(device), ns); 1376 + drbd_send_sizes(peer_device, 0, 0); /* to start sync... */ 1377 + drbd_send_uuids(peer_device); 1378 + drbd_send_state(peer_device, ns); 1387 1379 } 1388 1380 1389 1381 /* We want to pause/continue resync, tell peer. */ 1390 1382 if (ns.conn >= C_CONNECTED && 1391 1383 ((os.aftr_isp != ns.aftr_isp) || 1392 1384 (os.user_isp != ns.user_isp))) 1393 - drbd_send_state(first_peer_device(device), ns); 1385 + drbd_send_state(peer_device, ns); 1394 1386 1395 1387 /* In case one of the isp bits got set, suspend other devices. */ 1396 1388 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && ··· 1400 1392 /* Make sure the peer gets informed about eventual state 1401 1393 changes (ISP bits) while we were in WFReportParams. */ 1402 1394 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) 1403 - drbd_send_state(first_peer_device(device), ns); 1395 + drbd_send_state(peer_device, ns); 1404 1396 1405 1397 if (os.conn != C_AHEAD && ns.conn == C_AHEAD) 1406 - drbd_send_state(first_peer_device(device), ns); 1398 + drbd_send_state(peer_device, ns); 1407 1399 1408 1400 /* We are in the progress to start a full sync... */ 1409 1401 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || ··· 1457 1449 drbd_disk_str(device->state.disk)); 1458 1450 1459 1451 if (ns.conn >= C_CONNECTED) 1460 - drbd_send_state(first_peer_device(device), ns); 1452 + drbd_send_state(peer_device, ns); 1461 1453 1462 1454 drbd_rs_cancel_all(device); 1463 1455 ··· 1481 1473 drbd_disk_str(device->state.disk)); 1482 1474 1483 1475 if (ns.conn >= C_CONNECTED) 1484 - drbd_send_state(first_peer_device(device), ns); 1476 + drbd_send_state(peer_device, ns); 1485 1477 /* corresponding get_ldev in __drbd_set_state 1486 1478 * this may finally trigger drbd_ldev_destroy. */ 1487 1479 put_ldev(device); ··· 1489 1481 1490 1482 /* Notify peer that I had a local IO error, and did not detached.. */ 1491 1483 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) 1492 - drbd_send_state(first_peer_device(device), ns); 1484 + drbd_send_state(peer_device, ns); 1493 1485 1494 1486 /* Disks got bigger while they were detached */ 1495 1487 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && ··· 1507 1499 /* sync target done with resync. Explicitly notify peer, even though 1508 1500 * it should (at least for non-empty resyncs) already know itself. */ 1509 1501 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) 1510 - drbd_send_state(first_peer_device(device), ns); 1502 + drbd_send_state(peer_device, ns); 1511 1503 1512 1504 /* Verify finished, or reached stop sector. Peer did not know about 1513 1505 * the stop sector, and we may even have changed the stop sector during 1514 1506 * verify to interrupt/stop early. Send the new state. */ 1515 1507 if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED 1516 1508 && verify_can_do_stop_sector(device)) 1517 - drbd_send_state(first_peer_device(device), ns); 1509 + drbd_send_state(peer_device, ns); 1518 1510 1519 1511 /* This triggers bitmap writeout of potentially still unwritten pages 1520 1512 * if the resync finished cleanly, or aborted because of peer disk ··· 1571 1563 old_conf = connection->net_conf; 1572 1564 connection->my_addr_len = 0; 1573 1565 connection->peer_addr_len = 0; 1574 - rcu_assign_pointer(connection->net_conf, NULL); 1566 + RCU_INIT_POINTER(connection->net_conf, NULL); 1575 1567 conn_free_crypto(connection); 1576 1568 mutex_unlock(&connection->resource->conf_update); 1577 1569 ··· 1607 1599 return 0; 1608 1600 } 1609 1601 1610 - void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf) 1602 + static void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf) 1611 1603 { 1612 1604 enum chg_state_flags flags = ~0; 1613 1605 struct drbd_peer_device *peer_device; ··· 1696 1688 return rv; 1697 1689 } 1698 1690 1699 - void 1691 + static void 1700 1692 conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, 1701 1693 union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags) 1702 1694 {

+276 -72

drivers/block/drbd/drbd_worker.c

··· 67 67 */ 68 68 void drbd_md_io_complete(struct bio *bio, int error) 69 69 { 70 - struct drbd_md_io *md_io; 71 70 struct drbd_device *device; 72 71 73 - md_io = (struct drbd_md_io *)bio->bi_private; 74 - device = container_of(md_io, struct drbd_device, md_io); 75 - 76 - md_io->error = error; 72 + device = bio->bi_private; 73 + device->md_io.error = error; 77 74 78 75 /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able 79 76 * to timeout on the lower level device, and eventually detach from it. ··· 84 87 * ASSERT(atomic_read(&device->md_io_in_use) == 1) there. 85 88 */ 86 89 drbd_md_put_buffer(device); 87 - md_io->done = 1; 90 + device->md_io.done = 1; 88 91 wake_up(&device->misc_wait); 89 92 bio_put(bio); 90 93 if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */ ··· 132 135 i = peer_req->i; 133 136 do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO; 134 137 block_id = peer_req->block_id; 138 + peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; 135 139 136 140 spin_lock_irqsave(&device->resource->req_lock, flags); 137 141 device->writ_cnt += peer_req->i.size >> 9; ··· 396 398 if (!get_ldev(device)) 397 399 return -EIO; 398 400 399 - if (drbd_rs_should_slow_down(device, sector)) 400 - goto defer; 401 - 402 401 /* GFP_TRY, because if there is no memory available right now, this may 403 402 * be rescheduled for later. It is "only" background resync, after all. */ 404 403 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, ··· 405 410 406 411 peer_req->w.cb = w_e_send_csum; 407 412 spin_lock_irq(&device->resource->req_lock); 408 - list_add(&peer_req->w.list, &device->read_ee); 413 + list_add_tail(&peer_req->w.list, &device->read_ee); 409 414 spin_unlock_irq(&device->resource->req_lock); 410 415 411 416 atomic_add(size >> 9, &device->rs_sect_ev); ··· 447 452 { 448 453 struct drbd_device *device = (struct drbd_device *) data; 449 454 450 - if (list_empty(&device->resync_work.list)) 451 - drbd_queue_work(&first_peer_device(device)->connection->sender_work, 452 - &device->resync_work); 455 + drbd_queue_work_if_unqueued( 456 + &first_peer_device(device)->connection->sender_work, 457 + &device->resync_work); 453 458 } 454 459 455 460 static void fifo_set(struct fifo_buffer *fb, int value) ··· 499 504 static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in) 500 505 { 501 506 struct disk_conf *dc; 502 - unsigned int want; /* The number of sectors we want in the proxy */ 507 + unsigned int want; /* The number of sectors we want in-flight */ 503 508 int req_sect; /* Number of sectors to request in this turn */ 504 - int correction; /* Number of sectors more we need in the proxy*/ 509 + int correction; /* Number of sectors more we need in-flight */ 505 510 int cps; /* correction per invocation of drbd_rs_controller() */ 506 511 int steps; /* Number of time steps to plan ahead */ 507 512 int curr_corr; ··· 572 577 * potentially causing a distributed deadlock on congestion during 573 578 * online-verify or (checksum-based) resync, if max-buffers, 574 579 * socket buffer sizes and resync rate settings are mis-configured. */ 575 - if (mxb - device->rs_in_flight < number) 576 - number = mxb - device->rs_in_flight; 580 + 581 + /* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k), 582 + * mxb (as used here, and in drbd_alloc_pages on the peer) is 583 + * "number of pages" (typically also 4k), 584 + * but "rs_in_flight" is in "sectors" (512 Byte). */ 585 + if (mxb - device->rs_in_flight/8 < number) 586 + number = mxb - device->rs_in_flight/8; 577 587 578 588 return number; 579 589 } 580 590 581 - static int make_resync_request(struct drbd_device *device, int cancel) 591 + static int make_resync_request(struct drbd_device *const device, int cancel) 582 592 { 593 + struct drbd_peer_device *const peer_device = first_peer_device(device); 594 + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; 583 595 unsigned long bit; 584 596 sector_t sector; 585 597 const sector_t capacity = drbd_get_capacity(device->this_bdev); 586 598 int max_bio_size; 587 599 int number, rollback_i, size; 588 - int align, queued, sndbuf; 600 + int align, requeue = 0; 589 601 int i = 0; 590 602 591 603 if (unlikely(cancel)) ··· 619 617 goto requeue; 620 618 621 619 for (i = 0; i < number; i++) { 622 - /* Stop generating RS requests, when half of the send buffer is filled */ 623 - mutex_lock(&first_peer_device(device)->connection->data.mutex); 624 - if (first_peer_device(device)->connection->data.socket) { 625 - queued = first_peer_device(device)->connection->data.socket->sk->sk_wmem_queued; 626 - sndbuf = first_peer_device(device)->connection->data.socket->sk->sk_sndbuf; 627 - } else { 628 - queued = 1; 629 - sndbuf = 0; 630 - } 631 - mutex_unlock(&first_peer_device(device)->connection->data.mutex); 632 - if (queued > sndbuf / 2) 620 + /* Stop generating RS requests when half of the send buffer is filled, 621 + * but notify TCP that we'd like to have more space. */ 622 + mutex_lock(&connection->data.mutex); 623 + if (connection->data.socket) { 624 + struct sock *sk = connection->data.socket->sk; 625 + int queued = sk->sk_wmem_queued; 626 + int sndbuf = sk->sk_sndbuf; 627 + if (queued > sndbuf / 2) { 628 + requeue = 1; 629 + if (sk->sk_socket) 630 + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 631 + } 632 + } else 633 + requeue = 1; 634 + mutex_unlock(&connection->data.mutex); 635 + if (requeue) 633 636 goto requeue; 634 637 635 638 next_sector: ··· 649 642 650 643 sector = BM_BIT_TO_SECT(bit); 651 644 652 - if (drbd_rs_should_slow_down(device, sector) || 653 - drbd_try_rs_begin_io(device, sector)) { 645 + if (drbd_try_rs_begin_io(device, sector)) { 654 646 device->bm_resync_fo = bit; 655 647 goto requeue; 656 648 } ··· 702 696 /* adjust very last sectors, in case we are oddly sized */ 703 697 if (sector + (size>>9) > capacity) 704 698 size = (capacity-sector)<<9; 705 - if (first_peer_device(device)->connection->agreed_pro_version >= 89 && 706 - first_peer_device(device)->connection->csums_tfm) { 707 - switch (read_for_csum(first_peer_device(device), sector, size)) { 699 + 700 + if (device->use_csums) { 701 + switch (read_for_csum(peer_device, sector, size)) { 708 702 case -EIO: /* Disk failure */ 709 703 put_ldev(device); 710 704 return -EIO; ··· 723 717 int err; 724 718 725 719 inc_rs_pending(device); 726 - err = drbd_send_drequest(first_peer_device(device), P_RS_DATA_REQUEST, 720 + err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST, 727 721 sector, size, ID_SYNCER); 728 722 if (err) { 729 723 drbd_err(device, "drbd_send_drequest() failed, aborting...\n"); ··· 780 774 781 775 size = BM_BLOCK_SIZE; 782 776 783 - if (drbd_rs_should_slow_down(device, sector) || 784 - drbd_try_rs_begin_io(device, sector)) { 777 + if (drbd_try_rs_begin_io(device, sector)) { 785 778 device->ov_position = sector; 786 779 goto requeue; 787 780 } ··· 916 911 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) 917 912 khelper_cmd = "after-resync-target"; 918 913 919 - if (first_peer_device(device)->connection->csums_tfm && device->rs_total) { 914 + if (device->use_csums && device->rs_total) { 920 915 const unsigned long s = device->rs_same_csum; 921 916 const unsigned long t = device->rs_total; 922 917 const int ratio = ··· 1356 1351 { 1357 1352 struct drbd_request *req = container_of(w, struct drbd_request, w); 1358 1353 struct drbd_device *device = req->device; 1359 - struct drbd_connection *connection = first_peer_device(device)->connection; 1354 + struct drbd_peer_device *const peer_device = first_peer_device(device); 1355 + struct drbd_connection *const connection = peer_device->connection; 1360 1356 int err; 1361 1357 1362 1358 if (unlikely(cancel)) { 1363 1359 req_mod(req, SEND_CANCELED); 1364 1360 return 0; 1365 1361 } 1362 + req->pre_send_jif = jiffies; 1366 1363 1367 1364 /* this time, no connection->send.current_epoch_writes++; 1368 1365 * If it was sent, it was the closing barrier for the last ··· 1372 1365 * No more barriers will be sent, until we leave AHEAD mode again. */ 1373 1366 maybe_send_barrier(connection, req->epoch); 1374 1367 1375 - err = drbd_send_out_of_sync(first_peer_device(device), req); 1368 + err = drbd_send_out_of_sync(peer_device, req); 1376 1369 req_mod(req, OOS_HANDED_TO_NETWORK); 1377 1370 1378 1371 return err; ··· 1387 1380 { 1388 1381 struct drbd_request *req = container_of(w, struct drbd_request, w); 1389 1382 struct drbd_device *device = req->device; 1390 - struct drbd_connection *connection = first_peer_device(device)->connection; 1383 + struct drbd_peer_device *const peer_device = first_peer_device(device); 1384 + struct drbd_connection *connection = peer_device->connection; 1391 1385 int err; 1392 1386 1393 1387 if (unlikely(cancel)) { 1394 1388 req_mod(req, SEND_CANCELED); 1395 1389 return 0; 1396 1390 } 1391 + req->pre_send_jif = jiffies; 1397 1392 1398 1393 re_init_if_first_write(connection, req->epoch); 1399 1394 maybe_send_barrier(connection, req->epoch); 1400 1395 connection->send.current_epoch_writes++; 1401 1396 1402 - err = drbd_send_dblock(first_peer_device(device), req); 1397 + err = drbd_send_dblock(peer_device, req); 1403 1398 req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); 1404 1399 1405 1400 return err; ··· 1416 1407 { 1417 1408 struct drbd_request *req = container_of(w, struct drbd_request, w); 1418 1409 struct drbd_device *device = req->device; 1419 - struct drbd_connection *connection = first_peer_device(device)->connection; 1410 + struct drbd_peer_device *const peer_device = first_peer_device(device); 1411 + struct drbd_connection *connection = peer_device->connection; 1420 1412 int err; 1421 1413 1422 1414 if (unlikely(cancel)) { 1423 1415 req_mod(req, SEND_CANCELED); 1424 1416 return 0; 1425 1417 } 1418 + req->pre_send_jif = jiffies; 1426 1419 1427 1420 /* Even read requests may close a write epoch, 1428 1421 * if there was any yet. */ 1429 1422 maybe_send_barrier(connection, req->epoch); 1430 1423 1431 - err = drbd_send_drequest(first_peer_device(device), P_DATA_REQUEST, req->i.sector, req->i.size, 1424 + err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size, 1432 1425 (unsigned long)req); 1433 1426 1434 1427 req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); ··· 1444 1433 struct drbd_device *device = req->device; 1445 1434 1446 1435 if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) 1447 - drbd_al_begin_io(device, &req->i, false); 1436 + drbd_al_begin_io(device, &req->i); 1448 1437 1449 1438 drbd_req_make_private_bio(req, req->master_bio); 1450 1439 req->private_bio->bi_bdev = device->ldev->backing_bdev; ··· 1612 1601 void start_resync_timer_fn(unsigned long data) 1613 1602 { 1614 1603 struct drbd_device *device = (struct drbd_device *) data; 1615 - 1616 - drbd_queue_work(&first_peer_device(device)->connection->sender_work, 1617 - &device->start_resync_work); 1604 + drbd_device_post_work(device, RS_START); 1618 1605 } 1619 1606 1620 - int w_start_resync(struct drbd_work *w, int cancel) 1607 + static void do_start_resync(struct drbd_device *device) 1621 1608 { 1622 - struct drbd_device *device = 1623 - container_of(w, struct drbd_device, start_resync_work); 1624 - 1625 1609 if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) { 1626 - drbd_warn(device, "w_start_resync later...\n"); 1610 + drbd_warn(device, "postponing start_resync ...\n"); 1627 1611 device->start_resync_timer.expires = jiffies + HZ/10; 1628 1612 add_timer(&device->start_resync_timer); 1629 - return 0; 1613 + return; 1630 1614 } 1631 1615 1632 1616 drbd_start_resync(device, C_SYNC_SOURCE); 1633 1617 clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags); 1634 - return 0; 1618 + } 1619 + 1620 + static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device) 1621 + { 1622 + bool csums_after_crash_only; 1623 + rcu_read_lock(); 1624 + csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only; 1625 + rcu_read_unlock(); 1626 + return connection->agreed_pro_version >= 89 && /* supported? */ 1627 + connection->csums_tfm && /* configured? */ 1628 + (csums_after_crash_only == 0 /* use for each resync? */ 1629 + || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */ 1635 1630 } 1636 1631 1637 1632 /** ··· 1650 1633 */ 1651 1634 void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) 1652 1635 { 1636 + struct drbd_peer_device *peer_device = first_peer_device(device); 1637 + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 1653 1638 union drbd_state ns; 1654 1639 int r; 1655 1640 ··· 1670 1651 if (r > 0) { 1671 1652 drbd_info(device, "before-resync-target handler returned %d, " 1672 1653 "dropping connection.\n", r); 1673 - conn_request_state(first_peer_device(device)->connection, NS(conn, C_DISCONNECTING), CS_HARD); 1654 + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); 1674 1655 return; 1675 1656 } 1676 1657 } else /* C_SYNC_SOURCE */ { ··· 1683 1664 } else { 1684 1665 drbd_info(device, "before-resync-source handler returned %d, " 1685 1666 "dropping connection.\n", r); 1686 - conn_request_state(first_peer_device(device)->connection, 1667 + conn_request_state(connection, 1687 1668 NS(conn, C_DISCONNECTING), CS_HARD); 1688 1669 return; 1689 1670 } ··· 1691 1672 } 1692 1673 } 1693 1674 1694 - if (current == first_peer_device(device)->connection->worker.task) { 1675 + if (current == connection->worker.task) { 1695 1676 /* The worker should not sleep waiting for state_mutex, 1696 1677 that can take long */ 1697 1678 if (!mutex_trylock(device->state_mutex)) { ··· 1752 1733 device->rs_mark_time[i] = now; 1753 1734 } 1754 1735 _drbd_pause_after(device); 1736 + /* Forget potentially stale cached per resync extent bit-counts. 1737 + * Open coded drbd_rs_cancel_all(device), we already have IRQs 1738 + * disabled, and know the disk state is ok. */ 1739 + spin_lock(&device->al_lock); 1740 + lc_reset(device->resync); 1741 + device->resync_locked = 0; 1742 + device->resync_wenr = LC_FREE; 1743 + spin_unlock(&device->al_lock); 1755 1744 } 1756 1745 write_unlock(&global_state_lock); 1757 1746 spin_unlock_irq(&device->resource->req_lock); 1758 1747 1759 1748 if (r == SS_SUCCESS) { 1749 + wake_up(&device->al_wait); /* for lc_reset() above */ 1760 1750 /* reset rs_last_bcast when a resync or verify is started, 1761 1751 * to deal with potential jiffies wrap. */ 1762 1752 device->rs_last_bcast = jiffies - HZ; ··· 1774 1746 drbd_conn_str(ns.conn), 1775 1747 (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10), 1776 1748 (unsigned long) device->rs_total); 1777 - if (side == C_SYNC_TARGET) 1749 + if (side == C_SYNC_TARGET) { 1778 1750 device->bm_resync_fo = 0; 1751 + device->use_csums = use_checksum_based_resync(connection, device); 1752 + } else { 1753 + device->use_csums = 0; 1754 + } 1779 1755 1780 1756 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid 1781 1757 * with w_send_oos, or the sync target will get confused as to ··· 1788 1756 * drbd_resync_finished from here in that case. 1789 1757 * We drbd_gen_and_send_sync_uuid here for protocol < 96, 1790 1758 * and from after_state_ch otherwise. */ 1791 - if (side == C_SYNC_SOURCE && 1792 - first_peer_device(device)->connection->agreed_pro_version < 96) 1793 - drbd_gen_and_send_sync_uuid(first_peer_device(device)); 1759 + if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96) 1760 + drbd_gen_and_send_sync_uuid(peer_device); 1794 1761 1795 - if (first_peer_device(device)->connection->agreed_pro_version < 95 && 1796 - device->rs_total == 0) { 1762 + if (connection->agreed_pro_version < 95 && device->rs_total == 0) { 1797 1763 /* This still has a race (about when exactly the peers 1798 1764 * detect connection loss) that can lead to a full sync 1799 1765 * on next handshake. In 8.3.9 we fixed this with explicit ··· 1807 1777 int timeo; 1808 1778 1809 1779 rcu_read_lock(); 1810 - nc = rcu_dereference(first_peer_device(device)->connection->net_conf); 1780 + nc = rcu_dereference(connection->net_conf); 1811 1781 timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9; 1812 1782 rcu_read_unlock(); 1813 1783 schedule_timeout_interruptible(timeo); ··· 1829 1799 mutex_unlock(device->state_mutex); 1830 1800 } 1831 1801 1802 + static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done) 1803 + { 1804 + struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; 1805 + device->rs_last_bcast = jiffies; 1806 + 1807 + if (!get_ldev(device)) 1808 + return; 1809 + 1810 + drbd_bm_write_lazy(device, 0); 1811 + if (resync_done && is_sync_state(device->state.conn)) 1812 + drbd_resync_finished(device); 1813 + 1814 + drbd_bcast_event(device, &sib); 1815 + /* update timestamp, in case it took a while to write out stuff */ 1816 + device->rs_last_bcast = jiffies; 1817 + put_ldev(device); 1818 + } 1819 + 1820 + static void drbd_ldev_destroy(struct drbd_device *device) 1821 + { 1822 + lc_destroy(device->resync); 1823 + device->resync = NULL; 1824 + lc_destroy(device->act_log); 1825 + device->act_log = NULL; 1826 + __no_warn(local, 1827 + drbd_free_ldev(device->ldev); 1828 + device->ldev = NULL;); 1829 + clear_bit(GOING_DISKLESS, &device->flags); 1830 + wake_up(&device->misc_wait); 1831 + } 1832 + 1833 + static void go_diskless(struct drbd_device *device) 1834 + { 1835 + D_ASSERT(device, device->state.disk == D_FAILED); 1836 + /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will 1837 + * inc/dec it frequently. Once we are D_DISKLESS, no one will touch 1838 + * the protected members anymore, though, so once put_ldev reaches zero 1839 + * again, it will be safe to free them. */ 1840 + 1841 + /* Try to write changed bitmap pages, read errors may have just 1842 + * set some bits outside the area covered by the activity log. 1843 + * 1844 + * If we have an IO error during the bitmap writeout, 1845 + * we will want a full sync next time, just in case. 1846 + * (Do we want a specific meta data flag for this?) 1847 + * 1848 + * If that does not make it to stable storage either, 1849 + * we cannot do anything about that anymore. 1850 + * 1851 + * We still need to check if both bitmap and ldev are present, we may 1852 + * end up here after a failed attach, before ldev was even assigned. 1853 + */ 1854 + if (device->bitmap && device->ldev) { 1855 + /* An interrupted resync or similar is allowed to recounts bits 1856 + * while we detach. 1857 + * Any modifications would not be expected anymore, though. 1858 + */ 1859 + if (drbd_bitmap_io_from_worker(device, drbd_bm_write, 1860 + "detach", BM_LOCKED_TEST_ALLOWED)) { 1861 + if (test_bit(WAS_READ_ERROR, &device->flags)) { 1862 + drbd_md_set_flag(device, MDF_FULL_SYNC); 1863 + drbd_md_sync(device); 1864 + } 1865 + } 1866 + } 1867 + 1868 + drbd_force_state(device, NS(disk, D_DISKLESS)); 1869 + } 1870 + 1871 + static int do_md_sync(struct drbd_device *device) 1872 + { 1873 + drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); 1874 + drbd_md_sync(device); 1875 + return 0; 1876 + } 1877 + 1878 + /* only called from drbd_worker thread, no locking */ 1879 + void __update_timing_details( 1880 + struct drbd_thread_timing_details *tdp, 1881 + unsigned int *cb_nr, 1882 + void *cb, 1883 + const char *fn, const unsigned int line) 1884 + { 1885 + unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST; 1886 + struct drbd_thread_timing_details *td = tdp + i; 1887 + 1888 + td->start_jif = jiffies; 1889 + td->cb_addr = cb; 1890 + td->caller_fn = fn; 1891 + td->line = line; 1892 + td->cb_nr = *cb_nr; 1893 + 1894 + i = (i+1) % DRBD_THREAD_DETAILS_HIST; 1895 + td = tdp + i; 1896 + memset(td, 0, sizeof(*td)); 1897 + 1898 + ++(*cb_nr); 1899 + } 1900 + 1901 + #define WORK_PENDING(work_bit, todo) (todo & (1UL << work_bit)) 1902 + static void do_device_work(struct drbd_device *device, const unsigned long todo) 1903 + { 1904 + if (WORK_PENDING(MD_SYNC, todo)) 1905 + do_md_sync(device); 1906 + if (WORK_PENDING(RS_DONE, todo) || 1907 + WORK_PENDING(RS_PROGRESS, todo)) 1908 + update_on_disk_bitmap(device, WORK_PENDING(RS_DONE, todo)); 1909 + if (WORK_PENDING(GO_DISKLESS, todo)) 1910 + go_diskless(device); 1911 + if (WORK_PENDING(DESTROY_DISK, todo)) 1912 + drbd_ldev_destroy(device); 1913 + if (WORK_PENDING(RS_START, todo)) 1914 + do_start_resync(device); 1915 + } 1916 + 1917 + #define DRBD_DEVICE_WORK_MASK \ 1918 + ((1UL << GO_DISKLESS) \ 1919 + |(1UL << DESTROY_DISK) \ 1920 + |(1UL << MD_SYNC) \ 1921 + |(1UL << RS_START) \ 1922 + |(1UL << RS_PROGRESS) \ 1923 + |(1UL << RS_DONE) \ 1924 + ) 1925 + 1926 + static unsigned long get_work_bits(unsigned long *flags) 1927 + { 1928 + unsigned long old, new; 1929 + do { 1930 + old = *flags; 1931 + new = old & ~DRBD_DEVICE_WORK_MASK; 1932 + } while (cmpxchg(flags, old, new) != old); 1933 + return old & DRBD_DEVICE_WORK_MASK; 1934 + } 1935 + 1936 + static void do_unqueued_work(struct drbd_connection *connection) 1937 + { 1938 + struct drbd_peer_device *peer_device; 1939 + int vnr; 1940 + 1941 + rcu_read_lock(); 1942 + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1943 + struct drbd_device *device = peer_device->device; 1944 + unsigned long todo = get_work_bits(&device->flags); 1945 + if (!todo) 1946 + continue; 1947 + 1948 + kref_get(&device->kref); 1949 + rcu_read_unlock(); 1950 + do_device_work(device, todo); 1951 + kref_put(&device->kref, drbd_destroy_device); 1952 + rcu_read_lock(); 1953 + } 1954 + rcu_read_unlock(); 1955 + } 1956 + 1832 1957 static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list) 1833 1958 { 1834 1959 spin_lock_irq(&queue->q_lock); 1835 - list_splice_init(&queue->q, work_list); 1960 + list_splice_tail_init(&queue->q, work_list); 1836 1961 spin_unlock_irq(&queue->q_lock); 1837 1962 return !list_empty(work_list); 1838 1963 } ··· 2036 1851 /* dequeue single item only, 2037 1852 * we still use drbd_queue_work_front() in some places */ 2038 1853 if (!list_empty(&connection->sender_work.q)) 2039 - list_move(connection->sender_work.q.next, work_list); 1854 + list_splice_tail_init(&connection->sender_work.q, work_list); 2040 1855 spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ 2041 1856 if (!list_empty(work_list) || signal_pending(current)) { 2042 1857 spin_unlock_irq(&connection->resource->req_lock); ··· 2058 1873 if (send_barrier) 2059 1874 maybe_send_barrier(connection, 2060 1875 connection->send.current_epoch_nr + 1); 1876 + 1877 + if (test_bit(DEVICE_WORK_PENDING, &connection->flags)) 1878 + break; 1879 + 1880 + /* drbd_send() may have called flush_signals() */ 1881 + if (get_t_state(&connection->worker) != RUNNING) 1882 + break; 1883 + 2061 1884 schedule(); 2062 1885 /* may be woken up for other things but new work, too, 2063 1886 * e.g. if the current epoch got closed. ··· 2099 1906 while (get_t_state(thi) == RUNNING) { 2100 1907 drbd_thread_current_set_cpu(thi); 2101 1908 2102 - /* as long as we use drbd_queue_work_front(), 2103 - * we may only dequeue single work items here, not batches. */ 2104 - if (list_empty(&work_list)) 1909 + if (list_empty(&work_list)) { 1910 + update_worker_timing_details(connection, wait_for_work); 2105 1911 wait_for_work(connection, &work_list); 1912 + } 1913 + 1914 + if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) { 1915 + update_worker_timing_details(connection, do_unqueued_work); 1916 + do_unqueued_work(connection); 1917 + } 2106 1918 2107 1919 if (signal_pending(current)) { 2108 1920 flush_signals(current); ··· 2124 1926 while (!list_empty(&work_list)) { 2125 1927 w = list_first_entry(&work_list, struct drbd_work, list); 2126 1928 list_del_init(&w->list); 1929 + update_worker_timing_details(connection, w->cb); 2127 1930 if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0) 2128 1931 continue; 2129 1932 if (connection->cstate >= C_WF_REPORT_PARAMS) ··· 2133 1934 } 2134 1935 2135 1936 do { 1937 + if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) { 1938 + update_worker_timing_details(connection, do_unqueued_work); 1939 + do_unqueued_work(connection); 1940 + } 2136 1941 while (!list_empty(&work_list)) { 2137 1942 w = list_first_entry(&work_list, struct drbd_work, list); 2138 1943 list_del_init(&w->list); 1944 + update_worker_timing_details(connection, w->cb); 2139 1945 w->cb(w, 1); 2140 1946 } 2141 1947 dequeue_work_batch(&connection->sender_work, &work_list); 2142 - } while (!list_empty(&work_list)); 1948 + } while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags)); 2143 1949 2144 1950 rcu_read_lock(); 2145 1951 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {

+84 -20

drivers/block/virtio_blk.c

··· 15 15 #include <linux/numa.h> 16 16 17 17 #define PART_BITS 4 18 + #define VQ_NAME_LEN 16 18 19 19 20 static int major; 20 21 static DEFINE_IDA(vd_index_ida); 21 22 22 23 static struct workqueue_struct *virtblk_wq; 23 24 25 + struct virtio_blk_vq { 26 + struct virtqueue *vq; 27 + spinlock_t lock; 28 + char name[VQ_NAME_LEN]; 29 + } ____cacheline_aligned_in_smp; 30 + 24 31 struct virtio_blk 25 32 { 26 33 struct virtio_device *vdev; 27 - struct virtqueue *vq; 28 - spinlock_t vq_lock; 29 34 30 35 /* The disk structure for the kernel. */ 31 36 struct gendisk *disk; ··· 52 47 53 48 /* Ida index - used to track minor number allocations. */ 54 49 int index; 50 + 51 + /* num of vqs */ 52 + int num_vqs; 53 + struct virtio_blk_vq *vqs; 55 54 }; 56 55 57 56 struct virtblk_req ··· 142 133 { 143 134 struct virtio_blk *vblk = vq->vdev->priv; 144 135 bool req_done = false; 136 + int qid = vq->index; 145 137 struct virtblk_req *vbr; 146 138 unsigned long flags; 147 139 unsigned int len; 148 140 149 - spin_lock_irqsave(&vblk->vq_lock, flags); 141 + spin_lock_irqsave(&vblk->vqs[qid].lock, flags); 150 142 do { 151 143 virtqueue_disable_cb(vq); 152 - while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { 144 + while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) { 153 145 blk_mq_complete_request(vbr->req); 154 146 req_done = true; 155 147 } ··· 161 151 /* In case queue is stopped waiting for more buffers. */ 162 152 if (req_done) 163 153 blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); 164 - spin_unlock_irqrestore(&vblk->vq_lock, flags); 154 + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); 165 155 } 166 156 167 157 static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) ··· 170 160 struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); 171 161 unsigned long flags; 172 162 unsigned int num; 163 + int qid = hctx->queue_num; 173 164 const bool last = (req->cmd_flags & REQ_END) != 0; 174 165 int err; 175 166 bool notify = false; ··· 213 202 vbr->out_hdr.type |= VIRTIO_BLK_T_IN; 214 203 } 215 204 216 - spin_lock_irqsave(&vblk->vq_lock, flags); 217 - err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num); 205 + spin_lock_irqsave(&vblk->vqs[qid].lock, flags); 206 + err = __virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num); 218 207 if (err) { 219 - virtqueue_kick(vblk->vq); 208 + virtqueue_kick(vblk->vqs[qid].vq); 220 209 blk_mq_stop_hw_queue(hctx); 221 - spin_unlock_irqrestore(&vblk->vq_lock, flags); 210 + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); 222 211 /* Out of mem doesn't actually happen, since we fall back 223 212 * to direct descriptors */ 224 213 if (err == -ENOMEM || err == -ENOSPC) ··· 226 215 return BLK_MQ_RQ_QUEUE_ERROR; 227 216 } 228 217 229 - if (last && virtqueue_kick_prepare(vblk->vq)) 218 + if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) 230 219 notify = true; 231 - spin_unlock_irqrestore(&vblk->vq_lock, flags); 220 + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); 232 221 233 222 if (notify) 234 - virtqueue_notify(vblk->vq); 223 + virtqueue_notify(vblk->vqs[qid].vq); 235 224 return BLK_MQ_RQ_QUEUE_OK; 236 225 } 237 226 ··· 388 377 static int init_vq(struct virtio_blk *vblk) 389 378 { 390 379 int err = 0; 380 + int i; 381 + vq_callback_t **callbacks; 382 + const char **names; 383 + struct virtqueue **vqs; 384 + unsigned short num_vqs; 385 + struct virtio_device *vdev = vblk->vdev; 391 386 392 - /* We expect one virtqueue, for output. */ 393 - vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done, "requests"); 394 - if (IS_ERR(vblk->vq)) 395 - err = PTR_ERR(vblk->vq); 387 + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ, 388 + struct virtio_blk_config, num_queues, 389 + &num_vqs); 390 + if (err) 391 + num_vqs = 1; 396 392 393 + vblk->vqs = kmalloc(sizeof(*vblk->vqs) * num_vqs, GFP_KERNEL); 394 + if (!vblk->vqs) { 395 + err = -ENOMEM; 396 + goto out; 397 + } 398 + 399 + names = kmalloc(sizeof(*names) * num_vqs, GFP_KERNEL); 400 + if (!names) 401 + goto err_names; 402 + 403 + callbacks = kmalloc(sizeof(*callbacks) * num_vqs, GFP_KERNEL); 404 + if (!callbacks) 405 + goto err_callbacks; 406 + 407 + vqs = kmalloc(sizeof(*vqs) * num_vqs, GFP_KERNEL); 408 + if (!vqs) 409 + goto err_vqs; 410 + 411 + for (i = 0; i < num_vqs; i++) { 412 + callbacks[i] = virtblk_done; 413 + snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i); 414 + names[i] = vblk->vqs[i].name; 415 + } 416 + 417 + /* Discover virtqueues and write information to configuration. */ 418 + err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names); 419 + if (err) 420 + goto err_find_vqs; 421 + 422 + for (i = 0; i < num_vqs; i++) { 423 + spin_lock_init(&vblk->vqs[i].lock); 424 + vblk->vqs[i].vq = vqs[i]; 425 + } 426 + vblk->num_vqs = num_vqs; 427 + 428 + err_find_vqs: 429 + kfree(vqs); 430 + err_vqs: 431 + kfree(callbacks); 432 + err_callbacks: 433 + kfree(names); 434 + err_names: 435 + if (err) 436 + kfree(vblk->vqs); 437 + out: 397 438 return err; 398 439 } 399 440 ··· 614 551 err = init_vq(vblk); 615 552 if (err) 616 553 goto out_free_vblk; 617 - spin_lock_init(&vblk->vq_lock); 618 554 619 555 /* FIXME: How many partitions? How long is a piece of string? */ 620 556 vblk->disk = alloc_disk(1 << PART_BITS); ··· 624 562 625 563 /* Default queue sizing is to fill the ring. */ 626 564 if (!virtblk_queue_depth) { 627 - virtblk_queue_depth = vblk->vq->num_free; 565 + virtblk_queue_depth = vblk->vqs[0].vq->num_free; 628 566 /* ... but without indirect descs, we use 2 descs per req */ 629 567 if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) 630 568 virtblk_queue_depth /= 2; ··· 632 570 633 571 memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); 634 572 vblk->tag_set.ops = &virtio_mq_ops; 635 - vblk->tag_set.nr_hw_queues = 1; 636 573 vblk->tag_set.queue_depth = virtblk_queue_depth; 637 574 vblk->tag_set.numa_node = NUMA_NO_NODE; 638 575 vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ··· 639 578 sizeof(struct virtblk_req) + 640 579 sizeof(struct scatterlist) * sg_elems; 641 580 vblk->tag_set.driver_data = vblk; 581 + vblk->tag_set.nr_hw_queues = vblk->num_vqs; 642 582 643 583 err = blk_mq_alloc_tag_set(&vblk->tag_set); 644 584 if (err) ··· 789 727 refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount); 790 728 put_disk(vblk->disk); 791 729 vdev->config->del_vqs(vdev); 730 + kfree(vblk->vqs); 792 731 kfree(vblk); 793 732 794 733 /* Only free device id if we don't have any users */ ··· 840 777 static unsigned int features[] = { 841 778 VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, 842 779 VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI, 843 - VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE 780 + VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, 781 + VIRTIO_BLK_F_MQ, 844 782 }; 845 783 846 784 static struct virtio_driver virtio_blk = {

+1 -1

drivers/md/bcache/alloc.c

··· 331 331 mutex_unlock(&ca->set->bucket_lock); 332 332 blkdev_issue_discard(ca->bdev, 333 333 bucket_to_sector(ca->set, bucket), 334 - ca->sb.block_size, GFP_KERNEL, 0); 334 + ca->sb.bucket_size, GFP_KERNEL, 0); 335 335 mutex_lock(&ca->set->bucket_lock); 336 336 } 337 337

+4

drivers/md/bcache/bcache.h

··· 477 477 * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; 478 478 * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. 479 479 * flushing dirty data). 480 + * 481 + * CACHE_SET_RUNNING means all cache devices have been registered and journal 482 + * replay is complete. 480 483 */ 481 484 #define CACHE_SET_UNREGISTERING 0 482 485 #define CACHE_SET_STOPPING 1 486 + #define CACHE_SET_RUNNING 2 483 487 484 488 struct cache_set { 485 489 struct closure cl;

+1 -1

drivers/md/bcache/bset.c

··· 1182 1182 { 1183 1183 uint64_t start_time; 1184 1184 bool used_mempool = false; 1185 - struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, 1185 + struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, 1186 1186 order); 1187 1187 if (!out) { 1188 1188 struct page *outp;

+1 -1

drivers/md/bcache/bset.h

··· 453 453 { 454 454 return (KEY_DIRTY(l) == KEY_DIRTY(r) && 455 455 KEY_PTRS(l) == KEY_PTRS(r) && 456 - KEY_CSUM(l) == KEY_CSUM(l)); 456 + KEY_CSUM(l) == KEY_CSUM(r)); 457 457 } 458 458 459 459 /* Keylists */

+31 -19

drivers/md/bcache/btree.c

··· 117 117 ({ \ 118 118 int _r, l = (b)->level - 1; \ 119 119 bool _w = l <= (op)->lock; \ 120 - struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\ 120 + struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ 121 + _w, b); \ 121 122 if (!IS_ERR(_child)) { \ 122 - _child->parent = (b); \ 123 123 _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ 124 124 rw_unlock(_w, _child); \ 125 125 } else \ ··· 142 142 rw_lock(_w, _b, _b->level); \ 143 143 if (_b == (c)->root && \ 144 144 _w == insert_lock(op, _b)) { \ 145 - _b->parent = NULL; \ 146 145 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ 147 146 } \ 148 147 rw_unlock(_w, _b); \ ··· 201 202 struct bset *i = btree_bset_first(b); 202 203 struct btree_iter *iter; 203 204 204 - iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); 205 + iter = mempool_alloc(b->c->fill_iter, GFP_NOIO); 205 206 iter->size = b->c->sb.bucket_size / b->c->sb.block_size; 206 207 iter->used = 0; 207 208 ··· 420 421 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + 421 422 bset_sector_offset(&b->keys, i)); 422 423 423 - if (!bio_alloc_pages(b->bio, GFP_NOIO)) { 424 + if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { 424 425 int j; 425 426 struct bio_vec *bv; 426 427 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); ··· 966 967 * level and op->lock. 967 968 */ 968 969 struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, 969 - struct bkey *k, int level, bool write) 970 + struct bkey *k, int level, bool write, 971 + struct btree *parent) 970 972 { 971 973 int i = 0; 972 974 struct btree *b; ··· 1002 1002 BUG_ON(b->level != level); 1003 1003 } 1004 1004 1005 + b->parent = parent; 1005 1006 b->accessed = 1; 1006 1007 1007 1008 for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { ··· 1023 1022 return b; 1024 1023 } 1025 1024 1026 - static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) 1025 + static void btree_node_prefetch(struct btree *parent, struct bkey *k) 1027 1026 { 1028 1027 struct btree *b; 1029 1028 1030 - mutex_lock(&c->bucket_lock); 1031 - b = mca_alloc(c, NULL, k, level); 1032 - mutex_unlock(&c->bucket_lock); 1029 + mutex_lock(&parent->c->bucket_lock); 1030 + b = mca_alloc(parent->c, NULL, k, parent->level - 1); 1031 + mutex_unlock(&parent->c->bucket_lock); 1033 1032 1034 1033 if (!IS_ERR_OR_NULL(b)) { 1034 + b->parent = parent; 1035 1035 bch_btree_node_read(b); 1036 1036 rw_unlock(true, b); 1037 1037 } ··· 1062 1060 mutex_unlock(&b->c->bucket_lock); 1063 1061 } 1064 1062 1065 - struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, 1066 - int level) 1063 + struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, 1064 + int level, bool wait, 1065 + struct btree *parent) 1067 1066 { 1068 1067 BKEY_PADDED(key) k; 1069 1068 struct btree *b = ERR_PTR(-EAGAIN); 1070 1069 1071 1070 mutex_lock(&c->bucket_lock); 1072 1071 retry: 1073 - if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL)) 1072 + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) 1074 1073 goto err; 1075 1074 1076 1075 bkey_put(c, &k.key); ··· 1088 1085 } 1089 1086 1090 1087 b->accessed = 1; 1088 + b->parent = parent; 1091 1089 bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); 1092 1090 1093 1091 mutex_unlock(&c->bucket_lock); ··· 1100 1096 err: 1101 1097 mutex_unlock(&c->bucket_lock); 1102 1098 1103 - trace_bcache_btree_node_alloc_fail(b); 1099 + trace_bcache_btree_node_alloc_fail(c); 1104 1100 return b; 1101 + } 1102 + 1103 + static struct btree *bch_btree_node_alloc(struct cache_set *c, 1104 + struct btree_op *op, int level, 1105 + struct btree *parent) 1106 + { 1107 + return __bch_btree_node_alloc(c, op, level, op != NULL, parent); 1105 1108 } 1106 1109 1107 1110 static struct btree *btree_node_alloc_replacement(struct btree *b, 1108 1111 struct btree_op *op) 1109 1112 { 1110 - struct btree *n = bch_btree_node_alloc(b->c, op, b->level); 1113 + struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); 1111 1114 if (!IS_ERR_OR_NULL(n)) { 1112 1115 mutex_lock(&n->write_lock); 1113 1116 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); ··· 1414 1403 BUG_ON(btree_bset_first(new_nodes[0])->keys); 1415 1404 btree_node_free(new_nodes[0]); 1416 1405 rw_unlock(true, new_nodes[0]); 1406 + new_nodes[0] = NULL; 1417 1407 1418 1408 for (i = 0; i < nodes; i++) { 1419 1409 if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) ··· 1528 1516 k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); 1529 1517 if (k) { 1530 1518 r->b = bch_btree_node_get(b->c, op, k, b->level - 1, 1531 - true); 1519 + true, b); 1532 1520 if (IS_ERR(r->b)) { 1533 1521 ret = PTR_ERR(r->b); 1534 1522 break; ··· 1823 1811 k = bch_btree_iter_next_filter(&iter, &b->keys, 1824 1812 bch_ptr_bad); 1825 1813 if (k) 1826 - btree_node_prefetch(b->c, k, b->level - 1); 1814 + btree_node_prefetch(b, k); 1827 1815 1828 1816 if (p) 1829 1817 ret = btree(check_recurse, p, b, op); ··· 1988 1976 1989 1977 trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); 1990 1978 1991 - n2 = bch_btree_node_alloc(b->c, op, b->level); 1979 + n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent); 1992 1980 if (IS_ERR(n2)) 1993 1981 goto err_free1; 1994 1982 1995 1983 if (!b->parent) { 1996 - n3 = bch_btree_node_alloc(b->c, op, b->level + 1); 1984 + n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL); 1997 1985 if (IS_ERR(n3)) 1998 1986 goto err_free2; 1999 1987 }

+3 -2

drivers/md/bcache/btree.h

··· 242 242 void bch_btree_node_write(struct btree *, struct closure *); 243 243 244 244 void bch_btree_set_root(struct btree *); 245 - struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int); 245 + struct btree *__bch_btree_node_alloc(struct cache_set *, struct btree_op *, 246 + int, bool, struct btree *); 246 247 struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *, 247 - struct bkey *, int, bool); 248 + struct bkey *, int, bool, struct btree *); 248 249 249 250 int bch_btree_insert_check_key(struct btree *, struct btree_op *, 250 251 struct bkey *);

+9 -4

drivers/md/bcache/extents.c

··· 474 474 return false; 475 475 } 476 476 477 - static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) 477 + bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k) 478 478 { 479 - struct btree *b = container_of(bk, struct btree, keys); 480 479 char buf[80]; 481 480 482 481 if (!KEY_SIZE(k)) ··· 484 485 if (KEY_SIZE(k) > KEY_OFFSET(k)) 485 486 goto bad; 486 487 487 - if (__ptr_invalid(b->c, k)) 488 + if (__ptr_invalid(c, k)) 488 489 goto bad; 489 490 490 491 return false; 491 492 bad: 492 493 bch_extent_to_text(buf, sizeof(buf), k); 493 - cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k)); 494 + cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k)); 494 495 return true; 496 + } 497 + 498 + static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) 499 + { 500 + struct btree *b = container_of(bk, struct btree, keys); 501 + return __bch_extent_invalid(b->c, k); 495 502 } 496 503 497 504 static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,

+1

drivers/md/bcache/extents.h

··· 9 9 10 10 void bch_extent_to_text(char *, size_t, const struct bkey *); 11 11 bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); 12 + bool __bch_extent_invalid(struct cache_set *, const struct bkey *); 12 13 13 14 #endif /* _BCACHE_EXTENTS_H */

+15 -9

drivers/md/bcache/journal.c

··· 7 7 #include "bcache.h" 8 8 #include "btree.h" 9 9 #include "debug.h" 10 + #include "extents.h" 10 11 11 12 #include <trace/events/bcache.h> 12 13 ··· 190 189 if (read_bucket(l)) 191 190 goto bsearch; 192 191 193 - if (list_empty(list)) 192 + /* no journal entries on this device? */ 193 + if (l == ca->sb.njournal_buckets) 194 194 continue; 195 195 bsearch: 196 + BUG_ON(list_empty(list)); 197 + 196 198 /* Binary search */ 197 - m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); 199 + m = l; 200 + r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); 198 201 pr_debug("starting binary search, l %u r %u", l, r); 199 202 200 203 while (l + 1 < r) { ··· 296 291 297 292 for (k = i->j.start; 298 293 k < bset_bkey_last(&i->j); 299 - k = bkey_next(k)) { 300 - unsigned j; 294 + k = bkey_next(k)) 295 + if (!__bch_extent_invalid(c, k)) { 296 + unsigned j; 301 297 302 - for (j = 0; j < KEY_PTRS(k); j++) 303 - if (ptr_available(c, k, j)) 304 - atomic_inc(&PTR_BUCKET(c, k, j)->pin); 298 + for (j = 0; j < KEY_PTRS(k); j++) 299 + if (ptr_available(c, k, j)) 300 + atomic_inc(&PTR_BUCKET(c, k, j)->pin); 305 301 306 - bch_initial_mark_key(c, 0, k); 307 - } 302 + bch_initial_mark_key(c, 0, k); 303 + } 308 304 } 309 305 } 310 306

+2 -1

drivers/md/bcache/request.c

··· 311 311 { 312 312 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 313 313 314 - trace_bcache_write(op->bio, op->writeback, op->bypass); 314 + trace_bcache_write(op->c, op->inode, op->bio, 315 + op->writeback, op->bypass); 315 316 316 317 bch_keylist_init(&op->insert_keys); 317 318 bio_get(op->bio);

+37 -20

drivers/md/bcache/super.c

··· 733 733 static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, 734 734 unsigned id) 735 735 { 736 - BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags)); 737 - 738 736 d->id = id; 739 737 d->c = c; 740 738 c->devices[id] = d; ··· 925 927 list_move(&dc->list, &uncached_devices); 926 928 927 929 clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags); 930 + clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags); 928 931 929 932 mutex_unlock(&bch_register_lock); 930 933 ··· 1040 1041 */ 1041 1042 atomic_set(&dc->count, 1); 1042 1043 1044 + if (bch_cached_dev_writeback_start(dc)) 1045 + return -ENOMEM; 1046 + 1043 1047 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { 1044 1048 bch_sectors_dirty_init(dc); 1045 1049 atomic_set(&dc->has_dirty, 1); ··· 1072 1070 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); 1073 1071 1074 1072 cancel_delayed_work_sync(&dc->writeback_rate_update); 1075 - kthread_stop(dc->writeback_thread); 1073 + if (!IS_ERR_OR_NULL(dc->writeback_thread)) 1074 + kthread_stop(dc->writeback_thread); 1076 1075 1077 1076 mutex_lock(&bch_register_lock); 1078 1077 ··· 1084 1081 1085 1082 mutex_unlock(&bch_register_lock); 1086 1083 1087 - if (!IS_ERR_OR_NULL(dc->bdev)) { 1088 - if (dc->bdev->bd_disk) 1089 - blk_sync_queue(bdev_get_queue(dc->bdev)); 1090 - 1084 + if (!IS_ERR_OR_NULL(dc->bdev)) 1091 1085 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 1092 - } 1093 1086 1094 1087 wake_up(&unregister_wait); 1095 1088 ··· 1212 1213 static void flash_dev_free(struct closure *cl) 1213 1214 { 1214 1215 struct bcache_device *d = container_of(cl, struct bcache_device, cl); 1216 + mutex_lock(&bch_register_lock); 1215 1217 bcache_device_free(d); 1218 + mutex_unlock(&bch_register_lock); 1216 1219 kobject_put(&d->kobj); 1217 1220 } 1218 1221 ··· 1222 1221 { 1223 1222 struct bcache_device *d = container_of(cl, struct bcache_device, cl); 1224 1223 1224 + mutex_lock(&bch_register_lock); 1225 1225 bcache_device_unlink(d); 1226 + mutex_unlock(&bch_register_lock); 1226 1227 kobject_del(&d->kobj); 1227 1228 continue_at(cl, flash_dev_free, system_wq); 1228 1229 } ··· 1279 1276 1280 1277 if (test_bit(CACHE_SET_STOPPING, &c->flags)) 1281 1278 return -EINTR; 1279 + 1280 + if (!test_bit(CACHE_SET_RUNNING, &c->flags)) 1281 + return -EPERM; 1282 1282 1283 1283 u = uuid_find_empty(c); 1284 1284 if (!u) { ··· 1352 1346 bch_journal_free(c); 1353 1347 1354 1348 for_each_cache(ca, c, i) 1355 - if (ca) 1349 + if (ca) { 1350 + ca->set = NULL; 1351 + c->cache[ca->sb.nr_this_dev] = NULL; 1356 1352 kobject_put(&ca->kobj); 1353 + } 1357 1354 1358 1355 bch_bset_sort_state_free(&c->sort); 1359 1356 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); ··· 1414 1405 if (ca->alloc_thread) 1415 1406 kthread_stop(ca->alloc_thread); 1416 1407 1417 - cancel_delayed_work_sync(&c->journal.work); 1418 - /* flush last journal entry if needed */ 1419 - c->journal.work.work.func(&c->journal.work.work); 1408 + if (c->journal.cur) { 1409 + cancel_delayed_work_sync(&c->journal.work); 1410 + /* flush last journal entry if needed */ 1411 + c->journal.work.work.func(&c->journal.work.work); 1412 + } 1420 1413 1421 1414 closure_return(cl); 1422 1415 } ··· 1597 1586 goto err; 1598 1587 1599 1588 err = "error reading btree root"; 1600 - c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true); 1589 + c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL); 1601 1590 if (IS_ERR_OR_NULL(c->root)) 1602 1591 goto err; 1603 1592 ··· 1672 1661 goto err; 1673 1662 1674 1663 err = "cannot allocate new btree root"; 1675 - c->root = bch_btree_node_alloc(c, NULL, 0); 1664 + c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL); 1676 1665 if (IS_ERR_OR_NULL(c->root)) 1677 1666 goto err; 1678 1667 ··· 1708 1697 1709 1698 flash_devs_run(c); 1710 1699 1700 + set_bit(CACHE_SET_RUNNING, &c->flags); 1711 1701 return; 1712 1702 err: 1713 1703 closure_sync(&cl); ··· 1772 1760 pr_debug("set version = %llu", c->sb.version); 1773 1761 } 1774 1762 1763 + kobject_get(&ca->kobj); 1775 1764 ca->set = c; 1776 1765 ca->set->cache[ca->sb.nr_this_dev] = ca; 1777 1766 c->cache_by_alloc[c->caches_loaded++] = ca; ··· 1793 1780 struct cache *ca = container_of(kobj, struct cache, kobj); 1794 1781 unsigned i; 1795 1782 1796 - if (ca->set) 1783 + if (ca->set) { 1784 + BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca); 1797 1785 ca->set->cache[ca->sb.nr_this_dev] = NULL; 1786 + } 1798 1787 1799 1788 bio_split_pool_free(&ca->bio_split_hook); 1800 1789 ··· 1813 1798 if (ca->sb_bio.bi_inline_vecs[0].bv_page) 1814 1799 put_page(ca->sb_bio.bi_io_vec[0].bv_page); 1815 1800 1816 - if (!IS_ERR_OR_NULL(ca->bdev)) { 1817 - blk_sync_queue(bdev_get_queue(ca->bdev)); 1801 + if (!IS_ERR_OR_NULL(ca->bdev)) 1818 1802 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 1819 - } 1820 1803 1821 1804 kfree(ca); 1822 1805 module_put(THIS_MODULE); ··· 1857 1844 } 1858 1845 1859 1846 static void register_cache(struct cache_sb *sb, struct page *sb_page, 1860 - struct block_device *bdev, struct cache *ca) 1847 + struct block_device *bdev, struct cache *ca) 1861 1848 { 1862 1849 char name[BDEVNAME_SIZE]; 1863 1850 const char *err = "cannot allocate memory"; ··· 1890 1877 goto err; 1891 1878 1892 1879 pr_info("registered cache device %s", bdevname(bdev, name)); 1880 + out: 1881 + kobject_put(&ca->kobj); 1893 1882 return; 1894 1883 err: 1895 1884 pr_notice("error opening %s: %s", bdevname(bdev, name), err); 1896 - kobject_put(&ca->kobj); 1885 + goto out; 1897 1886 } 1898 1887 1899 1888 /* Global interfaces/init */ ··· 1960 1945 if (IS_ERR(bdev)) { 1961 1946 if (bdev == ERR_PTR(-EBUSY)) { 1962 1947 bdev = lookup_bdev(strim(path)); 1948 + mutex_lock(&bch_register_lock); 1963 1949 if (!IS_ERR(bdev) && bch_is_open(bdev)) 1964 1950 err = "device already registered"; 1965 1951 else 1966 1952 err = "device busy"; 1953 + mutex_unlock(&bch_register_lock); 1967 1954 } 1968 1955 goto err; 1969 1956 }

+2 -2

drivers/md/bcache/util.h

··· 416 416 average_frequency, frequency_units); \ 417 417 __print_time_stat(stats, name, \ 418 418 average_duration, duration_units); \ 419 - __print_time_stat(stats, name, \ 420 - max_duration, duration_units); \ 419 + sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \ 420 + div_u64((stats)->max_duration, NSEC_PER_ ## duration_units));\ 421 421 \ 422 422 sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ 423 423 ? div_s64(local_clock() - (stats)->last, \

+10 -4

drivers/md/bcache/writeback.c

··· 239 239 if (KEY_START(&w->key) != dc->last_read || 240 240 jiffies_to_msecs(delay) > 50) 241 241 while (!kthread_should_stop() && delay) 242 - delay = schedule_timeout_uninterruptible(delay); 242 + delay = schedule_timeout_interruptible(delay); 243 243 244 244 dc->last_read = KEY_OFFSET(&w->key); 245 245 ··· 436 436 while (delay && 437 437 !kthread_should_stop() && 438 438 !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) 439 - delay = schedule_timeout_uninterruptible(delay); 439 + delay = schedule_timeout_interruptible(delay); 440 440 } 441 441 } 442 442 ··· 478 478 dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); 479 479 } 480 480 481 - int bch_cached_dev_writeback_init(struct cached_dev *dc) 481 + void bch_cached_dev_writeback_init(struct cached_dev *dc) 482 482 { 483 483 sema_init(&dc->in_flight, 64); 484 484 init_rwsem(&dc->writeback_lock); ··· 494 494 dc->writeback_rate_d_term = 30; 495 495 dc->writeback_rate_p_term_inverse = 6000; 496 496 497 + INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); 498 + } 499 + 500 + int bch_cached_dev_writeback_start(struct cached_dev *dc) 501 + { 497 502 dc->writeback_thread = kthread_create(bch_writeback_thread, dc, 498 503 "bcache_writeback"); 499 504 if (IS_ERR(dc->writeback_thread)) 500 505 return PTR_ERR(dc->writeback_thread); 501 506 502 - INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); 503 507 schedule_delayed_work(&dc->writeback_rate_update, 504 508 dc->writeback_rate_update_seconds * HZ); 509 + 510 + bch_writeback_queue(dc); 505 511 506 512 return 0; 507 513 }

+2 -1

drivers/md/bcache/writeback.h

··· 85 85 void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); 86 86 87 87 void bch_sectors_dirty_init(struct cached_dev *dc); 88 - int bch_cached_dev_writeback_init(struct cached_dev *); 88 + void bch_cached_dev_writeback_init(struct cached_dev *); 89 + int bch_cached_dev_writeback_start(struct cached_dev *); 89 90 90 91 #endif

+2 -2

include/linux/drbd.h

··· 52 52 #endif 53 53 54 54 extern const char *drbd_buildtag(void); 55 - #define REL_VERSION "8.4.3" 55 + #define REL_VERSION "8.4.5" 56 56 #define API_VERSION 1 57 57 #define PRO_VERSION_MIN 86 58 58 #define PRO_VERSION_MAX 101 ··· 245 245 D_DISKLESS, 246 246 D_ATTACHING, /* In the process of reading the meta-data */ 247 247 D_FAILED, /* Becomes D_DISKLESS as soon as we told it the peer */ 248 - /* when >= D_FAILED it is legal to access mdev->bc */ 248 + /* when >= D_FAILED it is legal to access mdev->ldev */ 249 249 D_NEGOTIATING, /* Late attaching state, we need to talk to the peer */ 250 250 D_INCONSISTENT, 251 251 D_OUTDATED,

+4

include/linux/drbd_genl.h

··· 171 171 __flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, tentative) 172 172 __flg_field_def(29, DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF) 173 173 /* 9: __u32_field_def(30, DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF) */ 174 + /* 9: __str_field_def(31, DRBD_GENLA_F_MANDATORY, name, SHARED_SECRET_MAX) */ 175 + /* 9: __u32_field(32, DRBD_F_REQUIRED | DRBD_F_INVARIANT, peer_node_id) */ 176 + __flg_field_def(33, 0 /* OPTIONAL */, csums_after_crash_only, DRBD_CSUMS_AFTER_CRASH_ONLY_DEF) 177 + __u32_field_def(34, 0 /* OPTIONAL */, sock_check_timeo, DRBD_SOCKET_CHECK_TIMEO_DEF) 174 178 ) 175 179 176 180 GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms,

+6

include/linux/drbd_limits.h

··· 214 214 #define DRBD_ALLOW_TWO_PRIMARIES_DEF 0 215 215 #define DRBD_ALWAYS_ASBP_DEF 0 216 216 #define DRBD_USE_RLE_DEF 1 217 + #define DRBD_CSUMS_AFTER_CRASH_ONLY_DEF 0 217 218 218 219 #define DRBD_AL_STRIPES_MIN 1 219 220 #define DRBD_AL_STRIPES_MAX 1024 ··· 225 224 #define DRBD_AL_STRIPE_SIZE_MAX 16777216 226 225 #define DRBD_AL_STRIPE_SIZE_DEF 32 227 226 #define DRBD_AL_STRIPE_SIZE_SCALE 'k' /* kilobytes */ 227 + 228 + #define DRBD_SOCKET_CHECK_TIMEO_MIN 0 229 + #define DRBD_SOCKET_CHECK_TIMEO_MAX DRBD_PING_TIMEO_MAX 230 + #define DRBD_SOCKET_CHECK_TIMEO_DEF 0 231 + #define DRBD_SOCKET_CHECK_TIMEO_SCALE '1' 228 232 #endif

+3 -1

include/scsi/sg.h

··· 86 86 #define SG_FLAG_MMAP_IO 4 /* request memory mapped IO */ 87 87 #define SG_FLAG_NO_DXFER 0x10000 /* no transfer of kernel buffers to/from */ 88 88 /* user space (debug indirect IO) */ 89 - #define SG_FLAG_Q_AT_TAIL 0x10 /* default is Q_AT_HEAD */ 89 + /* defaults:: for sg driver: Q_AT_HEAD; for block layer: Q_AT_TAIL */ 90 + #define SG_FLAG_Q_AT_TAIL 0x10 91 + #define SG_FLAG_Q_AT_HEAD 0x20 90 92 91 93 /* following 'info' values are "or"-ed together */ 92 94 #define SG_INFO_OK_MASK 0x1

+12 -9

include/trace/events/bcache.h

··· 148 148 ); 149 149 150 150 TRACE_EVENT(bcache_write, 151 - TP_PROTO(struct bio *bio, bool writeback, bool bypass), 152 - TP_ARGS(bio, writeback, bypass), 151 + TP_PROTO(struct cache_set *c, u64 inode, struct bio *bio, 152 + bool writeback, bool bypass), 153 + TP_ARGS(c, inode, bio, writeback, bypass), 153 154 154 155 TP_STRUCT__entry( 155 - __field(dev_t, dev ) 156 + __array(char, uuid, 16 ) 157 + __field(u64, inode ) 156 158 __field(sector_t, sector ) 157 159 __field(unsigned int, nr_sector ) 158 160 __array(char, rwbs, 6 ) ··· 163 161 ), 164 162 165 163 TP_fast_assign( 166 - __entry->dev = bio->bi_bdev->bd_dev; 164 + memcpy(__entry->uuid, c->sb.set_uuid, 16); 165 + __entry->inode = inode; 167 166 __entry->sector = bio->bi_iter.bi_sector; 168 167 __entry->nr_sector = bio->bi_iter.bi_size >> 9; 169 168 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_iter.bi_size); ··· 172 169 __entry->bypass = bypass; 173 170 ), 174 171 175 - TP_printk("%d,%d %s %llu + %u hit %u bypass %u", 176 - MAJOR(__entry->dev), MINOR(__entry->dev), 172 + TP_printk("%pU inode %llu %s %llu + %u hit %u bypass %u", 173 + __entry->uuid, __entry->inode, 177 174 __entry->rwbs, (unsigned long long)__entry->sector, 178 175 __entry->nr_sector, __entry->writeback, __entry->bypass) 179 176 ); ··· 261 258 TP_ARGS(b) 262 259 ); 263 260 264 - DEFINE_EVENT(btree_node, bcache_btree_node_alloc_fail, 265 - TP_PROTO(struct btree *b), 266 - TP_ARGS(b) 261 + DEFINE_EVENT(cache_set, bcache_btree_node_alloc_fail, 262 + TP_PROTO(struct cache_set *c), 263 + TP_ARGS(c) 267 264 ); 268 265 269 266 DEFINE_EVENT(btree_node, bcache_btree_node_free,

+6 -5

include/uapi/linux/bsg.h

··· 10 10 #define BSG_SUB_PROTOCOL_SCSI_TRANSPORT 2 11 11 12 12 /* 13 - * For flags member below 14 - * sg.h sg_io_hdr also has bits defined for it's flags member. However 15 - * none of these bits are implemented/used by bsg. The bits below are 16 - * allocated to not conflict with sg.h ones anyway. 13 + * For flag constants below: 14 + * sg.h sg_io_hdr also has bits defined for it's flags member. These 15 + * two flag values (0x10 and 0x20) have the same meaning in sg.h . For 16 + * bsg the BSG_FLAG_Q_AT_HEAD flag is ignored since it is the deafult. 17 17 */ 18 - #define BSG_FLAG_Q_AT_TAIL 0x10 /* default, == 0 at this bit, is Q_AT_HEAD */ 18 + #define BSG_FLAG_Q_AT_TAIL 0x10 /* default is Q_AT_HEAD */ 19 + #define BSG_FLAG_Q_AT_HEAD 0x20 19 20 20 21 struct sg_io_v4 { 21 22 __s32 guard; /* [i] 'Q' to differentiate from v3 */

+5

include/uapi/linux/virtio_blk.h

··· 40 40 #define VIRTIO_BLK_F_WCE 9 /* Writeback mode enabled after reset */ 41 41 #define VIRTIO_BLK_F_TOPOLOGY 10 /* Topology information is available */ 42 42 #define VIRTIO_BLK_F_CONFIG_WCE 11 /* Writeback mode available in config */ 43 + #define VIRTIO_BLK_F_MQ 12 /* support more than one vq */ 43 44 44 45 #ifndef __KERNEL__ 45 46 /* Old (deprecated) name for VIRTIO_BLK_F_WCE. */ ··· 78 77 79 78 /* writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) */ 80 79 __u8 wce; 80 + __u8 unused; 81 + 82 + /* number of vqs, only available when VIRTIO_BLK_F_MQ is set */ 83 + __u16 num_queues; 81 84 } __attribute__((packed)); 82 85 83 86 /*

+13 -10

lib/lru_cache.c

··· 169 169 return NULL; 170 170 } 171 171 172 - void lc_free_by_index(struct lru_cache *lc, unsigned i) 172 + static void lc_free_by_index(struct lru_cache *lc, unsigned i) 173 173 { 174 174 void *p = lc->lc_element[i]; 175 175 WARN_ON(!p); ··· 643 643 * lc_dump - Dump a complete LRU cache to seq in textual form. 644 644 * @lc: the lru cache to operate on 645 645 * @seq: the &struct seq_file pointer to seq_printf into 646 - * @utext: user supplied "heading" or other info 646 + * @utext: user supplied additional "heading" or other info 647 647 * @detail: function pointer the user may provide to dump further details 648 - * of the object the lc_element is embedded in. 648 + * of the object the lc_element is embedded in. May be NULL. 649 + * Note: a leading space ' ' and trailing newline '\n' is implied. 649 650 */ 650 651 void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext, 651 652 void (*detail) (struct seq_file *, struct lc_element *)) ··· 655 654 struct lc_element *e; 656 655 int i; 657 656 658 - seq_printf(seq, "\tnn: lc_number refcnt %s\n ", utext); 657 + seq_printf(seq, "\tnn: lc_number (new nr) refcnt %s\n ", utext); 659 658 for (i = 0; i < nr_elements; i++) { 660 659 e = lc_element_by_index(lc, i); 661 - if (e->lc_number == LC_FREE) { 662 - seq_printf(seq, "\t%2d: FREE\n", i); 663 - } else { 664 - seq_printf(seq, "\t%2d: %4u %4u ", i, 665 - e->lc_number, e->refcnt); 660 + if (e->lc_number != e->lc_new_number) 661 + seq_printf(seq, "\t%5d: %6d %8d %6d ", 662 + i, e->lc_number, e->lc_new_number, e->refcnt); 663 + else 664 + seq_printf(seq, "\t%5d: %6d %-8s %6d ", 665 + i, e->lc_number, "-\"-", e->refcnt); 666 + if (detail) 666 667 detail(seq, e); 667 - } 668 + seq_putc(seq, '\n'); 668 669 } 669 670 } 670 671