Merge branch 'for-linus' of git://git.kernel.dk/linux-2.6-block

* 'for-linus' of git://git.kernel.dk/linux-2.6-block: (27 commits)
block: make blk_init_free_list and elevator_init idempotent
block: avoid unconditionally freeing previously allocated request_queue
pipe: change /proc/sys/fs/pipe-max-pages to byte sized interface
pipe: change the privilege required for growing a pipe beyond system max
pipe: adjust minimum pipe size to 1 page
block: disable preemption before using sched_clock()
cciss: call BUG() earlier
Preparing 8.3.8rc2
drbd: Reduce verbosity
drbd: use drbd specific ratelimit instead of global printk_ratelimit
drbd: fix hang on local read errors while disconnected
drbd: Removed the now empty w_io_error() function
drbd: removed duplicated #includes
drbd: improve usage of MSG_MORE
drbd: need to set socket bufsize early to take effect
drbd: improve network latency, TCP_QUICKACK
drbd: Revert "drbd: Create new current UUID as late as possible"
brd: support discard
Revert "writeback: fix WB_SYNC_NONE writeback from umount"
Revert "writeback: ensure that WB_SYNC_NONE writeback with sb pinned is sync"
...

+312 -265
+14 -6
block/blk-core.c
··· 467 467 { 468 468 struct request_list *rl = &q->rq; 469 469 470 + if (unlikely(rl->rq_pool)) 471 + return 0; 472 + 470 473 rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; 471 474 rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; 472 475 rl->elvpriv = 0; ··· 573 570 struct request_queue * 574 571 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) 575 572 { 576 - struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); 573 + struct request_queue *uninit_q, *q; 577 574 578 - return blk_init_allocated_queue_node(q, rfn, lock, node_id); 575 + uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id); 576 + if (!uninit_q) 577 + return NULL; 578 + 579 + q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id); 580 + if (!q) 581 + blk_cleanup_queue(uninit_q); 582 + 583 + return q; 579 584 } 580 585 EXPORT_SYMBOL(blk_init_queue_node); 581 586 ··· 603 592 return NULL; 604 593 605 594 q->node = node_id; 606 - if (blk_init_free_list(q)) { 607 - kmem_cache_free(blk_requestq_cachep, q); 595 + if (blk_init_free_list(q)) 608 596 return NULL; 609 - } 610 597 611 598 q->request_fn = rfn; 612 599 q->prep_rq_fn = NULL; ··· 627 618 return q; 628 619 } 629 620 630 - blk_put_queue(q); 631 621 return NULL; 632 622 } 633 623 EXPORT_SYMBOL(blk_init_allocated_queue_node);
+79 -22
block/cfq-iosched.c
··· 64 64 static struct completion *ioc_gone; 65 65 static DEFINE_SPINLOCK(ioc_gone_lock); 66 66 67 + static DEFINE_SPINLOCK(cic_index_lock); 68 + static DEFINE_IDA(cic_index_ida); 69 + 67 70 #define CFQ_PRIO_LISTS IOPRIO_BE_NR 68 71 #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 69 72 #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) ··· 274 271 unsigned int cfq_latency; 275 272 unsigned int cfq_group_isolation; 276 273 274 + unsigned int cic_index; 277 275 struct list_head cic_list; 278 276 279 277 /* ··· 432 428 struct cfq_queue *cfqq, bool is_sync) 433 429 { 434 430 cic->cfqq[is_sync] = cfqq; 431 + } 432 + 433 + #define CIC_DEAD_KEY 1ul 434 + #define CIC_DEAD_INDEX_SHIFT 1 435 + 436 + static inline void *cfqd_dead_key(struct cfq_data *cfqd) 437 + { 438 + return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); 439 + } 440 + 441 + static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) 442 + { 443 + struct cfq_data *cfqd = cic->key; 444 + 445 + if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY)) 446 + return NULL; 447 + 448 + return cfqd; 435 449 } 436 450 437 451 /* ··· 2532 2510 static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) 2533 2511 { 2534 2512 unsigned long flags; 2513 + unsigned long dead_key = (unsigned long) cic->key; 2535 2514 2536 - BUG_ON(!cic->dead_key); 2515 + BUG_ON(!(dead_key & CIC_DEAD_KEY)); 2537 2516 2538 2517 spin_lock_irqsave(&ioc->lock, flags); 2539 - radix_tree_delete(&ioc->radix_root, cic->dead_key); 2518 + radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT); 2540 2519 hlist_del_rcu(&cic->cic_list); 2541 2520 spin_unlock_irqrestore(&ioc->lock, flags); 2542 2521 ··· 2560 2537 __call_for_each_cic(ioc, cic_free_func); 2561 2538 } 2562 2539 2563 - static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2540 + static void cfq_put_cooperator(struct cfq_queue *cfqq) 2564 2541 { 2565 2542 struct cfq_queue *__cfqq, *next; 2566 - 2567 - if (unlikely(cfqq == cfqd->active_queue)) { 2568 - __cfq_slice_expired(cfqd, cfqq, 0); 2569 - cfq_schedule_dispatch(cfqd); 2570 - } 2571 2543 2572 2544 /* 2573 2545 * If this queue was scheduled to merge with another queue, be ··· 2579 2561 cfq_put_queue(__cfqq); 2580 2562 __cfqq = next; 2581 2563 } 2564 + } 2565 + 2566 + static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2567 + { 2568 + if (unlikely(cfqq == cfqd->active_queue)) { 2569 + __cfq_slice_expired(cfqd, cfqq, 0); 2570 + cfq_schedule_dispatch(cfqd); 2571 + } 2572 + 2573 + cfq_put_cooperator(cfqq); 2582 2574 2583 2575 cfq_put_queue(cfqq); 2584 2576 } ··· 2601 2573 list_del_init(&cic->queue_list); 2602 2574 2603 2575 /* 2604 - * Make sure key == NULL is seen for dead queues 2576 + * Make sure dead mark is seen for dead queues 2605 2577 */ 2606 2578 smp_wmb(); 2607 - cic->dead_key = (unsigned long) cic->key; 2608 - cic->key = NULL; 2579 + cic->key = cfqd_dead_key(cfqd); 2609 2580 2610 2581 if (ioc->ioc_data == cic) 2611 2582 rcu_assign_pointer(ioc->ioc_data, NULL); ··· 2623 2596 static void cfq_exit_single_io_context(struct io_context *ioc, 2624 2597 struct cfq_io_context *cic) 2625 2598 { 2626 - struct cfq_data *cfqd = cic->key; 2599 + struct cfq_data *cfqd = cic_to_cfqd(cic); 2627 2600 2628 2601 if (cfqd) { 2629 2602 struct request_queue *q = cfqd->queue; ··· 2636 2609 * race between exiting task and queue 2637 2610 */ 2638 2611 smp_read_barrier_depends(); 2639 - if (cic->key) 2612 + if (cic->key == cfqd) 2640 2613 __cfq_exit_single_io_context(cfqd, cic); 2641 2614 2642 2615 spin_unlock_irqrestore(q->queue_lock, flags); ··· 2716 2689 2717 2690 static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) 2718 2691 { 2719 - struct cfq_data *cfqd = cic->key; 2692 + struct cfq_data *cfqd = cic_to_cfqd(cic); 2720 2693 struct cfq_queue *cfqq; 2721 2694 unsigned long flags; 2722 2695 ··· 2773 2746 static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) 2774 2747 { 2775 2748 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); 2776 - struct cfq_data *cfqd = cic->key; 2749 + struct cfq_data *cfqd = cic_to_cfqd(cic); 2777 2750 unsigned long flags; 2778 2751 struct request_queue *q; 2779 2752 ··· 2910 2883 unsigned long flags; 2911 2884 2912 2885 WARN_ON(!list_empty(&cic->queue_list)); 2886 + BUG_ON(cic->key != cfqd_dead_key(cfqd)); 2913 2887 2914 2888 spin_lock_irqsave(&ioc->lock, flags); 2915 2889 2916 2890 BUG_ON(ioc->ioc_data == cic); 2917 2891 2918 - radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd); 2892 + radix_tree_delete(&ioc->radix_root, cfqd->cic_index); 2919 2893 hlist_del_rcu(&cic->cic_list); 2920 2894 spin_unlock_irqrestore(&ioc->lock, flags); 2921 2895 ··· 2928 2900 { 2929 2901 struct cfq_io_context *cic; 2930 2902 unsigned long flags; 2931 - void *k; 2932 2903 2933 2904 if (unlikely(!ioc)) 2934 2905 return NULL; ··· 2944 2917 } 2945 2918 2946 2919 do { 2947 - cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); 2920 + cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index); 2948 2921 rcu_read_unlock(); 2949 2922 if (!cic) 2950 2923 break; 2951 - /* ->key must be copied to avoid race with cfq_exit_queue() */ 2952 - k = cic->key; 2953 - if (unlikely(!k)) { 2924 + if (unlikely(cic->key != cfqd)) { 2954 2925 cfq_drop_dead_cic(cfqd, ioc, cic); 2955 2926 rcu_read_lock(); 2956 2927 continue; ··· 2981 2956 2982 2957 spin_lock_irqsave(&ioc->lock, flags); 2983 2958 ret = radix_tree_insert(&ioc->radix_root, 2984 - (unsigned long) cfqd, cic); 2959 + cfqd->cic_index, cic); 2985 2960 if (!ret) 2986 2961 hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); 2987 2962 spin_unlock_irqrestore(&ioc->lock, flags); ··· 3541 3516 } 3542 3517 3543 3518 cic_set_cfqq(cic, NULL, 1); 3519 + 3520 + cfq_put_cooperator(cfqq); 3521 + 3544 3522 cfq_put_queue(cfqq); 3545 3523 return NULL; 3546 3524 } ··· 3736 3708 3737 3709 cfq_shutdown_timer_wq(cfqd); 3738 3710 3711 + spin_lock(&cic_index_lock); 3712 + ida_remove(&cic_index_ida, cfqd->cic_index); 3713 + spin_unlock(&cic_index_lock); 3714 + 3739 3715 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ 3740 3716 call_rcu(&cfqd->rcu, cfq_cfqd_free); 3717 + } 3718 + 3719 + static int cfq_alloc_cic_index(void) 3720 + { 3721 + int index, error; 3722 + 3723 + do { 3724 + if (!ida_pre_get(&cic_index_ida, GFP_KERNEL)) 3725 + return -ENOMEM; 3726 + 3727 + spin_lock(&cic_index_lock); 3728 + error = ida_get_new(&cic_index_ida, &index); 3729 + spin_unlock(&cic_index_lock); 3730 + if (error && error != -EAGAIN) 3731 + return error; 3732 + } while (error); 3733 + 3734 + return index; 3741 3735 } 3742 3736 3743 3737 static void *cfq_init_queue(struct request_queue *q) ··· 3769 3719 struct cfq_group *cfqg; 3770 3720 struct cfq_rb_root *st; 3771 3721 3722 + i = cfq_alloc_cic_index(); 3723 + if (i < 0) 3724 + return NULL; 3725 + 3772 3726 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3773 3727 if (!cfqd) 3774 3728 return NULL; 3729 + 3730 + cfqd->cic_index = i; 3775 3731 3776 3732 /* Init root service tree */ 3777 3733 cfqd->grp_service_tree = CFQ_RB_ROOT; ··· 4040 3984 */ 4041 3985 if (elv_ioc_count_read(cfq_ioc_count)) 4042 3986 wait_for_completion(&all_gone); 3987 + ida_destroy(&cic_index_ida); 4043 3988 cfq_slab_kill(); 4044 3989 } 4045 3990
+5 -3
block/elevator.c
··· 242 242 { 243 243 struct elevator_type *e = NULL; 244 244 struct elevator_queue *eq; 245 - int ret = 0; 246 245 void *data; 246 + 247 + if (unlikely(q->elevator)) 248 + return 0; 247 249 248 250 INIT_LIST_HEAD(&q->queue_head); 249 251 q->last_merge = NULL; ··· 286 284 } 287 285 288 286 elevator_attach(q, eq, data); 289 - return ret; 287 + return 0; 290 288 } 291 289 EXPORT_SYMBOL(elevator_init); 292 290 ··· 1099 1097 struct elevator_type *__e; 1100 1098 int len = 0; 1101 1099 1102 - if (!q->elevator) 1100 + if (!q->elevator || !blk_queue_stackable(q)) 1103 1101 return sprintf(name, "none\n"); 1104 1102 1105 1103 elv = e->elevator_type;
+52 -1
drivers/block/brd.c
··· 133 133 return page; 134 134 } 135 135 136 + static void brd_free_page(struct brd_device *brd, sector_t sector) 137 + { 138 + struct page *page; 139 + pgoff_t idx; 140 + 141 + spin_lock(&brd->brd_lock); 142 + idx = sector >> PAGE_SECTORS_SHIFT; 143 + page = radix_tree_delete(&brd->brd_pages, idx); 144 + spin_unlock(&brd->brd_lock); 145 + if (page) 146 + __free_page(page); 147 + } 148 + 149 + static void brd_zero_page(struct brd_device *brd, sector_t sector) 150 + { 151 + struct page *page; 152 + 153 + page = brd_lookup_page(brd, sector); 154 + if (page) 155 + clear_highpage(page); 156 + } 157 + 136 158 /* 137 159 * Free all backing store pages and radix tree. This must only be called when 138 160 * there are no other users of the device. ··· 209 187 return -ENOMEM; 210 188 } 211 189 return 0; 190 + } 191 + 192 + static void discard_from_brd(struct brd_device *brd, 193 + sector_t sector, size_t n) 194 + { 195 + while (n >= PAGE_SIZE) { 196 + /* 197 + * Don't want to actually discard pages here because 198 + * re-allocating the pages can result in writeback 199 + * deadlocks under heavy load. 200 + */ 201 + if (0) 202 + brd_free_page(brd, sector); 203 + else 204 + brd_zero_page(brd, sector); 205 + sector += PAGE_SIZE >> SECTOR_SHIFT; 206 + n -= PAGE_SIZE; 207 + } 212 208 } 213 209 214 210 /* ··· 340 300 get_capacity(bdev->bd_disk)) 341 301 goto out; 342 302 303 + if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) { 304 + err = 0; 305 + discard_from_brd(brd, sector, bio->bi_size); 306 + goto out; 307 + } 308 + 343 309 rw = bio_rw(bio); 344 310 if (rw == READA) 345 311 rw = READ; ··· 366 320 } 367 321 368 322 #ifdef CONFIG_BLK_DEV_XIP 369 - static int brd_direct_access (struct block_device *bdev, sector_t sector, 323 + static int brd_direct_access(struct block_device *bdev, sector_t sector, 370 324 void **kaddr, unsigned long *pfn) 371 325 { 372 326 struct brd_device *brd = bdev->bd_disk->private_data; ··· 482 436 blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG, NULL); 483 437 blk_queue_max_hw_sectors(brd->brd_queue, 1024); 484 438 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); 439 + 440 + brd->brd_queue->limits.discard_granularity = PAGE_SIZE; 441 + brd->brd_queue->limits.max_discard_sectors = UINT_MAX; 442 + brd->brd_queue->limits.discard_zeroes_data = 1; 443 + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue); 485 444 486 445 disk = brd->brd_disk = alloc_disk(1 << part_shift); 487 446 if (!disk)
+1 -1
drivers/block/cciss_scsi.c
··· 188 188 189 189 sa = h->scsi_ctlr; 190 190 stk = &sa->cmd_stack; 191 + stk->top++; 191 192 if (stk->top >= CMD_STACK_SIZE) { 192 193 printk("cciss: scsi_cmd_free called too many times.\n"); 193 194 BUG(); 194 195 } 195 - stk->top++; 196 196 stk->elem[stk->top] = (struct cciss_scsi_cmd_stack_elem_t *) cmd; 197 197 } 198 198
+3 -11
drivers/block/drbd/drbd_int.h
··· 943 943 struct drbd_work resync_work, 944 944 unplug_work, 945 945 md_sync_work, 946 - delay_probe_work, 947 - uuid_work; 946 + delay_probe_work; 948 947 struct timer_list resync_timer; 949 948 struct timer_list md_sync_timer; 950 949 struct timer_list delay_probe_timer; ··· 1068 1069 struct timeval dps_time; /* delay-probes-start-time */ 1069 1070 unsigned int dp_volume_last; /* send_cnt of last delay probe */ 1070 1071 int c_sync_rate; /* current resync rate after delay_probe magic */ 1071 - atomic_t new_c_uuid; 1072 1072 }; 1073 1073 1074 1074 static inline struct drbd_conf *minor_to_mdev(unsigned int minor) ··· 1474 1476 extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); 1475 1477 extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int); 1476 1478 extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); 1477 - extern int w_io_error(struct drbd_conf *, struct drbd_work *, int); 1478 1479 extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); 1479 1480 extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int); 1480 1481 extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); ··· 1539 1542 1540 1543 static inline void drbd_tcp_quickack(struct socket *sock) 1541 1544 { 1542 - int __user val = 1; 1545 + int __user val = 2; 1543 1546 (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, 1544 1547 (char __user *)&val, sizeof(val)); 1545 1548 } ··· 1725 1728 switch (mdev->ldev->dc.on_io_error) { 1726 1729 case EP_PASS_ON: 1727 1730 if (!forcedetach) { 1728 - if (printk_ratelimit()) 1731 + if (__ratelimit(&drbd_ratelimit_state)) 1729 1732 dev_err(DEV, "Local IO failed in %s." 1730 1733 "Passing error on...\n", where); 1731 1734 break; ··· 2216 2219 return 0; 2217 2220 if (test_bit(BITMAP_IO, &mdev->flags)) 2218 2221 return 0; 2219 - if (atomic_read(&mdev->new_c_uuid)) 2220 - return 0; 2221 2222 return 1; 2222 2223 } 2223 2224 ··· 2235 2240 * 2236 2241 * to avoid races with the reconnect code, 2237 2242 * we need to atomic_inc within the spinlock. */ 2238 - 2239 - if (atomic_read(&mdev->new_c_uuid) && atomic_add_unless(&mdev->new_c_uuid, -1, 1)) 2240 - drbd_queue_work_front(&mdev->data.work, &mdev->uuid_work); 2241 2243 2242 2244 spin_lock_irq(&mdev->req_lock); 2243 2245 while (!__inc_ap_bio_cond(mdev)) {
+26 -42
drivers/block/drbd/drbd_main.c
··· 1215 1215 ns.pdsk == D_OUTDATED)) { 1216 1216 if (get_ldev(mdev)) { 1217 1217 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && 1218 - mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE && 1219 - !atomic_read(&mdev->new_c_uuid)) 1220 - atomic_set(&mdev->new_c_uuid, 2); 1218 + mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { 1219 + drbd_uuid_new_current(mdev); 1220 + drbd_send_uuids(mdev); 1221 + } 1221 1222 put_ldev(mdev); 1222 1223 } 1223 1224 } 1224 1225 1225 1226 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { 1226 - /* Diskless peer becomes primary or got connected do diskless, primary peer. */ 1227 - if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0 && 1228 - !atomic_read(&mdev->new_c_uuid)) 1229 - atomic_set(&mdev->new_c_uuid, 2); 1227 + if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) 1228 + drbd_uuid_new_current(mdev); 1230 1229 1231 1230 /* D_DISKLESS Peer becomes secondary */ 1232 1231 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) ··· 1349 1350 drbd_md_sync(mdev); 1350 1351 } 1351 1352 1352 - static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 1353 - { 1354 - if (get_ldev(mdev)) { 1355 - if (mdev->ldev->md.uuid[UI_BITMAP] == 0) { 1356 - drbd_uuid_new_current(mdev); 1357 - if (get_net_conf(mdev)) { 1358 - drbd_send_uuids(mdev); 1359 - put_net_conf(mdev); 1360 - } 1361 - drbd_md_sync(mdev); 1362 - } 1363 - put_ldev(mdev); 1364 - } 1365 - atomic_dec(&mdev->new_c_uuid); 1366 - wake_up(&mdev->misc_wait); 1367 - 1368 - return 1; 1369 - } 1370 1353 1371 1354 static int drbd_thread_setup(void *arg) 1372 1355 { ··· 2272 2291 * with page_count == 0 or PageSlab. 2273 2292 */ 2274 2293 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, 2275 - int offset, size_t size) 2294 + int offset, size_t size, unsigned msg_flags) 2276 2295 { 2277 - int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); 2296 + int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags); 2278 2297 kunmap(page); 2279 2298 if (sent == size) 2280 2299 mdev->send_cnt += size>>9; ··· 2282 2301 } 2283 2302 2284 2303 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, 2285 - int offset, size_t size) 2304 + int offset, size_t size, unsigned msg_flags) 2286 2305 { 2287 2306 mm_segment_t oldfs = get_fs(); 2288 2307 int sent, ok; ··· 2295 2314 * __page_cache_release a page that would actually still be referenced 2296 2315 * by someone, leading to some obscure delayed Oops somewhere else. */ 2297 2316 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page)) 2298 - return _drbd_no_send_page(mdev, page, offset, size); 2317 + return _drbd_no_send_page(mdev, page, offset, size, msg_flags); 2299 2318 2319 + msg_flags |= MSG_NOSIGNAL; 2300 2320 drbd_update_congested(mdev); 2301 2321 set_fs(KERNEL_DS); 2302 2322 do { 2303 2323 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page, 2304 2324 offset, len, 2305 - MSG_NOSIGNAL); 2325 + msg_flags); 2306 2326 if (sent == -EAGAIN) { 2307 2327 if (we_should_drop_the_connection(mdev, 2308 2328 mdev->data.socket)) ··· 2332 2350 { 2333 2351 struct bio_vec *bvec; 2334 2352 int i; 2353 + /* hint all but last page with MSG_MORE */ 2335 2354 __bio_for_each_segment(bvec, bio, i, 0) { 2336 2355 if (!_drbd_no_send_page(mdev, bvec->bv_page, 2337 - bvec->bv_offset, bvec->bv_len)) 2356 + bvec->bv_offset, bvec->bv_len, 2357 + i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) 2338 2358 return 0; 2339 2359 } 2340 2360 return 1; ··· 2346 2362 { 2347 2363 struct bio_vec *bvec; 2348 2364 int i; 2365 + /* hint all but last page with MSG_MORE */ 2349 2366 __bio_for_each_segment(bvec, bio, i, 0) { 2350 2367 if (!_drbd_send_page(mdev, bvec->bv_page, 2351 - bvec->bv_offset, bvec->bv_len)) 2368 + bvec->bv_offset, bvec->bv_len, 2369 + i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) 2352 2370 return 0; 2353 2371 } 2354 - 2355 2372 return 1; 2356 2373 } 2357 2374 ··· 2360 2375 { 2361 2376 struct page *page = e->pages; 2362 2377 unsigned len = e->size; 2378 + /* hint all but last page with MSG_MORE */ 2363 2379 page_chain_for_each(page) { 2364 2380 unsigned l = min_t(unsigned, len, PAGE_SIZE); 2365 - if (!_drbd_send_page(mdev, page, 0, l)) 2381 + if (!_drbd_send_page(mdev, page, 0, l, 2382 + page_chain_next(page) ? MSG_MORE : 0)) 2366 2383 return 0; 2367 2384 len -= l; 2368 2385 } ··· 2444 2457 p.dp_flags = cpu_to_be32(dp_flags); 2445 2458 set_bit(UNPLUG_REMOTE, &mdev->flags); 2446 2459 ok = (sizeof(p) == 2447 - drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); 2460 + drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0)); 2448 2461 if (ok && dgs) { 2449 2462 dgb = mdev->int_dig_out; 2450 2463 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); 2451 - ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); 2464 + ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); 2452 2465 } 2453 2466 if (ok) { 2454 2467 if (mdev->net_conf->wire_protocol == DRBD_PROT_A) ··· 2497 2510 return 0; 2498 2511 2499 2512 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, 2500 - sizeof(p), MSG_MORE); 2513 + sizeof(p), dgs ? MSG_MORE : 0); 2501 2514 if (ok && dgs) { 2502 2515 dgb = mdev->int_dig_out; 2503 2516 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb); 2504 - ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); 2517 + ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); 2505 2518 } 2506 2519 if (ok) 2507 2520 ok = _drbd_send_zc_ee(mdev, e); ··· 2695 2708 atomic_set(&mdev->net_cnt, 0); 2696 2709 atomic_set(&mdev->packet_seq, 0); 2697 2710 atomic_set(&mdev->pp_in_use, 0); 2698 - atomic_set(&mdev->new_c_uuid, 0); 2699 2711 2700 2712 mutex_init(&mdev->md_io_mutex); 2701 2713 mutex_init(&mdev->data.mutex); ··· 2725 2739 INIT_LIST_HEAD(&mdev->bm_io_work.w.list); 2726 2740 INIT_LIST_HEAD(&mdev->delay_probes); 2727 2741 INIT_LIST_HEAD(&mdev->delay_probe_work.list); 2728 - INIT_LIST_HEAD(&mdev->uuid_work.list); 2729 2742 2730 2743 mdev->resync_work.cb = w_resync_inactive; 2731 2744 mdev->unplug_work.cb = w_send_write_hint; 2732 2745 mdev->md_sync_work.cb = w_md_sync; 2733 2746 mdev->bm_io_work.w.cb = w_bitmap_io; 2734 2747 mdev->delay_probe_work.cb = w_delay_probes; 2735 - mdev->uuid_work.cb = w_new_current_uuid; 2736 2748 init_timer(&mdev->resync_timer); 2737 2749 init_timer(&mdev->md_sync_timer); 2738 2750 init_timer(&mdev->delay_probe_timer); ··· 3783 3799 if (ret) { 3784 3800 fault_count++; 3785 3801 3786 - if (printk_ratelimit()) 3802 + if (__ratelimit(&drbd_ratelimit_state)) 3787 3803 dev_warn(DEV, "***Simulating %s failure\n", 3788 3804 _drbd_fault_str(type)); 3789 3805 }
+23 -22
drivers/block/drbd/drbd_receiver.c
··· 42 42 #include <linux/unistd.h> 43 43 #include <linux/vmalloc.h> 44 44 #include <linux/random.h> 45 - #include <linux/mm.h> 46 45 #include <linux/string.h> 47 46 #include <linux/scatterlist.h> 48 47 #include "drbd_int.h" ··· 570 571 return rv; 571 572 } 572 573 574 + /* quoting tcp(7): 575 + * On individual connections, the socket buffer size must be set prior to the 576 + * listen(2) or connect(2) calls in order to have it take effect. 577 + * This is our wrapper to do so. 578 + */ 579 + static void drbd_setbufsize(struct socket *sock, unsigned int snd, 580 + unsigned int rcv) 581 + { 582 + /* open coded SO_SNDBUF, SO_RCVBUF */ 583 + if (snd) { 584 + sock->sk->sk_sndbuf = snd; 585 + sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 586 + } 587 + if (rcv) { 588 + sock->sk->sk_rcvbuf = rcv; 589 + sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 590 + } 591 + } 592 + 573 593 static struct socket *drbd_try_connect(struct drbd_conf *mdev) 574 594 { 575 595 const char *what; ··· 610 592 611 593 sock->sk->sk_rcvtimeo = 612 594 sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; 595 + drbd_setbufsize(sock, mdev->net_conf->sndbuf_size, 596 + mdev->net_conf->rcvbuf_size); 613 597 614 598 /* explicitly bind to the configured IP as source IP 615 599 * for the outgoing connections. ··· 690 670 s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */ 691 671 s_listen->sk->sk_rcvtimeo = timeo; 692 672 s_listen->sk->sk_sndtimeo = timeo; 673 + drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size, 674 + mdev->net_conf->rcvbuf_size); 693 675 694 676 what = "bind before listen"; 695 677 err = s_listen->ops->bind(s_listen, ··· 877 855 878 856 sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; 879 857 msock->sk->sk_priority = TC_PRIO_INTERACTIVE; 880 - 881 - if (mdev->net_conf->sndbuf_size) { 882 - sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size; 883 - sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 884 - } 885 - 886 - if (mdev->net_conf->rcvbuf_size) { 887 - sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size; 888 - sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 889 - } 890 858 891 859 /* NOT YET ... 892 860 * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; ··· 1165 1153 unsigned ds = e->size; 1166 1154 unsigned n_bios = 0; 1167 1155 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; 1168 - 1169 - if (atomic_read(&mdev->new_c_uuid)) { 1170 - if (atomic_add_unless(&mdev->new_c_uuid, -1, 1)) { 1171 - drbd_uuid_new_current(mdev); 1172 - drbd_md_sync(mdev); 1173 - 1174 - atomic_dec(&mdev->new_c_uuid); 1175 - wake_up(&mdev->misc_wait); 1176 - } 1177 - wait_event(mdev->misc_wait, !atomic_read(&mdev->new_c_uuid)); 1178 - } 1179 1156 1180 1157 /* In most cases, we will only need one bio. But in case the lower 1181 1158 * level restrictions happen to be different at this offset on this
+14 -40
drivers/block/drbd/drbd_req.c
··· 102 102 } 103 103 } 104 104 105 - /* if it was a local io error, we want to notify our 106 - * peer about that, and see if we need to 107 - * detach the disk and stuff. 108 - * to avoid allocating some special work 109 - * struct, reuse the request. */ 110 - 111 - /* THINK 112 - * why do we do this not when we detect the error, 113 - * but delay it until it is "done", i.e. possibly 114 - * until the next barrier ack? */ 115 - 116 - if (rw == WRITE && 117 - ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) { 118 - if (!(req->w.list.next == LIST_POISON1 || 119 - list_empty(&req->w.list))) { 120 - /* DEBUG ASSERT only; if this triggers, we 121 - * probably corrupt the worker list here */ 122 - dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next); 123 - dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev); 124 - } 125 - req->w.cb = w_io_error; 126 - drbd_queue_work(&mdev->data.work, &req->w); 127 - /* drbd_req_free() is done in w_io_error */ 128 - } else { 129 - drbd_req_free(req); 130 - } 105 + drbd_req_free(req); 131 106 } 132 107 133 108 static void queue_barrier(struct drbd_conf *mdev) ··· 428 453 req->rq_state |= RQ_LOCAL_COMPLETED; 429 454 req->rq_state &= ~RQ_LOCAL_PENDING; 430 455 431 - dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n", 432 - (unsigned long long)req->sector, req->size); 433 - /* and now: check how to handle local io error. */ 434 456 __drbd_chk_io_error(mdev, FALSE); 435 457 _req_may_be_done(req, m); 436 458 put_ldev(mdev); ··· 447 475 req->rq_state |= RQ_LOCAL_COMPLETED; 448 476 req->rq_state &= ~RQ_LOCAL_PENDING; 449 477 450 - dev_alert(DEV, "Local READ failed sec=%llus size=%u\n", 451 - (unsigned long long)req->sector, req->size); 452 - /* _req_mod(req,to_be_send); oops, recursion... */ 453 478 D_ASSERT(!(req->rq_state & RQ_NET_MASK)); 454 - req->rq_state |= RQ_NET_PENDING; 455 - inc_ap_pending(mdev); 456 479 457 480 __drbd_chk_io_error(mdev, FALSE); 458 481 put_ldev(mdev); 459 - /* NOTE: if we have no connection, 460 - * or know the peer has no good data either, 461 - * then we don't actually need to "queue_for_net_read", 462 - * but we do so anyways, since the drbd_io_error() 463 - * and the potential state change to "Diskless" 464 - * needs to be done from process context */ 465 482 483 + /* no point in retrying if there is no good remote data, 484 + * or we have no connection. */ 485 + if (mdev->state.pdsk != D_UP_TO_DATE) { 486 + _req_may_be_done(req, m); 487 + break; 488 + } 489 + 490 + /* _req_mod(req,to_be_send); oops, recursion... */ 491 + req->rq_state |= RQ_NET_PENDING; 492 + inc_ap_pending(mdev); 466 493 /* fall through: _req_mod(req,queue_for_net_read); */ 467 494 468 495 case queue_for_net_read: ··· 571 600 _req_may_be_done(req, m); 572 601 break; 573 602 603 + case read_retry_remote_canceled: 604 + req->rq_state &= ~RQ_NET_QUEUED; 605 + /* fall through, in case we raced with drbd_disconnect */ 574 606 case connection_lost_while_pending: 575 607 /* transfer log cleanup after connection loss */ 576 608 /* assert something? */
+1
drivers/block/drbd/drbd_req.h
··· 91 91 send_failed, 92 92 handed_over_to_network, 93 93 connection_lost_while_pending, 94 + read_retry_remote_canceled, 94 95 recv_acked_by_peer, 95 96 write_acked_by_peer, 96 97 write_acked_by_peer_and_sis, /* and set_in_sync */
+2 -22
drivers/block/drbd/drbd_worker.c
··· 224 224 enum drbd_req_event what; 225 225 int uptodate = bio_flagged(bio, BIO_UPTODATE); 226 226 227 - if (error) 228 - dev_warn(DEV, "p %s: error=%d\n", 229 - bio_data_dir(bio) == WRITE ? "write" : "read", error); 230 227 if (!error && !uptodate) { 231 228 dev_warn(DEV, "p %s: setting error to -EIO\n", 232 229 bio_data_dir(bio) == WRITE ? "write" : "read"); ··· 254 257 complete_master_bio(mdev, &m); 255 258 } 256 259 257 - int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 258 - { 259 - struct drbd_request *req = container_of(w, struct drbd_request, w); 260 - 261 - /* NOTE: mdev->ldev can be NULL by the time we get here! */ 262 - /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */ 263 - 264 - /* the only way this callback is scheduled is from _req_may_be_done, 265 - * when it is done and had a local write error, see comments there */ 266 - drbd_req_free(req); 267 - 268 - return TRUE; 269 - } 270 - 271 260 int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 272 261 { 273 262 struct drbd_request *req = container_of(w, struct drbd_request, w); ··· 263 280 * to give the disk the chance to relocate that block */ 264 281 265 282 spin_lock_irq(&mdev->req_lock); 266 - if (cancel || 267 - mdev->state.conn < C_CONNECTED || 268 - mdev->state.pdsk <= D_INCONSISTENT) { 269 - _req_mod(req, send_canceled); 283 + if (cancel || mdev->state.pdsk != D_UP_TO_DATE) { 284 + _req_mod(req, read_retry_remote_canceled); 270 285 spin_unlock_irq(&mdev->req_lock); 271 - dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n"); 272 286 return 1; 273 287 } 274 288 spin_unlock_irq(&mdev->req_lock);
+16 -48
fs/fs-writeback.c
··· 45 45 unsigned int for_kupdate:1; 46 46 unsigned int range_cyclic:1; 47 47 unsigned int for_background:1; 48 - unsigned int sb_pinned:1; 49 48 }; 50 49 51 50 /* ··· 192 193 } 193 194 194 195 static void bdi_alloc_queue_work(struct backing_dev_info *bdi, 195 - struct wb_writeback_args *args, 196 - int wait) 196 + struct wb_writeback_args *args) 197 197 { 198 198 struct bdi_work *work; 199 199 ··· 204 206 if (work) { 205 207 bdi_work_init(work, args); 206 208 bdi_queue_work(bdi, work); 207 - if (wait) 208 - bdi_wait_on_work_clear(work); 209 209 } else { 210 210 struct bdi_writeback *wb = &bdi->wb; 211 211 ··· 230 234 .sync_mode = WB_SYNC_ALL, 231 235 .nr_pages = LONG_MAX, 232 236 .range_cyclic = 0, 233 - /* 234 - * Setting sb_pinned is not necessary for WB_SYNC_ALL, but 235 - * lets make it explicitly clear. 236 - */ 237 - .sb_pinned = 1, 238 237 }; 239 238 struct bdi_work work; 240 239 ··· 245 254 * @bdi: the backing device to write from 246 255 * @sb: write inodes from this super_block 247 256 * @nr_pages: the number of pages to write 248 - * @sb_locked: caller already holds sb umount sem. 249 257 * 250 258 * Description: 251 259 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 252 260 * started when this function returns, we make no guarentees on 253 - * completion. Caller specifies whether sb umount sem is held already or not. 261 + * completion. Caller need not hold sb s_umount semaphore. 254 262 * 255 263 */ 256 264 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, 257 - long nr_pages, int sb_locked) 265 + long nr_pages) 258 266 { 259 267 struct wb_writeback_args args = { 260 268 .sb = sb, 261 269 .sync_mode = WB_SYNC_NONE, 262 270 .nr_pages = nr_pages, 263 271 .range_cyclic = 1, 264 - .sb_pinned = sb_locked, 265 272 }; 266 273 267 274 /* ··· 271 282 args.for_background = 1; 272 283 } 273 284 274 - bdi_alloc_queue_work(bdi, &args, sb_locked); 285 + bdi_alloc_queue_work(bdi, &args); 275 286 } 276 287 277 288 /* ··· 584 595 /* 585 596 * Caller must already hold the ref for this 586 597 */ 587 - if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) { 598 + if (wbc->sync_mode == WB_SYNC_ALL) { 588 599 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 589 600 return SB_NOT_PINNED; 590 601 } ··· 758 769 .for_kupdate = args->for_kupdate, 759 770 .for_background = args->for_background, 760 771 .range_cyclic = args->range_cyclic, 761 - .sb_pinned = args->sb_pinned, 762 772 }; 763 773 unsigned long oldest_jif; 764 774 long wrote = 0; ··· 900 912 901 913 while ((work = get_next_work_item(bdi, wb)) != NULL) { 902 914 struct wb_writeback_args args = work->args; 903 - int post_clear; 904 915 905 916 /* 906 917 * Override sync mode, in case we must wait for completion ··· 907 920 if (force_wait) 908 921 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; 909 922 910 - post_clear = WB_SYNC_ALL || args.sb_pinned; 911 - 912 923 /* 913 924 * If this isn't a data integrity operation, just notify 914 925 * that we have seen this work and we are now starting it. 915 926 */ 916 - if (!post_clear) 927 + if (args.sync_mode == WB_SYNC_NONE) 917 928 wb_clear_pending(wb, work); 918 929 919 930 wrote += wb_writeback(wb, &args); ··· 920 935 * This is a data integrity writeback, so only do the 921 936 * notification when we have completed the work. 922 937 */ 923 - if (post_clear) 938 + if (args.sync_mode == WB_SYNC_ALL) 924 939 wb_clear_pending(wb, work); 925 940 } 926 941 ··· 996 1011 if (!bdi_has_dirty_io(bdi)) 997 1012 continue; 998 1013 999 - bdi_alloc_queue_work(bdi, &args, 0); 1014 + bdi_alloc_queue_work(bdi, &args); 1000 1015 } 1001 1016 1002 1017 rcu_read_unlock(); ··· 1205 1220 iput(old_inode); 1206 1221 } 1207 1222 1208 - static void __writeback_inodes_sb(struct super_block *sb, int sb_locked) 1209 - { 1210 - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1211 - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); 1212 - long nr_to_write; 1213 - 1214 - nr_to_write = nr_dirty + nr_unstable + 1215 - (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1216 - 1217 - bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked); 1218 - } 1219 - 1220 1223 /** 1221 1224 * writeback_inodes_sb - writeback dirty inodes from given super_block 1222 1225 * @sb: the superblock ··· 1216 1243 */ 1217 1244 void writeback_inodes_sb(struct super_block *sb) 1218 1245 { 1219 - __writeback_inodes_sb(sb, 0); 1246 + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1247 + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); 1248 + long nr_to_write; 1249 + 1250 + nr_to_write = nr_dirty + nr_unstable + 1251 + (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1252 + 1253 + bdi_start_writeback(sb->s_bdi, sb, nr_to_write); 1220 1254 } 1221 1255 EXPORT_SYMBOL(writeback_inodes_sb); 1222 - 1223 - /** 1224 - * writeback_inodes_sb_locked - writeback dirty inodes from given super_block 1225 - * @sb: the superblock 1226 - * 1227 - * Like writeback_inodes_sb(), except the caller already holds the 1228 - * sb umount sem. 1229 - */ 1230 - void writeback_inodes_sb_locked(struct super_block *sb) 1231 - { 1232 - __writeback_inodes_sb(sb, 1); 1233 - } 1234 1256 1235 1257 /** 1236 1258 * writeback_inodes_sb_if_idle - start writeback if none underway
+54 -25
fs/pipe.c
··· 26 26 27 27 /* 28 28 * The max size that a non-root user is allowed to grow the pipe. Can 29 - * be set by root in /proc/sys/fs/pipe-max-pages 29 + * be set by root in /proc/sys/fs/pipe-max-size 30 30 */ 31 - unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16; 31 + unsigned int pipe_max_size = 1048576; 32 + 33 + /* 34 + * Minimum pipe size, as required by POSIX 35 + */ 36 + unsigned int pipe_min_size = PAGE_SIZE; 32 37 33 38 /* 34 39 * We use a start+len construction, which provides full use of the ··· 1123 1118 * Allocate a new array of pipe buffers and copy the info over. Returns the 1124 1119 * pipe size if successful, or return -ERROR on error. 1125 1120 */ 1126 - static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) 1121 + static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) 1127 1122 { 1128 1123 struct pipe_buffer *bufs; 1129 - 1130 - /* 1131 - * Must be a power-of-2 currently 1132 - */ 1133 - if (!is_power_of_2(arg)) 1134 - return -EINVAL; 1135 1124 1136 1125 /* 1137 1126 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't ··· 1133 1134 * again like we would do for growing. If the pipe currently 1134 1135 * contains more buffers than arg, then return busy. 1135 1136 */ 1136 - if (arg < pipe->nrbufs) 1137 + if (nr_pages < pipe->nrbufs) 1137 1138 return -EBUSY; 1138 1139 1139 - bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL); 1140 + bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); 1140 1141 if (unlikely(!bufs)) 1141 1142 return -ENOMEM; 1142 1143 ··· 1157 1158 pipe->curbuf = 0; 1158 1159 kfree(pipe->bufs); 1159 1160 pipe->bufs = bufs; 1160 - pipe->buffers = arg; 1161 - return arg; 1161 + pipe->buffers = nr_pages; 1162 + return nr_pages * PAGE_SIZE; 1163 + } 1164 + 1165 + /* 1166 + * Currently we rely on the pipe array holding a power-of-2 number 1167 + * of pages. 1168 + */ 1169 + static inline unsigned int round_pipe_size(unsigned int size) 1170 + { 1171 + unsigned long nr_pages; 1172 + 1173 + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 1174 + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; 1175 + } 1176 + 1177 + /* 1178 + * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax 1179 + * will return an error. 1180 + */ 1181 + int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, 1182 + size_t *lenp, loff_t *ppos) 1183 + { 1184 + int ret; 1185 + 1186 + ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); 1187 + if (ret < 0 || !write) 1188 + return ret; 1189 + 1190 + pipe_max_size = round_pipe_size(pipe_max_size); 1191 + return ret; 1162 1192 } 1163 1193 1164 1194 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) ··· 1202 1174 mutex_lock(&pipe->inode->i_mutex); 1203 1175 1204 1176 switch (cmd) { 1205 - case F_SETPIPE_SZ: 1206 - if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) { 1177 + case F_SETPIPE_SZ: { 1178 + unsigned int size, nr_pages; 1179 + 1180 + size = round_pipe_size(arg); 1181 + nr_pages = size >> PAGE_SHIFT; 1182 + 1183 + if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { 1184 + ret = -EPERM; 1185 + goto out; 1186 + } else if (nr_pages < PAGE_SIZE) { 1207 1187 ret = -EINVAL; 1208 1188 goto out; 1209 1189 } 1210 - /* 1211 - * The pipe needs to be at least 2 pages large to 1212 - * guarantee POSIX behaviour. 1213 - */ 1214 - if (arg < 2) { 1215 - ret = -EINVAL; 1216 - goto out; 1217 - } 1218 - ret = pipe_set_size(pipe, arg); 1190 + ret = pipe_set_size(pipe, nr_pages); 1219 1191 break; 1192 + } 1220 1193 case F_GETPIPE_SZ: 1221 - ret = pipe->buffers; 1194 + ret = pipe->buffers * PAGE_SIZE; 1222 1195 break; 1223 1196 default: 1224 1197 ret = -EINVAL;
+1 -1
fs/splice.c
··· 354 354 break; 355 355 356 356 error = add_to_page_cache_lru(page, mapping, index, 357 - mapping_gfp_mask(mapping)); 357 + GFP_KERNEL); 358 358 if (unlikely(error)) { 359 359 page_cache_release(page); 360 360 if (error == -EEXIST)
+1 -1
fs/sync.c
··· 42 42 if (wait) 43 43 sync_inodes_sb(sb); 44 44 else 45 - writeback_inodes_sb_locked(sb); 45 + writeback_inodes_sb(sb); 46 46 47 47 if (sb->s_op->sync_fs) 48 48 sb->s_op->sync_fs(sb, wait);
+1 -1
include/linux/backing-dev.h
··· 106 106 void bdi_unregister(struct backing_dev_info *bdi); 107 107 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); 108 108 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, 109 - long nr_pages, int sb_locked); 109 + long nr_pages); 110 110 int bdi_writeback_task(struct bdi_writeback *wb); 111 111 int bdi_has_dirty_io(struct backing_dev_info *bdi); 112 112 void bdi_arm_supers_timer(void);
+9
include/linux/blkdev.h
··· 1211 1211 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); 1212 1212 1213 1213 #ifdef CONFIG_BLK_CGROUP 1214 + /* 1215 + * This should not be using sched_clock(). A real patch is in progress 1216 + * to fix this up, until that is in place we need to disable preemption 1217 + * around sched_clock() in this function and set_io_start_time_ns(). 1218 + */ 1214 1219 static inline void set_start_time_ns(struct request *req) 1215 1220 { 1221 + preempt_disable(); 1216 1222 req->start_time_ns = sched_clock(); 1223 + preempt_enable(); 1217 1224 } 1218 1225 1219 1226 static inline void set_io_start_time_ns(struct request *req) 1220 1227 { 1228 + preempt_disable(); 1221 1229 req->io_start_time_ns = sched_clock(); 1230 + preempt_enable(); 1222 1231 } 1223 1232 1224 1233 static inline uint64_t rq_start_time_ns(struct request *req)
+1 -1
include/linux/drbd.h
··· 53 53 54 54 55 55 extern const char *drbd_buildtag(void); 56 - #define REL_VERSION "8.3.8rc1" 56 + #define REL_VERSION "8.3.8rc2" 57 57 #define API_VERSION 88 58 58 #define PRO_VERSION_MIN 86 59 59 #define PRO_VERSION_MAX 94
-1
include/linux/iocontext.h
··· 7 7 struct cfq_queue; 8 8 struct cfq_io_context { 9 9 void *key; 10 - unsigned long dead_key; 11 10 12 11 struct cfq_queue *cfqq[2]; 13 12
+3 -1
include/linux/pipe_fs_i.h
··· 139 139 void pipe_unlock(struct pipe_inode_info *); 140 140 void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); 141 141 142 - extern unsigned int pipe_max_pages; 142 + extern unsigned int pipe_max_size, pipe_min_size; 143 + int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *); 144 + 143 145 144 146 /* Drop the inode semaphore and wait for a pipe event, atomically */ 145 147 void pipe_wait(struct pipe_inode_info *pipe);
-10
include/linux/writeback.h
··· 65 65 * so we use a single control to update them 66 66 */ 67 67 unsigned no_nrwrite_index_update:1; 68 - 69 - /* 70 - * For WB_SYNC_ALL, the sb must always be pinned. For WB_SYNC_NONE, 71 - * the writeback code will pin the sb for the caller. However, 72 - * for eg umount, the caller does WB_SYNC_NONE but already has 73 - * the sb pinned. If the below is set, caller already has the 74 - * sb pinned. 75 - */ 76 - unsigned sb_pinned:1; 77 68 }; 78 69 79 70 /* ··· 73 82 struct bdi_writeback; 74 83 int inode_wait(void *); 75 84 void writeback_inodes_sb(struct super_block *); 76 - void writeback_inodes_sb_locked(struct super_block *); 77 85 int writeback_inodes_sb_if_idle(struct super_block *); 78 86 void sync_inodes_sb(struct super_block *); 79 87 void writeback_inodes_wbc(struct writeback_control *wbc);
+4 -4
kernel/sysctl.c
··· 1471 1471 }, 1472 1472 #endif 1473 1473 { 1474 - .procname = "pipe-max-pages", 1475 - .data = &pipe_max_pages, 1474 + .procname = "pipe-max-size", 1475 + .data = &pipe_max_size, 1476 1476 .maxlen = sizeof(int), 1477 1477 .mode = 0644, 1478 - .proc_handler = &proc_dointvec_minmax, 1479 - .extra1 = &two, 1478 + .proc_handler = &pipe_proc_fn, 1479 + .extra1 = &pipe_min_size, 1480 1480 }, 1481 1481 /* 1482 1482 * NOTE: do not add new entries to this table unless you have read
+2 -2
mm/page-writeback.c
··· 597 597 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) 598 598 + global_page_state(NR_UNSTABLE_NFS)) 599 599 > background_thresh))) 600 - bdi_start_writeback(bdi, NULL, 0, 0); 600 + bdi_start_writeback(bdi, NULL, 0); 601 601 } 602 602 603 603 void set_page_dirty_balance(struct page *page, int page_mkwrite) ··· 707 707 */ 708 708 709 709 if (bdi_has_dirty_io(&q->backing_dev_info)) 710 - bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages, 0); 710 + bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages); 711 711 } 712 712 713 713 /*