Merge branch 'for-linus' of git://git.kernel.dk/linux-2.6-block

* 'for-linus' of git://git.kernel.dk/linux-2.6-block:
cfq-iosched: fix RCU problem in cfq_cic_lookup()
block: make blktrace use per-cpu buffers for message notes
Added in elevator switch message to blktrace stream
Added in MESSAGE notes for blktraces
block: reorder cfq_queue to save space on 64bit builds
block: Move the second call to get_request to the end of the loop
splice: handle try_to_release_page() failure
splice: fix sendfile() issue with relay

+110 -33
+17 -20
block/blk-core.c
··· 806 rq = get_request(q, rw_flags, bio, GFP_NOIO); 807 while (!rq) { 808 DEFINE_WAIT(wait); 809 struct request_list *rl = &q->rq; 810 811 prepare_to_wait_exclusive(&rl->wait[rw], &wait, 812 TASK_UNINTERRUPTIBLE); 813 814 - rq = get_request(q, rw_flags, bio, GFP_NOIO); 815 816 - if (!rq) { 817 - struct io_context *ioc; 818 819 - blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); 820 821 - __generic_unplug_device(q); 822 - spin_unlock_irq(q->queue_lock); 823 - io_schedule(); 824 - 825 - /* 826 - * After sleeping, we become a "batching" process and 827 - * will be able to allocate at least one request, and 828 - * up to a big batch of them for a small period time. 829 - * See ioc_batching, ioc_set_batching 830 - */ 831 - ioc = current_io_context(GFP_NOIO, q->node); 832 - ioc_set_batching(q, ioc); 833 - 834 - spin_lock_irq(q->queue_lock); 835 - } 836 finish_wait(&rl->wait[rw], &wait); 837 - } 838 839 return rq; 840 }
··· 806 rq = get_request(q, rw_flags, bio, GFP_NOIO); 807 while (!rq) { 808 DEFINE_WAIT(wait); 809 + struct io_context *ioc; 810 struct request_list *rl = &q->rq; 811 812 prepare_to_wait_exclusive(&rl->wait[rw], &wait, 813 TASK_UNINTERRUPTIBLE); 814 815 + blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); 816 817 + __generic_unplug_device(q); 818 + spin_unlock_irq(q->queue_lock); 819 + io_schedule(); 820 821 + /* 822 + * After sleeping, we become a "batching" process and 823 + * will be able to allocate at least one request, and 824 + * up to a big batch of them for a small period time. 825 + * See ioc_batching, ioc_set_batching 826 + */ 827 + ioc = current_io_context(GFP_NOIO, q->node); 828 + ioc_set_batching(q, ioc); 829 830 + spin_lock_irq(q->queue_lock); 831 finish_wait(&rl->wait[rw], &wait); 832 + 833 + rq = get_request(q, rw_flags, bio, GFP_NOIO); 834 + }; 835 836 return rq; 837 }
+23
block/blktrace.c
··· 75 local_irq_restore(flags); 76 } 77 78 static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, 79 pid_t pid) 80 { ··· 249 debugfs_remove(bt->dropped_file); 250 blk_remove_tree(bt->dir); 251 free_percpu(bt->sequence); 252 kfree(bt); 253 } 254 ··· 364 if (!bt->sequence) 365 goto err; 366 367 ret = -ENOENT; 368 dir = blk_create_tree(buts->name); 369 if (!dir) ··· 414 if (bt->dropped_file) 415 debugfs_remove(bt->dropped_file); 416 free_percpu(bt->sequence); 417 if (bt->rchan) 418 relay_close(bt->rchan); 419 kfree(bt);
··· 75 local_irq_restore(flags); 76 } 77 78 + void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) 79 + { 80 + int n; 81 + va_list args; 82 + char *buf; 83 + 84 + preempt_disable(); 85 + buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); 86 + va_start(args, fmt); 87 + n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); 88 + va_end(args); 89 + 90 + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); 91 + preempt_enable(); 92 + } 93 + EXPORT_SYMBOL_GPL(__trace_note_message); 94 + 95 static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, 96 pid_t pid) 97 { ··· 232 debugfs_remove(bt->dropped_file); 233 blk_remove_tree(bt->dir); 234 free_percpu(bt->sequence); 235 + free_percpu(bt->msg_data); 236 kfree(bt); 237 } 238 ··· 346 if (!bt->sequence) 347 goto err; 348 349 + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); 350 + if (!bt->msg_data) 351 + goto err; 352 + 353 ret = -ENOENT; 354 dir = blk_create_tree(buts->name); 355 if (!dir) ··· 392 if (bt->dropped_file) 393 debugfs_remove(bt->dropped_file); 394 free_percpu(bt->sequence); 395 + free_percpu(bt->msg_data); 396 if (bt->rchan) 397 relay_close(bt->rchan); 398 kfree(bt);
+30 -6
block/cfq-iosched.c
··· 124 struct cfq_queue { 125 /* reference count */ 126 atomic_t ref; 127 /* parent cfq_data */ 128 struct cfq_data *cfqd; 129 /* service_tree member */ ··· 140 int queued[2]; 141 /* currently allocated requests */ 142 int allocated[2]; 143 - /* pending metadata requests */ 144 - int meta_pending; 145 /* fifo list of requests in sort_list */ 146 struct list_head fifo; 147 148 unsigned long slice_end; 149 long slice_resid; 150 151 /* number of requests that are on the dispatch list or inside driver */ 152 int dispatched; 153 ··· 155 unsigned short ioprio, org_ioprio; 156 unsigned short ioprio_class, org_ioprio_class; 157 158 - /* various state flags, see below */ 159 - unsigned int flags; 160 }; 161 162 enum cfqq_state_flags { ··· 1142 kmem_cache_free(cfq_pool, cfqq); 1143 } 1144 1145 static void 1146 __call_for_each_cic(struct io_context *ioc, 1147 void (*func)(struct io_context *, struct cfq_io_context *)) ··· 1200 cfq_cic_free(cic); 1201 } 1202 1203 static void cfq_free_io_context(struct io_context *ioc) 1204 { 1205 /* ··· 1510 cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) 1511 { 1512 struct cfq_io_context *cic; 1513 void *k; 1514 1515 if (unlikely(!ioc)) 1516 return NULL; 1517 1518 /* 1519 * we maintain a last-hit cache, to avoid browsing over the tree 1520 */ 1521 cic = rcu_dereference(ioc->ioc_data); 1522 - if (cic && cic->key == cfqd) 1523 return cic; 1524 1525 do { 1526 - rcu_read_lock(); 1527 cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); 1528 rcu_read_unlock(); 1529 if (!cic) ··· 1536 k = cic->key; 1537 if (unlikely(!k)) { 1538 cfq_drop_dead_cic(cfqd, ioc, cic); 1539 continue; 1540 } 1541 1542 rcu_assign_pointer(ioc->ioc_data, cic); 1543 break; 1544 } while (1); 1545 ··· 2149 2150 static void cfq_slab_kill(void) 2151 { 2152 if (cfq_pool) 2153 kmem_cache_destroy(cfq_pool); 2154 if (cfq_ioc_pool) ··· 2311 ioc_gone = &all_gone; 2312 /* ioc_gone's update must be visible before reading ioc_count */ 2313 smp_wmb(); 2314 if (elv_ioc_count_read(ioc_count)) 2315 wait_for_completion(ioc_gone); 2316 cfq_slab_kill();
··· 124 struct cfq_queue { 125 /* reference count */ 126 atomic_t ref; 127 + /* various state flags, see below */ 128 + unsigned int flags; 129 /* parent cfq_data */ 130 struct cfq_data *cfqd; 131 /* service_tree member */ ··· 138 int queued[2]; 139 /* currently allocated requests */ 140 int allocated[2]; 141 /* fifo list of requests in sort_list */ 142 struct list_head fifo; 143 144 unsigned long slice_end; 145 long slice_resid; 146 147 + /* pending metadata requests */ 148 + int meta_pending; 149 /* number of requests that are on the dispatch list or inside driver */ 150 int dispatched; 151 ··· 153 unsigned short ioprio, org_ioprio; 154 unsigned short ioprio_class, org_ioprio_class; 155 156 }; 157 158 enum cfqq_state_flags { ··· 1142 kmem_cache_free(cfq_pool, cfqq); 1143 } 1144 1145 + /* 1146 + * Must always be called with the rcu_read_lock() held 1147 + */ 1148 static void 1149 __call_for_each_cic(struct io_context *ioc, 1150 void (*func)(struct io_context *, struct cfq_io_context *)) ··· 1197 cfq_cic_free(cic); 1198 } 1199 1200 + /* 1201 + * Must be called with rcu_read_lock() held or preemption otherwise disabled. 1202 + * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), 1203 + * and ->trim() which is called with the task lock held 1204 + */ 1205 static void cfq_free_io_context(struct io_context *ioc) 1206 { 1207 /* ··· 1502 cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) 1503 { 1504 struct cfq_io_context *cic; 1505 + unsigned long flags; 1506 void *k; 1507 1508 if (unlikely(!ioc)) 1509 return NULL; 1510 1511 + rcu_read_lock(); 1512 + 1513 /* 1514 * we maintain a last-hit cache, to avoid browsing over the tree 1515 */ 1516 cic = rcu_dereference(ioc->ioc_data); 1517 + if (cic && cic->key == cfqd) { 1518 + rcu_read_unlock(); 1519 return cic; 1520 + } 1521 1522 do { 1523 cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); 1524 rcu_read_unlock(); 1525 if (!cic) ··· 1524 k = cic->key; 1525 if (unlikely(!k)) { 1526 cfq_drop_dead_cic(cfqd, ioc, cic); 1527 + rcu_read_lock(); 1528 continue; 1529 } 1530 1531 + spin_lock_irqsave(&ioc->lock, flags); 1532 rcu_assign_pointer(ioc->ioc_data, cic); 1533 + spin_unlock_irqrestore(&ioc->lock, flags); 1534 break; 1535 } while (1); 1536 ··· 2134 2135 static void cfq_slab_kill(void) 2136 { 2137 + /* 2138 + * Caller already ensured that pending RCU callbacks are completed, 2139 + * so we should have no busy allocations at this point. 2140 + */ 2141 if (cfq_pool) 2142 kmem_cache_destroy(cfq_pool); 2143 if (cfq_ioc_pool) ··· 2292 ioc_gone = &all_gone; 2293 /* ioc_gone's update must be visible before reading ioc_count */ 2294 smp_wmb(); 2295 + 2296 + /* 2297 + * this also protects us from entering cfq_slab_kill() with 2298 + * pending RCU callbacks 2299 + */ 2300 if (elv_ioc_count_read(ioc_count)) 2301 wait_for_completion(ioc_gone); 2302 cfq_slab_kill();
+2
block/elevator.c
··· 1110 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); 1111 spin_unlock_irq(q->queue_lock); 1112 1113 return 1; 1114 1115 fail_register:
··· 1110 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); 1111 spin_unlock_irq(q->queue_lock); 1112 1113 + blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); 1114 + 1115 return 1; 1116 1117 fail_register:
+11 -6
fs/splice.c
··· 58 */ 59 wait_on_page_writeback(page); 60 61 - if (PagePrivate(page)) 62 - try_to_release_page(page, GFP_KERNEL); 63 64 /* 65 * If we succeeded in removing the mapping, set LRU flag ··· 75 * Raced with truncate or failed to remove page from current 76 * address space, unlock and return failure. 77 */ 78 unlock_page(page); 79 return 1; 80 } ··· 984 985 while (len) { 986 size_t read_len; 987 - loff_t pos = sd->pos; 988 989 ret = do_splice_to(in, &pos, pipe, len, flags); 990 if (unlikely(ret <= 0)) ··· 999 * could get stuck data in the internal pipe: 1000 */ 1001 ret = actor(pipe, sd); 1002 - if (unlikely(ret <= 0)) 1003 goto out_release; 1004 1005 bytes += ret; 1006 len -= ret; 1007 sd->pos = pos; 1008 1009 - if (ret < read_len) 1010 goto out_release; 1011 } 1012 1013 done: ··· 1077 1078 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1079 if (ret > 0) 1080 - *ppos += ret; 1081 1082 return ret; 1083 }
··· 58 */ 59 wait_on_page_writeback(page); 60 61 + if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) 62 + goto out_unlock; 63 64 /* 65 * If we succeeded in removing the mapping, set LRU flag ··· 75 * Raced with truncate or failed to remove page from current 76 * address space, unlock and return failure. 77 */ 78 + out_unlock: 79 unlock_page(page); 80 return 1; 81 } ··· 983 984 while (len) { 985 size_t read_len; 986 + loff_t pos = sd->pos, prev_pos = pos; 987 988 ret = do_splice_to(in, &pos, pipe, len, flags); 989 if (unlikely(ret <= 0)) ··· 998 * could get stuck data in the internal pipe: 999 */ 1000 ret = actor(pipe, sd); 1001 + if (unlikely(ret <= 0)) { 1002 + sd->pos = prev_pos; 1003 goto out_release; 1004 + } 1005 1006 bytes += ret; 1007 len -= ret; 1008 sd->pos = pos; 1009 1010 + if (ret < read_len) { 1011 + sd->pos = prev_pos + ret; 1012 goto out_release; 1013 + } 1014 } 1015 1016 done: ··· 1072 1073 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1074 if (ret > 0) 1075 + *ppos = sd.pos; 1076 1077 return ret; 1078 }
+26
include/linux/blktrace_api.h
··· 55 enum blktrace_notify { 56 __BLK_TN_PROCESS = 0, /* establish pid/name mapping */ 57 __BLK_TN_TIMESTAMP, /* include system clock */ 58 }; 59 60 ··· 80 81 #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) 82 #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) 83 84 #define BLK_IO_TRACE_MAGIC 0x65617400 85 #define BLK_IO_TRACE_VERSION 0x07 ··· 121 int trace_state; 122 struct rchan *rchan; 123 unsigned long *sequence; 124 u16 act_mask; 125 u64 start_lba; 126 u64 end_lba; ··· 152 extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *); 153 extern int do_blk_trace_setup(struct request_queue *q, 154 char *name, dev_t dev, struct blk_user_trace_setup *buts); 155 156 157 /** 158 * blk_add_trace_rq - Add a trace for a request oriented action ··· 323 #define blk_trace_setup(q, name, dev, arg) (-ENOTTY) 324 #define blk_trace_startstop(q, start) (-ENOTTY) 325 #define blk_trace_remove(q) (-ENOTTY) 326 #endif /* CONFIG_BLK_DEV_IO_TRACE */ 327 #endif /* __KERNEL__ */ 328 #endif
··· 55 enum blktrace_notify { 56 __BLK_TN_PROCESS = 0, /* establish pid/name mapping */ 57 __BLK_TN_TIMESTAMP, /* include system clock */ 58 + __BLK_TN_MESSAGE, /* Character string message */ 59 }; 60 61 ··· 79 80 #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) 81 #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) 82 + #define BLK_TN_MESSAGE (__BLK_TN_MESSAGE | BLK_TC_ACT(BLK_TC_NOTIFY)) 83 84 #define BLK_IO_TRACE_MAGIC 0x65617400 85 #define BLK_IO_TRACE_VERSION 0x07 ··· 119 int trace_state; 120 struct rchan *rchan; 121 unsigned long *sequence; 122 + unsigned char *msg_data; 123 u16 act_mask; 124 u64 start_lba; 125 u64 end_lba; ··· 149 extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *); 150 extern int do_blk_trace_setup(struct request_queue *q, 151 char *name, dev_t dev, struct blk_user_trace_setup *buts); 152 + extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); 153 154 + /** 155 + * blk_add_trace_msg - Add a (simple) message to the blktrace stream 156 + * @q: queue the io is for 157 + * @fmt: format to print message in 158 + * args... Variable argument list for format 159 + * 160 + * Description: 161 + * Records a (simple) message onto the blktrace stream. 162 + * 163 + * NOTE: BLK_TN_MAX_MSG characters are output at most. 164 + * NOTE: Can not use 'static inline' due to presence of var args... 165 + * 166 + **/ 167 + #define blk_add_trace_msg(q, fmt, ...) \ 168 + do { \ 169 + struct blk_trace *bt = (q)->blk_trace; \ 170 + if (unlikely(bt)) \ 171 + __trace_note_message(bt, fmt, ##__VA_ARGS__); \ 172 + } while (0) 173 + #define BLK_TN_MAX_MSG 128 174 175 /** 176 * blk_add_trace_rq - Add a trace for a request oriented action ··· 299 #define blk_trace_setup(q, name, dev, arg) (-ENOTTY) 300 #define blk_trace_startstop(q, start) (-ENOTTY) 301 #define blk_trace_remove(q) (-ENOTTY) 302 + #define blk_add_trace_msg(q, fmt, ...) do { } while (0) 303 + 304 #endif /* CONFIG_BLK_DEV_IO_TRACE */ 305 #endif /* __KERNEL__ */ 306 #endif
+1 -1
kernel/relay.c
··· 1191 ret = 0; 1192 spliced = 0; 1193 1194 - while (len) { 1195 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); 1196 if (ret < 0) 1197 break;
··· 1191 ret = 0; 1192 spliced = 0; 1193 1194 + while (len && !spliced) { 1195 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); 1196 if (ret < 0) 1197 break;