Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dm delay: for short delays, use kthread instead of timers and wq

DM delay's current design of using timers and wq to realize the delays
is insufficient for delays below ~50ms.

This commit enhances the design to use a kthread to flush the expired
delays, trading some CPU time (in some cases) for better delay
accuracy and delays closer to what the user requested for smaller
delays. The new design is chosen as long as all the delays are below
50ms.

Since bios can't be completed in interrupt context using a kthread
is probably the most reasonable way to approach this.

Testing with
echo "0 2097152 zero" | dmsetup create dm-zeros
for i in $(seq 0 20);
do
echo "0 2097152 delay /dev/mapper/dm-zeros 0 $i" | dmsetup create dm-delay-${i}ms;
done

Some performance numbers for comparison, on beaglebone black (single
core) CONFIG_HZ_1000=y:

fio --name=1msread --rw=randread --bs=4k --runtime=60 --time_based \
--filename=/dev/mapper/dm-delay-1ms
Theoretical maximum: 1000 IOPS
Previous: 250 IOPS
Kthread: 500 IOPS

fio --name=10msread --rw=randread --bs=4k --runtime=60 --time_based \
--filename=/dev/mapper/dm-delay-10ms
Theoretical maximum: 100 IOPS
Previous: 45 IOPS
Kthread: 50 IOPS

fio --name=1mswrite --rw=randwrite --direct=1 --bs=4k --runtime=60 \
--time_based --filename=/dev/mapper/dm-delay-1ms
Theoretical maximum: 1000 IOPS
Previous: 498 IOPS
Kthread: 1000 IOPS

fio --name=10mswrite --rw=randwrite --direct=1 --bs=4k --runtime=60 \
--time_based --filename=/dev/mapper/dm-delay-10ms
Theoretical maximum: 100 IOPS
Previous: 90 IOPS
Kthread: 100 IOPS

(This one is just to prove the new design isn't impacting throughput,
not really about delays):
fio --name=10mswriteasync --rw=randwrite --direct=1 --bs=4k \
--runtime=60 --time_based --filename=/dev/mapper/dm-delay-10ms \
--numjobs=32 --iodepth=64 --ioengine=libaio --group_reporting
Previous: 13.3k IOPS
Kthread: 13.3k IOPS

Signed-off-by: Christian Loehle <christian.loehle@arm.com>
[Harshit: kthread_create error handling fix in delay_ctr]
Signed-off-by: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>

authored by

Christian Loehle and committed by
Mike Snitzer
70bbeb29 8388cba9

+88 -15
+88 -15
drivers/md/dm-delay.c
··· 13 13 #include <linux/blkdev.h> 14 14 #include <linux/bio.h> 15 15 #include <linux/slab.h> 16 + #include <linux/kthread.h> 16 17 17 18 #include <linux/device-mapper.h> 18 19 ··· 32 31 struct workqueue_struct *kdelayd_wq; 33 32 struct work_struct flush_expired_bios; 34 33 struct list_head delayed_bios; 34 + struct task_struct *worker; 35 35 atomic_t may_delay; 36 36 37 37 struct delay_class read; ··· 68 66 mutex_unlock(&dc->timer_lock); 69 67 } 70 68 69 + static inline bool delay_is_fast(struct delay_c *dc) 70 + { 71 + return !!dc->worker; 72 + } 73 + 74 + static void flush_delayed_bios_fast(struct delay_c *dc, bool flush_all) 75 + { 76 + struct dm_delay_info *delayed, *next; 77 + 78 + mutex_lock(&delayed_bios_lock); 79 + list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { 80 + if (flush_all || time_after_eq(jiffies, delayed->expires)) { 81 + struct bio *bio = dm_bio_from_per_bio_data(delayed, 82 + sizeof(struct dm_delay_info)); 83 + list_del(&delayed->list); 84 + dm_submit_bio_remap(bio, NULL); 85 + delayed->class->ops--; 86 + } 87 + } 88 + mutex_unlock(&delayed_bios_lock); 89 + } 90 + 91 + static int flush_worker_fn(void *data) 92 + { 93 + struct delay_c *dc = data; 94 + 95 + while (1) { 96 + flush_delayed_bios_fast(dc, false); 97 + if (unlikely(list_empty(&dc->delayed_bios))) { 98 + set_current_state(TASK_INTERRUPTIBLE); 99 + schedule(); 100 + } else 101 + cond_resched(); 102 + } 103 + 104 + return 0; 105 + } 106 + 71 107 static void flush_bios(struct bio *bio) 72 108 { 73 109 struct bio *n; ··· 118 78 } 119 79 } 120 80 121 - static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all) 81 + static struct bio *flush_delayed_bios(struct delay_c *dc, bool flush_all) 122 82 { 123 83 struct dm_delay_info *delayed, *next; 124 84 unsigned long next_expires = 0; ··· 155 115 struct delay_c *dc; 156 116 157 117 dc = container_of(work, struct delay_c, flush_expired_bios); 158 - flush_bios(flush_delayed_bios(dc, 0)); 118 + if (delay_is_fast(dc)) 119 + flush_delayed_bios_fast(dc, false); 120 + else 121 + flush_bios(flush_delayed_bios(dc, false)); 159 122 } 160 123 161 124 static void delay_dtr(struct dm_target *ti) ··· 174 131 dm_put_device(ti, dc->write.dev); 175 132 if (dc->flush.dev) 176 133 dm_put_device(ti, dc->flush.dev); 134 + if (dc->worker) 135 + kthread_stop(dc->worker); 177 136 178 - mutex_destroy(&dc->timer_lock); 137 + if (!delay_is_fast(dc)) 138 + mutex_destroy(&dc->timer_lock); 179 139 180 140 kfree(dc); 181 141 } ··· 221 175 { 222 176 struct delay_c *dc; 223 177 int ret; 178 + unsigned int max_delay; 224 179 225 180 if (argc != 3 && argc != 6 && argc != 9) { 226 181 ti->error = "Requires exactly 3, 6 or 9 arguments"; ··· 235 188 } 236 189 237 190 ti->private = dc; 238 - timer_setup(&dc->delay_timer, handle_delayed_timer, 0); 239 - INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); 240 191 INIT_LIST_HEAD(&dc->delayed_bios); 241 - mutex_init(&dc->timer_lock); 242 192 atomic_set(&dc->may_delay, 1); 243 193 dc->argc = argc; 244 194 245 195 ret = delay_class_ctr(ti, &dc->read, argv); 246 196 if (ret) 247 197 goto bad; 198 + max_delay = dc->read.delay; 248 199 249 200 if (argc == 3) { 250 201 ret = delay_class_ctr(ti, &dc->write, argv); ··· 251 206 ret = delay_class_ctr(ti, &dc->flush, argv); 252 207 if (ret) 253 208 goto bad; 209 + max_delay = max(max_delay, dc->write.delay); 210 + max_delay = max(max_delay, dc->flush.delay); 254 211 goto out; 255 212 } 256 213 ··· 263 216 ret = delay_class_ctr(ti, &dc->flush, argv + 3); 264 217 if (ret) 265 218 goto bad; 219 + max_delay = max(max_delay, dc->flush.delay); 266 220 goto out; 267 221 } 268 222 269 223 ret = delay_class_ctr(ti, &dc->flush, argv + 6); 270 224 if (ret) 271 225 goto bad; 226 + max_delay = max(max_delay, dc->flush.delay); 272 227 273 228 out: 274 - dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); 275 - if (!dc->kdelayd_wq) { 276 - ret = -EINVAL; 277 - DMERR("Couldn't start kdelayd"); 278 - goto bad; 229 + if (max_delay < 50) { 230 + /* 231 + * In case of small requested delays, use kthread instead of 232 + * timers and workqueue to achieve better latency. 233 + */ 234 + dc->worker = kthread_create(&flush_worker_fn, dc, 235 + "dm-delay-flush-worker"); 236 + if (IS_ERR(dc->worker)) { 237 + ret = PTR_ERR(dc->worker); 238 + goto bad; 239 + } 240 + } else { 241 + timer_setup(&dc->delay_timer, handle_delayed_timer, 0); 242 + INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); 243 + mutex_init(&dc->timer_lock); 244 + dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); 245 + if (!dc->kdelayd_wq) { 246 + ret = -EINVAL; 247 + DMERR("Couldn't start kdelayd"); 248 + goto bad; 249 + } 279 250 } 280 251 281 252 ti->num_flush_bios = 1; ··· 325 260 list_add_tail(&delayed->list, &dc->delayed_bios); 326 261 mutex_unlock(&delayed_bios_lock); 327 262 328 - queue_timeout(dc, expires); 263 + if (delay_is_fast(dc)) 264 + wake_up_process(dc->worker); 265 + else 266 + queue_timeout(dc, expires); 329 267 330 268 return DM_MAPIO_SUBMITTED; 331 269 } ··· 338 270 struct delay_c *dc = ti->private; 339 271 340 272 atomic_set(&dc->may_delay, 0); 341 - del_timer_sync(&dc->delay_timer); 342 - flush_bios(flush_delayed_bios(dc, 1)); 273 + 274 + if (delay_is_fast(dc)) 275 + flush_delayed_bios_fast(dc, true); 276 + else { 277 + del_timer_sync(&dc->delay_timer); 278 + flush_bios(flush_delayed_bios(dc, true)); 279 + } 343 280 } 344 281 345 282 static void delay_resume(struct dm_target *ti) ··· 429 356 430 357 static struct target_type delay_target = { 431 358 .name = "delay", 432 - .version = {1, 3, 0}, 359 + .version = {1, 4, 0}, 433 360 .features = DM_TARGET_PASSES_INTEGRITY, 434 361 .module = THIS_MODULE, 435 362 .ctr = delay_ctr,