Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (c) 2025, Christoph Hellwig.
4 * Copyright (c) 2025, Western Digital Corporation or its affiliates.
5 *
6 * Zoned Loop Device driver - exports a zoned block device using one file per
7 * zone as backing storage.
8 */
9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11#include <linux/module.h>
12#include <linux/blk-mq.h>
13#include <linux/blkzoned.h>
14#include <linux/pagemap.h>
15#include <linux/miscdevice.h>
16#include <linux/falloc.h>
17#include <linux/mutex.h>
18#include <linux/parser.h>
19#include <linux/seq_file.h>
20
21/*
22 * Options for adding (and removing) a device.
23 */
24enum {
25 ZLOOP_OPT_ERR = 0,
26 ZLOOP_OPT_ID = (1 << 0),
27 ZLOOP_OPT_CAPACITY = (1 << 1),
28 ZLOOP_OPT_ZONE_SIZE = (1 << 2),
29 ZLOOP_OPT_ZONE_CAPACITY = (1 << 3),
30 ZLOOP_OPT_NR_CONV_ZONES = (1 << 4),
31 ZLOOP_OPT_BASE_DIR = (1 << 5),
32 ZLOOP_OPT_NR_QUEUES = (1 << 6),
33 ZLOOP_OPT_QUEUE_DEPTH = (1 << 7),
34 ZLOOP_OPT_BUFFERED_IO = (1 << 8),
35};
36
37static const match_table_t zloop_opt_tokens = {
38 { ZLOOP_OPT_ID, "id=%d" },
39 { ZLOOP_OPT_CAPACITY, "capacity_mb=%u" },
40 { ZLOOP_OPT_ZONE_SIZE, "zone_size_mb=%u" },
41 { ZLOOP_OPT_ZONE_CAPACITY, "zone_capacity_mb=%u" },
42 { ZLOOP_OPT_NR_CONV_ZONES, "conv_zones=%u" },
43 { ZLOOP_OPT_BASE_DIR, "base_dir=%s" },
44 { ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" },
45 { ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" },
46 { ZLOOP_OPT_BUFFERED_IO, "buffered_io" },
47 { ZLOOP_OPT_ERR, NULL }
48};
49
50/* Default values for the "add" operation. */
51#define ZLOOP_DEF_ID -1
52#define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT)
53#define ZLOOP_DEF_NR_ZONES 64
54#define ZLOOP_DEF_NR_CONV_ZONES 8
55#define ZLOOP_DEF_BASE_DIR "/var/local/zloop"
56#define ZLOOP_DEF_NR_QUEUES 1
57#define ZLOOP_DEF_QUEUE_DEPTH 128
58#define ZLOOP_DEF_BUFFERED_IO false
59
60/* Arbitrary limit on the zone size (16GB). */
61#define ZLOOP_MAX_ZONE_SIZE_MB 16384
62
63struct zloop_options {
64 unsigned int mask;
65 int id;
66 sector_t capacity;
67 sector_t zone_size;
68 sector_t zone_capacity;
69 unsigned int nr_conv_zones;
70 char *base_dir;
71 unsigned int nr_queues;
72 unsigned int queue_depth;
73 bool buffered_io;
74};
75
76/*
77 * Device states.
78 */
79enum {
80 Zlo_creating = 0,
81 Zlo_live,
82 Zlo_deleting,
83};
84
85enum zloop_zone_flags {
86 ZLOOP_ZONE_CONV = 0,
87 ZLOOP_ZONE_SEQ_ERROR,
88};
89
90struct zloop_zone {
91 struct file *file;
92
93 unsigned long flags;
94 struct mutex lock;
95 enum blk_zone_cond cond;
96 sector_t start;
97 sector_t wp;
98
99 gfp_t old_gfp_mask;
100};
101
102struct zloop_device {
103 unsigned int id;
104 unsigned int state;
105
106 struct blk_mq_tag_set tag_set;
107 struct gendisk *disk;
108
109 struct workqueue_struct *workqueue;
110 bool buffered_io;
111
112 const char *base_dir;
113 struct file *data_dir;
114
115 unsigned int zone_shift;
116 sector_t zone_size;
117 sector_t zone_capacity;
118 unsigned int nr_zones;
119 unsigned int nr_conv_zones;
120 unsigned int block_size;
121
122 struct zloop_zone zones[] __counted_by(nr_zones);
123};
124
125struct zloop_cmd {
126 struct work_struct work;
127 atomic_t ref;
128 sector_t sector;
129 sector_t nr_sectors;
130 long ret;
131 struct kiocb iocb;
132 struct bio_vec *bvec;
133};
134
135static DEFINE_IDR(zloop_index_idr);
136static DEFINE_MUTEX(zloop_ctl_mutex);
137
138static unsigned int rq_zone_no(struct request *rq)
139{
140 struct zloop_device *zlo = rq->q->queuedata;
141
142 return blk_rq_pos(rq) >> zlo->zone_shift;
143}
144
145static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
146{
147 struct zloop_zone *zone = &zlo->zones[zone_no];
148 struct kstat stat;
149 sector_t file_sectors;
150 int ret;
151
152 lockdep_assert_held(&zone->lock);
153
154 ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
155 if (ret < 0) {
156 pr_err("Failed to get zone %u file stat (err=%d)\n",
157 zone_no, ret);
158 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
159 return ret;
160 }
161
162 file_sectors = stat.size >> SECTOR_SHIFT;
163 if (file_sectors > zlo->zone_capacity) {
164 pr_err("Zone %u file too large (%llu sectors > %llu)\n",
165 zone_no, file_sectors, zlo->zone_capacity);
166 return -EINVAL;
167 }
168
169 if (file_sectors & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
170 pr_err("Zone %u file size not aligned to block size %u\n",
171 zone_no, zlo->block_size);
172 return -EINVAL;
173 }
174
175 if (!file_sectors) {
176 zone->cond = BLK_ZONE_COND_EMPTY;
177 zone->wp = zone->start;
178 } else if (file_sectors == zlo->zone_capacity) {
179 zone->cond = BLK_ZONE_COND_FULL;
180 zone->wp = zone->start + zlo->zone_size;
181 } else {
182 zone->cond = BLK_ZONE_COND_CLOSED;
183 zone->wp = zone->start + file_sectors;
184 }
185
186 return 0;
187}
188
189static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no)
190{
191 struct zloop_zone *zone = &zlo->zones[zone_no];
192 int ret = 0;
193
194 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
195 return -EIO;
196
197 mutex_lock(&zone->lock);
198
199 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
200 ret = zloop_update_seq_zone(zlo, zone_no);
201 if (ret)
202 goto unlock;
203 }
204
205 switch (zone->cond) {
206 case BLK_ZONE_COND_EXP_OPEN:
207 break;
208 case BLK_ZONE_COND_EMPTY:
209 case BLK_ZONE_COND_CLOSED:
210 case BLK_ZONE_COND_IMP_OPEN:
211 zone->cond = BLK_ZONE_COND_EXP_OPEN;
212 break;
213 case BLK_ZONE_COND_FULL:
214 default:
215 ret = -EIO;
216 break;
217 }
218
219unlock:
220 mutex_unlock(&zone->lock);
221
222 return ret;
223}
224
225static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
226{
227 struct zloop_zone *zone = &zlo->zones[zone_no];
228 int ret = 0;
229
230 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
231 return -EIO;
232
233 mutex_lock(&zone->lock);
234
235 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
236 ret = zloop_update_seq_zone(zlo, zone_no);
237 if (ret)
238 goto unlock;
239 }
240
241 switch (zone->cond) {
242 case BLK_ZONE_COND_CLOSED:
243 break;
244 case BLK_ZONE_COND_IMP_OPEN:
245 case BLK_ZONE_COND_EXP_OPEN:
246 if (zone->wp == zone->start)
247 zone->cond = BLK_ZONE_COND_EMPTY;
248 else
249 zone->cond = BLK_ZONE_COND_CLOSED;
250 break;
251 case BLK_ZONE_COND_EMPTY:
252 case BLK_ZONE_COND_FULL:
253 default:
254 ret = -EIO;
255 break;
256 }
257
258unlock:
259 mutex_unlock(&zone->lock);
260
261 return ret;
262}
263
264static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
265{
266 struct zloop_zone *zone = &zlo->zones[zone_no];
267 int ret = 0;
268
269 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
270 return -EIO;
271
272 mutex_lock(&zone->lock);
273
274 if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
275 zone->cond == BLK_ZONE_COND_EMPTY)
276 goto unlock;
277
278 if (vfs_truncate(&zone->file->f_path, 0)) {
279 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
280 ret = -EIO;
281 goto unlock;
282 }
283
284 zone->cond = BLK_ZONE_COND_EMPTY;
285 zone->wp = zone->start;
286 clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
287
288unlock:
289 mutex_unlock(&zone->lock);
290
291 return ret;
292}
293
294static int zloop_reset_all_zones(struct zloop_device *zlo)
295{
296 unsigned int i;
297 int ret;
298
299 for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) {
300 ret = zloop_reset_zone(zlo, i);
301 if (ret)
302 return ret;
303 }
304
305 return 0;
306}
307
308static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
309{
310 struct zloop_zone *zone = &zlo->zones[zone_no];
311 int ret = 0;
312
313 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
314 return -EIO;
315
316 mutex_lock(&zone->lock);
317
318 if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
319 zone->cond == BLK_ZONE_COND_FULL)
320 goto unlock;
321
322 if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) {
323 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
324 ret = -EIO;
325 goto unlock;
326 }
327
328 zone->cond = BLK_ZONE_COND_FULL;
329 zone->wp = zone->start + zlo->zone_size;
330 clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
331
332 unlock:
333 mutex_unlock(&zone->lock);
334
335 return ret;
336}
337
338static void zloop_put_cmd(struct zloop_cmd *cmd)
339{
340 struct request *rq = blk_mq_rq_from_pdu(cmd);
341
342 if (!atomic_dec_and_test(&cmd->ref))
343 return;
344 kfree(cmd->bvec);
345 cmd->bvec = NULL;
346 if (likely(!blk_should_fake_timeout(rq->q)))
347 blk_mq_complete_request(rq);
348}
349
350static void zloop_rw_complete(struct kiocb *iocb, long ret)
351{
352 struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb);
353
354 cmd->ret = ret;
355 zloop_put_cmd(cmd);
356}
357
358static void zloop_rw(struct zloop_cmd *cmd)
359{
360 struct request *rq = blk_mq_rq_from_pdu(cmd);
361 struct zloop_device *zlo = rq->q->queuedata;
362 unsigned int zone_no = rq_zone_no(rq);
363 sector_t sector = blk_rq_pos(rq);
364 sector_t nr_sectors = blk_rq_sectors(rq);
365 bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
366 bool is_write = req_op(rq) == REQ_OP_WRITE || is_append;
367 int rw = is_write ? ITER_SOURCE : ITER_DEST;
368 struct req_iterator rq_iter;
369 struct zloop_zone *zone;
370 struct iov_iter iter;
371 struct bio_vec tmp;
372 sector_t zone_end;
373 int nr_bvec = 0;
374 int ret;
375
376 atomic_set(&cmd->ref, 2);
377 cmd->sector = sector;
378 cmd->nr_sectors = nr_sectors;
379 cmd->ret = 0;
380
381 /* We should never get an I/O beyond the device capacity. */
382 if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) {
383 ret = -EIO;
384 goto out;
385 }
386 zone = &zlo->zones[zone_no];
387 zone_end = zone->start + zlo->zone_capacity;
388
389 /*
390 * The block layer should never send requests that are not fully
391 * contained within the zone.
392 */
393 if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) {
394 ret = -EIO;
395 goto out;
396 }
397
398 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
399 mutex_lock(&zone->lock);
400 ret = zloop_update_seq_zone(zlo, zone_no);
401 mutex_unlock(&zone->lock);
402 if (ret)
403 goto out;
404 }
405
406 if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
407 mutex_lock(&zone->lock);
408
409 if (is_append) {
410 sector = zone->wp;
411 cmd->sector = sector;
412 }
413
414 /*
415 * Write operations must be aligned to the write pointer and
416 * fully contained within the zone capacity.
417 */
418 if (sector != zone->wp || zone->wp + nr_sectors > zone_end) {
419 pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
420 zone_no, sector, zone->wp);
421 ret = -EIO;
422 goto unlock;
423 }
424
425 /* Implicitly open the target zone. */
426 if (zone->cond == BLK_ZONE_COND_CLOSED ||
427 zone->cond == BLK_ZONE_COND_EMPTY)
428 zone->cond = BLK_ZONE_COND_IMP_OPEN;
429
430 /*
431 * Advance the write pointer of sequential zones. If the write
432 * fails, the wp position will be corrected when the next I/O
433 * copmpletes.
434 */
435 zone->wp += nr_sectors;
436 if (zone->wp == zone_end)
437 zone->cond = BLK_ZONE_COND_FULL;
438 }
439
440 rq_for_each_bvec(tmp, rq, rq_iter)
441 nr_bvec++;
442
443 if (rq->bio != rq->biotail) {
444 struct bio_vec *bvec;
445
446 cmd->bvec = kmalloc_array(nr_bvec, sizeof(*cmd->bvec), GFP_NOIO);
447 if (!cmd->bvec) {
448 ret = -EIO;
449 goto unlock;
450 }
451
452 /*
453 * The bios of the request may be started from the middle of
454 * the 'bvec' because of bio splitting, so we can't directly
455 * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
456 * API will take care of all details for us.
457 */
458 bvec = cmd->bvec;
459 rq_for_each_bvec(tmp, rq, rq_iter) {
460 *bvec = tmp;
461 bvec++;
462 }
463 iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq));
464 } else {
465 /*
466 * Same here, this bio may be started from the middle of the
467 * 'bvec' because of bio splitting, so offset from the bvec
468 * must be passed to iov iterator
469 */
470 iov_iter_bvec(&iter, rw,
471 __bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter),
472 nr_bvec, blk_rq_bytes(rq));
473 iter.iov_offset = rq->bio->bi_iter.bi_bvec_done;
474 }
475
476 cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT;
477 cmd->iocb.ki_filp = zone->file;
478 cmd->iocb.ki_complete = zloop_rw_complete;
479 if (!zlo->buffered_io)
480 cmd->iocb.ki_flags = IOCB_DIRECT;
481 cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
482
483 if (rw == ITER_SOURCE)
484 ret = zone->file->f_op->write_iter(&cmd->iocb, &iter);
485 else
486 ret = zone->file->f_op->read_iter(&cmd->iocb, &iter);
487unlock:
488 if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write)
489 mutex_unlock(&zone->lock);
490out:
491 if (ret != -EIOCBQUEUED)
492 zloop_rw_complete(&cmd->iocb, ret);
493 zloop_put_cmd(cmd);
494}
495
496static void zloop_handle_cmd(struct zloop_cmd *cmd)
497{
498 struct request *rq = blk_mq_rq_from_pdu(cmd);
499 struct zloop_device *zlo = rq->q->queuedata;
500
501 switch (req_op(rq)) {
502 case REQ_OP_READ:
503 case REQ_OP_WRITE:
504 case REQ_OP_ZONE_APPEND:
505 /*
506 * zloop_rw() always executes asynchronously or completes
507 * directly.
508 */
509 zloop_rw(cmd);
510 return;
511 case REQ_OP_FLUSH:
512 /*
513 * Sync the entire FS containing the zone files instead of
514 * walking all files
515 */
516 cmd->ret = sync_filesystem(file_inode(zlo->data_dir)->i_sb);
517 break;
518 case REQ_OP_ZONE_RESET:
519 cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq));
520 break;
521 case REQ_OP_ZONE_RESET_ALL:
522 cmd->ret = zloop_reset_all_zones(zlo);
523 break;
524 case REQ_OP_ZONE_FINISH:
525 cmd->ret = zloop_finish_zone(zlo, rq_zone_no(rq));
526 break;
527 case REQ_OP_ZONE_OPEN:
528 cmd->ret = zloop_open_zone(zlo, rq_zone_no(rq));
529 break;
530 case REQ_OP_ZONE_CLOSE:
531 cmd->ret = zloop_close_zone(zlo, rq_zone_no(rq));
532 break;
533 default:
534 WARN_ON_ONCE(1);
535 pr_err("Unsupported operation %d\n", req_op(rq));
536 cmd->ret = -EOPNOTSUPP;
537 break;
538 }
539
540 blk_mq_complete_request(rq);
541}
542
543static void zloop_cmd_workfn(struct work_struct *work)
544{
545 struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work);
546 int orig_flags = current->flags;
547
548 current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
549 zloop_handle_cmd(cmd);
550 current->flags = orig_flags;
551}
552
553static void zloop_complete_rq(struct request *rq)
554{
555 struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
556 struct zloop_device *zlo = rq->q->queuedata;
557 unsigned int zone_no = cmd->sector >> zlo->zone_shift;
558 struct zloop_zone *zone = &zlo->zones[zone_no];
559 blk_status_t sts = BLK_STS_OK;
560
561 switch (req_op(rq)) {
562 case REQ_OP_READ:
563 if (cmd->ret < 0)
564 pr_err("Zone %u: failed read sector %llu, %llu sectors\n",
565 zone_no, cmd->sector, cmd->nr_sectors);
566
567 if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
568 /* short read */
569 struct bio *bio;
570
571 __rq_for_each_bio(bio, rq)
572 zero_fill_bio(bio);
573 }
574 break;
575 case REQ_OP_WRITE:
576 case REQ_OP_ZONE_APPEND:
577 if (cmd->ret < 0)
578 pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n",
579 zone_no,
580 req_op(rq) == REQ_OP_WRITE ? "" : "append ",
581 cmd->sector, cmd->nr_sectors);
582
583 if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
584 pr_err("Zone %u: partial write %ld/%u B\n",
585 zone_no, cmd->ret, blk_rq_bytes(rq));
586 cmd->ret = -EIO;
587 }
588
589 if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
590 /*
591 * A write to a sequential zone file failed: mark the
592 * zone as having an error. This will be corrected and
593 * cleared when the next IO is submitted.
594 */
595 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
596 break;
597 }
598 if (req_op(rq) == REQ_OP_ZONE_APPEND)
599 rq->__sector = cmd->sector;
600
601 break;
602 default:
603 break;
604 }
605
606 if (cmd->ret < 0)
607 sts = errno_to_blk_status(cmd->ret);
608 blk_mq_end_request(rq, sts);
609}
610
611static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
612 const struct blk_mq_queue_data *bd)
613{
614 struct request *rq = bd->rq;
615 struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
616 struct zloop_device *zlo = rq->q->queuedata;
617
618 if (zlo->state == Zlo_deleting)
619 return BLK_STS_IOERR;
620
621 blk_mq_start_request(rq);
622
623 INIT_WORK(&cmd->work, zloop_cmd_workfn);
624 queue_work(zlo->workqueue, &cmd->work);
625
626 return BLK_STS_OK;
627}
628
629static const struct blk_mq_ops zloop_mq_ops = {
630 .queue_rq = zloop_queue_rq,
631 .complete = zloop_complete_rq,
632};
633
634static int zloop_open(struct gendisk *disk, blk_mode_t mode)
635{
636 struct zloop_device *zlo = disk->private_data;
637 int ret;
638
639 ret = mutex_lock_killable(&zloop_ctl_mutex);
640 if (ret)
641 return ret;
642
643 if (zlo->state != Zlo_live)
644 ret = -ENXIO;
645 mutex_unlock(&zloop_ctl_mutex);
646 return ret;
647}
648
649static int zloop_report_zones(struct gendisk *disk, sector_t sector,
650 unsigned int nr_zones, report_zones_cb cb, void *data)
651{
652 struct zloop_device *zlo = disk->private_data;
653 struct blk_zone blkz = {};
654 unsigned int first, i;
655 int ret;
656
657 first = disk_zone_no(disk, sector);
658 if (first >= zlo->nr_zones)
659 return 0;
660 nr_zones = min(nr_zones, zlo->nr_zones - first);
661
662 for (i = 0; i < nr_zones; i++) {
663 unsigned int zone_no = first + i;
664 struct zloop_zone *zone = &zlo->zones[zone_no];
665
666 mutex_lock(&zone->lock);
667
668 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
669 ret = zloop_update_seq_zone(zlo, zone_no);
670 if (ret) {
671 mutex_unlock(&zone->lock);
672 return ret;
673 }
674 }
675
676 blkz.start = zone->start;
677 blkz.len = zlo->zone_size;
678 blkz.wp = zone->wp;
679 blkz.cond = zone->cond;
680 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
681 blkz.type = BLK_ZONE_TYPE_CONVENTIONAL;
682 blkz.capacity = zlo->zone_size;
683 } else {
684 blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
685 blkz.capacity = zlo->zone_capacity;
686 }
687
688 mutex_unlock(&zone->lock);
689
690 ret = cb(&blkz, i, data);
691 if (ret)
692 return ret;
693 }
694
695 return nr_zones;
696}
697
698static void zloop_free_disk(struct gendisk *disk)
699{
700 struct zloop_device *zlo = disk->private_data;
701 unsigned int i;
702
703 for (i = 0; i < zlo->nr_zones; i++) {
704 struct zloop_zone *zone = &zlo->zones[i];
705
706 mapping_set_gfp_mask(zone->file->f_mapping,
707 zone->old_gfp_mask);
708 fput(zone->file);
709 }
710
711 fput(zlo->data_dir);
712 destroy_workqueue(zlo->workqueue);
713 kfree(zlo->base_dir);
714 kvfree(zlo);
715}
716
717static const struct block_device_operations zloop_fops = {
718 .owner = THIS_MODULE,
719 .open = zloop_open,
720 .report_zones = zloop_report_zones,
721 .free_disk = zloop_free_disk,
722};
723
724__printf(3, 4)
725static struct file *zloop_filp_open_fmt(int oflags, umode_t mode,
726 const char *fmt, ...)
727{
728 struct file *file;
729 va_list ap;
730 char *p;
731
732 va_start(ap, fmt);
733 p = kvasprintf(GFP_KERNEL, fmt, ap);
734 va_end(ap);
735
736 if (!p)
737 return ERR_PTR(-ENOMEM);
738 file = filp_open(p, oflags, mode);
739 kfree(p);
740 return file;
741}
742
743static int zloop_get_block_size(struct zloop_device *zlo,
744 struct zloop_zone *zone)
745{
746 struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev;
747 struct kstat st;
748
749 /*
750 * If the FS block size is lower than or equal to 4K, use that as the
751 * device block size. Otherwise, fallback to the FS direct IO alignment
752 * constraint if that is provided, and to the FS underlying device
753 * physical block size if the direct IO alignment is unknown.
754 */
755 if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K)
756 zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize;
757 else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) &&
758 (st.result_mask & STATX_DIOALIGN))
759 zlo->block_size = st.dio_offset_align;
760 else if (sb_bdev)
761 zlo->block_size = bdev_physical_block_size(sb_bdev);
762 else
763 zlo->block_size = SECTOR_SIZE;
764
765 if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
766 pr_err("Zone capacity is not aligned to block size %u\n",
767 zlo->block_size);
768 return -EINVAL;
769 }
770
771 return 0;
772}
773
774static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
775 unsigned int zone_no, bool restore)
776{
777 struct zloop_zone *zone = &zlo->zones[zone_no];
778 int oflags = O_RDWR;
779 struct kstat stat;
780 sector_t file_sectors;
781 int ret;
782
783 mutex_init(&zone->lock);
784 zone->start = (sector_t)zone_no << zlo->zone_shift;
785
786 if (!restore)
787 oflags |= O_CREAT;
788
789 if (!opts->buffered_io)
790 oflags |= O_DIRECT;
791
792 if (zone_no < zlo->nr_conv_zones) {
793 /* Conventional zone file. */
794 set_bit(ZLOOP_ZONE_CONV, &zone->flags);
795 zone->cond = BLK_ZONE_COND_NOT_WP;
796 zone->wp = U64_MAX;
797
798 zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/cnv-%06u",
799 zlo->base_dir, zlo->id, zone_no);
800 if (IS_ERR(zone->file)) {
801 pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)",
802 zone_no, zlo->base_dir, zlo->id, zone_no,
803 PTR_ERR(zone->file));
804 return PTR_ERR(zone->file);
805 }
806
807 if (!zlo->block_size) {
808 ret = zloop_get_block_size(zlo, zone);
809 if (ret)
810 return ret;
811 }
812
813 ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
814 if (ret < 0) {
815 pr_err("Failed to get zone %u file stat\n", zone_no);
816 return ret;
817 }
818 file_sectors = stat.size >> SECTOR_SHIFT;
819
820 if (restore && file_sectors != zlo->zone_size) {
821 pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n",
822 zone_no, file_sectors, zlo->zone_capacity);
823 return ret;
824 }
825
826 ret = vfs_truncate(&zone->file->f_path,
827 zlo->zone_size << SECTOR_SHIFT);
828 if (ret < 0) {
829 pr_err("Failed to truncate zone %u file (err=%d)\n",
830 zone_no, ret);
831 return ret;
832 }
833
834 return 0;
835 }
836
837 /* Sequential zone file. */
838 zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/seq-%06u",
839 zlo->base_dir, zlo->id, zone_no);
840 if (IS_ERR(zone->file)) {
841 pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)",
842 zone_no, zlo->base_dir, zlo->id, zone_no,
843 PTR_ERR(zone->file));
844 return PTR_ERR(zone->file);
845 }
846
847 if (!zlo->block_size) {
848 ret = zloop_get_block_size(zlo, zone);
849 if (ret)
850 return ret;
851 }
852
853 zloop_get_block_size(zlo, zone);
854
855 mutex_lock(&zone->lock);
856 ret = zloop_update_seq_zone(zlo, zone_no);
857 mutex_unlock(&zone->lock);
858
859 return ret;
860}
861
862static bool zloop_dev_exists(struct zloop_device *zlo)
863{
864 struct file *cnv, *seq;
865 bool exists;
866
867 cnv = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/cnv-%06u",
868 zlo->base_dir, zlo->id, 0);
869 seq = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/seq-%06u",
870 zlo->base_dir, zlo->id, 0);
871 exists = !IS_ERR(cnv) || !IS_ERR(seq);
872
873 if (!IS_ERR(cnv))
874 fput(cnv);
875 if (!IS_ERR(seq))
876 fput(seq);
877
878 return exists;
879}
880
881static int zloop_ctl_add(struct zloop_options *opts)
882{
883 struct queue_limits lim = {
884 .max_hw_sectors = SZ_1M >> SECTOR_SHIFT,
885 .max_hw_zone_append_sectors = SZ_1M >> SECTOR_SHIFT,
886 .chunk_sectors = opts->zone_size,
887 .features = BLK_FEAT_ZONED,
888 };
889 unsigned int nr_zones, i, j;
890 struct zloop_device *zlo;
891 int ret = -EINVAL;
892 bool restore;
893
894 __module_get(THIS_MODULE);
895
896 nr_zones = opts->capacity >> ilog2(opts->zone_size);
897 if (opts->nr_conv_zones >= nr_zones) {
898 pr_err("Invalid number of conventional zones %u\n",
899 opts->nr_conv_zones);
900 goto out;
901 }
902
903 zlo = kvzalloc(struct_size(zlo, zones, nr_zones), GFP_KERNEL);
904 if (!zlo) {
905 ret = -ENOMEM;
906 goto out;
907 }
908 zlo->state = Zlo_creating;
909
910 ret = mutex_lock_killable(&zloop_ctl_mutex);
911 if (ret)
912 goto out_free_dev;
913
914 /* Allocate id, if @opts->id >= 0, we're requesting that specific id */
915 if (opts->id >= 0) {
916 ret = idr_alloc(&zloop_index_idr, zlo,
917 opts->id, opts->id + 1, GFP_KERNEL);
918 if (ret == -ENOSPC)
919 ret = -EEXIST;
920 } else {
921 ret = idr_alloc(&zloop_index_idr, zlo, 0, 0, GFP_KERNEL);
922 }
923 mutex_unlock(&zloop_ctl_mutex);
924 if (ret < 0)
925 goto out_free_dev;
926
927 zlo->id = ret;
928 zlo->zone_shift = ilog2(opts->zone_size);
929 zlo->zone_size = opts->zone_size;
930 if (opts->zone_capacity)
931 zlo->zone_capacity = opts->zone_capacity;
932 else
933 zlo->zone_capacity = zlo->zone_size;
934 zlo->nr_zones = nr_zones;
935 zlo->nr_conv_zones = opts->nr_conv_zones;
936 zlo->buffered_io = opts->buffered_io;
937
938 zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
939 opts->nr_queues * opts->queue_depth, zlo->id);
940 if (!zlo->workqueue) {
941 ret = -ENOMEM;
942 goto out_free_idr;
943 }
944
945 if (opts->base_dir)
946 zlo->base_dir = kstrdup(opts->base_dir, GFP_KERNEL);
947 else
948 zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL);
949 if (!zlo->base_dir) {
950 ret = -ENOMEM;
951 goto out_destroy_workqueue;
952 }
953
954 zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, 0, "%s/%u",
955 zlo->base_dir, zlo->id);
956 if (IS_ERR(zlo->data_dir)) {
957 ret = PTR_ERR(zlo->data_dir);
958 pr_warn("Failed to open directory %s/%u (err=%d)\n",
959 zlo->base_dir, zlo->id, ret);
960 goto out_free_base_dir;
961 }
962
963 /*
964 * If we already have zone files, we are restoring a device created by a
965 * previous add operation. In this case, zloop_init_zone() will check
966 * that the zone files are consistent with the zone configuration given.
967 */
968 restore = zloop_dev_exists(zlo);
969 for (i = 0; i < nr_zones; i++) {
970 ret = zloop_init_zone(zlo, opts, i, restore);
971 if (ret)
972 goto out_close_files;
973 }
974
975 lim.physical_block_size = zlo->block_size;
976 lim.logical_block_size = zlo->block_size;
977
978 zlo->tag_set.ops = &zloop_mq_ops;
979 zlo->tag_set.nr_hw_queues = opts->nr_queues;
980 zlo->tag_set.queue_depth = opts->queue_depth;
981 zlo->tag_set.numa_node = NUMA_NO_NODE;
982 zlo->tag_set.cmd_size = sizeof(struct zloop_cmd);
983 zlo->tag_set.driver_data = zlo;
984
985 ret = blk_mq_alloc_tag_set(&zlo->tag_set);
986 if (ret) {
987 pr_err("blk_mq_alloc_tag_set failed (err=%d)\n", ret);
988 goto out_close_files;
989 }
990
991 zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo);
992 if (IS_ERR(zlo->disk)) {
993 pr_err("blk_mq_alloc_disk failed (err=%d)\n", ret);
994 ret = PTR_ERR(zlo->disk);
995 goto out_cleanup_tags;
996 }
997 zlo->disk->flags = GENHD_FL_NO_PART;
998 zlo->disk->fops = &zloop_fops;
999 zlo->disk->private_data = zlo;
1000 sprintf(zlo->disk->disk_name, "zloop%d", zlo->id);
1001 set_capacity(zlo->disk, (u64)lim.chunk_sectors * zlo->nr_zones);
1002
1003 ret = blk_revalidate_disk_zones(zlo->disk);
1004 if (ret)
1005 goto out_cleanup_disk;
1006
1007 ret = add_disk(zlo->disk);
1008 if (ret) {
1009 pr_err("add_disk failed (err=%d)\n", ret);
1010 goto out_cleanup_disk;
1011 }
1012
1013 mutex_lock(&zloop_ctl_mutex);
1014 zlo->state = Zlo_live;
1015 mutex_unlock(&zloop_ctl_mutex);
1016
1017 pr_info("Added device %d: %u zones of %llu MB, %u B block size\n",
1018 zlo->id, zlo->nr_zones,
1019 ((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20,
1020 zlo->block_size);
1021
1022 return 0;
1023
1024out_cleanup_disk:
1025 put_disk(zlo->disk);
1026out_cleanup_tags:
1027 blk_mq_free_tag_set(&zlo->tag_set);
1028out_close_files:
1029 for (j = 0; j < i; j++) {
1030 struct zloop_zone *zone = &zlo->zones[j];
1031
1032 if (!IS_ERR_OR_NULL(zone->file))
1033 fput(zone->file);
1034 }
1035 fput(zlo->data_dir);
1036out_free_base_dir:
1037 kfree(zlo->base_dir);
1038out_destroy_workqueue:
1039 destroy_workqueue(zlo->workqueue);
1040out_free_idr:
1041 mutex_lock(&zloop_ctl_mutex);
1042 idr_remove(&zloop_index_idr, zlo->id);
1043 mutex_unlock(&zloop_ctl_mutex);
1044out_free_dev:
1045 kvfree(zlo);
1046out:
1047 module_put(THIS_MODULE);
1048 if (ret == -ENOENT)
1049 ret = -EINVAL;
1050 return ret;
1051}
1052
1053static int zloop_ctl_remove(struct zloop_options *opts)
1054{
1055 struct zloop_device *zlo;
1056 int ret;
1057
1058 if (!(opts->mask & ZLOOP_OPT_ID)) {
1059 pr_err("No ID specified\n");
1060 return -EINVAL;
1061 }
1062
1063 ret = mutex_lock_killable(&zloop_ctl_mutex);
1064 if (ret)
1065 return ret;
1066
1067 zlo = idr_find(&zloop_index_idr, opts->id);
1068 if (!zlo || zlo->state == Zlo_creating) {
1069 ret = -ENODEV;
1070 } else if (zlo->state == Zlo_deleting) {
1071 ret = -EINVAL;
1072 } else {
1073 idr_remove(&zloop_index_idr, zlo->id);
1074 zlo->state = Zlo_deleting;
1075 }
1076
1077 mutex_unlock(&zloop_ctl_mutex);
1078 if (ret)
1079 return ret;
1080
1081 del_gendisk(zlo->disk);
1082 put_disk(zlo->disk);
1083 blk_mq_free_tag_set(&zlo->tag_set);
1084
1085 pr_info("Removed device %d\n", opts->id);
1086
1087 module_put(THIS_MODULE);
1088
1089 return 0;
1090}
1091
1092static int zloop_parse_options(struct zloop_options *opts, const char *buf)
1093{
1094 substring_t args[MAX_OPT_ARGS];
1095 char *options, *o, *p;
1096 unsigned int token;
1097 int ret = 0;
1098
1099 /* Set defaults. */
1100 opts->mask = 0;
1101 opts->id = ZLOOP_DEF_ID;
1102 opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES;
1103 opts->zone_size = ZLOOP_DEF_ZONE_SIZE;
1104 opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES;
1105 opts->nr_queues = ZLOOP_DEF_NR_QUEUES;
1106 opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
1107 opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;
1108
1109 if (!buf)
1110 return 0;
1111
1112 /* Skip leading spaces before the options. */
1113 while (isspace(*buf))
1114 buf++;
1115
1116 options = o = kstrdup(buf, GFP_KERNEL);
1117 if (!options)
1118 return -ENOMEM;
1119
1120 /* Parse the options, doing only some light invalid value checks. */
1121 while ((p = strsep(&o, ",\n")) != NULL) {
1122 if (!*p)
1123 continue;
1124
1125 token = match_token(p, zloop_opt_tokens, args);
1126 opts->mask |= token;
1127 switch (token) {
1128 case ZLOOP_OPT_ID:
1129 if (match_int(args, &opts->id)) {
1130 ret = -EINVAL;
1131 goto out;
1132 }
1133 break;
1134 case ZLOOP_OPT_CAPACITY:
1135 if (match_uint(args, &token)) {
1136 ret = -EINVAL;
1137 goto out;
1138 }
1139 if (!token) {
1140 pr_err("Invalid capacity\n");
1141 ret = -EINVAL;
1142 goto out;
1143 }
1144 opts->capacity =
1145 ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
1146 break;
1147 case ZLOOP_OPT_ZONE_SIZE:
1148 if (match_uint(args, &token)) {
1149 ret = -EINVAL;
1150 goto out;
1151 }
1152 if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB ||
1153 !is_power_of_2(token)) {
1154 pr_err("Invalid zone size %u\n", token);
1155 ret = -EINVAL;
1156 goto out;
1157 }
1158 opts->zone_size =
1159 ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
1160 break;
1161 case ZLOOP_OPT_ZONE_CAPACITY:
1162 if (match_uint(args, &token)) {
1163 ret = -EINVAL;
1164 goto out;
1165 }
1166 if (!token) {
1167 pr_err("Invalid zone capacity\n");
1168 ret = -EINVAL;
1169 goto out;
1170 }
1171 opts->zone_capacity =
1172 ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
1173 break;
1174 case ZLOOP_OPT_NR_CONV_ZONES:
1175 if (match_uint(args, &token)) {
1176 ret = -EINVAL;
1177 goto out;
1178 }
1179 opts->nr_conv_zones = token;
1180 break;
1181 case ZLOOP_OPT_BASE_DIR:
1182 p = match_strdup(args);
1183 if (!p) {
1184 ret = -ENOMEM;
1185 goto out;
1186 }
1187 kfree(opts->base_dir);
1188 opts->base_dir = p;
1189 break;
1190 case ZLOOP_OPT_NR_QUEUES:
1191 if (match_uint(args, &token)) {
1192 ret = -EINVAL;
1193 goto out;
1194 }
1195 if (!token) {
1196 pr_err("Invalid number of queues\n");
1197 ret = -EINVAL;
1198 goto out;
1199 }
1200 opts->nr_queues = min(token, num_online_cpus());
1201 break;
1202 case ZLOOP_OPT_QUEUE_DEPTH:
1203 if (match_uint(args, &token)) {
1204 ret = -EINVAL;
1205 goto out;
1206 }
1207 if (!token) {
1208 pr_err("Invalid queue depth\n");
1209 ret = -EINVAL;
1210 goto out;
1211 }
1212 opts->queue_depth = token;
1213 break;
1214 case ZLOOP_OPT_BUFFERED_IO:
1215 opts->buffered_io = true;
1216 break;
1217 case ZLOOP_OPT_ERR:
1218 default:
1219 pr_warn("unknown parameter or missing value '%s'\n", p);
1220 ret = -EINVAL;
1221 goto out;
1222 }
1223 }
1224
1225 ret = -EINVAL;
1226 if (opts->capacity <= opts->zone_size) {
1227 pr_err("Invalid capacity\n");
1228 goto out;
1229 }
1230
1231 if (opts->zone_capacity > opts->zone_size) {
1232 pr_err("Invalid zone capacity\n");
1233 goto out;
1234 }
1235
1236 ret = 0;
1237out:
1238 kfree(options);
1239 return ret;
1240}
1241
1242enum {
1243 ZLOOP_CTL_ADD,
1244 ZLOOP_CTL_REMOVE,
1245};
1246
1247static struct zloop_ctl_op {
1248 int code;
1249 const char *name;
1250} zloop_ctl_ops[] = {
1251 { ZLOOP_CTL_ADD, "add" },
1252 { ZLOOP_CTL_REMOVE, "remove" },
1253 { -1, NULL },
1254};
1255
1256static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf,
1257 size_t count, loff_t *pos)
1258{
1259 struct zloop_options opts = { };
1260 struct zloop_ctl_op *op;
1261 const char *buf, *opts_buf;
1262 int i, ret;
1263
1264 if (count > PAGE_SIZE)
1265 return -ENOMEM;
1266
1267 buf = memdup_user_nul(ubuf, count);
1268 if (IS_ERR(buf))
1269 return PTR_ERR(buf);
1270
1271 for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) {
1272 op = &zloop_ctl_ops[i];
1273 if (!op->name) {
1274 pr_err("Invalid operation\n");
1275 ret = -EINVAL;
1276 goto out;
1277 }
1278 if (!strncmp(buf, op->name, strlen(op->name)))
1279 break;
1280 }
1281
1282 if (count <= strlen(op->name))
1283 opts_buf = NULL;
1284 else
1285 opts_buf = buf + strlen(op->name);
1286
1287 ret = zloop_parse_options(&opts, opts_buf);
1288 if (ret) {
1289 pr_err("Failed to parse options\n");
1290 goto out;
1291 }
1292
1293 switch (op->code) {
1294 case ZLOOP_CTL_ADD:
1295 ret = zloop_ctl_add(&opts);
1296 break;
1297 case ZLOOP_CTL_REMOVE:
1298 ret = zloop_ctl_remove(&opts);
1299 break;
1300 default:
1301 pr_err("Invalid operation\n");
1302 ret = -EINVAL;
1303 goto out;
1304 }
1305
1306out:
1307 kfree(opts.base_dir);
1308 kfree(buf);
1309 return ret ? ret : count;
1310}
1311
1312static int zloop_ctl_show(struct seq_file *seq_file, void *private)
1313{
1314 const struct match_token *tok;
1315 int i;
1316
1317 /* Add operation */
1318 seq_printf(seq_file, "%s ", zloop_ctl_ops[0].name);
1319 for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) {
1320 tok = &zloop_opt_tokens[i];
1321 if (!tok->pattern)
1322 break;
1323 if (i)
1324 seq_putc(seq_file, ',');
1325 seq_puts(seq_file, tok->pattern);
1326 }
1327 seq_putc(seq_file, '\n');
1328
1329 /* Remove operation */
1330 seq_puts(seq_file, zloop_ctl_ops[1].name);
1331 seq_puts(seq_file, " id=%d\n");
1332
1333 return 0;
1334}
1335
1336static int zloop_ctl_open(struct inode *inode, struct file *file)
1337{
1338 file->private_data = NULL;
1339 return single_open(file, zloop_ctl_show, NULL);
1340}
1341
1342static int zloop_ctl_release(struct inode *inode, struct file *file)
1343{
1344 return single_release(inode, file);
1345}
1346
1347static const struct file_operations zloop_ctl_fops = {
1348 .owner = THIS_MODULE,
1349 .open = zloop_ctl_open,
1350 .release = zloop_ctl_release,
1351 .write = zloop_ctl_write,
1352 .read = seq_read,
1353};
1354
1355static struct miscdevice zloop_misc = {
1356 .minor = MISC_DYNAMIC_MINOR,
1357 .name = "zloop-control",
1358 .fops = &zloop_ctl_fops,
1359};
1360
1361static int __init zloop_init(void)
1362{
1363 int ret;
1364
1365 ret = misc_register(&zloop_misc);
1366 if (ret) {
1367 pr_err("Failed to register misc device: %d\n", ret);
1368 return ret;
1369 }
1370 pr_info("Module loaded\n");
1371
1372 return 0;
1373}
1374
1375static void __exit zloop_exit(void)
1376{
1377 misc_deregister(&zloop_misc);
1378 idr_destroy(&zloop_index_idr);
1379}
1380
1381module_init(zloop_init);
1382module_exit(zloop_exit);
1383
1384MODULE_DESCRIPTION("Zoned loopback device");
1385MODULE_LICENSE("GPL");