commit 922a763ae178901c2393424ba42b0aa1be22bf06 · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Merge tag 'zonefs-5.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs

Pull zonefs updates from Damien Le Moal:
"Add an 'explicit-open' mount option to automatically issue a
REQ_OP_ZONE_OPEN command to the device whenever a sequential zone file
is open for writing for the first time.

This avoids 'insufficient zone resources' errors for write operations
on some drives with limited zone resources or on ZNS drives with a
limited number of active zones. From Johannes"

* tag 'zonefs-5.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs:
zonefs: document the explicit-open mount option
zonefs: open/close zone on file open/close
zonefs: provide no-lock zonefs_io_error variant
zonefs: introduce helper for zone management

Linus Torvalds 5 years ago 922a763a 7cf726a5

+233 -13

3 changed files

expand all

unified split

Documentation

filesystems

zonefs.rst

zonefs

super.c

zonefs.h

+15

Documentation/filesystems/zonefs.rst

··· 326 read-only zone discovered at run-time, as indicated in the previous section. 327 The size of the zone file is left unchanged from its last updated value. 328 329 Zonefs User Space Tools 330 ======================= 331

··· 326 read-only zone discovered at run-time, as indicated in the previous section. 327 The size of the zone file is left unchanged from its last updated value. 328 329 + A zoned block device (e.g. an NVMe Zoned Namespace device) may have limits on 330 + the number of zones that can be active, that is, zones that are in the 331 + implicit open, explicit open or closed conditions. This potential limitation 332 + translates into a risk for applications to see write IO errors due to this 333 + limit being exceeded if the zone of a file is not already active when a write 334 + request is issued by the user. 335 + 336 + To avoid these potential errors, the "explicit-open" mount option forces zones 337 + to be made active using an open zone command when a file is opened for writing 338 + for the first time. If the zone open command succeeds, the application is then 339 + guaranteed that write requests can be processed. Conversely, the 340 + "explicit-open" mount option will result in a zone close command being issued 341 + to the device on the last close() of a zone file if the zone is not full nor 342 + empty. 343 + 344 Zonefs User Space Tools 345 ======================= 346

+208 -13

fs/zonefs/super.c

··· 24 25 #include "zonefs.h" 26 27 static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 28 unsigned int flags, struct iomap *iomap, 29 struct iomap *srcmap) ··· 335 } 336 337 /* 338 * If error=remount-ro was specified, any error result in remounting 339 * the volume as read-only. 340 */ ··· 359 * invalid data. 360 */ 361 zonefs_update_stats(inode, data_size); 362 - i_size_write(inode, data_size); 363 zi->i_wpoffset = data_size; 364 365 return 0; ··· 372 * eventually correct the file size and zonefs inode write pointer offset 373 * (which can be out of sync with the drive due to partial write failures). 374 */ 375 - static void zonefs_io_error(struct inode *inode, bool write) 376 { 377 struct zonefs_inode_info *zi = ZONEFS_I(inode); 378 struct super_block *sb = inode->i_sb; ··· 385 .write = write, 386 }; 387 int ret; 388 - 389 - mutex_lock(&zi->i_truncate_mutex); 390 391 /* 392 * Memory allocations in blkdev_report_zones() can trigger a memory ··· 401 zonefs_err(sb, "Get inode %lu zone information failed %d\n", 402 inode->i_ino, ret); 403 memalloc_noio_restore(noio_flag); 404 405 mutex_unlock(&zi->i_truncate_mutex); 406 } 407 ··· 446 if (isize == old_isize) 447 goto unlock; 448 449 - ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, 450 - zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); 451 - if (ret) { 452 - zonefs_err(inode->i_sb, 453 - "Zone management operation at %llu failed %d", 454 - zi->i_zsector, ret); 455 goto unlock; 456 } 457 458 zonefs_update_stats(inode, isize); ··· 647 mutex_lock(&zi->i_truncate_mutex); 648 if (i_size_read(inode) < iocb->ki_pos + size) { 649 zonefs_update_stats(inode, iocb->ki_pos + size); 650 - i_size_write(inode, iocb->ki_pos + size); 651 } 652 mutex_unlock(&zi->i_truncate_mutex); 653 } ··· 928 return ret; 929 } 930 931 static const struct file_operations zonefs_file_operations = { 932 - .open = generic_file_open, 933 .fsync = zonefs_file_fsync, 934 .mmap = zonefs_file_mmap, 935 .llseek = zonefs_file_llseek, ··· 1073 inode_init_once(&zi->i_vnode); 1074 mutex_init(&zi->i_truncate_mutex); 1075 init_rwsem(&zi->i_mmap_sem); 1076 1077 return &zi->i_vnode; 1078 } ··· 1124 1125 enum { 1126 Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair, 1127 - Opt_err, 1128 }; 1129 1130 static const match_table_t tokens = { ··· 1132 { Opt_errors_zro, "errors=zone-ro"}, 1133 { Opt_errors_zol, "errors=zone-offline"}, 1134 { Opt_errors_repair, "errors=repair"}, 1135 { Opt_err, NULL} 1136 }; 1137 ··· 1168 case Opt_errors_repair: 1169 sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; 1170 sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR; 1171 break; 1172 default: 1173 return -EINVAL; ··· 1591 sbi->s_gid = GLOBAL_ROOT_GID; 1592 sbi->s_perm = 0640; 1593 sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; 1594 1595 ret = zonefs_read_super(sb); 1596 if (ret)

··· 24 25 #include "zonefs.h" 26 27 + static inline int zonefs_zone_mgmt(struct inode *inode, 28 + enum req_opf op) 29 + { 30 + struct zonefs_inode_info *zi = ZONEFS_I(inode); 31 + int ret; 32 + 33 + lockdep_assert_held(&zi->i_truncate_mutex); 34 + 35 + ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, 36 + zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); 37 + if (ret) { 38 + zonefs_err(inode->i_sb, 39 + "Zone management operation %s at %llu failed %d\n", 40 + blk_op_str(op), zi->i_zsector, ret); 41 + return ret; 42 + } 43 + 44 + return 0; 45 + } 46 + 47 + static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) 48 + { 49 + struct zonefs_inode_info *zi = ZONEFS_I(inode); 50 + 51 + i_size_write(inode, isize); 52 + /* 53 + * A full zone is no longer open/active and does not need 54 + * explicit closing. 55 + */ 56 + if (isize >= zi->i_max_size) 57 + zi->i_flags &= ~ZONEFS_ZONE_OPEN; 58 + } 59 + 60 static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 61 unsigned int flags, struct iomap *iomap, 62 struct iomap *srcmap) ··· 302 } 303 304 /* 305 + * If the filesystem is mounted with the explicit-open mount option, we 306 + * need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to 307 + * the read-only or offline condition, to avoid attempting an explicit 308 + * close of the zone when the inode file is closed. 309 + */ 310 + if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && 311 + (zone->cond == BLK_ZONE_COND_OFFLINE || 312 + zone->cond == BLK_ZONE_COND_READONLY)) 313 + zi->i_flags &= ~ZONEFS_ZONE_OPEN; 314 + 315 + /* 316 * If error=remount-ro was specified, any error result in remounting 317 * the volume as read-only. 318 */ ··· 315 * invalid data. 316 */ 317 zonefs_update_stats(inode, data_size); 318 + zonefs_i_size_write(inode, data_size); 319 zi->i_wpoffset = data_size; 320 321 return 0; ··· 328 * eventually correct the file size and zonefs inode write pointer offset 329 * (which can be out of sync with the drive due to partial write failures). 330 */ 331 + static void __zonefs_io_error(struct inode *inode, bool write) 332 { 333 struct zonefs_inode_info *zi = ZONEFS_I(inode); 334 struct super_block *sb = inode->i_sb; ··· 341 .write = write, 342 }; 343 int ret; 344 345 /* 346 * Memory allocations in blkdev_report_zones() can trigger a memory ··· 359 zonefs_err(sb, "Get inode %lu zone information failed %d\n", 360 inode->i_ino, ret); 361 memalloc_noio_restore(noio_flag); 362 + } 363 364 + static void zonefs_io_error(struct inode *inode, bool write) 365 + { 366 + struct zonefs_inode_info *zi = ZONEFS_I(inode); 367 + 368 + mutex_lock(&zi->i_truncate_mutex); 369 + __zonefs_io_error(inode, write); 370 mutex_unlock(&zi->i_truncate_mutex); 371 } 372 ··· 397 if (isize == old_isize) 398 goto unlock; 399 400 + ret = zonefs_zone_mgmt(inode, op); 401 + if (ret) 402 goto unlock; 403 + 404 + /* 405 + * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, 406 + * take care of open zones. 407 + */ 408 + if (zi->i_flags & ZONEFS_ZONE_OPEN) { 409 + /* 410 + * Truncating a zone to EMPTY or FULL is the equivalent of 411 + * closing the zone. For a truncation to 0, we need to 412 + * re-open the zone to ensure new writes can be processed. 413 + * For a truncation to the maximum file size, the zone is 414 + * closed and writes cannot be accepted anymore, so clear 415 + * the open flag. 416 + */ 417 + if (!isize) 418 + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); 419 + else 420 + zi->i_flags &= ~ZONEFS_ZONE_OPEN; 421 } 422 423 zonefs_update_stats(inode, isize); ··· 584 mutex_lock(&zi->i_truncate_mutex); 585 if (i_size_read(inode) < iocb->ki_pos + size) { 586 zonefs_update_stats(inode, iocb->ki_pos + size); 587 + zonefs_i_size_write(inode, iocb->ki_pos + size); 588 } 589 mutex_unlock(&zi->i_truncate_mutex); 590 } ··· 865 return ret; 866 } 867 868 + static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file) 869 + { 870 + struct zonefs_inode_info *zi = ZONEFS_I(inode); 871 + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 872 + 873 + if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN)) 874 + return false; 875 + 876 + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) 877 + return false; 878 + 879 + if (!(file->f_mode & FMODE_WRITE)) 880 + return false; 881 + 882 + return true; 883 + } 884 + 885 + static int zonefs_open_zone(struct inode *inode) 886 + { 887 + struct zonefs_inode_info *zi = ZONEFS_I(inode); 888 + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 889 + int ret = 0; 890 + 891 + mutex_lock(&zi->i_truncate_mutex); 892 + 893 + zi->i_wr_refcnt++; 894 + if (zi->i_wr_refcnt == 1) { 895 + 896 + if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) { 897 + atomic_dec(&sbi->s_open_zones); 898 + ret = -EBUSY; 899 + goto unlock; 900 + } 901 + 902 + if (i_size_read(inode) < zi->i_max_size) { 903 + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); 904 + if (ret) { 905 + zi->i_wr_refcnt--; 906 + atomic_dec(&sbi->s_open_zones); 907 + goto unlock; 908 + } 909 + zi->i_flags |= ZONEFS_ZONE_OPEN; 910 + } 911 + } 912 + 913 + unlock: 914 + mutex_unlock(&zi->i_truncate_mutex); 915 + 916 + return ret; 917 + } 918 + 919 + static int zonefs_file_open(struct inode *inode, struct file *file) 920 + { 921 + int ret; 922 + 923 + ret = generic_file_open(inode, file); 924 + if (ret) 925 + return ret; 926 + 927 + if (zonefs_file_use_exp_open(inode, file)) 928 + return zonefs_open_zone(inode); 929 + 930 + return 0; 931 + } 932 + 933 + static void zonefs_close_zone(struct inode *inode) 934 + { 935 + struct zonefs_inode_info *zi = ZONEFS_I(inode); 936 + int ret = 0; 937 + 938 + mutex_lock(&zi->i_truncate_mutex); 939 + zi->i_wr_refcnt--; 940 + if (!zi->i_wr_refcnt) { 941 + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 942 + struct super_block *sb = inode->i_sb; 943 + 944 + /* 945 + * If the file zone is full, it is not open anymore and we only 946 + * need to decrement the open count. 947 + */ 948 + if (!(zi->i_flags & ZONEFS_ZONE_OPEN)) 949 + goto dec; 950 + 951 + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); 952 + if (ret) { 953 + __zonefs_io_error(inode, false); 954 + /* 955 + * Leaving zones explicitly open may lead to a state 956 + * where most zones cannot be written (zone resources 957 + * exhausted). So take preventive action by remounting 958 + * read-only. 959 + */ 960 + if (zi->i_flags & ZONEFS_ZONE_OPEN && 961 + !(sb->s_flags & SB_RDONLY)) { 962 + zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n"); 963 + sb->s_flags |= SB_RDONLY; 964 + } 965 + } 966 + zi->i_flags &= ~ZONEFS_ZONE_OPEN; 967 + dec: 968 + atomic_dec(&sbi->s_open_zones); 969 + } 970 + mutex_unlock(&zi->i_truncate_mutex); 971 + } 972 + 973 + static int zonefs_file_release(struct inode *inode, struct file *file) 974 + { 975 + /* 976 + * If we explicitly open a zone we must close it again as well, but the 977 + * zone management operation can fail (either due to an IO error or as 978 + * the zone has gone offline or read-only). Make sure we don't fail the 979 + * close(2) for user-space. 980 + */ 981 + if (zonefs_file_use_exp_open(inode, file)) 982 + zonefs_close_zone(inode); 983 + 984 + return 0; 985 + } 986 + 987 static const struct file_operations zonefs_file_operations = { 988 + .open = zonefs_file_open, 989 + .release = zonefs_file_release, 990 .fsync = zonefs_file_fsync, 991 .mmap = zonefs_file_mmap, 992 .llseek = zonefs_file_llseek, ··· 890 inode_init_once(&zi->i_vnode); 891 mutex_init(&zi->i_truncate_mutex); 892 init_rwsem(&zi->i_mmap_sem); 893 + zi->i_wr_refcnt = 0; 894 895 return &zi->i_vnode; 896 } ··· 940 941 enum { 942 Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair, 943 + Opt_explicit_open, Opt_err, 944 }; 945 946 static const match_table_t tokens = { ··· 948 { Opt_errors_zro, "errors=zone-ro"}, 949 { Opt_errors_zol, "errors=zone-offline"}, 950 { Opt_errors_repair, "errors=repair"}, 951 + { Opt_explicit_open, "explicit-open" }, 952 { Opt_err, NULL} 953 }; 954 ··· 983 case Opt_errors_repair: 984 sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; 985 sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR; 986 + break; 987 + case Opt_explicit_open: 988 + sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN; 989 break; 990 default: 991 return -EINVAL; ··· 1403 sbi->s_gid = GLOBAL_ROOT_GID; 1404 sbi->s_perm = 0640; 1405 sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; 1406 + sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev); 1407 + atomic_set(&sbi->s_open_zones, 0); 1408 + if (!sbi->s_max_open_zones && 1409 + sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { 1410 + zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n"); 1411 + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; 1412 + } 1413 1414 ret = zonefs_read_super(sb); 1415 if (ret)

+10

fs/zonefs/zonefs.h

··· 38 return ZONEFS_ZTYPE_SEQ; 39 } 40 41 /* 42 * In-memory inode data. 43 */ ··· 76 */ 77 struct mutex i_truncate_mutex; 78 struct rw_semaphore i_mmap_sem; 79 }; 80 81 static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) ··· 160 #define ZONEFS_MNTOPT_ERRORS_MASK \ 161 (ZONEFS_MNTOPT_ERRORS_RO | ZONEFS_MNTOPT_ERRORS_ZRO | \ 162 ZONEFS_MNTOPT_ERRORS_ZOL | ZONEFS_MNTOPT_ERRORS_REPAIR) 163 164 /* 165 * In-memory Super block information. ··· 182 183 loff_t s_blocks; 184 loff_t s_used_blocks; 185 }; 186 187 static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)

··· 38 return ZONEFS_ZTYPE_SEQ; 39 } 40 41 + #define ZONEFS_ZONE_OPEN (1 << 0) 42 + 43 /* 44 * In-memory inode data. 45 */ ··· 74 */ 75 struct mutex i_truncate_mutex; 76 struct rw_semaphore i_mmap_sem; 77 + 78 + /* guarded by i_truncate_mutex */ 79 + unsigned int i_wr_refcnt; 80 + unsigned int i_flags; 81 }; 82 83 static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) ··· 154 #define ZONEFS_MNTOPT_ERRORS_MASK \ 155 (ZONEFS_MNTOPT_ERRORS_RO | ZONEFS_MNTOPT_ERRORS_ZRO | \ 156 ZONEFS_MNTOPT_ERRORS_ZOL | ZONEFS_MNTOPT_ERRORS_REPAIR) 157 + #define ZONEFS_MNTOPT_EXPLICIT_OPEN (1 << 4) /* Explicit open/close of zones on open/close */ 158 159 /* 160 * In-memory Super block information. ··· 175 176 loff_t s_blocks; 177 loff_t s_used_blocks; 178 + 179 + unsigned int s_max_open_zones; 180 + atomic_t s_open_zones; 181 }; 182 183 static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)