commit 922a763ae178901c2393424ba42b0aa1be22bf06 · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Merge tag 'zonefs-5.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs

Pull zonefs updates from Damien Le Moal:
"Add an 'explicit-open' mount option to automatically issue a
REQ_OP_ZONE_OPEN command to the device whenever a sequential zone file
is open for writing for the first time.

This avoids 'insufficient zone resources' errors for write operations
on some drives with limited zone resources or on ZNS drives with a
limited number of active zones. From Johannes"

* tag 'zonefs-5.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs:
zonefs: document the explicit-open mount option
zonefs: open/close zone on file open/close
zonefs: provide no-lock zonefs_io_error variant
zonefs: introduce helper for zone management

Linus Torvalds 5 years ago 922a763a 7cf726a5

+233 -13

3 changed files

expand all

unified split

Documentation

filesystems

zonefs.rst

zonefs

super.c

zonefs.h

+15

Documentation/filesystems/zonefs.rst

··· 326 326 read-only zone discovered at run-time, as indicated in the previous section. 327 327 The size of the zone file is left unchanged from its last updated value. 328 328 329 + A zoned block device (e.g. an NVMe Zoned Namespace device) may have limits on 330 + the number of zones that can be active, that is, zones that are in the 331 + implicit open, explicit open or closed conditions. This potential limitation 332 + translates into a risk for applications to see write IO errors due to this 333 + limit being exceeded if the zone of a file is not already active when a write 334 + request is issued by the user. 335 + 336 + To avoid these potential errors, the "explicit-open" mount option forces zones 337 + to be made active using an open zone command when a file is opened for writing 338 + for the first time. If the zone open command succeeds, the application is then 339 + guaranteed that write requests can be processed. Conversely, the 340 + "explicit-open" mount option will result in a zone close command being issued 341 + to the device on the last close() of a zone file if the zone is not full nor 342 + empty. 343 + 329 344 Zonefs User Space Tools 330 345 ======================= 331 346

+208 -13

fs/zonefs/super.c

··· 24 24 25 25 #include "zonefs.h" 26 26 27 + static inline int zonefs_zone_mgmt(struct inode *inode, 28 + enum req_opf op) 29 + { 30 + struct zonefs_inode_info *zi = ZONEFS_I(inode); 31 + int ret; 32 + 33 + lockdep_assert_held(&zi->i_truncate_mutex); 34 + 35 + ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, 36 + zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); 37 + if (ret) { 38 + zonefs_err(inode->i_sb, 39 + "Zone management operation %s at %llu failed %d\n", 40 + blk_op_str(op), zi->i_zsector, ret); 41 + return ret; 42 + } 43 + 44 + return 0; 45 + } 46 + 47 + static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) 48 + { 49 + struct zonefs_inode_info *zi = ZONEFS_I(inode); 50 + 51 + i_size_write(inode, isize); 52 + /* 53 + * A full zone is no longer open/active and does not need 54 + * explicit closing. 55 + */ 56 + if (isize >= zi->i_max_size) 57 + zi->i_flags &= ~ZONEFS_ZONE_OPEN; 58 + } 59 + 27 60 static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 28 61 unsigned int flags, struct iomap *iomap, 29 62 struct iomap *srcmap) ··· 335 302 } 336 303 337 304 /* 305 + * If the filesystem is mounted with the explicit-open mount option, we 306 + * need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to 307 + * the read-only or offline condition, to avoid attempting an explicit 308 + * close of the zone when the inode file is closed. 309 + */ 310 + if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && 311 + (zone->cond == BLK_ZONE_COND_OFFLINE || 312 + zone->cond == BLK_ZONE_COND_READONLY)) 313 + zi->i_flags &= ~ZONEFS_ZONE_OPEN; 314 + 315 + /* 338 316 * If error=remount-ro was specified, any error result in remounting 339 317 * the volume as read-only. 340 318 */ ··· 359 315 * invalid data. 360 316 */ 361 317 zonefs_update_stats(inode, data_size); 362 - i_size_write(inode, data_size); 318 + zonefs_i_size_write(inode, data_size); 363 319 zi->i_wpoffset = data_size; 364 320 365 321 return 0; ··· 372 328 * eventually correct the file size and zonefs inode write pointer offset 373 329 * (which can be out of sync with the drive due to partial write failures). 374 330 */ 375 - static void zonefs_io_error(struct inode *inode, bool write) 331 + static void __zonefs_io_error(struct inode *inode, bool write) 376 332 { 377 333 struct zonefs_inode_info *zi = ZONEFS_I(inode); 378 334 struct super_block *sb = inode->i_sb; ··· 385 341 .write = write, 386 342 }; 387 343 int ret; 388 - 389 - mutex_lock(&zi->i_truncate_mutex); 390 344 391 345 /* 392 346 * Memory allocations in blkdev_report_zones() can trigger a memory ··· 401 359 zonefs_err(sb, "Get inode %lu zone information failed %d\n", 402 360 inode->i_ino, ret); 403 361 memalloc_noio_restore(noio_flag); 362 + } 404 363 364 + static void zonefs_io_error(struct inode *inode, bool write) 365 + { 366 + struct zonefs_inode_info *zi = ZONEFS_I(inode); 367 + 368 + mutex_lock(&zi->i_truncate_mutex); 369 + __zonefs_io_error(inode, write); 405 370 mutex_unlock(&zi->i_truncate_mutex); 406 371 } 407 372 ··· 446 397 if (isize == old_isize) 447 398 goto unlock; 448 399 449 - ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, 450 - zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); 451 - if (ret) { 452 - zonefs_err(inode->i_sb, 453 - "Zone management operation at %llu failed %d", 454 - zi->i_zsector, ret); 400 + ret = zonefs_zone_mgmt(inode, op); 401 + if (ret) 455 402 goto unlock; 403 + 404 + /* 405 + * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, 406 + * take care of open zones. 407 + */ 408 + if (zi->i_flags & ZONEFS_ZONE_OPEN) { 409 + /* 410 + * Truncating a zone to EMPTY or FULL is the equivalent of 411 + * closing the zone. For a truncation to 0, we need to 412 + * re-open the zone to ensure new writes can be processed. 413 + * For a truncation to the maximum file size, the zone is 414 + * closed and writes cannot be accepted anymore, so clear 415 + * the open flag. 416 + */ 417 + if (!isize) 418 + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); 419 + else 420 + zi->i_flags &= ~ZONEFS_ZONE_OPEN; 456 421 } 457 422 458 423 zonefs_update_stats(inode, isize); ··· 647 584 mutex_lock(&zi->i_truncate_mutex); 648 585 if (i_size_read(inode) < iocb->ki_pos + size) { 649 586 zonefs_update_stats(inode, iocb->ki_pos + size); 650 - i_size_write(inode, iocb->ki_pos + size); 587 + zonefs_i_size_write(inode, iocb->ki_pos + size); 651 588 } 652 589 mutex_unlock(&zi->i_truncate_mutex); 653 590 } ··· 928 865 return ret; 929 866 } 930 867 868 + static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file) 869 + { 870 + struct zonefs_inode_info *zi = ZONEFS_I(inode); 871 + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 872 + 873 + if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN)) 874 + return false; 875 + 876 + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) 877 + return false; 878 + 879 + if (!(file->f_mode & FMODE_WRITE)) 880 + return false; 881 + 882 + return true; 883 + } 884 + 885 + static int zonefs_open_zone(struct inode *inode) 886 + { 887 + struct zonefs_inode_info *zi = ZONEFS_I(inode); 888 + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 889 + int ret = 0; 890 + 891 + mutex_lock(&zi->i_truncate_mutex); 892 + 893 + zi->i_wr_refcnt++; 894 + if (zi->i_wr_refcnt == 1) { 895 + 896 + if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) { 897 + atomic_dec(&sbi->s_open_zones); 898 + ret = -EBUSY; 899 + goto unlock; 900 + } 901 + 902 + if (i_size_read(inode) < zi->i_max_size) { 903 + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); 904 + if (ret) { 905 + zi->i_wr_refcnt--; 906 + atomic_dec(&sbi->s_open_zones); 907 + goto unlock; 908 + } 909 + zi->i_flags |= ZONEFS_ZONE_OPEN; 910 + } 911 + } 912 + 913 + unlock: 914 + mutex_unlock(&zi->i_truncate_mutex); 915 + 916 + return ret; 917 + } 918 + 919 + static int zonefs_file_open(struct inode *inode, struct file *file) 920 + { 921 + int ret; 922 + 923 + ret = generic_file_open(inode, file); 924 + if (ret) 925 + return ret; 926 + 927 + if (zonefs_file_use_exp_open(inode, file)) 928 + return zonefs_open_zone(inode); 929 + 930 + return 0; 931 + } 932 + 933 + static void zonefs_close_zone(struct inode *inode) 934 + { 935 + struct zonefs_inode_info *zi = ZONEFS_I(inode); 936 + int ret = 0; 937 + 938 + mutex_lock(&zi->i_truncate_mutex); 939 + zi->i_wr_refcnt--; 940 + if (!zi->i_wr_refcnt) { 941 + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 942 + struct super_block *sb = inode->i_sb; 943 + 944 + /* 945 + * If the file zone is full, it is not open anymore and we only 946 + * need to decrement the open count. 947 + */ 948 + if (!(zi->i_flags & ZONEFS_ZONE_OPEN)) 949 + goto dec; 950 + 951 + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); 952 + if (ret) { 953 + __zonefs_io_error(inode, false); 954 + /* 955 + * Leaving zones explicitly open may lead to a state 956 + * where most zones cannot be written (zone resources 957 + * exhausted). So take preventive action by remounting 958 + * read-only. 959 + */ 960 + if (zi->i_flags & ZONEFS_ZONE_OPEN && 961 + !(sb->s_flags & SB_RDONLY)) { 962 + zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n"); 963 + sb->s_flags |= SB_RDONLY; 964 + } 965 + } 966 + zi->i_flags &= ~ZONEFS_ZONE_OPEN; 967 + dec: 968 + atomic_dec(&sbi->s_open_zones); 969 + } 970 + mutex_unlock(&zi->i_truncate_mutex); 971 + } 972 + 973 + static int zonefs_file_release(struct inode *inode, struct file *file) 974 + { 975 + /* 976 + * If we explicitly open a zone we must close it again as well, but the 977 + * zone management operation can fail (either due to an IO error or as 978 + * the zone has gone offline or read-only). Make sure we don't fail the 979 + * close(2) for user-space. 980 + */ 981 + if (zonefs_file_use_exp_open(inode, file)) 982 + zonefs_close_zone(inode); 983 + 984 + return 0; 985 + } 986 + 931 987 static const struct file_operations zonefs_file_operations = { 932 - .open = generic_file_open, 988 + .open = zonefs_file_open, 989 + .release = zonefs_file_release, 933 990 .fsync = zonefs_file_fsync, 934 991 .mmap = zonefs_file_mmap, 935 992 .llseek = zonefs_file_llseek, ··· 1073 890 inode_init_once(&zi->i_vnode); 1074 891 mutex_init(&zi->i_truncate_mutex); 1075 892 init_rwsem(&zi->i_mmap_sem); 893 + zi->i_wr_refcnt = 0; 1076 894 1077 895 return &zi->i_vnode; 1078 896 } ··· 1124 940 1125 941 enum { 1126 942 Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair, 1127 - Opt_err, 943 + Opt_explicit_open, Opt_err, 1128 944 }; 1129 945 1130 946 static const match_table_t tokens = { ··· 1132 948 { Opt_errors_zro, "errors=zone-ro"}, 1133 949 { Opt_errors_zol, "errors=zone-offline"}, 1134 950 { Opt_errors_repair, "errors=repair"}, 951 + { Opt_explicit_open, "explicit-open" }, 1135 952 { Opt_err, NULL} 1136 953 }; 1137 954 ··· 1168 983 case Opt_errors_repair: 1169 984 sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; 1170 985 sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR; 986 + break; 987 + case Opt_explicit_open: 988 + sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN; 1171 989 break; 1172 990 default: 1173 991 return -EINVAL; ··· 1591 1403 sbi->s_gid = GLOBAL_ROOT_GID; 1592 1404 sbi->s_perm = 0640; 1593 1405 sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; 1406 + sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev); 1407 + atomic_set(&sbi->s_open_zones, 0); 1408 + if (!sbi->s_max_open_zones && 1409 + sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { 1410 + zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n"); 1411 + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; 1412 + } 1594 1413 1595 1414 ret = zonefs_read_super(sb); 1596 1415 if (ret)

+10

fs/zonefs/zonefs.h

··· 38 38 return ZONEFS_ZTYPE_SEQ; 39 39 } 40 40 41 + #define ZONEFS_ZONE_OPEN (1 << 0) 42 + 41 43 /* 42 44 * In-memory inode data. 43 45 */ ··· 76 74 */ 77 75 struct mutex i_truncate_mutex; 78 76 struct rw_semaphore i_mmap_sem; 77 + 78 + /* guarded by i_truncate_mutex */ 79 + unsigned int i_wr_refcnt; 80 + unsigned int i_flags; 79 81 }; 80 82 81 83 static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) ··· 160 154 #define ZONEFS_MNTOPT_ERRORS_MASK \ 161 155 (ZONEFS_MNTOPT_ERRORS_RO | ZONEFS_MNTOPT_ERRORS_ZRO | \ 162 156 ZONEFS_MNTOPT_ERRORS_ZOL | ZONEFS_MNTOPT_ERRORS_REPAIR) 157 + #define ZONEFS_MNTOPT_EXPLICIT_OPEN (1 << 4) /* Explicit open/close of zones on open/close */ 163 158 164 159 /* 165 160 * In-memory Super block information. ··· 182 175 183 176 loff_t s_blocks; 184 177 loff_t s_used_blocks; 178 + 179 + unsigned int s_max_open_zones; 180 + atomic_t s_open_zones; 185 181 }; 186 182 187 183 static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)