Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

GFS2: FITRIM ioctl support

The FITRIM ioctl provides an alternative way to send discard requests to
the underlying device. Using the discard mount option results in every
freed block generating a discard request to the block device. This can
be slow, since many block devices can only process discard requests of
larger sizes, and also such operations can be time consuming.

Rather than using the discard mount option, FITRIM allows a sweep of the
filesystem on an occasional basis, and also to optionally avoid sending
down discard requests for smaller regions.

In GFS2 FITRIM will work at resource group granularity. There is a flag
for each resource group which keeps track of which resource groups have
been trimmed. This flag is reset whenever a deallocation occurs in the
resource group, and set whenever a successful FITRIM of that resource
group has taken place. This helps to reduce repeated discard requests
for the same block ranges, again improving performance.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

+153 -36
+2
fs/gfs2/file.c
··· 313 313 return gfs2_get_flags(filp, (u32 __user *)arg); 314 314 case FS_IOC_SETFLAGS: 315 315 return gfs2_set_flags(filp, (u32 __user *)arg); 316 + case FITRIM: 317 + return gfs2_fitrim(filp, (void __user *)arg); 316 318 } 317 319 return -ENOTTY; 318 320 }
+2 -2
fs/gfs2/inode.c
··· 1036 1036 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); 1037 1037 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); 1038 1038 1039 - rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); 1039 + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1); 1040 1040 if (!rgd) 1041 1041 goto out_inodes; 1042 1042 ··· 1255 1255 * this is the case of the target file already existing 1256 1256 * so we unlink before doing the rename 1257 1257 */ 1258 - nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr); 1258 + nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr, 1); 1259 1259 if (nrgd) 1260 1260 gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); 1261 1261 }
+1 -1
fs/gfs2/lops.c
··· 76 76 if (bi->bi_clone == 0) 77 77 return; 78 78 if (sdp->sd_args.ar_discard) 79 - gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi); 79 + gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL); 80 80 memcpy(bi->bi_clone + bi->bi_offset, 81 81 bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); 82 82 clear_bit(GBF_FULL, &bi->bi_flags);
+138 -26
fs/gfs2/rgrp.c
··· 327 327 * Returns: The resource group, or NULL if not found 328 328 */ 329 329 330 - struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) 330 + struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact) 331 331 { 332 - struct rb_node **newn; 332 + struct rb_node *n, *next; 333 333 struct gfs2_rgrpd *cur; 334 334 335 335 spin_lock(&sdp->sd_rindex_spin); 336 - newn = &sdp->sd_rindex_tree.rb_node; 337 - while (*newn) { 338 - cur = rb_entry(*newn, struct gfs2_rgrpd, rd_node); 336 + n = sdp->sd_rindex_tree.rb_node; 337 + while (n) { 338 + cur = rb_entry(n, struct gfs2_rgrpd, rd_node); 339 + next = NULL; 339 340 if (blk < cur->rd_addr) 340 - newn = &((*newn)->rb_left); 341 + next = n->rb_left; 341 342 else if (blk >= cur->rd_data0 + cur->rd_data) 342 - newn = &((*newn)->rb_right); 343 - else { 343 + next = n->rb_right; 344 + if (next == NULL) { 344 345 spin_unlock(&sdp->sd_rindex_spin); 346 + if (exact) { 347 + if (blk < cur->rd_addr) 348 + return NULL; 349 + if (blk >= cur->rd_data0 + cur->rd_data) 350 + return NULL; 351 + } 345 352 return cur; 346 353 } 354 + n = next; 347 355 } 348 356 spin_unlock(&sdp->sd_rindex_spin); 349 357 ··· 818 810 819 811 } 820 812 821 - void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, 813 + int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, 822 814 struct buffer_head *bh, 823 - const struct gfs2_bitmap *bi) 815 + const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed) 824 816 { 825 817 struct super_block *sb = sdp->sd_vfs; 826 818 struct block_device *bdev = sb->s_bdev; ··· 831 823 sector_t nr_sects = 0; 832 824 int rv; 833 825 unsigned int x; 826 + u32 trimmed = 0; 827 + u8 diff; 834 828 835 829 for (x = 0; x < bi->bi_len; x++) { 836 - const u8 *orig = bh->b_data + bi->bi_offset + x; 837 - const u8 *clone = bi->bi_clone + bi->bi_offset + x; 838 - u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); 830 + const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data; 831 + clone += bi->bi_offset; 832 + clone += x; 833 + if (bh) { 834 + const u8 *orig = bh->b_data + bi->bi_offset + x; 835 + diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); 836 + } else { 837 + diff = ~(*clone | (*clone >> 1)); 838 + } 839 839 diff &= 0x55; 840 840 if (diff == 0) 841 841 continue; ··· 854 838 if (nr_sects == 0) 855 839 goto start_new_extent; 856 840 if ((start + nr_sects) != blk) { 857 - rv = blkdev_issue_discard(bdev, start, 858 - nr_sects, GFP_NOFS, 859 - 0); 860 - if (rv) 861 - goto fail; 841 + if (nr_sects >= minlen) { 842 + rv = blkdev_issue_discard(bdev, 843 + start, nr_sects, 844 + GFP_NOFS, 0); 845 + if (rv) 846 + goto fail; 847 + trimmed += nr_sects; 848 + } 862 849 nr_sects = 0; 863 850 start_new_extent: 864 851 start = blk; ··· 872 853 blk += sects_per_blk; 873 854 } 874 855 } 875 - if (nr_sects) { 856 + if (nr_sects >= minlen) { 876 857 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0); 877 858 if (rv) 878 859 goto fail; 860 + trimmed += nr_sects; 879 861 } 880 - return; 862 + if (ptrimmed) 863 + *ptrimmed = trimmed; 864 + return 0; 865 + 881 866 fail: 882 - fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv); 867 + if (sdp->sd_args.ar_discard) 868 + fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv); 883 869 sdp->sd_args.ar_discard = 0; 870 + return -EIO; 871 + } 872 + 873 + /** 874 + * gfs2_fitrim - Generate discard requests for unused bits of the filesystem 875 + * @filp: Any file on the filesystem 876 + * @argp: Pointer to the arguments (also used to pass result) 877 + * 878 + * Returns: 0 on success, otherwise error code 879 + */ 880 + 881 + int gfs2_fitrim(struct file *filp, void __user *argp) 882 + { 883 + struct inode *inode = filp->f_dentry->d_inode; 884 + struct gfs2_sbd *sdp = GFS2_SB(inode); 885 + struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev); 886 + struct buffer_head *bh; 887 + struct gfs2_rgrpd *rgd; 888 + struct gfs2_rgrpd *rgd_end; 889 + struct gfs2_holder gh; 890 + struct fstrim_range r; 891 + int ret = 0; 892 + u64 amt; 893 + u64 trimmed = 0; 894 + unsigned int x; 895 + 896 + if (!capable(CAP_SYS_ADMIN)) 897 + return -EPERM; 898 + 899 + if (!blk_queue_discard(q)) 900 + return -EOPNOTSUPP; 901 + 902 + ret = gfs2_rindex_update(sdp); 903 + if (ret) 904 + return ret; 905 + 906 + if (argp == NULL) { 907 + r.start = 0; 908 + r.len = ULLONG_MAX; 909 + r.minlen = 0; 910 + } else if (copy_from_user(&r, argp, sizeof(r))) 911 + return -EFAULT; 912 + 913 + rgd = gfs2_blk2rgrpd(sdp, r.start, 0); 914 + rgd_end = gfs2_blk2rgrpd(sdp, r.start + r.len, 0); 915 + 916 + while (1) { 917 + 918 + ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); 919 + if (ret) 920 + goto out; 921 + 922 + if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) { 923 + /* Trim each bitmap in the rgrp */ 924 + for (x = 0; x < rgd->rd_length; x++) { 925 + struct gfs2_bitmap *bi = rgd->rd_bits + x; 926 + ret = gfs2_rgrp_send_discards(sdp, rgd->rd_data0, NULL, bi, r.minlen, &amt); 927 + if (ret) { 928 + gfs2_glock_dq_uninit(&gh); 929 + goto out; 930 + } 931 + trimmed += amt; 932 + } 933 + 934 + /* Mark rgrp as having been trimmed */ 935 + ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0); 936 + if (ret == 0) { 937 + bh = rgd->rd_bits[0].bi_bh; 938 + rgd->rd_flags |= GFS2_RGF_TRIMMED; 939 + gfs2_trans_add_bh(rgd->rd_gl, bh, 1); 940 + gfs2_rgrp_out(rgd, bh->b_data); 941 + gfs2_trans_end(sdp); 942 + } 943 + } 944 + gfs2_glock_dq_uninit(&gh); 945 + 946 + if (rgd == rgd_end) 947 + break; 948 + 949 + rgd = gfs2_rgrpd_get_next(rgd); 950 + } 951 + 952 + out: 953 + r.len = trimmed << 9; 954 + if (argp && copy_to_user(argp, &r, sizeof(r))) 955 + return -EFAULT; 956 + 957 + return ret; 884 958 } 885 959 886 960 /** ··· 1120 1008 if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) 1121 1009 rgd = begin = ip->i_rgd; 1122 1010 else 1123 - rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal); 1011 + rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); 1124 1012 1125 1013 if (rgd == NULL) 1126 1014 return -EBADSLT; ··· 1405 1293 u32 length, rgrp_blk, buf_blk; 1406 1294 unsigned int buf; 1407 1295 1408 - rgd = gfs2_blk2rgrpd(sdp, bstart); 1296 + rgd = gfs2_blk2rgrpd(sdp, bstart, 1); 1409 1297 if (!rgd) { 1410 1298 if (gfs2_consist(sdp)) 1411 1299 fs_err(sdp, "block = %llu\n", (unsigned long long)bstart); ··· 1586 1474 return; 1587 1475 trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE); 1588 1476 rgd->rd_free += blen; 1589 - 1477 + rgd->rd_flags &= ~GFS2_RGF_TRIMMED; 1590 1478 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1591 1479 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1592 1480 ··· 1679 1567 return error; 1680 1568 1681 1569 error = -EINVAL; 1682 - rgd = gfs2_blk2rgrpd(sdp, no_addr); 1570 + rgd = gfs2_blk2rgrpd(sdp, no_addr, 1); 1683 1571 if (!rgd) 1684 1572 goto fail; 1685 1573 ··· 1722 1610 if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block)) 1723 1611 rgd = ip->i_rgd; 1724 1612 else 1725 - rgd = gfs2_blk2rgrpd(sdp, block); 1613 + rgd = gfs2_blk2rgrpd(sdp, block, 1); 1726 1614 if (!rgd) { 1727 1615 fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block); 1728 1616 return;
+6 -4
fs/gfs2/rgrp.h
··· 11 11 #define __RGRP_DOT_H__ 12 12 13 13 #include <linux/slab.h> 14 + #include <linux/uaccess.h> 14 15 15 16 struct gfs2_rgrpd; 16 17 struct gfs2_sbd; ··· 19 18 20 19 extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); 21 20 22 - extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); 21 + extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact); 23 22 extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); 24 23 extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); 25 24 ··· 63 62 extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); 64 63 extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); 65 64 extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); 66 - extern void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, 67 - struct buffer_head *bh, 68 - const struct gfs2_bitmap *bi); 65 + extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, 66 + struct buffer_head *bh, 67 + const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); 68 + extern int gfs2_fitrim(struct file *filp, void __user *argp); 69 69 70 70 #endif /* __RGRP_DOT_H__ */
+1 -1
fs/gfs2/super.c
··· 1417 1417 if (error) 1418 1418 goto out; 1419 1419 1420 - rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); 1420 + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1); 1421 1421 if (!rgd) { 1422 1422 gfs2_consist_inode(ip); 1423 1423 error = -EIO;
+2 -2
fs/gfs2/xattr.c
··· 251 251 if (!blks) 252 252 return 0; 253 253 254 - rgd = gfs2_blk2rgrpd(sdp, bn); 254 + rgd = gfs2_blk2rgrpd(sdp, bn, 1); 255 255 if (!rgd) { 256 256 gfs2_consist_inode(ip); 257 257 return -EIO; ··· 1439 1439 struct gfs2_holder gh; 1440 1440 int error; 1441 1441 1442 - rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr); 1442 + rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr, 1); 1443 1443 if (!rgd) { 1444 1444 gfs2_consist_inode(ip); 1445 1445 return -EIO;
+1
include/linux/gfs2_ondisk.h
··· 168 168 #define GFS2_RGF_METAONLY 0x00000002 169 169 #define GFS2_RGF_DATAONLY 0x00000004 170 170 #define GFS2_RGF_NOALLOC 0x00000008 171 + #define GFS2_RGF_TRIMMED 0x00000010 171 172 172 173 struct gfs2_rgrp { 173 174 struct gfs2_meta_header rg_header;