Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Btrfs: add mount -o auto_defrag

This will detect small random writes into files and
queue the up for an auto defrag process. It isn't well suited to
database workloads yet, but works for smaller files such as rpm, sqlite
or bdb databases.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

+681 -138
+1
fs/btrfs/btrfs_inode.h
··· 153 153 unsigned ordered_data_close:1; 154 154 unsigned orphan_meta_reserved:1; 155 155 unsigned dummy_inode:1; 156 + unsigned in_defrag:1; 156 157 157 158 /* 158 159 * always compress this one file
+44 -1
fs/btrfs/ctree.h
··· 1074 1074 /* all metadata allocations go through this cluster */ 1075 1075 struct btrfs_free_cluster meta_alloc_cluster; 1076 1076 1077 + /* auto defrag inodes go here */ 1078 + spinlock_t defrag_inodes_lock; 1079 + struct rb_root defrag_inodes; 1080 + atomic_t defrag_running; 1081 + 1077 1082 spinlock_t ref_cache_lock; 1078 1083 u64 total_ref_cache_size; 1079 1084 ··· 1210 1205 struct super_block anon_super; 1211 1206 }; 1212 1207 1208 + struct btrfs_ioctl_defrag_range_args { 1209 + /* start of the defrag operation */ 1210 + __u64 start; 1211 + 1212 + /* number of bytes to defrag, use (u64)-1 to say all */ 1213 + __u64 len; 1214 + 1215 + /* 1216 + * flags for the operation, which can include turning 1217 + * on compression for this one defrag 1218 + */ 1219 + __u64 flags; 1220 + 1221 + /* 1222 + * any extent bigger than this will be considered 1223 + * already defragged. Use 0 to take the kernel default 1224 + * Use 1 to say every single extent must be rewritten 1225 + */ 1226 + __u32 extent_thresh; 1227 + 1228 + /* 1229 + * which compression method to use if turning on compression 1230 + * for this defrag operation. If unspecified, zlib will 1231 + * be used 1232 + */ 1233 + __u32 compress_type; 1234 + 1235 + /* spare for later */ 1236 + __u32 unused[4]; 1237 + }; 1238 + 1239 + 1213 1240 /* 1214 1241 * inode items have the data typically returned from stat and store other 1215 1242 * info about object characteristics. There is one for every file and dir in ··· 1339 1302 #define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) 1340 1303 #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) 1341 1304 #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 1305 + #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1342 1306 1343 1307 #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1344 1308 #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) ··· 2566 2528 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 2567 2529 void btrfs_update_iflags(struct inode *inode); 2568 2530 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 2569 - 2531 + int btrfs_defrag_file(struct inode *inode, struct file *file, 2532 + struct btrfs_ioctl_defrag_range_args *range, 2533 + u64 newer_than, unsigned long max_pages); 2570 2534 /* file.c */ 2535 + int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 2536 + struct inode *inode); 2537 + int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); 2571 2538 int btrfs_sync_file(struct file *file, int datasync); 2572 2539 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2573 2540 int skip_pinned);
+12
fs/btrfs/disk-io.c
··· 1475 1475 btrfs_run_delayed_iputs(root); 1476 1476 btrfs_clean_old_snapshots(root); 1477 1477 mutex_unlock(&root->fs_info->cleaner_mutex); 1478 + btrfs_run_defrag_inodes(root->fs_info); 1478 1479 } 1479 1480 1480 1481 if (freezing(current)) { ··· 1617 1616 spin_lock_init(&fs_info->ref_cache_lock); 1618 1617 spin_lock_init(&fs_info->fs_roots_radix_lock); 1619 1618 spin_lock_init(&fs_info->delayed_iput_lock); 1619 + spin_lock_init(&fs_info->defrag_inodes_lock); 1620 1620 1621 1621 init_completion(&fs_info->kobj_unregister); 1622 1622 fs_info->tree_root = tree_root; ··· 1640 1638 atomic_set(&fs_info->async_delalloc_pages, 0); 1641 1639 atomic_set(&fs_info->async_submit_draining, 0); 1642 1640 atomic_set(&fs_info->nr_async_bios, 0); 1641 + atomic_set(&fs_info->defrag_running, 0); 1643 1642 fs_info->sb = sb; 1644 1643 fs_info->max_inline = 8192 * 1024; 1645 1644 fs_info->metadata_ratio = 0; 1645 + fs_info->defrag_inodes = RB_ROOT; 1646 1646 1647 1647 fs_info->thread_pool_size = min_t(unsigned long, 1648 1648 num_online_cpus() + 2, 8); ··· 2505 2501 smp_mb(); 2506 2502 2507 2503 btrfs_scrub_cancel(root); 2504 + 2505 + /* wait for any defraggers to finish */ 2506 + wait_event(fs_info->transaction_wait, 2507 + (atomic_read(&fs_info->defrag_running) == 0)); 2508 + 2509 + /* clear out the rbtree of defraggable inodes */ 2510 + btrfs_run_defrag_inodes(root->fs_info); 2511 + 2508 2512 btrfs_put_block_group_cache(fs_info); 2509 2513 2510 2514 /*
+257
fs/btrfs/file.c
··· 40 40 #include "locking.h" 41 41 #include "compat.h" 42 42 43 + /* 44 + * when auto defrag is enabled we 45 + * queue up these defrag structs to remember which 46 + * inodes need defragging passes 47 + */ 48 + struct inode_defrag { 49 + struct rb_node rb_node; 50 + /* objectid */ 51 + u64 ino; 52 + /* 53 + * transid where the defrag was added, we search for 54 + * extents newer than this 55 + */ 56 + u64 transid; 57 + 58 + /* root objectid */ 59 + u64 root; 60 + 61 + /* last offset we were able to defrag */ 62 + u64 last_offset; 63 + 64 + /* if we've wrapped around back to zero once already */ 65 + int cycled; 66 + }; 67 + 68 + /* pop a record for an inode into the defrag tree. The lock 69 + * must be held already 70 + * 71 + * If you're inserting a record for an older transid than an 72 + * existing record, the transid already in the tree is lowered 73 + * 74 + * If an existing record is found the defrag item you 75 + * pass in is freed 76 + */ 77 + static int __btrfs_add_inode_defrag(struct inode *inode, 78 + struct inode_defrag *defrag) 79 + { 80 + struct btrfs_root *root = BTRFS_I(inode)->root; 81 + struct inode_defrag *entry; 82 + struct rb_node **p; 83 + struct rb_node *parent = NULL; 84 + 85 + p = &root->fs_info->defrag_inodes.rb_node; 86 + while (*p) { 87 + parent = *p; 88 + entry = rb_entry(parent, struct inode_defrag, rb_node); 89 + 90 + if (defrag->ino < entry->ino) 91 + p = &parent->rb_left; 92 + else if (defrag->ino > entry->ino) 93 + p = &parent->rb_right; 94 + else { 95 + /* if we're reinserting an entry for 96 + * an old defrag run, make sure to 97 + * lower the transid of our existing record 98 + */ 99 + if (defrag->transid < entry->transid) 100 + entry->transid = defrag->transid; 101 + if (defrag->last_offset > entry->last_offset) 102 + entry->last_offset = defrag->last_offset; 103 + goto exists; 104 + } 105 + } 106 + BTRFS_I(inode)->in_defrag = 1; 107 + rb_link_node(&defrag->rb_node, parent, p); 108 + rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 109 + return 0; 110 + 111 + exists: 112 + kfree(defrag); 113 + return 0; 114 + 115 + } 116 + 117 + /* 118 + * insert a defrag record for this inode if auto defrag is 119 + * enabled 120 + */ 121 + int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 122 + struct inode *inode) 123 + { 124 + struct btrfs_root *root = BTRFS_I(inode)->root; 125 + struct inode_defrag *defrag; 126 + int ret = 0; 127 + u64 transid; 128 + 129 + if (!btrfs_test_opt(root, AUTO_DEFRAG)) 130 + return 0; 131 + 132 + if (root->fs_info->closing) 133 + return 0; 134 + 135 + if (BTRFS_I(inode)->in_defrag) 136 + return 0; 137 + 138 + if (trans) 139 + transid = trans->transid; 140 + else 141 + transid = BTRFS_I(inode)->root->last_trans; 142 + 143 + defrag = kzalloc(sizeof(*defrag), GFP_NOFS); 144 + if (!defrag) 145 + return -ENOMEM; 146 + 147 + defrag->ino = inode->i_ino; 148 + defrag->transid = transid; 149 + defrag->root = root->root_key.objectid; 150 + 151 + spin_lock(&root->fs_info->defrag_inodes_lock); 152 + if (!BTRFS_I(inode)->in_defrag) 153 + ret = __btrfs_add_inode_defrag(inode, defrag); 154 + spin_unlock(&root->fs_info->defrag_inodes_lock); 155 + return ret; 156 + } 157 + 158 + /* 159 + * must be called with the defrag_inodes lock held 160 + */ 161 + struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino, 162 + struct rb_node **next) 163 + { 164 + struct inode_defrag *entry = NULL; 165 + struct rb_node *p; 166 + struct rb_node *parent = NULL; 167 + 168 + p = info->defrag_inodes.rb_node; 169 + while (p) { 170 + parent = p; 171 + entry = rb_entry(parent, struct inode_defrag, rb_node); 172 + 173 + if (ino < entry->ino) 174 + p = parent->rb_left; 175 + else if (ino > entry->ino) 176 + p = parent->rb_right; 177 + else 178 + return entry; 179 + } 180 + 181 + if (next) { 182 + while (parent && ino > entry->ino) { 183 + parent = rb_next(parent); 184 + entry = rb_entry(parent, struct inode_defrag, rb_node); 185 + } 186 + *next = parent; 187 + } 188 + return NULL; 189 + } 190 + 191 + /* 192 + * run through the list of inodes in the FS that need 193 + * defragging 194 + */ 195 + int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 196 + { 197 + struct inode_defrag *defrag; 198 + struct btrfs_root *inode_root; 199 + struct inode *inode; 200 + struct rb_node *n; 201 + struct btrfs_key key; 202 + struct btrfs_ioctl_defrag_range_args range; 203 + u64 first_ino = 0; 204 + int num_defrag; 205 + int defrag_batch = 1024; 206 + 207 + memset(&range, 0, sizeof(range)); 208 + range.len = (u64)-1; 209 + 210 + atomic_inc(&fs_info->defrag_running); 211 + spin_lock(&fs_info->defrag_inodes_lock); 212 + while(1) { 213 + n = NULL; 214 + 215 + /* find an inode to defrag */ 216 + defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n); 217 + if (!defrag) { 218 + if (n) 219 + defrag = rb_entry(n, struct inode_defrag, rb_node); 220 + else if (first_ino) { 221 + first_ino = 0; 222 + continue; 223 + } else { 224 + break; 225 + } 226 + } 227 + 228 + /* remove it from the rbtree */ 229 + first_ino = defrag->ino + 1; 230 + rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); 231 + 232 + if (fs_info->closing) 233 + goto next_free; 234 + 235 + spin_unlock(&fs_info->defrag_inodes_lock); 236 + 237 + /* get the inode */ 238 + key.objectid = defrag->root; 239 + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 240 + key.offset = (u64)-1; 241 + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); 242 + if (IS_ERR(inode_root)) 243 + goto next; 244 + 245 + key.objectid = defrag->ino; 246 + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 247 + key.offset = 0; 248 + 249 + inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); 250 + if (IS_ERR(inode)) 251 + goto next; 252 + 253 + /* do a chunk of defrag */ 254 + BTRFS_I(inode)->in_defrag = 0; 255 + range.start = defrag->last_offset; 256 + num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 257 + defrag_batch); 258 + /* 259 + * if we filled the whole defrag batch, there 260 + * must be more work to do. Queue this defrag 261 + * again 262 + */ 263 + if (num_defrag == defrag_batch) { 264 + defrag->last_offset = range.start; 265 + __btrfs_add_inode_defrag(inode, defrag); 266 + /* 267 + * we don't want to kfree defrag, we added it back to 268 + * the rbtree 269 + */ 270 + defrag = NULL; 271 + } else if (defrag->last_offset && !defrag->cycled) { 272 + /* 273 + * we didn't fill our defrag batch, but 274 + * we didn't start at zero. Make sure we loop 275 + * around to the start of the file. 276 + */ 277 + defrag->last_offset = 0; 278 + defrag->cycled = 1; 279 + __btrfs_add_inode_defrag(inode, defrag); 280 + defrag = NULL; 281 + } 282 + 283 + iput(inode); 284 + next: 285 + spin_lock(&fs_info->defrag_inodes_lock); 286 + next_free: 287 + kfree(defrag); 288 + } 289 + spin_unlock(&fs_info->defrag_inodes_lock); 290 + 291 + atomic_dec(&fs_info->defrag_running); 292 + 293 + /* 294 + * during unmount, we use the transaction_wait queue to 295 + * wait for the defragger to stop 296 + */ 297 + wake_up(&fs_info->transaction_wait); 298 + return 0; 299 + } 43 300 44 301 /* simple helper to fault in pages and copy. This should go away 45 302 * and be replaced with calls into generic code.
+12
fs/btrfs/inode.c
··· 342 342 int will_compress; 343 343 int compress_type = root->fs_info->compress_type; 344 344 345 + /* if this is a small write inside eof, kick off a defragbot */ 346 + if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024) 347 + btrfs_add_inode_defrag(NULL, inode); 348 + 345 349 actual_end = min_t(u64, isize, end + 1); 346 350 again: 347 351 will_compress = 0; ··· 802 798 num_bytes = max(blocksize, num_bytes); 803 799 disk_num_bytes = num_bytes; 804 800 ret = 0; 801 + 802 + /* if this is a small write inside eof, kick off defrag */ 803 + if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024) 804 + btrfs_add_inode_defrag(trans, inode); 805 805 806 806 if (start == 0) { 807 807 /* lets try to make an inline extent */ ··· 5379 5371 if (IS_ERR(trans)) 5380 5372 return ERR_CAST(trans); 5381 5373 5374 + if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024) 5375 + btrfs_add_inode_defrag(trans, inode); 5376 + 5382 5377 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5383 5378 5384 5379 alloc_hint = get_extent_allocation_hint(inode, start, len); ··· 6693 6682 ei->ordered_data_close = 0; 6694 6683 ei->orphan_meta_reserved = 0; 6695 6684 ei->dummy_inode = 0; 6685 + ei->in_defrag = 0; 6696 6686 ei->force_compress = BTRFS_COMPRESS_NONE; 6697 6687 6698 6688 ei->delayed_node = NULL;
+349 -105
fs/btrfs/ioctl.c
··· 656 656 return error; 657 657 } 658 658 659 + /* 660 + * When we're defragging a range, we don't want to kick it off again 661 + * if it is really just waiting for delalloc to send it down. 662 + * If we find a nice big extent or delalloc range for the bytes in the 663 + * file you want to defrag, we return 0 to let you know to skip this 664 + * part of the file 665 + */ 666 + static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh) 667 + { 668 + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 669 + struct extent_map *em = NULL; 670 + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 671 + u64 end; 672 + 673 + read_lock(&em_tree->lock); 674 + em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); 675 + read_unlock(&em_tree->lock); 676 + 677 + if (em) { 678 + end = extent_map_end(em); 679 + free_extent_map(em); 680 + if (end - offset > thresh) 681 + return 0; 682 + } 683 + /* if we already have a nice delalloc here, just stop */ 684 + thresh /= 2; 685 + end = count_range_bits(io_tree, &offset, offset + thresh, 686 + thresh, EXTENT_DELALLOC, 1); 687 + if (end >= thresh) 688 + return 0; 689 + return 1; 690 + } 691 + 692 + /* 693 + * helper function to walk through a file and find extents 694 + * newer than a specific transid, and smaller than thresh. 695 + * 696 + * This is used by the defragging code to find new and small 697 + * extents 698 + */ 699 + static int find_new_extents(struct btrfs_root *root, 700 + struct inode *inode, u64 newer_than, 701 + u64 *off, int thresh) 702 + { 703 + struct btrfs_path *path; 704 + struct btrfs_key min_key; 705 + struct btrfs_key max_key; 706 + struct extent_buffer *leaf; 707 + struct btrfs_file_extent_item *extent; 708 + int type; 709 + int ret; 710 + 711 + path = btrfs_alloc_path(); 712 + if (!path) 713 + return -ENOMEM; 714 + 715 + min_key.objectid = inode->i_ino; 716 + min_key.type = BTRFS_EXTENT_DATA_KEY; 717 + min_key.offset = *off; 718 + 719 + max_key.objectid = inode->i_ino; 720 + max_key.type = (u8)-1; 721 + max_key.offset = (u64)-1; 722 + 723 + path->keep_locks = 1; 724 + 725 + while(1) { 726 + ret = btrfs_search_forward(root, &min_key, &max_key, 727 + path, 0, newer_than); 728 + if (ret != 0) 729 + goto none; 730 + if (min_key.objectid != inode->i_ino) 731 + goto none; 732 + if (min_key.type != BTRFS_EXTENT_DATA_KEY) 733 + goto none; 734 + 735 + leaf = path->nodes[0]; 736 + extent = btrfs_item_ptr(leaf, path->slots[0], 737 + struct btrfs_file_extent_item); 738 + 739 + type = btrfs_file_extent_type(leaf, extent); 740 + if (type == BTRFS_FILE_EXTENT_REG && 741 + btrfs_file_extent_num_bytes(leaf, extent) < thresh && 742 + check_defrag_in_cache(inode, min_key.offset, thresh)) { 743 + *off = min_key.offset; 744 + btrfs_free_path(path); 745 + return 0; 746 + } 747 + 748 + if (min_key.offset == (u64)-1) 749 + goto none; 750 + 751 + min_key.offset++; 752 + btrfs_release_path(path); 753 + } 754 + none: 755 + btrfs_free_path(path); 756 + return -ENOENT; 757 + } 758 + 659 759 static int should_defrag_range(struct inode *inode, u64 start, u64 len, 660 760 int thresh, u64 *last_len, u64 *skip, 661 761 u64 *defrag_end) ··· 764 664 struct extent_map *em = NULL; 765 665 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 766 666 int ret = 1; 767 - 768 - 769 - if (thresh == 0) 770 - thresh = 256 * 1024; 771 667 772 668 /* 773 669 * make sure that once we start defragging and extent, we keep on ··· 823 727 return ret; 824 728 } 825 729 826 - static int btrfs_defrag_file(struct file *file, 827 - struct btrfs_ioctl_defrag_range_args *range) 730 + /* 731 + * it doesn't do much good to defrag one or two pages 732 + * at a time. This pulls in a nice chunk of pages 733 + * to COW and defrag. 734 + * 735 + * It also makes sure the delalloc code has enough 736 + * dirty data to avoid making new small extents as part 737 + * of the defrag 738 + * 739 + * It's a good idea to start RA on this range 740 + * before calling this. 741 + */ 742 + static int cluster_pages_for_defrag(struct inode *inode, 743 + struct page **pages, 744 + unsigned long start_index, 745 + int num_pages) 828 746 { 829 - struct inode *inode = fdentry(file)->d_inode; 830 - struct btrfs_root *root = BTRFS_I(inode)->root; 831 - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 832 - struct btrfs_ordered_extent *ordered; 833 - struct page *page; 834 - struct btrfs_super_block *disk_super; 835 - unsigned long last_index; 836 - unsigned long ra_pages = root->fs_info->bdi.ra_pages; 837 - unsigned long total_read = 0; 838 - u64 features; 747 + unsigned long file_end; 748 + u64 isize = i_size_read(inode); 839 749 u64 page_start; 840 750 u64 page_end; 751 + int ret; 752 + int i; 753 + int i_done; 754 + struct btrfs_ordered_extent *ordered; 755 + struct extent_state *cached_state = NULL; 756 + 757 + if (isize == 0) 758 + return 0; 759 + file_end = (isize - 1) >> PAGE_CACHE_SHIFT; 760 + 761 + ret = btrfs_delalloc_reserve_space(inode, 762 + num_pages << PAGE_CACHE_SHIFT); 763 + if (ret) 764 + return ret; 765 + again: 766 + ret = 0; 767 + i_done = 0; 768 + 769 + /* step one, lock all the pages */ 770 + for (i = 0; i < num_pages; i++) { 771 + struct page *page; 772 + page = grab_cache_page(inode->i_mapping, 773 + start_index + i); 774 + if (!page) 775 + break; 776 + 777 + if (!PageUptodate(page)) { 778 + btrfs_readpage(NULL, page); 779 + lock_page(page); 780 + if (!PageUptodate(page)) { 781 + unlock_page(page); 782 + page_cache_release(page); 783 + ret = -EIO; 784 + break; 785 + } 786 + } 787 + isize = i_size_read(inode); 788 + file_end = (isize - 1) >> PAGE_CACHE_SHIFT; 789 + if (!isize || page->index > file_end || 790 + page->mapping != inode->i_mapping) { 791 + /* whoops, we blew past eof, skip this page */ 792 + unlock_page(page); 793 + page_cache_release(page); 794 + break; 795 + } 796 + pages[i] = page; 797 + i_done++; 798 + } 799 + if (!i_done || ret) 800 + goto out; 801 + 802 + if (!(inode->i_sb->s_flags & MS_ACTIVE)) 803 + goto out; 804 + 805 + /* 806 + * so now we have a nice long stream of locked 807 + * and up to date pages, lets wait on them 808 + */ 809 + for (i = 0; i < i_done; i++) 810 + wait_on_page_writeback(pages[i]); 811 + 812 + page_start = page_offset(pages[0]); 813 + page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; 814 + 815 + lock_extent_bits(&BTRFS_I(inode)->io_tree, 816 + page_start, page_end - 1, 0, &cached_state, 817 + GFP_NOFS); 818 + ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1); 819 + if (ordered && 820 + ordered->file_offset + ordered->len > page_start && 821 + ordered->file_offset < page_end) { 822 + btrfs_put_ordered_extent(ordered); 823 + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 824 + page_start, page_end - 1, 825 + &cached_state, GFP_NOFS); 826 + for (i = 0; i < i_done; i++) { 827 + unlock_page(pages[i]); 828 + page_cache_release(pages[i]); 829 + } 830 + btrfs_wait_ordered_range(inode, page_start, 831 + page_end - page_start); 832 + goto again; 833 + } 834 + if (ordered) 835 + btrfs_put_ordered_extent(ordered); 836 + 837 + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, 838 + page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 839 + EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, 840 + GFP_NOFS); 841 + 842 + if (i_done != num_pages) { 843 + atomic_inc(&BTRFS_I(inode)->outstanding_extents); 844 + btrfs_delalloc_release_space(inode, 845 + (num_pages - i_done) << PAGE_CACHE_SHIFT); 846 + } 847 + 848 + 849 + btrfs_set_extent_delalloc(inode, page_start, page_end - 1, 850 + &cached_state); 851 + 852 + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 853 + page_start, page_end - 1, &cached_state, 854 + GFP_NOFS); 855 + 856 + for (i = 0; i < i_done; i++) { 857 + clear_page_dirty_for_io(pages[i]); 858 + ClearPageChecked(pages[i]); 859 + set_page_extent_mapped(pages[i]); 860 + set_page_dirty(pages[i]); 861 + unlock_page(pages[i]); 862 + page_cache_release(pages[i]); 863 + } 864 + return i_done; 865 + out: 866 + for (i = 0; i < i_done; i++) { 867 + unlock_page(pages[i]); 868 + page_cache_release(pages[i]); 869 + } 870 + btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT); 871 + return ret; 872 + 873 + } 874 + 875 + int btrfs_defrag_file(struct inode *inode, struct file *file, 876 + struct btrfs_ioctl_defrag_range_args *range, 877 + u64 newer_than, unsigned long max_to_defrag) 878 + { 879 + struct btrfs_root *root = BTRFS_I(inode)->root; 880 + struct btrfs_super_block *disk_super; 881 + struct file_ra_state *ra = NULL; 882 + unsigned long last_index; 883 + u64 features; 841 884 u64 last_len = 0; 842 885 u64 skip = 0; 843 886 u64 defrag_end = 0; 887 + u64 newer_off = range->start; 888 + int newer_left = 0; 844 889 unsigned long i; 845 890 int ret; 891 + int defrag_count = 0; 846 892 int compress_type = BTRFS_COMPRESS_ZLIB; 893 + int extent_thresh = range->extent_thresh; 894 + int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 895 + u64 new_align = ~((u64)128 * 1024 - 1); 896 + struct page **pages = NULL; 897 + 898 + if (extent_thresh == 0) 899 + extent_thresh = 256 * 1024; 847 900 848 901 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { 849 902 if (range->compress_type > BTRFS_COMPRESS_TYPES) ··· 1004 759 if (inode->i_size == 0) 1005 760 return 0; 1006 761 762 + /* 763 + * if we were not given a file, allocate a readahead 764 + * context 765 + */ 766 + if (!file) { 767 + ra = kzalloc(sizeof(*ra), GFP_NOFS); 768 + if (!ra) 769 + return -ENOMEM; 770 + file_ra_state_init(ra, inode->i_mapping); 771 + } else { 772 + ra = &file->f_ra; 773 + } 774 + 775 + pages = kmalloc(sizeof(struct page *) * newer_cluster, 776 + GFP_NOFS); 777 + if (!pages) { 778 + ret = -ENOMEM; 779 + goto out_ra; 780 + } 781 + 782 + /* find the last page to defrag */ 1007 783 if (range->start + range->len > range->start) { 1008 784 last_index = min_t(u64, inode->i_size - 1, 1009 785 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; ··· 1032 766 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; 1033 767 } 1034 768 1035 - i = range->start >> PAGE_CACHE_SHIFT; 1036 - while (i <= last_index) { 1037 - if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, 769 + if (newer_than) { 770 + ret = find_new_extents(root, inode, newer_than, 771 + &newer_off, 64 * 1024); 772 + if (!ret) { 773 + range->start = newer_off; 774 + /* 775 + * we always align our defrag to help keep 776 + * the extents in the file evenly spaced 777 + */ 778 + i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 779 + newer_left = newer_cluster; 780 + } else 781 + goto out_ra; 782 + } else { 783 + i = range->start >> PAGE_CACHE_SHIFT; 784 + } 785 + if (!max_to_defrag) 786 + max_to_defrag = last_index - 1; 787 + 788 + while (i <= last_index && defrag_count < max_to_defrag) { 789 + /* 790 + * make sure we stop running if someone unmounts 791 + * the FS 792 + */ 793 + if (!(inode->i_sb->s_flags & MS_ACTIVE)) 794 + break; 795 + 796 + if (!newer_than && 797 + !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, 1038 798 PAGE_CACHE_SIZE, 1039 - range->extent_thresh, 799 + extent_thresh, 1040 800 &last_len, &skip, 1041 801 &defrag_end)) { 1042 802 unsigned long next; ··· 1074 782 i = max(i + 1, next); 1075 783 continue; 1076 784 } 1077 - 1078 - if (total_read % ra_pages == 0) { 1079 - btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, 1080 - min(last_index, i + ra_pages - 1)); 1081 - } 1082 - total_read++; 1083 - mutex_lock(&inode->i_mutex); 1084 785 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1085 786 BTRFS_I(inode)->force_compress = compress_type; 1086 787 1087 - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 1088 - if (ret) 1089 - goto err_unlock; 1090 - again: 1091 - if (inode->i_size == 0 || 1092 - i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { 1093 - ret = 0; 1094 - goto err_reservations; 1095 - } 788 + btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); 1096 789 1097 - page = grab_cache_page(inode->i_mapping, i); 1098 - if (!page) { 1099 - ret = -ENOMEM; 1100 - goto err_reservations; 1101 - } 790 + ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); 791 + if (ret < 0) 792 + goto out_ra; 1102 793 1103 - if (!PageUptodate(page)) { 1104 - btrfs_readpage(NULL, page); 1105 - lock_page(page); 1106 - if (!PageUptodate(page)) { 1107 - unlock_page(page); 1108 - page_cache_release(page); 1109 - ret = -EIO; 1110 - goto err_reservations; 794 + defrag_count += ret; 795 + balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 796 + i += ret; 797 + 798 + if (newer_than) { 799 + if (newer_off == (u64)-1) 800 + break; 801 + 802 + newer_off = max(newer_off + 1, 803 + (u64)i << PAGE_CACHE_SHIFT); 804 + 805 + ret = find_new_extents(root, inode, 806 + newer_than, &newer_off, 807 + 64 * 1024); 808 + if (!ret) { 809 + range->start = newer_off; 810 + i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 811 + newer_left = newer_cluster; 812 + } else { 813 + break; 1111 814 } 815 + } else { 816 + i++; 1112 817 } 1113 - 1114 - if (page->mapping != inode->i_mapping) { 1115 - unlock_page(page); 1116 - page_cache_release(page); 1117 - goto again; 1118 - } 1119 - 1120 - wait_on_page_writeback(page); 1121 - 1122 - if (PageDirty(page)) { 1123 - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 1124 - goto loop_unlock; 1125 - } 1126 - 1127 - page_start = (u64)page->index << PAGE_CACHE_SHIFT; 1128 - page_end = page_start + PAGE_CACHE_SIZE - 1; 1129 - lock_extent(io_tree, page_start, page_end, GFP_NOFS); 1130 - 1131 - ordered = btrfs_lookup_ordered_extent(inode, page_start); 1132 - if (ordered) { 1133 - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 1134 - unlock_page(page); 1135 - page_cache_release(page); 1136 - btrfs_start_ordered_extent(inode, ordered, 1); 1137 - btrfs_put_ordered_extent(ordered); 1138 - goto again; 1139 - } 1140 - set_page_extent_mapped(page); 1141 - 1142 - /* 1143 - * this makes sure page_mkwrite is called on the 1144 - * page if it is dirtied again later 1145 - */ 1146 - clear_page_dirty_for_io(page); 1147 - clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, 1148 - page_end, EXTENT_DIRTY | EXTENT_DELALLOC | 1149 - EXTENT_DO_ACCOUNTING, GFP_NOFS); 1150 - 1151 - btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); 1152 - ClearPageChecked(page); 1153 - set_page_dirty(page); 1154 - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 1155 - 1156 - loop_unlock: 1157 - unlock_page(page); 1158 - page_cache_release(page); 1159 - mutex_unlock(&inode->i_mutex); 1160 - 1161 - balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 1162 - i++; 1163 818 } 1164 819 1165 820 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) ··· 1138 899 btrfs_set_super_incompat_flags(disk_super, features); 1139 900 } 1140 901 1141 - return 0; 902 + if (!file) 903 + kfree(ra); 904 + return defrag_count; 1142 905 1143 - err_reservations: 1144 - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 1145 - err_unlock: 1146 - mutex_unlock(&inode->i_mutex); 906 + out_ra: 907 + if (!file) 908 + kfree(ra); 909 + kfree(pages); 1147 910 return ret; 1148 911 } 1149 912 ··· 1997 1756 /* the rest are all set to zero by kzalloc */ 1998 1757 range->len = (u64)-1; 1999 1758 } 2000 - ret = btrfs_defrag_file(file, range); 1759 + ret = btrfs_defrag_file(fdentry(file)->d_inode, file, 1760 + range, 0, 0); 1761 + if (ret > 0) 1762 + ret = 0; 2001 1763 kfree(range); 2002 1764 break; 2003 1765 default:
-31
fs/btrfs/ioctl.h
··· 181 181 #define BTRFS_DEFRAG_RANGE_COMPRESS 1 182 182 #define BTRFS_DEFRAG_RANGE_START_IO 2 183 183 184 - struct btrfs_ioctl_defrag_range_args { 185 - /* start of the defrag operation */ 186 - __u64 start; 187 - 188 - /* number of bytes to defrag, use (u64)-1 to say all */ 189 - __u64 len; 190 - 191 - /* 192 - * flags for the operation, which can include turning 193 - * on compression for this one defrag 194 - */ 195 - __u64 flags; 196 - 197 - /* 198 - * any extent bigger than this will be considered 199 - * already defragged. Use 0 to take the kernel default 200 - * Use 1 to say every single extent must be rewritten 201 - */ 202 - __u32 extent_thresh; 203 - 204 - /* 205 - * which compression method to use if turning on compression 206 - * for this defrag operation. If unspecified, zlib will 207 - * be used 208 - */ 209 - __u32 compress_type; 210 - 211 - /* spare for later */ 212 - __u32 unused[4]; 213 - }; 214 - 215 184 struct btrfs_ioctl_space_info { 216 185 __u64 flags; 217 186 __u64 total_bytes;
+6 -1
fs/btrfs/super.c
··· 160 160 Opt_compress_type, Opt_compress_force, Opt_compress_force_type, 161 161 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 162 162 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 163 - Opt_enospc_debug, Opt_subvolrootid, Opt_err, 163 + Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_err, 164 164 }; 165 165 166 166 static match_table_t tokens = { ··· 191 191 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, 192 192 {Opt_enospc_debug, "enospc_debug"}, 193 193 {Opt_subvolrootid, "subvolrootid=%d"}, 194 + {Opt_defrag, "autodefrag"}, 194 195 {Opt_err, NULL}, 195 196 }; 196 197 ··· 369 368 break; 370 369 case Opt_enospc_debug: 371 370 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); 371 + break; 372 + case Opt_defrag: 373 + printk(KERN_INFO "btrfs: enabling auto defrag"); 374 + btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); 372 375 break; 373 376 case Opt_err: 374 377 printk(KERN_INFO "btrfs: unrecognized mount option "