Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

btrfs: iterate over unused chunk space in FITRIM

Since we now clean up block groups automatically as they become
empty, iterating over block groups is no longer sufficient to discard
unused space.

This patch iterates over the unused chunk space and discards any regions
that are unallocated, regardless of whether they were ever used. This is
a change for btrfs but is consistent with other file systems.

We do this in a transactionless manner since the discard process can take
a substantial amount of time and a transaction would need to be started
before the acquisition of the device list lock. That would mean a
transaction would be held open across /all/ of the discards collectively.
In order to prevent other threads from allocating or freeing chunks, we
hold the chunks lock across the search and discard calls. We release it
between searches to allow the file system to perform more-or-less
normally. Since the running transaction can commit and disappear while
we're using the transaction pointer, we take a reference to it and
release it after the search. This is safe since it would happen normally
at the end of the transaction commit after any locks are released anyway.
We also take the commit_root_sem to protect against a transaction starting
and committing while we're running.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>

authored by

Jeff Mahoney and committed by
Chris Mason
499f377f 86557861

+143 -24
+101
fs/btrfs/extent-tree.c
··· 10135 10135 return unpin_extent_range(root, start, end, false); 10136 10136 } 10137 10137 10138 + /* 10139 + * It used to be that old block groups would be left around forever. 10140 + * Iterating over them would be enough to trim unused space. Since we 10141 + * now automatically remove them, we also need to iterate over unallocated 10142 + * space. 10143 + * 10144 + * We don't want a transaction for this since the discard may take a 10145 + * substantial amount of time. We don't require that a transaction be 10146 + * running, but we do need to take a running transaction into account 10147 + * to ensure that we're not discarding chunks that were released in 10148 + * the current transaction. 10149 + * 10150 + * Holding the chunks lock will prevent other threads from allocating 10151 + * or releasing chunks, but it won't prevent a running transaction 10152 + * from committing and releasing the memory that the pending chunks 10153 + * list head uses. For that, we need to take a reference to the 10154 + * transaction. 10155 + */ 10156 + static int btrfs_trim_free_extents(struct btrfs_device *device, 10157 + u64 minlen, u64 *trimmed) 10158 + { 10159 + u64 start = 0, len = 0; 10160 + int ret; 10161 + 10162 + *trimmed = 0; 10163 + 10164 + /* Not writeable = nothing to do. */ 10165 + if (!device->writeable) 10166 + return 0; 10167 + 10168 + /* No free space = nothing to do. */ 10169 + if (device->total_bytes <= device->bytes_used) 10170 + return 0; 10171 + 10172 + ret = 0; 10173 + 10174 + while (1) { 10175 + struct btrfs_fs_info *fs_info = device->dev_root->fs_info; 10176 + struct btrfs_transaction *trans; 10177 + u64 bytes; 10178 + 10179 + ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 10180 + if (ret) 10181 + return ret; 10182 + 10183 + down_read(&fs_info->commit_root_sem); 10184 + 10185 + spin_lock(&fs_info->trans_lock); 10186 + trans = fs_info->running_transaction; 10187 + if (trans) 10188 + atomic_inc(&trans->use_count); 10189 + spin_unlock(&fs_info->trans_lock); 10190 + 10191 + ret = find_free_dev_extent_start(trans, device, minlen, start, 10192 + &start, &len); 10193 + if (trans) 10194 + btrfs_put_transaction(trans); 10195 + 10196 + if (ret) { 10197 + up_read(&fs_info->commit_root_sem); 10198 + mutex_unlock(&fs_info->chunk_mutex); 10199 + if (ret == -ENOSPC) 10200 + ret = 0; 10201 + break; 10202 + } 10203 + 10204 + ret = btrfs_issue_discard(device->bdev, start, len, &bytes); 10205 + up_read(&fs_info->commit_root_sem); 10206 + mutex_unlock(&fs_info->chunk_mutex); 10207 + 10208 + if (ret) 10209 + break; 10210 + 10211 + start += len; 10212 + *trimmed += bytes; 10213 + 10214 + if (fatal_signal_pending(current)) { 10215 + ret = -ERESTARTSYS; 10216 + break; 10217 + } 10218 + 10219 + cond_resched(); 10220 + } 10221 + 10222 + return ret; 10223 + } 10224 + 10138 10225 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) 10139 10226 { 10140 10227 struct btrfs_fs_info *fs_info = root->fs_info; 10141 10228 struct btrfs_block_group_cache *cache = NULL; 10229 + struct btrfs_device *device; 10230 + struct list_head *devices; 10142 10231 u64 group_trimmed; 10143 10232 u64 start; 10144 10233 u64 end; ··· 10281 10192 10282 10193 cache = next_block_group(fs_info->tree_root, cache); 10283 10194 } 10195 + 10196 + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 10197 + devices = &root->fs_info->fs_devices->alloc_list; 10198 + list_for_each_entry(device, devices, dev_alloc_list) { 10199 + ret = btrfs_trim_free_extents(device, range->minlen, 10200 + &group_trimmed); 10201 + if (ret) 10202 + break; 10203 + 10204 + trimmed += group_trimmed; 10205 + } 10206 + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 10284 10207 10285 10208 range->len = trimmed; 10286 10209 return ret;
+39 -24
fs/btrfs/volumes.c
··· 1116 1116 return ret; 1117 1117 } 1118 1118 1119 - static int contains_pending_extent(struct btrfs_trans_handle *trans, 1119 + static int contains_pending_extent(struct btrfs_transaction *transaction, 1120 1120 struct btrfs_device *device, 1121 1121 u64 *start, u64 len) 1122 1122 { 1123 + struct btrfs_fs_info *fs_info = device->dev_root->fs_info; 1123 1124 struct extent_map *em; 1124 - struct list_head *search_list = &trans->transaction->pending_chunks; 1125 + struct list_head *search_list = &fs_info->pinned_chunks; 1125 1126 int ret = 0; 1126 1127 u64 physical_start = *start; 1127 1128 1129 + if (transaction) 1130 + search_list = &transaction->pending_chunks; 1128 1131 again: 1129 1132 list_for_each_entry(em, search_list, list) { 1130 1133 struct map_lookup *map; ··· 1162 1159 } 1163 1160 } 1164 1161 } 1165 - if (search_list == &trans->transaction->pending_chunks) { 1166 - search_list = &trans->root->fs_info->pinned_chunks; 1162 + if (search_list != &fs_info->pinned_chunks) { 1163 + search_list = &fs_info->pinned_chunks; 1167 1164 goto again; 1168 1165 } 1169 1166 ··· 1172 1169 1173 1170 1174 1171 /* 1175 - * find_free_dev_extent - find free space in the specified device 1176 - * @device: the device which we search the free space in 1177 - * @num_bytes: the size of the free space that we need 1178 - * @start: store the start of the free space. 1179 - * @len: the size of the free space. that we find, or the size of the max 1180 - * free space if we don't find suitable free space 1172 + * find_free_dev_extent_start - find free space in the specified device 1173 + * @device: the device which we search the free space in 1174 + * @num_bytes: the size of the free space that we need 1175 + * @search_start: the position from which to begin the search 1176 + * @start: store the start of the free space. 1177 + * @len: the size of the free space. that we find, or the size 1178 + * of the max free space if we don't find suitable free space 1181 1179 * 1182 1180 * this uses a pretty simple search, the expectation is that it is 1183 1181 * called very infrequently and that a given device has a small number ··· 1192 1188 * But if we don't find suitable free space, it is used to store the size of 1193 1189 * the max free space. 1194 1190 */ 1195 - int find_free_dev_extent(struct btrfs_trans_handle *trans, 1196 - struct btrfs_device *device, u64 num_bytes, 1197 - u64 *start, u64 *len) 1191 + int find_free_dev_extent_start(struct btrfs_transaction *transaction, 1192 + struct btrfs_device *device, u64 num_bytes, 1193 + u64 search_start, u64 *start, u64 *len) 1198 1194 { 1199 1195 struct btrfs_key key; 1200 1196 struct btrfs_root *root = device->dev_root; ··· 1204 1200 u64 max_hole_start; 1205 1201 u64 max_hole_size; 1206 1202 u64 extent_end; 1207 - u64 search_start; 1208 1203 u64 search_end = device->total_bytes; 1209 1204 int ret; 1210 1205 int slot; 1211 1206 struct extent_buffer *l; 1212 - 1213 - /* FIXME use last free of some kind */ 1214 - 1215 - /* we don't want to overwrite the superblock on the drive, 1216 - * so we make sure to start at an offset of at least 1MB 1217 - */ 1218 - search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 1219 1207 1220 1208 path = btrfs_alloc_path(); 1221 1209 if (!path) ··· 1269 1273 * Have to check before we set max_hole_start, otherwise 1270 1274 * we could end up sending back this offset anyway. 1271 1275 */ 1272 - if (contains_pending_extent(trans, device, 1276 + if (contains_pending_extent(transaction, device, 1273 1277 &search_start, 1274 1278 hole_size)) { 1275 1279 if (key.offset >= search_start) { ··· 1318 1322 if (search_end > search_start) { 1319 1323 hole_size = search_end - search_start; 1320 1324 1321 - if (contains_pending_extent(trans, device, &search_start, 1325 + if (contains_pending_extent(transaction, device, &search_start, 1322 1326 hole_size)) { 1323 1327 btrfs_release_path(path); 1324 1328 goto again; ··· 1342 1346 if (len) 1343 1347 *len = max_hole_size; 1344 1348 return ret; 1349 + } 1350 + 1351 + int find_free_dev_extent(struct btrfs_trans_handle *trans, 1352 + struct btrfs_device *device, u64 num_bytes, 1353 + u64 *start, u64 *len) 1354 + { 1355 + struct btrfs_root *root = device->dev_root; 1356 + u64 search_start; 1357 + 1358 + /* FIXME use last free of some kind */ 1359 + 1360 + /* 1361 + * we don't want to overwrite the superblock on the drive, 1362 + * so we make sure to start at an offset of at least 1MB 1363 + */ 1364 + search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 1365 + return find_free_dev_extent_start(trans->transaction, device, 1366 + num_bytes, search_start, start, len); 1345 1367 } 1346 1368 1347 1369 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, ··· 4214 4200 u64 start = new_size; 4215 4201 u64 len = old_size - new_size; 4216 4202 4217 - if (contains_pending_extent(trans, device, &start, len)) { 4203 + if (contains_pending_extent(trans->transaction, device, 4204 + &start, len)) { 4218 4205 unlock_chunks(root); 4219 4206 checked_pending_chunks = true; 4220 4207 failed = 0;
+3
fs/btrfs/volumes.h
··· 455 455 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); 456 456 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); 457 457 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 458 + int find_free_dev_extent_start(struct btrfs_transaction *transaction, 459 + struct btrfs_device *device, u64 num_bytes, 460 + u64 search_start, u64 *start, u64 *max_avail); 458 461 int find_free_dev_extent(struct btrfs_trans_handle *trans, 459 462 struct btrfs_device *device, u64 num_bytes, 460 463 u64 *start, u64 *max_avail);