Merge tag 'for-5.3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

+2 -1

fs/btrfs/Kconfig

··· 2 2 3 3 config BTRFS_FS 4 4 tristate "Btrfs filesystem support" 5 - select LIBCRC32C 5 + select CRYPTO 6 + select CRYPTO_CRC32C 6 7 select ZLIB_INFLATE 7 8 select ZLIB_DEFLATE 8 9 select LZO_COMPRESS

+2 -1

fs/btrfs/Makefile

··· 10 10 export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ 11 11 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 12 12 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ 13 - uuid-tree.o props.o free-space-tree.o tree-checker.o 13 + uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ 14 + block-rsv.o delalloc-space.o 14 15 15 16 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 16 17 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o

+6 -11

fs/btrfs/backref.c

··· 1465 1465 * 1466 1466 * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. 1467 1467 */ 1468 - int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) 1468 + int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, 1469 + struct ulist *roots, struct ulist *tmp) 1469 1470 { 1470 1471 struct btrfs_fs_info *fs_info = root->fs_info; 1471 1472 struct btrfs_trans_handle *trans; 1472 - struct ulist *tmp = NULL; 1473 - struct ulist *roots = NULL; 1474 1473 struct ulist_iterator uiter; 1475 1474 struct ulist_node *node; 1476 1475 struct seq_list elem = SEQ_LIST_INIT(elem); ··· 1480 1481 .share_count = 0, 1481 1482 }; 1482 1483 1483 - tmp = ulist_alloc(GFP_NOFS); 1484 - roots = ulist_alloc(GFP_NOFS); 1485 - if (!tmp || !roots) { 1486 - ret = -ENOMEM; 1487 - goto out; 1488 - } 1484 + ulist_init(roots); 1485 + ulist_init(tmp); 1489 1486 1490 1487 trans = btrfs_attach_transaction(root); 1491 1488 if (IS_ERR(trans)) { ··· 1522 1527 up_read(&fs_info->commit_root_sem); 1523 1528 } 1524 1529 out: 1525 - ulist_free(tmp); 1526 - ulist_free(roots); 1530 + ulist_release(roots); 1531 + ulist_release(tmp); 1527 1532 return ret; 1528 1533 } 1529 1534

+2 -1

fs/btrfs/backref.h

··· 57 57 u64 start_off, struct btrfs_path *path, 58 58 struct btrfs_inode_extref **ret_extref, 59 59 u64 *found_off); 60 - int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr); 60 + int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, 61 + struct ulist *roots, struct ulist *tmp_ulist); 61 62 62 63 int __init btrfs_prelim_ref_init(void); 63 64 void __cold btrfs_prelim_ref_exit(void);

+425

fs/btrfs/block-rsv.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "ctree.h" 4 + #include "block-rsv.h" 5 + #include "space-info.h" 6 + #include "math.h" 7 + #include "transaction.h" 8 + 9 + static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 10 + struct btrfs_block_rsv *block_rsv, 11 + struct btrfs_block_rsv *dest, u64 num_bytes, 12 + u64 *qgroup_to_release_ret) 13 + { 14 + struct btrfs_space_info *space_info = block_rsv->space_info; 15 + u64 qgroup_to_release = 0; 16 + u64 ret; 17 + 18 + spin_lock(&block_rsv->lock); 19 + if (num_bytes == (u64)-1) { 20 + num_bytes = block_rsv->size; 21 + qgroup_to_release = block_rsv->qgroup_rsv_size; 22 + } 23 + block_rsv->size -= num_bytes; 24 + if (block_rsv->reserved >= block_rsv->size) { 25 + num_bytes = block_rsv->reserved - block_rsv->size; 26 + block_rsv->reserved = block_rsv->size; 27 + block_rsv->full = 1; 28 + } else { 29 + num_bytes = 0; 30 + } 31 + if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { 32 + qgroup_to_release = block_rsv->qgroup_rsv_reserved - 33 + block_rsv->qgroup_rsv_size; 34 + block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; 35 + } else { 36 + qgroup_to_release = 0; 37 + } 38 + spin_unlock(&block_rsv->lock); 39 + 40 + ret = num_bytes; 41 + if (num_bytes > 0) { 42 + if (dest) { 43 + spin_lock(&dest->lock); 44 + if (!dest->full) { 45 + u64 bytes_to_add; 46 + 47 + bytes_to_add = dest->size - dest->reserved; 48 + bytes_to_add = min(num_bytes, bytes_to_add); 49 + dest->reserved += bytes_to_add; 50 + if (dest->reserved >= dest->size) 51 + dest->full = 1; 52 + num_bytes -= bytes_to_add; 53 + } 54 + spin_unlock(&dest->lock); 55 + } 56 + if (num_bytes) 57 + btrfs_space_info_add_old_bytes(fs_info, space_info, 58 + num_bytes); 59 + } 60 + if (qgroup_to_release_ret) 61 + *qgroup_to_release_ret = qgroup_to_release; 62 + return ret; 63 + } 64 + 65 + int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, 66 + struct btrfs_block_rsv *dst, u64 num_bytes, 67 + bool update_size) 68 + { 69 + int ret; 70 + 71 + ret = btrfs_block_rsv_use_bytes(src, num_bytes); 72 + if (ret) 73 + return ret; 74 + 75 + btrfs_block_rsv_add_bytes(dst, num_bytes, update_size); 76 + return 0; 77 + } 78 + 79 + void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 80 + { 81 + memset(rsv, 0, sizeof(*rsv)); 82 + spin_lock_init(&rsv->lock); 83 + rsv->type = type; 84 + } 85 + 86 + void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, 87 + struct btrfs_block_rsv *rsv, 88 + unsigned short type) 89 + { 90 + btrfs_init_block_rsv(rsv, type); 91 + rsv->space_info = btrfs_find_space_info(fs_info, 92 + BTRFS_BLOCK_GROUP_METADATA); 93 + } 94 + 95 + struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 96 + unsigned short type) 97 + { 98 + struct btrfs_block_rsv *block_rsv; 99 + 100 + block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 101 + if (!block_rsv) 102 + return NULL; 103 + 104 + btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); 105 + return block_rsv; 106 + } 107 + 108 + void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, 109 + struct btrfs_block_rsv *rsv) 110 + { 111 + if (!rsv) 112 + return; 113 + btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 114 + kfree(rsv); 115 + } 116 + 117 + int btrfs_block_rsv_add(struct btrfs_root *root, 118 + struct btrfs_block_rsv *block_rsv, u64 num_bytes, 119 + enum btrfs_reserve_flush_enum flush) 120 + { 121 + int ret; 122 + 123 + if (num_bytes == 0) 124 + return 0; 125 + 126 + ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 127 + if (!ret) 128 + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); 129 + 130 + return ret; 131 + } 132 + 133 + int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) 134 + { 135 + u64 num_bytes = 0; 136 + int ret = -ENOSPC; 137 + 138 + if (!block_rsv) 139 + return 0; 140 + 141 + spin_lock(&block_rsv->lock); 142 + num_bytes = div_factor(block_rsv->size, min_factor); 143 + if (block_rsv->reserved >= num_bytes) 144 + ret = 0; 145 + spin_unlock(&block_rsv->lock); 146 + 147 + return ret; 148 + } 149 + 150 + int btrfs_block_rsv_refill(struct btrfs_root *root, 151 + struct btrfs_block_rsv *block_rsv, u64 min_reserved, 152 + enum btrfs_reserve_flush_enum flush) 153 + { 154 + u64 num_bytes = 0; 155 + int ret = -ENOSPC; 156 + 157 + if (!block_rsv) 158 + return 0; 159 + 160 + spin_lock(&block_rsv->lock); 161 + num_bytes = min_reserved; 162 + if (block_rsv->reserved >= num_bytes) 163 + ret = 0; 164 + else 165 + num_bytes -= block_rsv->reserved; 166 + spin_unlock(&block_rsv->lock); 167 + 168 + if (!ret) 169 + return 0; 170 + 171 + ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 172 + if (!ret) { 173 + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); 174 + return 0; 175 + } 176 + 177 + return ret; 178 + } 179 + 180 + u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 181 + struct btrfs_block_rsv *block_rsv, 182 + u64 num_bytes, u64 *qgroup_to_release) 183 + { 184 + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 185 + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 186 + struct btrfs_block_rsv *target = NULL; 187 + 188 + /* 189 + * If we are the delayed_rsv then push to the global rsv, otherwise dump 190 + * into the delayed rsv if it is not full. 191 + */ 192 + if (block_rsv == delayed_rsv) 193 + target = global_rsv; 194 + else if (block_rsv != global_rsv && !delayed_rsv->full) 195 + target = delayed_rsv; 196 + 197 + if (target && block_rsv->space_info != target->space_info) 198 + target = NULL; 199 + 200 + return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, 201 + qgroup_to_release); 202 + } 203 + 204 + int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes) 205 + { 206 + int ret = -ENOSPC; 207 + 208 + spin_lock(&block_rsv->lock); 209 + if (block_rsv->reserved >= num_bytes) { 210 + block_rsv->reserved -= num_bytes; 211 + if (block_rsv->reserved < block_rsv->size) 212 + block_rsv->full = 0; 213 + ret = 0; 214 + } 215 + spin_unlock(&block_rsv->lock); 216 + return ret; 217 + } 218 + 219 + void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 220 + u64 num_bytes, bool update_size) 221 + { 222 + spin_lock(&block_rsv->lock); 223 + block_rsv->reserved += num_bytes; 224 + if (update_size) 225 + block_rsv->size += num_bytes; 226 + else if (block_rsv->reserved >= block_rsv->size) 227 + block_rsv->full = 1; 228 + spin_unlock(&block_rsv->lock); 229 + } 230 + 231 + int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 232 + struct btrfs_block_rsv *dest, u64 num_bytes, 233 + int min_factor) 234 + { 235 + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 236 + u64 min_bytes; 237 + 238 + if (global_rsv->space_info != dest->space_info) 239 + return -ENOSPC; 240 + 241 + spin_lock(&global_rsv->lock); 242 + min_bytes = div_factor(global_rsv->size, min_factor); 243 + if (global_rsv->reserved < min_bytes + num_bytes) { 244 + spin_unlock(&global_rsv->lock); 245 + return -ENOSPC; 246 + } 247 + global_rsv->reserved -= num_bytes; 248 + if (global_rsv->reserved < global_rsv->size) 249 + global_rsv->full = 0; 250 + spin_unlock(&global_rsv->lock); 251 + 252 + btrfs_block_rsv_add_bytes(dest, num_bytes, true); 253 + return 0; 254 + } 255 + 256 + void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) 257 + { 258 + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 259 + struct btrfs_space_info *sinfo = block_rsv->space_info; 260 + u64 num_bytes; 261 + 262 + /* 263 + * The global block rsv is based on the size of the extent tree, the 264 + * checksum tree and the root tree. If the fs is empty we want to set 265 + * it to a minimal amount for safety. 266 + */ 267 + num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + 268 + btrfs_root_used(&fs_info->csum_root->root_item) + 269 + btrfs_root_used(&fs_info->tree_root->root_item); 270 + num_bytes = max_t(u64, num_bytes, SZ_16M); 271 + 272 + spin_lock(&sinfo->lock); 273 + spin_lock(&block_rsv->lock); 274 + 275 + block_rsv->size = min_t(u64, num_bytes, SZ_512M); 276 + 277 + if (block_rsv->reserved < block_rsv->size) { 278 + num_bytes = btrfs_space_info_used(sinfo, true); 279 + if (sinfo->total_bytes > num_bytes) { 280 + num_bytes = sinfo->total_bytes - num_bytes; 281 + num_bytes = min(num_bytes, 282 + block_rsv->size - block_rsv->reserved); 283 + block_rsv->reserved += num_bytes; 284 + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, 285 + num_bytes); 286 + trace_btrfs_space_reservation(fs_info, "space_info", 287 + sinfo->flags, num_bytes, 288 + 1); 289 + } 290 + } else if (block_rsv->reserved > block_rsv->size) { 291 + num_bytes = block_rsv->reserved - block_rsv->size; 292 + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, 293 + -num_bytes); 294 + trace_btrfs_space_reservation(fs_info, "space_info", 295 + sinfo->flags, num_bytes, 0); 296 + block_rsv->reserved = block_rsv->size; 297 + } 298 + 299 + if (block_rsv->reserved == block_rsv->size) 300 + block_rsv->full = 1; 301 + else 302 + block_rsv->full = 0; 303 + 304 + spin_unlock(&block_rsv->lock); 305 + spin_unlock(&sinfo->lock); 306 + } 307 + 308 + void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) 309 + { 310 + struct btrfs_space_info *space_info; 311 + 312 + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 313 + fs_info->chunk_block_rsv.space_info = space_info; 314 + 315 + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 316 + fs_info->global_block_rsv.space_info = space_info; 317 + fs_info->trans_block_rsv.space_info = space_info; 318 + fs_info->empty_block_rsv.space_info = space_info; 319 + fs_info->delayed_block_rsv.space_info = space_info; 320 + fs_info->delayed_refs_rsv.space_info = space_info; 321 + 322 + fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; 323 + fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; 324 + fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 325 + fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 326 + if (fs_info->quota_root) 327 + fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 328 + fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 329 + 330 + btrfs_update_global_block_rsv(fs_info); 331 + } 332 + 333 + void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info) 334 + { 335 + btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1); 336 + WARN_ON(fs_info->trans_block_rsv.size > 0); 337 + WARN_ON(fs_info->trans_block_rsv.reserved > 0); 338 + WARN_ON(fs_info->chunk_block_rsv.size > 0); 339 + WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 340 + WARN_ON(fs_info->delayed_block_rsv.size > 0); 341 + WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 342 + WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); 343 + WARN_ON(fs_info->delayed_refs_rsv.size > 0); 344 + } 345 + 346 + static struct btrfs_block_rsv *get_block_rsv( 347 + const struct btrfs_trans_handle *trans, 348 + const struct btrfs_root *root) 349 + { 350 + struct btrfs_fs_info *fs_info = root->fs_info; 351 + struct btrfs_block_rsv *block_rsv = NULL; 352 + 353 + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 354 + (root == fs_info->csum_root && trans->adding_csums) || 355 + (root == fs_info->uuid_root)) 356 + block_rsv = trans->block_rsv; 357 + 358 + if (!block_rsv) 359 + block_rsv = root->block_rsv; 360 + 361 + if (!block_rsv) 362 + block_rsv = &fs_info->empty_block_rsv; 363 + 364 + return block_rsv; 365 + } 366 + 367 + struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, 368 + struct btrfs_root *root, 369 + u32 blocksize) 370 + { 371 + struct btrfs_fs_info *fs_info = root->fs_info; 372 + struct btrfs_block_rsv *block_rsv; 373 + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 374 + int ret; 375 + bool global_updated = false; 376 + 377 + block_rsv = get_block_rsv(trans, root); 378 + 379 + if (unlikely(block_rsv->size == 0)) 380 + goto try_reserve; 381 + again: 382 + ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize); 383 + if (!ret) 384 + return block_rsv; 385 + 386 + if (block_rsv->failfast) 387 + return ERR_PTR(ret); 388 + 389 + if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 390 + global_updated = true; 391 + btrfs_update_global_block_rsv(fs_info); 392 + goto again; 393 + } 394 + 395 + /* 396 + * The global reserve still exists to save us from ourselves, so don't 397 + * warn_on if we are short on our delayed refs reserve. 398 + */ 399 + if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && 400 + btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 401 + static DEFINE_RATELIMIT_STATE(_rs, 402 + DEFAULT_RATELIMIT_INTERVAL * 10, 403 + /*DEFAULT_RATELIMIT_BURST*/ 1); 404 + if (__ratelimit(&_rs)) 405 + WARN(1, KERN_DEBUG 406 + "BTRFS: block rsv returned %d\n", ret); 407 + } 408 + try_reserve: 409 + ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize, 410 + BTRFS_RESERVE_NO_FLUSH); 411 + if (!ret) 412 + return block_rsv; 413 + /* 414 + * If we couldn't reserve metadata bytes try and use some from 415 + * the global reserve if its space type is the same as the global 416 + * reservation. 417 + */ 418 + if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 419 + block_rsv->space_info == global_rsv->space_info) { 420 + ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize); 421 + if (!ret) 422 + return global_rsv; 423 + } 424 + return ERR_PTR(ret); 425 + }

+101

fs/btrfs/block-rsv.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + 3 + #ifndef BTRFS_BLOCK_RSV_H 4 + #define BTRFS_BLOCK_RSV_H 5 + 6 + struct btrfs_trans_handle; 7 + enum btrfs_reserve_flush_enum; 8 + 9 + /* 10 + * Types of block reserves 11 + */ 12 + enum { 13 + BTRFS_BLOCK_RSV_GLOBAL, 14 + BTRFS_BLOCK_RSV_DELALLOC, 15 + BTRFS_BLOCK_RSV_TRANS, 16 + BTRFS_BLOCK_RSV_CHUNK, 17 + BTRFS_BLOCK_RSV_DELOPS, 18 + BTRFS_BLOCK_RSV_DELREFS, 19 + BTRFS_BLOCK_RSV_EMPTY, 20 + BTRFS_BLOCK_RSV_TEMP, 21 + }; 22 + 23 + struct btrfs_block_rsv { 24 + u64 size; 25 + u64 reserved; 26 + struct btrfs_space_info *space_info; 27 + spinlock_t lock; 28 + unsigned short full; 29 + unsigned short type; 30 + unsigned short failfast; 31 + 32 + /* 33 + * Qgroup equivalent for @size @reserved 34 + * 35 + * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care 36 + * about things like csum size nor how many tree blocks it will need to 37 + * reserve. 38 + * 39 + * Qgroup cares more about net change of the extent usage. 40 + * 41 + * So for one newly inserted file extent, in worst case it will cause 42 + * leaf split and level increase, nodesize for each file extent is 43 + * already too much. 44 + * 45 + * In short, qgroup_size/reserved is the upper limit of possible needed 46 + * qgroup metadata reservation. 47 + */ 48 + u64 qgroup_rsv_size; 49 + u64 qgroup_rsv_reserved; 50 + }; 51 + 52 + void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); 53 + struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 54 + unsigned short type); 55 + void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, 56 + struct btrfs_block_rsv *rsv, 57 + unsigned short type); 58 + void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, 59 + struct btrfs_block_rsv *rsv); 60 + int btrfs_block_rsv_add(struct btrfs_root *root, 61 + struct btrfs_block_rsv *block_rsv, u64 num_bytes, 62 + enum btrfs_reserve_flush_enum flush); 63 + int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); 64 + int btrfs_block_rsv_refill(struct btrfs_root *root, 65 + struct btrfs_block_rsv *block_rsv, u64 min_reserved, 66 + enum btrfs_reserve_flush_enum flush); 67 + int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 68 + struct btrfs_block_rsv *dst_rsv, u64 num_bytes, 69 + bool update_size); 70 + int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); 71 + int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 72 + struct btrfs_block_rsv *dest, u64 num_bytes, 73 + int min_factor); 74 + void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 75 + u64 num_bytes, bool update_size); 76 + u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 77 + struct btrfs_block_rsv *block_rsv, 78 + u64 num_bytes, u64 *qgroup_to_release); 79 + void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info); 80 + void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info); 81 + void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); 82 + struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, 83 + struct btrfs_root *root, 84 + u32 blocksize); 85 + 86 + static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 87 + struct btrfs_block_rsv *block_rsv, 88 + u64 num_bytes) 89 + { 90 + __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); 91 + } 92 + 93 + static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, 94 + struct btrfs_block_rsv *block_rsv, 95 + u32 blocksize) 96 + { 97 + btrfs_block_rsv_add_bytes(block_rsv, blocksize, false); 98 + btrfs_block_rsv_release(fs_info, block_rsv, 0); 99 + } 100 + 101 + #endif /* BTRFS_BLOCK_RSV_H */

+17 -5

fs/btrfs/btrfs_inode.h

··· 337 337 clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); 338 338 } 339 339 340 + /* Array of bytes with variable length, hexadecimal format 0x1234 */ 341 + #define CSUM_FMT "0x%*phN" 342 + #define CSUM_FMT_VALUE(size, bytes) size, bytes 343 + 340 344 static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, 341 - u64 logical_start, u32 csum, u32 csum_expected, int mirror_num) 345 + u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) 342 346 { 343 347 struct btrfs_root *root = inode->root; 348 + struct btrfs_super_block *sb = root->fs_info->super_copy; 349 + const u16 csum_size = btrfs_super_csum_size(sb); 344 350 345 351 /* Output minus objectid, which is more meaningful */ 346 352 if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) 347 353 btrfs_warn_rl(root->fs_info, 348 - "csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d", 354 + "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", 349 355 root->root_key.objectid, btrfs_ino(inode), 350 - logical_start, csum, csum_expected, mirror_num); 356 + logical_start, 357 + CSUM_FMT_VALUE(csum_size, csum), 358 + CSUM_FMT_VALUE(csum_size, csum_expected), 359 + mirror_num); 351 360 else 352 361 btrfs_warn_rl(root->fs_info, 353 - "csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d", 362 + "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", 354 363 root->root_key.objectid, btrfs_ino(inode), 355 - logical_start, csum, csum_expected, mirror_num); 364 + logical_start, 365 + CSUM_FMT_VALUE(csum_size, csum), 366 + CSUM_FMT_VALUE(csum_size, csum_expected), 367 + mirror_num); 356 368 } 357 369 358 370 #endif

+7 -4

fs/btrfs/check-integrity.c

··· 83 83 #include <linux/blkdev.h> 84 84 #include <linux/mm.h> 85 85 #include <linux/string.h> 86 - #include <linux/crc32c.h> 86 + #include <crypto/hash.h> 87 87 #include "ctree.h" 88 88 #include "disk-io.h" 89 89 #include "transaction.h" ··· 1710 1710 char **datav, unsigned int num_pages) 1711 1711 { 1712 1712 struct btrfs_fs_info *fs_info = state->fs_info; 1713 + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 1713 1714 struct btrfs_header *h; 1714 1715 u8 csum[BTRFS_CSUM_SIZE]; 1715 - u32 crc = ~(u32)0; 1716 1716 unsigned int i; 1717 1717 1718 1718 if (num_pages * PAGE_SIZE < state->metablock_size) ··· 1723 1723 if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) 1724 1724 return 1; 1725 1725 1726 + shash->tfm = fs_info->csum_shash; 1727 + crypto_shash_init(shash); 1728 + 1726 1729 for (i = 0; i < num_pages; i++) { 1727 1730 u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); 1728 1731 size_t sublen = i ? PAGE_SIZE : 1729 1732 (PAGE_SIZE - BTRFS_CSUM_SIZE); 1730 1733 1731 - crc = crc32c(crc, data, sublen); 1734 + crypto_shash_update(shash, data, sublen); 1732 1735 } 1733 - btrfs_csum_final(crc, csum); 1736 + crypto_shash_final(shash, csum); 1734 1737 if (memcmp(csum, h->csum, state->csum_size)) 1735 1738 return 1; 1736 1739

+48 -17

fs/btrfs/compression.c

··· 17 17 #include <linux/slab.h> 18 18 #include <linux/sched/mm.h> 19 19 #include <linux/log2.h> 20 + #include <crypto/hash.h> 20 21 #include "ctree.h" 21 22 #include "disk-io.h" 22 23 #include "transaction.h" ··· 43 42 return NULL; 44 43 } 45 44 45 + bool btrfs_compress_is_valid_type(const char *str, size_t len) 46 + { 47 + int i; 48 + 49 + for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) { 50 + size_t comp_len = strlen(btrfs_compress_types[i]); 51 + 52 + if (len < comp_len) 53 + continue; 54 + 55 + if (!strncmp(btrfs_compress_types[i], str, comp_len)) 56 + return true; 57 + } 58 + return false; 59 + } 60 + 46 61 static int btrfs_decompress_bio(struct compressed_bio *cb); 47 62 48 63 static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, ··· 74 57 struct compressed_bio *cb, 75 58 u64 disk_start) 76 59 { 60 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 61 + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 62 + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 77 63 int ret; 78 64 struct page *page; 79 65 unsigned long i; 80 66 char *kaddr; 81 - u32 csum; 82 - u32 *cb_sum = &cb->sums; 67 + u8 csum[BTRFS_CSUM_SIZE]; 68 + u8 *cb_sum = cb->sums; 83 69 84 70 if (inode->flags & BTRFS_INODE_NODATASUM) 85 71 return 0; 86 72 73 + shash->tfm = fs_info->csum_shash; 74 + 87 75 for (i = 0; i < cb->nr_pages; i++) { 88 76 page = cb->compressed_pages[i]; 89 - csum = ~(u32)0; 90 77 78 + crypto_shash_init(shash); 91 79 kaddr = kmap_atomic(page); 92 - csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE); 93 - btrfs_csum_final(csum, (u8 *)&csum); 80 + crypto_shash_update(shash, kaddr, PAGE_SIZE); 94 81 kunmap_atomic(kaddr); 82 + crypto_shash_final(shash, (u8 *)&csum); 95 83 96 - if (csum != *cb_sum) { 97 - btrfs_print_data_csum_error(inode, disk_start, csum, 98 - *cb_sum, cb->mirror_num); 84 + if (memcmp(&csum, cb_sum, csum_size)) { 85 + btrfs_print_data_csum_error(inode, disk_start, 86 + csum, cb_sum, cb->mirror_num); 99 87 ret = -EIO; 100 88 goto fail; 101 89 } 102 - cb_sum++; 90 + cb_sum += csum_size; 103 91 104 92 } 105 93 ret = 0; ··· 340 318 341 319 bdev = fs_info->fs_devices->latest_bdev; 342 320 343 - bio = btrfs_bio_alloc(bdev, first_byte); 321 + bio = btrfs_bio_alloc(first_byte); 322 + bio_set_dev(bio, bdev); 344 323 bio->bi_opf = REQ_OP_WRITE | write_flags; 345 324 bio->bi_private = cb; 346 325 bio->bi_end_io = end_compressed_bio_write; ··· 383 360 bio_endio(bio); 384 361 } 385 362 386 - bio = btrfs_bio_alloc(bdev, first_byte); 363 + bio = btrfs_bio_alloc(first_byte); 364 + bio_set_dev(bio, bdev); 387 365 bio->bi_opf = REQ_OP_WRITE | write_flags; 388 366 bio->bi_private = cb; 389 367 bio->bi_end_io = end_compressed_bio_write; ··· 560 536 struct extent_map *em; 561 537 blk_status_t ret = BLK_STS_RESOURCE; 562 538 int faili = 0; 563 - u32 *sums; 539 + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 540 + u8 *sums; 564 541 565 542 em_tree = &BTRFS_I(inode)->extent_tree; 566 543 ··· 583 558 cb->errors = 0; 584 559 cb->inode = inode; 585 560 cb->mirror_num = mirror_num; 586 - sums = &cb->sums; 561 + sums = cb->sums; 587 562 588 563 cb->start = em->orig_start; 589 564 em_len = em->len; ··· 622 597 /* include any pages we added in add_ra-bio_pages */ 623 598 cb->len = bio->bi_iter.bi_size; 624 599 625 - comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); 600 + comp_bio = btrfs_bio_alloc(cur_disk_byte); 601 + bio_set_dev(comp_bio, bdev); 626 602 comp_bio->bi_opf = REQ_OP_READ; 627 603 comp_bio->bi_private = cb; 628 604 comp_bio->bi_end_io = end_compressed_bio_read; ··· 643 617 page->mapping = NULL; 644 618 if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < 645 619 PAGE_SIZE) { 620 + unsigned int nr_sectors; 621 + 646 622 ret = btrfs_bio_wq_end_io(fs_info, comp_bio, 647 623 BTRFS_WQ_ENDIO_DATA); 648 624 BUG_ON(ret); /* -ENOMEM */ ··· 662 634 sums); 663 635 BUG_ON(ret); /* -ENOMEM */ 664 636 } 665 - sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, 666 - fs_info->sectorsize); 637 + 638 + nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, 639 + fs_info->sectorsize); 640 + sums += csum_size * nr_sectors; 667 641 668 642 ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); 669 643 if (ret) { ··· 673 643 bio_endio(comp_bio); 674 644 } 675 645 676 - comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); 646 + comp_bio = btrfs_bio_alloc(cur_disk_byte); 647 + bio_set_dev(comp_bio, bdev); 677 648 comp_bio->bi_opf = REQ_OP_READ; 678 649 comp_bio->bi_private = cb; 679 650 comp_bio->bi_end_io = end_compressed_bio_read;

+2 -1

fs/btrfs/compression.h

··· 61 61 * the start of a variable length array of checksums only 62 62 * used by reads 63 63 */ 64 - u32 sums; 64 + u8 sums[]; 65 65 }; 66 66 67 67 static inline unsigned int btrfs_compress_type(unsigned int type_level) ··· 173 173 extern const struct btrfs_compress_op btrfs_zstd_compress; 174 174 175 175 const char* btrfs_compress_type2str(enum btrfs_compression_type type); 176 + bool btrfs_compress_is_valid_type(const char *str, size_t len); 176 177 177 178 int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); 178 179

+94 -188

fs/btrfs/ctree.h

··· 19 19 #include <linux/kobject.h> 20 20 #include <trace/events/btrfs.h> 21 21 #include <asm/kmap_types.h> 22 + #include <asm/unaligned.h> 22 23 #include <linux/pagemap.h> 23 24 #include <linux/btrfs.h> 24 25 #include <linux/btrfs_tree.h> ··· 32 31 #include "extent_io.h" 33 32 #include "extent_map.h" 34 33 #include "async-thread.h" 34 + #include "block-rsv.h" 35 35 36 36 struct btrfs_trans_handle; 37 37 struct btrfs_transaction; 38 38 struct btrfs_pending_snapshot; 39 39 struct btrfs_delayed_ref_root; 40 + struct btrfs_space_info; 40 41 extern struct kmem_cache *btrfs_trans_handle_cachep; 41 42 extern struct kmem_cache *btrfs_bit_radix_cachep; 42 43 extern struct kmem_cache *btrfs_path_cachep; ··· 48 45 49 46 #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ 50 47 51 - #define BTRFS_MAX_MIRRORS 3 48 + /* 49 + * Maximum number of mirrors that can be available for all profiles counting 50 + * the target device of dev-replace as one. During an active device replace 51 + * procedure, the target device of the copy operation is a mirror for the 52 + * filesystem data as well that can be used to read data in order to repair 53 + * read errors on other disks. 54 + * 55 + * Current value is derived from RAID1 with 2 copies. 56 + */ 57 + #define BTRFS_MAX_MIRRORS (2 + 1) 52 58 53 59 #define BTRFS_MAX_LEVEL 8 54 60 ··· 84 72 85 73 /* four bytes for CRC32 */ 86 74 static const int btrfs_csum_sizes[] = { 4 }; 75 + static const char *btrfs_csum_names[] = { "crc32c" }; 87 76 88 77 #define BTRFS_EMPTY_DIR_SIZE 0 89 78 ··· 111 98 { 112 99 return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); 113 100 } 114 - 115 - struct btrfs_mapping_tree { 116 - struct extent_map_tree map_tree; 117 - }; 118 101 119 102 static inline unsigned long btrfs_chunk_item_size(int num_stripes) 120 103 { ··· 404 395 struct list_head list; 405 396 }; 406 397 407 - struct btrfs_space_info { 408 - spinlock_t lock; 409 - 410 - u64 total_bytes; /* total bytes in the space, 411 - this doesn't take mirrors into account */ 412 - u64 bytes_used; /* total bytes used, 413 - this doesn't take mirrors into account */ 414 - u64 bytes_pinned; /* total bytes pinned, will be freed when the 415 - transaction finishes */ 416 - u64 bytes_reserved; /* total bytes the allocator has reserved for 417 - current allocations */ 418 - u64 bytes_may_use; /* number of bytes that may be used for 419 - delalloc/allocations */ 420 - u64 bytes_readonly; /* total bytes that are read only */ 421 - 422 - u64 max_extent_size; /* This will hold the maximum extent size of 423 - the space info if we had an ENOSPC in the 424 - allocator. */ 425 - 426 - unsigned int full:1; /* indicates that we cannot allocate any more 427 - chunks for this space */ 428 - unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ 429 - 430 - unsigned int flush:1; /* set if we are trying to make space */ 431 - 432 - unsigned int force_alloc; /* set if we need to force a chunk 433 - alloc for this space */ 434 - 435 - u64 disk_used; /* total bytes used on disk */ 436 - u64 disk_total; /* total bytes on disk, takes mirrors into 437 - account */ 438 - 439 - u64 flags; 440 - 441 - /* 442 - * bytes_pinned is kept in line with what is actually pinned, as in 443 - * we've called update_block_group and dropped the bytes_used counter 444 - * and increased the bytes_pinned counter. However this means that 445 - * bytes_pinned does not reflect the bytes that will be pinned once the 446 - * delayed refs are flushed, so this counter is inc'ed every time we 447 - * call btrfs_free_extent so it is a realtime count of what will be 448 - * freed once the transaction is committed. It will be zeroed every 449 - * time the transaction commits. 450 - */ 451 - struct percpu_counter total_bytes_pinned; 452 - 453 - struct list_head list; 454 - /* Protected by the spinlock 'lock'. */ 455 - struct list_head ro_bgs; 456 - struct list_head priority_tickets; 457 - struct list_head tickets; 458 - /* 459 - * tickets_id just indicates the next ticket will be handled, so note 460 - * it's not stored per ticket. 461 - */ 462 - u64 tickets_id; 463 - 464 - struct rw_semaphore groups_sem; 465 - /* for block groups in our same type */ 466 - struct list_head block_groups[BTRFS_NR_RAID_TYPES]; 467 - wait_queue_head_t wait; 468 - 469 - struct kobject kobj; 470 - struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; 471 - }; 472 - 473 - /* 474 - * Types of block reserves 475 - */ 476 - enum { 477 - BTRFS_BLOCK_RSV_GLOBAL, 478 - BTRFS_BLOCK_RSV_DELALLOC, 479 - BTRFS_BLOCK_RSV_TRANS, 480 - BTRFS_BLOCK_RSV_CHUNK, 481 - BTRFS_BLOCK_RSV_DELOPS, 482 - BTRFS_BLOCK_RSV_DELREFS, 483 - BTRFS_BLOCK_RSV_EMPTY, 484 - BTRFS_BLOCK_RSV_TEMP, 485 - }; 486 - 487 - struct btrfs_block_rsv { 488 - u64 size; 489 - u64 reserved; 490 - struct btrfs_space_info *space_info; 491 - spinlock_t lock; 492 - unsigned short full; 493 - unsigned short type; 494 - unsigned short failfast; 495 - 496 - /* 497 - * Qgroup equivalent for @size @reserved 498 - * 499 - * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care 500 - * about things like csum size nor how many tree blocks it will need to 501 - * reserve. 502 - * 503 - * Qgroup cares more about net change of the extent usage. 504 - * 505 - * So for one newly inserted file extent, in worst case it will cause 506 - * leaf split and level increase, nodesize for each file extent is 507 - * already too much. 508 - * 509 - * In short, qgroup_size/reserved is the upper limit of possible needed 510 - * qgroup metadata reservation. 511 - */ 512 - u64 qgroup_rsv_size; 513 - u64 qgroup_rsv_reserved; 514 - }; 515 - 516 398 /* 517 399 * free clusters are used to claim free space in relatively large chunks, 518 400 * allowing us to do less seeky writes. They are used for all metadata ··· 686 786 /* 687 787 * Indicate that balance has been set up from the ioctl and is in the 688 788 * main phase. The fs_info::balance_ctl is initialized. 789 + * Set and cleared while holding fs_info::balance_mutex. 689 790 */ 690 791 BTRFS_FS_BALANCE_RUNNING, 691 792 692 793 /* Indicate that the cleaner thread is awake and doing something. */ 693 794 BTRFS_FS_CLEANER_RUNNING, 795 + 796 + /* 797 + * The checksumming has an optimized version and is considered fast, 798 + * so we don't need to offload checksums to workqueues. 799 + */ 800 + BTRFS_FS_CSUM_IMPL_FAST, 694 801 }; 695 802 696 803 struct btrfs_fs_info { ··· 731 824 struct extent_io_tree *pinned_extents; 732 825 733 826 /* logical->physical extent mapping */ 734 - struct btrfs_mapping_tree mapping_tree; 827 + struct extent_map_tree mapping_tree; 735 828 736 829 /* 737 830 * block reservation for extent, checksum, root tree and ··· 1066 1159 /* Block groups and devices containing active swapfiles. */ 1067 1160 spinlock_t swapfile_pins_lock; 1068 1161 struct rb_root swapfile_pins; 1162 + 1163 + struct crypto_shash *csum_shash; 1164 + 1165 + /* 1166 + * Number of send operations in progress. 1167 + * Updated while holding fs_info::balance_mutex. 1168 + */ 1169 + int send_in_progress; 1069 1170 1070 1171 #ifdef CONFIG_BTRFS_FS_REF_VERIFY 1071 1172 spinlock_t ref_verify_lock; ··· 2366 2451 return btrfs_csum_sizes[t]; 2367 2452 } 2368 2453 2454 + static inline const char *btrfs_super_csum_name(u16 csum_type) 2455 + { 2456 + /* csum type is validated at mount time */ 2457 + return btrfs_csum_names[csum_type]; 2458 + } 2369 2459 2370 2460 /* 2371 2461 * The leaf data grows from end-to-front in the node. ··· 2562 2642 ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ 2563 2643 btrfs_item_offset_nr(leaf, slot))) 2564 2644 2645 + static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) 2646 + { 2647 + return crc32c(crc, address, length); 2648 + } 2649 + 2650 + static inline void btrfs_crc32c_final(u32 crc, u8 *result) 2651 + { 2652 + put_unaligned_le32(~crc, result); 2653 + } 2654 + 2565 2655 static inline u64 btrfs_name_hash(const char *name, int len) 2566 2656 { 2567 2657 return crc32c((u32)~1, name, len); ··· 2584 2654 int len) 2585 2655 { 2586 2656 return (u64) crc32c(parent_objectid, name, len); 2587 - } 2588 - 2589 - static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) 2590 - { 2591 - return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && 2592 - (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); 2593 2657 } 2594 2658 2595 2659 static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) ··· 2622 2698 return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; 2623 2699 } 2624 2700 2625 - int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); 2626 - bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); 2627 2701 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 2628 2702 const u64 start); 2629 2703 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); ··· 2736 2814 COMMIT_TRANS = 9, 2737 2815 }; 2738 2816 2739 - int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); 2740 - int btrfs_check_data_free_space(struct inode *inode, 2741 - struct extent_changeset **reserved, u64 start, u64 len); 2742 - void btrfs_free_reserved_data_space(struct inode *inode, 2743 - struct extent_changeset *reserved, u64 start, u64 len); 2744 - void btrfs_delalloc_release_space(struct inode *inode, 2745 - struct extent_changeset *reserved, 2746 - u64 start, u64 len, bool qgroup_free); 2747 - void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, 2748 - u64 len); 2749 - void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); 2817 + /* 2818 + * control flags for do_chunk_alloc's force field 2819 + * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 2820 + * if we really need one. 2821 + * 2822 + * CHUNK_ALLOC_LIMITED means to only try and allocate one 2823 + * if we have very few chunks already allocated. This is 2824 + * used as part of the clustering code to help make sure 2825 + * we have a good pool of storage to cluster in, without 2826 + * filling the FS with empty chunks 2827 + * 2828 + * CHUNK_ALLOC_FORCE means it must try to allocate one 2829 + * 2830 + */ 2831 + enum btrfs_chunk_alloc_enum { 2832 + CHUNK_ALLOC_NO_FORCE, 2833 + CHUNK_ALLOC_LIMITED, 2834 + CHUNK_ALLOC_FORCE, 2835 + }; 2836 + 2837 + int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 2838 + enum btrfs_chunk_alloc_enum force); 2750 2839 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 2751 2840 struct btrfs_block_rsv *rsv, 2752 2841 int nitems, bool use_global_rsv); ··· 2767 2834 bool qgroup_free); 2768 2835 2769 2836 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); 2770 - void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, 2771 - bool qgroup_free); 2772 - int btrfs_delalloc_reserve_space(struct inode *inode, 2773 - struct extent_changeset **reserved, u64 start, u64 len); 2774 - void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); 2775 - struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 2776 - unsigned short type); 2777 - void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, 2778 - struct btrfs_block_rsv *rsv, 2779 - unsigned short type); 2780 - void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, 2781 - struct btrfs_block_rsv *rsv); 2782 - int btrfs_block_rsv_add(struct btrfs_root *root, 2783 - struct btrfs_block_rsv *block_rsv, u64 num_bytes, 2784 - enum btrfs_reserve_flush_enum flush); 2785 - int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); 2786 - int btrfs_block_rsv_refill(struct btrfs_root *root, 2787 - struct btrfs_block_rsv *block_rsv, u64 min_reserved, 2788 - enum btrfs_reserve_flush_enum flush); 2789 - int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 2790 - struct btrfs_block_rsv *dst_rsv, u64 num_bytes, 2791 - bool update_size); 2792 - int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 2793 - struct btrfs_block_rsv *dest, u64 num_bytes, 2794 - int min_factor); 2795 - void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 2796 - struct btrfs_block_rsv *block_rsv, 2797 - u64 num_bytes); 2798 - void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); 2799 - void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); 2800 - int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, 2801 - enum btrfs_reserve_flush_enum flush); 2802 - void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, 2803 - struct btrfs_block_rsv *src, 2804 - u64 num_bytes); 2805 2837 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); 2806 2838 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); 2807 2839 void btrfs_put_block_group_cache(struct btrfs_fs_info *info); ··· 3084 3186 struct btrfs_dio_private; 3085 3187 int btrfs_del_csums(struct btrfs_trans_handle *trans, 3086 3188 struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); 3087 - blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); 3189 + blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, 3190 + u8 *dst); 3088 3191 blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, 3089 3192 u64 logical_offset); 3090 3193 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, ··· 3413 3514 static inline void assfail(const char *expr, const char *file, int line) 3414 3515 { 3415 3516 if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { 3416 - pr_err("assertion failed: %s, file: %s, line: %d\n", 3417 - expr, file, line); 3517 + pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); 3418 3518 BUG(); 3419 3519 } 3420 3520 } ··· 3497 3599 /* compatibility and incompatibility defines */ 3498 3600 3499 3601 #define btrfs_set_fs_incompat(__fs_info, opt) \ 3500 - __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) 3602 + __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ 3603 + #opt) 3501 3604 3502 3605 static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, 3503 - u64 flag) 3606 + u64 flag, const char* name) 3504 3607 { 3505 3608 struct btrfs_super_block *disk_super; 3506 3609 u64 features; ··· 3514 3615 if (!(features & flag)) { 3515 3616 features |= flag; 3516 3617 btrfs_set_super_incompat_flags(disk_super, features); 3517 - btrfs_info(fs_info, "setting %llu feature flag", 3518 - flag); 3618 + btrfs_info(fs_info, 3619 + "setting incompat feature flag for %s (0x%llx)", 3620 + name, flag); 3519 3621 } 3520 3622 spin_unlock(&fs_info->super_lock); 3521 3623 } 3522 3624 } 3523 3625 3524 3626 #define btrfs_clear_fs_incompat(__fs_info, opt) \ 3525 - __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) 3627 + __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ 3628 + #opt) 3526 3629 3527 3630 static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, 3528 - u64 flag) 3631 + u64 flag, const char* name) 3529 3632 { 3530 3633 struct btrfs_super_block *disk_super; 3531 3634 u64 features; ··· 3540 3639 if (features & flag) { 3541 3640 features &= ~flag; 3542 3641 btrfs_set_super_incompat_flags(disk_super, features); 3543 - btrfs_info(fs_info, "clearing %llu feature flag", 3544 - flag); 3642 + btrfs_info(fs_info, 3643 + "clearing incompat feature flag for %s (0x%llx)", 3644 + name, flag); 3545 3645 } 3546 3646 spin_unlock(&fs_info->super_lock); 3547 3647 } ··· 3559 3657 } 3560 3658 3561 3659 #define btrfs_set_fs_compat_ro(__fs_info, opt) \ 3562 - __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) 3660 + __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ 3661 + #opt) 3563 3662 3564 3663 static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, 3565 - u64 flag) 3664 + u64 flag, const char *name) 3566 3665 { 3567 3666 struct btrfs_super_block *disk_super; 3568 3667 u64 features; ··· 3576 3673 if (!(features & flag)) { 3577 3674 features |= flag; 3578 3675 btrfs_set_super_compat_ro_flags(disk_super, features); 3579 - btrfs_info(fs_info, "setting %llu ro feature flag", 3580 - flag); 3676 + btrfs_info(fs_info, 3677 + "setting compat-ro feature flag for %s (0x%llx)", 3678 + name, flag); 3581 3679 } 3582 3680 spin_unlock(&fs_info->super_lock); 3583 3681 } 3584 3682 } 3585 3683 3586 3684 #define btrfs_clear_fs_compat_ro(__fs_info, opt) \ 3587 - __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) 3685 + __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ 3686 + #opt) 3588 3687 3589 3688 static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, 3590 - u64 flag) 3689 + u64 flag, const char *name) 3591 3690 { 3592 3691 struct btrfs_super_block *disk_super; 3593 3692 u64 features; ··· 3602 3697 if (features & flag) { 3603 3698 features &= ~flag; 3604 3699 btrfs_set_super_compat_ro_flags(disk_super, features); 3605 - btrfs_info(fs_info, "clearing %llu ro feature flag", 3606 - flag); 3700 + btrfs_info(fs_info, 3701 + "clearing compat-ro feature flag for %s (0x%llx)", 3702 + name, flag); 3607 3703 } 3608 3704 spin_unlock(&fs_info->super_lock); 3609 3705 }

+494

fs/btrfs/delalloc-space.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "ctree.h" 4 + #include "delalloc-space.h" 5 + #include "block-rsv.h" 6 + #include "btrfs_inode.h" 7 + #include "space-info.h" 8 + #include "transaction.h" 9 + #include "qgroup.h" 10 + 11 + int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) 12 + { 13 + struct btrfs_root *root = inode->root; 14 + struct btrfs_fs_info *fs_info = root->fs_info; 15 + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; 16 + u64 used; 17 + int ret = 0; 18 + int need_commit = 2; 19 + int have_pinned_space; 20 + 21 + /* Make sure bytes are sectorsize aligned */ 22 + bytes = ALIGN(bytes, fs_info->sectorsize); 23 + 24 + if (btrfs_is_free_space_inode(inode)) { 25 + need_commit = 0; 26 + ASSERT(current->journal_info); 27 + } 28 + 29 + again: 30 + /* Make sure we have enough space to handle the data first */ 31 + spin_lock(&data_sinfo->lock); 32 + used = btrfs_space_info_used(data_sinfo, true); 33 + 34 + if (used + bytes > data_sinfo->total_bytes) { 35 + struct btrfs_trans_handle *trans; 36 + 37 + /* 38 + * If we don't have enough free bytes in this space then we need 39 + * to alloc a new chunk. 40 + */ 41 + if (!data_sinfo->full) { 42 + u64 alloc_target; 43 + 44 + data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 45 + spin_unlock(&data_sinfo->lock); 46 + 47 + alloc_target = btrfs_data_alloc_profile(fs_info); 48 + /* 49 + * It is ugly that we don't call nolock join 50 + * transaction for the free space inode case here. 51 + * But it is safe because we only do the data space 52 + * reservation for the free space cache in the 53 + * transaction context, the common join transaction 54 + * just increase the counter of the current transaction 55 + * handler, doesn't try to acquire the trans_lock of 56 + * the fs. 57 + */ 58 + trans = btrfs_join_transaction(root); 59 + if (IS_ERR(trans)) 60 + return PTR_ERR(trans); 61 + 62 + ret = btrfs_chunk_alloc(trans, alloc_target, 63 + CHUNK_ALLOC_NO_FORCE); 64 + btrfs_end_transaction(trans); 65 + if (ret < 0) { 66 + if (ret != -ENOSPC) 67 + return ret; 68 + else { 69 + have_pinned_space = 1; 70 + goto commit_trans; 71 + } 72 + } 73 + 74 + goto again; 75 + } 76 + 77 + /* 78 + * If we don't have enough pinned space to deal with this 79 + * allocation, and no removed chunk in current transaction, 80 + * don't bother committing the transaction. 81 + */ 82 + have_pinned_space = __percpu_counter_compare( 83 + &data_sinfo->total_bytes_pinned, 84 + used + bytes - data_sinfo->total_bytes, 85 + BTRFS_TOTAL_BYTES_PINNED_BATCH); 86 + spin_unlock(&data_sinfo->lock); 87 + 88 + /* Commit the current transaction and try again */ 89 + commit_trans: 90 + if (need_commit) { 91 + need_commit--; 92 + 93 + if (need_commit > 0) { 94 + btrfs_start_delalloc_roots(fs_info, -1); 95 + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, 96 + (u64)-1); 97 + } 98 + 99 + trans = btrfs_join_transaction(root); 100 + if (IS_ERR(trans)) 101 + return PTR_ERR(trans); 102 + if (have_pinned_space >= 0 || 103 + test_bit(BTRFS_TRANS_HAVE_FREE_BGS, 104 + &trans->transaction->flags) || 105 + need_commit > 0) { 106 + ret = btrfs_commit_transaction(trans); 107 + if (ret) 108 + return ret; 109 + /* 110 + * The cleaner kthread might still be doing iput 111 + * operations. Wait for it to finish so that 112 + * more space is released. We don't need to 113 + * explicitly run the delayed iputs here because 114 + * the commit_transaction would have woken up 115 + * the cleaner. 116 + */ 117 + ret = btrfs_wait_on_delayed_iputs(fs_info); 118 + if (ret) 119 + return ret; 120 + goto again; 121 + } else { 122 + btrfs_end_transaction(trans); 123 + } 124 + } 125 + 126 + trace_btrfs_space_reservation(fs_info, 127 + "space_info:enospc", 128 + data_sinfo->flags, bytes, 1); 129 + return -ENOSPC; 130 + } 131 + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); 132 + trace_btrfs_space_reservation(fs_info, "space_info", 133 + data_sinfo->flags, bytes, 1); 134 + spin_unlock(&data_sinfo->lock); 135 + 136 + return 0; 137 + } 138 + 139 + int btrfs_check_data_free_space(struct inode *inode, 140 + struct extent_changeset **reserved, u64 start, u64 len) 141 + { 142 + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 143 + int ret; 144 + 145 + /* align the range */ 146 + len = round_up(start + len, fs_info->sectorsize) - 147 + round_down(start, fs_info->sectorsize); 148 + start = round_down(start, fs_info->sectorsize); 149 + 150 + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); 151 + if (ret < 0) 152 + return ret; 153 + 154 + /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ 155 + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); 156 + if (ret < 0) 157 + btrfs_free_reserved_data_space_noquota(inode, start, len); 158 + else 159 + ret = 0; 160 + return ret; 161 + } 162 + 163 + /* 164 + * Called if we need to clear a data reservation for this inode 165 + * Normally in a error case. 166 + * 167 + * This one will *NOT* use accurate qgroup reserved space API, just for case 168 + * which we can't sleep and is sure it won't affect qgroup reserved space. 169 + * Like clear_bit_hook(). 170 + */ 171 + void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, 172 + u64 len) 173 + { 174 + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 175 + struct btrfs_space_info *data_sinfo; 176 + 177 + /* Make sure the range is aligned to sectorsize */ 178 + len = round_up(start + len, fs_info->sectorsize) - 179 + round_down(start, fs_info->sectorsize); 180 + start = round_down(start, fs_info->sectorsize); 181 + 182 + data_sinfo = fs_info->data_sinfo; 183 + spin_lock(&data_sinfo->lock); 184 + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); 185 + trace_btrfs_space_reservation(fs_info, "space_info", 186 + data_sinfo->flags, len, 0); 187 + spin_unlock(&data_sinfo->lock); 188 + } 189 + 190 + /* 191 + * Called if we need to clear a data reservation for this inode 192 + * Normally in a error case. 193 + * 194 + * This one will handle the per-inode data rsv map for accurate reserved 195 + * space framework. 196 + */ 197 + void btrfs_free_reserved_data_space(struct inode *inode, 198 + struct extent_changeset *reserved, u64 start, u64 len) 199 + { 200 + struct btrfs_root *root = BTRFS_I(inode)->root; 201 + 202 + /* Make sure the range is aligned to sectorsize */ 203 + len = round_up(start + len, root->fs_info->sectorsize) - 204 + round_down(start, root->fs_info->sectorsize); 205 + start = round_down(start, root->fs_info->sectorsize); 206 + 207 + btrfs_free_reserved_data_space_noquota(inode, start, len); 208 + btrfs_qgroup_free_data(inode, reserved, start, len); 209 + } 210 + 211 + /** 212 + * btrfs_inode_rsv_release - release any excessive reservation. 213 + * @inode - the inode we need to release from. 214 + * @qgroup_free - free or convert qgroup meta. 215 + * Unlike normal operation, qgroup meta reservation needs to know if we are 216 + * freeing qgroup reservation or just converting it into per-trans. Normally 217 + * @qgroup_free is true for error handling, and false for normal release. 218 + * 219 + * This is the same as btrfs_block_rsv_release, except that it handles the 220 + * tracepoint for the reservation. 221 + */ 222 + static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) 223 + { 224 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 225 + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 226 + u64 released = 0; 227 + u64 qgroup_to_release = 0; 228 + 229 + /* 230 + * Since we statically set the block_rsv->size we just want to say we 231 + * are releasing 0 bytes, and then we'll just get the reservation over 232 + * the size free'd. 233 + */ 234 + released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, 235 + &qgroup_to_release); 236 + if (released > 0) 237 + trace_btrfs_space_reservation(fs_info, "delalloc", 238 + btrfs_ino(inode), released, 0); 239 + if (qgroup_free) 240 + btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); 241 + else 242 + btrfs_qgroup_convert_reserved_meta(inode->root, 243 + qgroup_to_release); 244 + } 245 + 246 + static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, 247 + struct btrfs_inode *inode) 248 + { 249 + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 250 + u64 reserve_size = 0; 251 + u64 qgroup_rsv_size = 0; 252 + u64 csum_leaves; 253 + unsigned outstanding_extents; 254 + 255 + lockdep_assert_held(&inode->lock); 256 + outstanding_extents = inode->outstanding_extents; 257 + if (outstanding_extents) 258 + reserve_size = btrfs_calc_trans_metadata_size(fs_info, 259 + outstanding_extents + 1); 260 + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, 261 + inode->csum_bytes); 262 + reserve_size += btrfs_calc_trans_metadata_size(fs_info, 263 + csum_leaves); 264 + /* 265 + * For qgroup rsv, the calculation is very simple: 266 + * account one nodesize for each outstanding extent 267 + * 268 + * This is overestimating in most cases. 269 + */ 270 + qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; 271 + 272 + spin_lock(&block_rsv->lock); 273 + block_rsv->size = reserve_size; 274 + block_rsv->qgroup_rsv_size = qgroup_rsv_size; 275 + spin_unlock(&block_rsv->lock); 276 + } 277 + 278 + static void calc_inode_reservations(struct btrfs_fs_info *fs_info, 279 + u64 num_bytes, u64 *meta_reserve, 280 + u64 *qgroup_reserve) 281 + { 282 + u64 nr_extents = count_max_extents(num_bytes); 283 + u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); 284 + 285 + /* We add one for the inode update at finish ordered time */ 286 + *meta_reserve = btrfs_calc_trans_metadata_size(fs_info, 287 + nr_extents + csum_leaves + 1); 288 + *qgroup_reserve = nr_extents * fs_info->nodesize; 289 + } 290 + 291 + int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) 292 + { 293 + struct btrfs_root *root = inode->root; 294 + struct btrfs_fs_info *fs_info = root->fs_info; 295 + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 296 + u64 meta_reserve, qgroup_reserve; 297 + unsigned nr_extents; 298 + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 299 + int ret = 0; 300 + bool delalloc_lock = true; 301 + 302 + /* 303 + * If we are a free space inode we need to not flush since we will be in 304 + * the middle of a transaction commit. We also don't need the delalloc 305 + * mutex since we won't race with anybody. We need this mostly to make 306 + * lockdep shut its filthy mouth. 307 + * 308 + * If we have a transaction open (can happen if we call truncate_block 309 + * from truncate), then we need FLUSH_LIMIT so we don't deadlock. 310 + */ 311 + if (btrfs_is_free_space_inode(inode)) { 312 + flush = BTRFS_RESERVE_NO_FLUSH; 313 + delalloc_lock = false; 314 + } else { 315 + if (current->journal_info) 316 + flush = BTRFS_RESERVE_FLUSH_LIMIT; 317 + 318 + if (btrfs_transaction_in_commit(fs_info)) 319 + schedule_timeout(1); 320 + } 321 + 322 + if (delalloc_lock) 323 + mutex_lock(&inode->delalloc_mutex); 324 + 325 + num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 326 + 327 + /* 328 + * We always want to do it this way, every other way is wrong and ends 329 + * in tears. Pre-reserving the amount we are going to add will always 330 + * be the right way, because otherwise if we have enough parallelism we 331 + * could end up with thousands of inodes all holding little bits of 332 + * reservations they were able to make previously and the only way to 333 + * reclaim that space is to ENOSPC out the operations and clear 334 + * everything out and try again, which is bad. This way we just 335 + * over-reserve slightly, and clean up the mess when we are done. 336 + */ 337 + calc_inode_reservations(fs_info, num_bytes, &meta_reserve, 338 + &qgroup_reserve); 339 + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); 340 + if (ret) 341 + goto out_fail; 342 + ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); 343 + if (ret) 344 + goto out_qgroup; 345 + 346 + /* 347 + * Now we need to update our outstanding extents and csum bytes _first_ 348 + * and then add the reservation to the block_rsv. This keeps us from 349 + * racing with an ordered completion or some such that would think it 350 + * needs to free the reservation we just made. 351 + */ 352 + spin_lock(&inode->lock); 353 + nr_extents = count_max_extents(num_bytes); 354 + btrfs_mod_outstanding_extents(inode, nr_extents); 355 + inode->csum_bytes += num_bytes; 356 + btrfs_calculate_inode_block_rsv_size(fs_info, inode); 357 + spin_unlock(&inode->lock); 358 + 359 + /* Now we can safely add our space to our block rsv */ 360 + btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false); 361 + trace_btrfs_space_reservation(root->fs_info, "delalloc", 362 + btrfs_ino(inode), meta_reserve, 1); 363 + 364 + spin_lock(&block_rsv->lock); 365 + block_rsv->qgroup_rsv_reserved += qgroup_reserve; 366 + spin_unlock(&block_rsv->lock); 367 + 368 + if (delalloc_lock) 369 + mutex_unlock(&inode->delalloc_mutex); 370 + return 0; 371 + out_qgroup: 372 + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); 373 + out_fail: 374 + btrfs_inode_rsv_release(inode, true); 375 + if (delalloc_lock) 376 + mutex_unlock(&inode->delalloc_mutex); 377 + return ret; 378 + } 379 + 380 + /** 381 + * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 382 + * @inode: the inode to release the reservation for. 383 + * @num_bytes: the number of bytes we are releasing. 384 + * @qgroup_free: free qgroup reservation or convert it to per-trans reservation 385 + * 386 + * This will release the metadata reservation for an inode. This can be called 387 + * once we complete IO for a given set of bytes to release their metadata 388 + * reservations, or on error for the same reason. 389 + */ 390 + void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, 391 + bool qgroup_free) 392 + { 393 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 394 + 395 + num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 396 + spin_lock(&inode->lock); 397 + inode->csum_bytes -= num_bytes; 398 + btrfs_calculate_inode_block_rsv_size(fs_info, inode); 399 + spin_unlock(&inode->lock); 400 + 401 + if (btrfs_is_testing(fs_info)) 402 + return; 403 + 404 + btrfs_inode_rsv_release(inode, qgroup_free); 405 + } 406 + 407 + /** 408 + * btrfs_delalloc_release_extents - release our outstanding_extents 409 + * @inode: the inode to balance the reservation for. 410 + * @num_bytes: the number of bytes we originally reserved with 411 + * @qgroup_free: do we need to free qgroup meta reservation or convert them. 412 + * 413 + * When we reserve space we increase outstanding_extents for the extents we may 414 + * add. Once we've set the range as delalloc or created our ordered extents we 415 + * have outstanding_extents to track the real usage, so we use this to free our 416 + * temporarily tracked outstanding_extents. This _must_ be used in conjunction 417 + * with btrfs_delalloc_reserve_metadata. 418 + */ 419 + void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, 420 + bool qgroup_free) 421 + { 422 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 423 + unsigned num_extents; 424 + 425 + spin_lock(&inode->lock); 426 + num_extents = count_max_extents(num_bytes); 427 + btrfs_mod_outstanding_extents(inode, -num_extents); 428 + btrfs_calculate_inode_block_rsv_size(fs_info, inode); 429 + spin_unlock(&inode->lock); 430 + 431 + if (btrfs_is_testing(fs_info)) 432 + return; 433 + 434 + btrfs_inode_rsv_release(inode, qgroup_free); 435 + } 436 + 437 + /** 438 + * btrfs_delalloc_reserve_space - reserve data and metadata space for 439 + * delalloc 440 + * @inode: inode we're writing to 441 + * @start: start range we are writing to 442 + * @len: how long the range we are writing to 443 + * @reserved: mandatory parameter, record actually reserved qgroup ranges of 444 + * current reservation. 445 + * 446 + * This will do the following things 447 + * 448 + * - reserve space in data space info for num bytes 449 + * and reserve precious corresponding qgroup space 450 + * (Done in check_data_free_space) 451 + * 452 + * - reserve space for metadata space, based on the number of outstanding 453 + * extents and how much csums will be needed 454 + * also reserve metadata space in a per root over-reserve method. 455 + * - add to the inodes->delalloc_bytes 456 + * - add it to the fs_info's delalloc inodes list. 457 + * (Above 3 all done in delalloc_reserve_metadata) 458 + * 459 + * Return 0 for success 460 + * Return <0 for error(-ENOSPC or -EQUOT) 461 + */ 462 + int btrfs_delalloc_reserve_space(struct inode *inode, 463 + struct extent_changeset **reserved, u64 start, u64 len) 464 + { 465 + int ret; 466 + 467 + ret = btrfs_check_data_free_space(inode, reserved, start, len); 468 + if (ret < 0) 469 + return ret; 470 + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); 471 + if (ret < 0) 472 + btrfs_free_reserved_data_space(inode, *reserved, start, len); 473 + return ret; 474 + } 475 + 476 + /** 477 + * btrfs_delalloc_release_space - release data and metadata space for delalloc 478 + * @inode: inode we're releasing space for 479 + * @start: start position of the space already reserved 480 + * @len: the len of the space already reserved 481 + * @release_bytes: the len of the space we consumed or didn't use 482 + * 483 + * This function will release the metadata space that was not used and will 484 + * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 485 + * list if there are no delalloc bytes left. 486 + * Also it will handle the qgroup reserved space. 487 + */ 488 + void btrfs_delalloc_release_space(struct inode *inode, 489 + struct extent_changeset *reserved, 490 + u64 start, u64 len, bool qgroup_free) 491 + { 492 + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); 493 + btrfs_free_reserved_data_space(inode, reserved, start, len); 494 + }

+23

fs/btrfs/delalloc-space.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + 3 + #ifndef BTRFS_DELALLOC_SPACE_H 4 + #define BTRFS_DELALLOC_SPACE_H 5 + 6 + struct extent_changeset; 7 + 8 + int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); 9 + int btrfs_check_data_free_space(struct inode *inode, 10 + struct extent_changeset **reserved, u64 start, u64 len); 11 + void btrfs_free_reserved_data_space(struct inode *inode, 12 + struct extent_changeset *reserved, u64 start, u64 len); 13 + void btrfs_delalloc_release_space(struct inode *inode, 14 + struct extent_changeset *reserved, 15 + u64 start, u64 len, bool qgroup_free); 16 + void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, 17 + u64 len); 18 + void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, 19 + bool qgroup_free); 20 + int btrfs_delalloc_reserve_space(struct inode *inode, 21 + struct extent_changeset **reserved, u64 start, u64 len); 22 + 23 + #endif /* BTRFS_DELALLOC_SPACE_H */

+178 -3

fs/btrfs/delayed-ref.c

··· 10 10 #include "delayed-ref.h" 11 11 #include "transaction.h" 12 12 #include "qgroup.h" 13 + #include "space-info.h" 13 14 14 15 struct kmem_cache *btrfs_delayed_ref_head_cachep; 15 16 struct kmem_cache *btrfs_delayed_tree_ref_cachep; ··· 24 23 * us to buffer up frequently modified backrefs in an rb tree instead 25 24 * of hammering updates on the extent allocation tree. 26 25 */ 26 + 27 + bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) 28 + { 29 + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 30 + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 31 + bool ret = false; 32 + u64 reserved; 33 + 34 + spin_lock(&global_rsv->lock); 35 + reserved = global_rsv->reserved; 36 + spin_unlock(&global_rsv->lock); 37 + 38 + /* 39 + * Since the global reserve is just kind of magic we don't really want 40 + * to rely on it to save our bacon, so if our size is more than the 41 + * delayed_refs_rsv and the global rsv then it's time to think about 42 + * bailing. 43 + */ 44 + spin_lock(&delayed_refs_rsv->lock); 45 + reserved += delayed_refs_rsv->reserved; 46 + if (delayed_refs_rsv->size >= reserved) 47 + ret = true; 48 + spin_unlock(&delayed_refs_rsv->lock); 49 + return ret; 50 + } 51 + 52 + int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) 53 + { 54 + u64 num_entries = 55 + atomic_read(&trans->transaction->delayed_refs.num_entries); 56 + u64 avg_runtime; 57 + u64 val; 58 + 59 + smp_mb(); 60 + avg_runtime = trans->fs_info->avg_delayed_ref_runtime; 61 + val = num_entries * avg_runtime; 62 + if (val >= NSEC_PER_SEC) 63 + return 1; 64 + if (val >= NSEC_PER_SEC / 2) 65 + return 2; 66 + 67 + return btrfs_check_space_for_delayed_refs(trans->fs_info); 68 + } 69 + 70 + /** 71 + * btrfs_delayed_refs_rsv_release - release a ref head's reservation. 72 + * @fs_info - the fs_info for our fs. 73 + * @nr - the number of items to drop. 74 + * 75 + * This drops the delayed ref head's count from the delayed refs rsv and frees 76 + * any excess reservation we had. 77 + */ 78 + void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) 79 + { 80 + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 81 + u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); 82 + u64 released = 0; 83 + 84 + released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, 85 + NULL); 86 + if (released) 87 + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 88 + 0, released, 0); 89 + } 90 + 91 + /* 92 + * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv 93 + * @trans - the trans that may have generated delayed refs 94 + * 95 + * This is to be called anytime we may have adjusted trans->delayed_ref_updates, 96 + * it'll calculate the additional size and add it to the delayed_refs_rsv. 97 + */ 98 + void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) 99 + { 100 + struct btrfs_fs_info *fs_info = trans->fs_info; 101 + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 102 + u64 num_bytes; 103 + 104 + if (!trans->delayed_ref_updates) 105 + return; 106 + 107 + num_bytes = btrfs_calc_trans_metadata_size(fs_info, 108 + trans->delayed_ref_updates); 109 + spin_lock(&delayed_rsv->lock); 110 + delayed_rsv->size += num_bytes; 111 + delayed_rsv->full = 0; 112 + spin_unlock(&delayed_rsv->lock); 113 + trans->delayed_ref_updates = 0; 114 + } 115 + 116 + /** 117 + * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. 118 + * @fs_info - the fs info for our fs. 119 + * @src - the source block rsv to transfer from. 120 + * @num_bytes - the number of bytes to transfer. 121 + * 122 + * This transfers up to the num_bytes amount from the src rsv to the 123 + * delayed_refs_rsv. Any extra bytes are returned to the space info. 124 + */ 125 + void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, 126 + struct btrfs_block_rsv *src, 127 + u64 num_bytes) 128 + { 129 + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 130 + u64 to_free = 0; 131 + 132 + spin_lock(&src->lock); 133 + src->reserved -= num_bytes; 134 + src->size -= num_bytes; 135 + spin_unlock(&src->lock); 136 + 137 + spin_lock(&delayed_refs_rsv->lock); 138 + if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { 139 + u64 delta = delayed_refs_rsv->size - 140 + delayed_refs_rsv->reserved; 141 + if (num_bytes > delta) { 142 + to_free = num_bytes - delta; 143 + num_bytes = delta; 144 + } 145 + } else { 146 + to_free = num_bytes; 147 + num_bytes = 0; 148 + } 149 + 150 + if (num_bytes) 151 + delayed_refs_rsv->reserved += num_bytes; 152 + if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) 153 + delayed_refs_rsv->full = 1; 154 + spin_unlock(&delayed_refs_rsv->lock); 155 + 156 + if (num_bytes) 157 + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 158 + 0, num_bytes, 1); 159 + if (to_free) 160 + btrfs_space_info_add_old_bytes(fs_info, 161 + delayed_refs_rsv->space_info, to_free); 162 + } 163 + 164 + /** 165 + * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. 166 + * @fs_info - the fs_info for our fs. 167 + * @flush - control how we can flush for this reservation. 168 + * 169 + * This will refill the delayed block_rsv up to 1 items size worth of space and 170 + * will return -ENOSPC if we can't make the reservation. 171 + */ 172 + int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, 173 + enum btrfs_reserve_flush_enum flush) 174 + { 175 + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 176 + u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); 177 + u64 num_bytes = 0; 178 + int ret = -ENOSPC; 179 + 180 + spin_lock(&block_rsv->lock); 181 + if (block_rsv->reserved < block_rsv->size) { 182 + num_bytes = block_rsv->size - block_rsv->reserved; 183 + num_bytes = min(num_bytes, limit); 184 + } 185 + spin_unlock(&block_rsv->lock); 186 + 187 + if (!num_bytes) 188 + return 0; 189 + 190 + ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv, 191 + num_bytes, flush); 192 + if (ret) 193 + return ret; 194 + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); 195 + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 196 + 0, num_bytes, 1); 197 + return 0; 198 + } 27 199 28 200 /* 29 201 * compare two delayed tree backrefs with same bytenr and type ··· 1131 957 } 1132 958 1133 959 /* 1134 - * this does a simple search for the head node for a given extent. 1135 - * It must be called with the delayed ref spinlock held, and it returns 1136 - * the head node if any where found, or NULL if not. 960 + * This does a simple search for the head node for a given extent. Returns the 961 + * head node if found, or NULL if not. 1137 962 */ 1138 963 struct btrfs_delayed_ref_head * 1139 964 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) 1140 965 { 966 + lockdep_assert_held(&delayed_refs->lock); 967 + 1141 968 return find_ref_head(delayed_refs, bytenr, false); 1142 969 } 1143 970

+10

fs/btrfs/delayed-ref.h

··· 364 364 365 365 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); 366 366 367 + void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); 368 + void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); 369 + int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, 370 + enum btrfs_reserve_flush_enum flush); 371 + void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, 372 + struct btrfs_block_rsv *src, 373 + u64 num_bytes); 374 + int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); 375 + bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); 376 + 367 377 /* 368 378 * helper functions to cast a node into its container 369 379 */

+13 -18

fs/btrfs/dev-replace.c

··· 201 201 return PTR_ERR(bdev); 202 202 } 203 203 204 - filemap_write_and_wait(bdev->bd_inode->i_mapping); 204 + sync_blockdev(bdev); 205 205 206 206 devices = &fs_info->fs_devices->devices; 207 207 list_for_each_entry(device, devices, dev_list) { ··· 237 237 } 238 238 rcu_assign_pointer(device->name, name); 239 239 240 - mutex_lock(&fs_info->fs_devices->device_list_mutex); 241 240 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 242 241 device->generation = 0; 243 242 device->io_width = fs_info->sectorsize; ··· 255 256 device->dev_stats_valid = 1; 256 257 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 257 258 device->fs_devices = fs_info->fs_devices; 259 + 260 + mutex_lock(&fs_info->fs_devices->device_list_mutex); 258 261 list_add(&device->dev_list, &fs_info->fs_devices->devices); 259 262 fs_info->fs_devices->num_devices++; 260 263 fs_info->fs_devices->open_devices++; ··· 400 399 int ret; 401 400 struct btrfs_device *tgt_device = NULL; 402 401 struct btrfs_device *src_device = NULL; 403 - bool need_unlock; 404 402 405 403 src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, 406 404 srcdev_name); ··· 412 412 btrfs_dev_name(src_device), src_device->devid); 413 413 return -ETXTBSY; 414 414 } 415 - 416 - ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, 417 - src_device, &tgt_device); 418 - if (ret) 419 - return ret; 420 415 421 416 /* 422 417 * Here we commit the transaction to make sure commit_total_bytes ··· 426 431 return PTR_ERR(trans); 427 432 } 428 433 429 - need_unlock = true; 434 + ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, 435 + src_device, &tgt_device); 436 + if (ret) 437 + return ret; 438 + 430 439 down_write(&dev_replace->rwsem); 431 440 switch (dev_replace->replace_state) { 432 441 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: ··· 441 442 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 442 443 ASSERT(0); 443 444 ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; 445 + up_write(&dev_replace->rwsem); 444 446 goto leave; 445 447 } 446 448 447 449 dev_replace->cont_reading_from_srcdev_mode = read_src; 448 - WARN_ON(!src_device); 449 450 dev_replace->srcdev = src_device; 450 451 dev_replace->tgtdev = tgt_device; 451 452 ··· 470 471 atomic64_set(&dev_replace->num_write_errors, 0); 471 472 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); 472 473 up_write(&dev_replace->rwsem); 473 - need_unlock = false; 474 474 475 475 ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); 476 476 if (ret) ··· 477 479 478 480 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); 479 481 480 - /* force writing the updated state information to disk */ 481 - trans = btrfs_start_transaction(root, 0); 482 + /* Commit dev_replace state and reserve 1 item for it. */ 483 + trans = btrfs_start_transaction(root, 1); 482 484 if (IS_ERR(trans)) { 483 485 ret = PTR_ERR(trans); 484 - need_unlock = true; 485 486 down_write(&dev_replace->rwsem); 486 487 dev_replace->replace_state = 487 488 BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; 488 489 dev_replace->srcdev = NULL; 489 490 dev_replace->tgtdev = NULL; 491 + up_write(&dev_replace->rwsem); 490 492 goto leave; 491 493 } 492 494 ··· 508 510 return ret; 509 511 510 512 leave: 511 - if (need_unlock) 512 - up_write(&dev_replace->rwsem); 513 513 btrfs_destroy_dev_replace_tgtdev(tgt_device); 514 514 return ret; 515 515 } ··· 674 678 btrfs_device_set_disk_total_bytes(tgt_device, 675 679 src_device->disk_total_bytes); 676 680 btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); 677 - tgt_device->commit_total_bytes = src_device->commit_total_bytes; 678 681 tgt_device->commit_bytes_used = src_device->bytes_used; 679 682 680 683 btrfs_assign_next_active_device(src_device, tgt_device); ··· 723 728 struct btrfs_device *srcdev, 724 729 struct btrfs_device *tgtdev) 725 730 { 726 - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 731 + struct extent_map_tree *em_tree = &fs_info->mapping_tree; 727 732 struct extent_map *em; 728 733 struct map_lookup *map; 729 734 u64 start = 0;

+104 -62

fs/btrfs/disk-io.c

··· 19 19 #include <linux/crc32c.h> 20 20 #include <linux/sched/mm.h> 21 21 #include <asm/unaligned.h> 22 + #include <crypto/hash.h> 22 23 #include "ctree.h" 23 24 #include "disk-io.h" 24 25 #include "transaction.h" ··· 40 39 #include "compression.h" 41 40 #include "tree-checker.h" 42 41 #include "ref-verify.h" 43 - 44 - #ifdef CONFIG_X86 45 - #include <asm/cpufeature.h> 46 - #endif 47 42 48 43 #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ 49 44 BTRFS_HEADER_FLAG_RELOC |\ ··· 246 249 return em; 247 250 } 248 251 249 - u32 btrfs_csum_data(const char *data, u32 seed, size_t len) 250 - { 251 - return crc32c(seed, data, len); 252 - } 253 - 254 - void btrfs_csum_final(u32 crc, u8 *result) 255 - { 256 - put_unaligned_le32(~crc, result); 257 - } 258 - 259 252 /* 260 253 * Compute the csum of a btree block and store the result to provided buffer. 261 254 * ··· 253 266 */ 254 267 static int csum_tree_block(struct extent_buffer *buf, u8 *result) 255 268 { 269 + struct btrfs_fs_info *fs_info = buf->fs_info; 270 + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 256 271 unsigned long len; 257 272 unsigned long cur_len; 258 273 unsigned long offset = BTRFS_CSUM_SIZE; ··· 262 273 unsigned long map_start; 263 274 unsigned long map_len; 264 275 int err; 265 - u32 crc = ~(u32)0; 276 + 277 + shash->tfm = fs_info->csum_shash; 278 + crypto_shash_init(shash); 266 279 267 280 len = buf->len - offset; 281 + 268 282 while (len > 0) { 269 283 /* 270 284 * Note: we don't need to check for the err == 1 case here, as ··· 280 288 if (WARN_ON(err)) 281 289 return err; 282 290 cur_len = min(len, map_len - (offset - map_start)); 283 - crc = btrfs_csum_data(kaddr + offset - map_start, 284 - crc, cur_len); 291 + crypto_shash_update(shash, kaddr + offset - map_start, cur_len); 285 292 len -= cur_len; 286 293 offset += cur_len; 287 294 } 288 295 memset(result, 0, BTRFS_CSUM_SIZE); 289 296 290 - btrfs_csum_final(crc, result); 297 + crypto_shash_final(shash, result); 291 298 292 299 return 0; 293 300 } ··· 347 356 return ret; 348 357 } 349 358 359 + static bool btrfs_supported_super_csum(u16 csum_type) 360 + { 361 + switch (csum_type) { 362 + case BTRFS_CSUM_TYPE_CRC32: 363 + return true; 364 + default: 365 + return false; 366 + } 367 + } 368 + 350 369 /* 351 370 * Return 0 if the superblock checksum type matches the checksum value of that 352 371 * algorithm. Pass the raw disk superblock data. ··· 366 365 { 367 366 struct btrfs_super_block *disk_sb = 368 367 (struct btrfs_super_block *)raw_disk_sb; 369 - u16 csum_type = btrfs_super_csum_type(disk_sb); 370 - int ret = 0; 368 + char result[BTRFS_CSUM_SIZE]; 369 + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 371 370 372 - if (csum_type == BTRFS_CSUM_TYPE_CRC32) { 373 - u32 crc = ~(u32)0; 374 - char result[sizeof(crc)]; 371 + shash->tfm = fs_info->csum_shash; 372 + crypto_shash_init(shash); 375 373 376 - /* 377 - * The super_block structure does not span the whole 378 - * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space 379 - * is filled with zeros and is included in the checksum. 380 - */ 381 - crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, 382 - crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); 383 - btrfs_csum_final(crc, result); 374 + /* 375 + * The super_block structure does not span the whole 376 + * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is 377 + * filled with zeros and is included in the checksum. 378 + */ 379 + crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE, 380 + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); 381 + crypto_shash_final(shash, result); 384 382 385 - if (memcmp(raw_disk_sb, result, sizeof(result))) 386 - ret = 1; 387 - } 383 + if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb))) 384 + return 1; 388 385 389 - if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { 390 - btrfs_err(fs_info, "unsupported checksum algorithm %u", 391 - csum_type); 392 - ret = 1; 393 - } 394 - 395 - return ret; 386 + return 0; 396 387 } 397 388 398 389 int btrfs_verify_level_key(struct extent_buffer *eb, int level, ··· 866 873 return btree_csum_one_bio(bio); 867 874 } 868 875 869 - static int check_async_write(struct btrfs_inode *bi) 876 + static int check_async_write(struct btrfs_fs_info *fs_info, 877 + struct btrfs_inode *bi) 870 878 { 871 879 if (atomic_read(&bi->sync_writers)) 872 880 return 0; 873 - #ifdef CONFIG_X86 874 - if (static_cpu_has(X86_FEATURE_XMM4_2)) 881 + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) 875 882 return 0; 876 - #endif 877 883 return 1; 878 884 } 879 885 ··· 881 889 unsigned long bio_flags) 882 890 { 883 891 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 884 - int async = check_async_write(BTRFS_I(inode)); 892 + int async = check_async_write(fs_info, BTRFS_I(inode)); 885 893 blk_status_t ret; 886 894 887 895 if (bio_op(bio) != REQ_OP_WRITE) { ··· 2254 2262 return 0; 2255 2263 } 2256 2264 2265 + static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) 2266 + { 2267 + struct crypto_shash *csum_shash; 2268 + const char *csum_name = btrfs_super_csum_name(csum_type); 2269 + 2270 + csum_shash = crypto_alloc_shash(csum_name, 0, 0); 2271 + 2272 + if (IS_ERR(csum_shash)) { 2273 + btrfs_err(fs_info, "error allocating %s hash for checksum", 2274 + csum_name); 2275 + return PTR_ERR(csum_shash); 2276 + } 2277 + 2278 + fs_info->csum_shash = csum_shash; 2279 + 2280 + return 0; 2281 + } 2282 + 2283 + static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) 2284 + { 2285 + crypto_free_shash(fs_info->csum_shash); 2286 + } 2287 + 2257 2288 static int btrfs_replay_log(struct btrfs_fs_info *fs_info, 2258 2289 struct btrfs_fs_devices *fs_devices) 2259 2290 { ··· 2592 2577 ret = validate_super(fs_info, sb, -1); 2593 2578 if (ret < 0) 2594 2579 goto out; 2595 - if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) { 2580 + if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { 2596 2581 ret = -EUCLEAN; 2597 2582 btrfs_err(fs_info, "invalid csum type, has %u want %u", 2598 2583 btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); ··· 2622 2607 u32 stripesize; 2623 2608 u64 generation; 2624 2609 u64 features; 2610 + u16 csum_type; 2625 2611 struct btrfs_key location; 2626 2612 struct buffer_head *bh; 2627 2613 struct btrfs_super_block *disk_super; ··· 2705 2689 INIT_LIST_HEAD(&fs_info->space_info); 2706 2690 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); 2707 2691 INIT_LIST_HEAD(&fs_info->unused_bgs); 2708 - btrfs_mapping_init(&fs_info->mapping_tree); 2692 + extent_map_tree_init(&fs_info->mapping_tree); 2709 2693 btrfs_init_block_rsv(&fs_info->global_block_rsv, 2710 2694 BTRFS_BLOCK_RSV_GLOBAL); 2711 2695 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); ··· 2809 2793 spin_lock_init(&fs_info->swapfile_pins_lock); 2810 2794 fs_info->swapfile_pins = RB_ROOT; 2811 2795 2796 + fs_info->send_in_progress = 0; 2797 + 2812 2798 ret = btrfs_alloc_stripe_hash_table(fs_info); 2813 2799 if (ret) { 2814 2800 err = ret; ··· 2831 2813 } 2832 2814 2833 2815 /* 2816 + * Verify the type first, if that or the the checksum value are 2817 + * corrupted, we'll find out 2818 + */ 2819 + csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data); 2820 + if (!btrfs_supported_super_csum(csum_type)) { 2821 + btrfs_err(fs_info, "unsupported checksum algorithm: %u", 2822 + csum_type); 2823 + err = -EINVAL; 2824 + brelse(bh); 2825 + goto fail_alloc; 2826 + } 2827 + 2828 + ret = btrfs_init_csum_hash(fs_info, csum_type); 2829 + if (ret) { 2830 + err = ret; 2831 + goto fail_alloc; 2832 + } 2833 + 2834 + /* 2834 2835 * We want to check superblock checksum, the type is stored inside. 2835 2836 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). 2836 2837 */ ··· 2857 2820 btrfs_err(fs_info, "superblock checksum mismatch"); 2858 2821 err = -EINVAL; 2859 2822 brelse(bh); 2860 - goto fail_alloc; 2823 + goto fail_csum; 2861 2824 } 2862 2825 2863 2826 /* ··· 2894 2857 if (ret) { 2895 2858 btrfs_err(fs_info, "superblock contains fatal errors"); 2896 2859 err = -EINVAL; 2897 - goto fail_alloc; 2860 + goto fail_csum; 2898 2861 } 2899 2862 2900 2863 if (!btrfs_super_root(disk_super)) 2901 - goto fail_alloc; 2864 + goto fail_csum; 2902 2865 2903 2866 /* check FS state, whether FS is broken. */ 2904 2867 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) ··· 2920 2883 ret = btrfs_parse_options(fs_info, options, sb->s_flags); 2921 2884 if (ret) { 2922 2885 err = ret; 2923 - goto fail_alloc; 2886 + goto fail_csum; 2924 2887 } 2925 2888 2926 2889 features = btrfs_super_incompat_flags(disk_super) & ··· 2930 2893 "cannot mount because of unsupported optional features (%llx)", 2931 2894 features); 2932 2895 err = -EINVAL; 2933 - goto fail_alloc; 2896 + goto fail_csum; 2934 2897 } 2935 2898 2936 2899 features = btrfs_super_incompat_flags(disk_super); ··· 2974 2937 btrfs_err(fs_info, 2975 2938 "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", 2976 2939 nodesize, sectorsize); 2977 - goto fail_alloc; 2940 + goto fail_csum; 2978 2941 } 2979 2942 2980 2943 /* ··· 2990 2953 "cannot mount read-write because of unsupported optional features (%llx)", 2991 2954 features); 2992 2955 err = -EINVAL; 2993 - goto fail_alloc; 2956 + goto fail_csum; 2994 2957 } 2995 2958 2996 2959 ret = btrfs_init_workqueues(fs_info, fs_devices); ··· 3368 3331 fail_sb_buffer: 3369 3332 btrfs_stop_all_workers(fs_info); 3370 3333 btrfs_free_block_groups(fs_info); 3334 + fail_csum: 3335 + btrfs_free_csum_hash(fs_info); 3371 3336 fail_alloc: 3372 3337 fail_iput: 3373 3338 btrfs_mapping_tree_free(&fs_info->mapping_tree); ··· 3511 3472 static int write_dev_supers(struct btrfs_device *device, 3512 3473 struct btrfs_super_block *sb, int max_mirrors) 3513 3474 { 3475 + struct btrfs_fs_info *fs_info = device->fs_info; 3476 + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 3514 3477 struct buffer_head *bh; 3515 3478 int i; 3516 3479 int ret; 3517 3480 int errors = 0; 3518 - u32 crc; 3519 3481 u64 bytenr; 3520 3482 int op_flags; 3521 3483 3522 3484 if (max_mirrors == 0) 3523 3485 max_mirrors = BTRFS_SUPER_MIRROR_MAX; 3486 + 3487 + shash->tfm = fs_info->csum_shash; 3524 3488 3525 3489 for (i = 0; i < max_mirrors; i++) { 3526 3490 bytenr = btrfs_sb_offset(i); ··· 3533 3491 3534 3492 btrfs_set_super_bytenr(sb, bytenr); 3535 3493 3536 - crc = ~(u32)0; 3537 - crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc, 3538 - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); 3539 - btrfs_csum_final(crc, sb->csum); 3494 + crypto_shash_init(shash); 3495 + crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE, 3496 + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); 3497 + crypto_shash_final(shash, sb->csum); 3540 3498 3541 3499 /* One reference for us, and we leave it for the caller */ 3542 3500 bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, ··· 3751 3709 3752 3710 if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || 3753 3711 (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) 3754 - min_tolerated = min(min_tolerated, 3712 + min_tolerated = min_t(int, min_tolerated, 3755 3713 btrfs_raid_array[BTRFS_RAID_SINGLE]. 3756 3714 tolerated_failures); 3757 3715 ··· 3760 3718 continue; 3761 3719 if (!(flags & btrfs_raid_array[raid_type].bg_flag)) 3762 3720 continue; 3763 - min_tolerated = min(min_tolerated, 3721 + min_tolerated = min_t(int, min_tolerated, 3764 3722 btrfs_raid_array[raid_type]. 3765 3723 tolerated_failures); 3766 3724 }

-2

fs/btrfs/disk-io.h

··· 115 115 int atomic); 116 116 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, 117 117 struct btrfs_key *first_key); 118 - u32 btrfs_csum_data(const char *data, u32 seed, size_t len); 119 - void btrfs_csum_final(u32 crc, u8 *result); 120 118 blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 121 119 enum btrfs_wq_endio_type metadata); 122 120 blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,

+126 -2377

fs/btrfs/extent-tree.c

··· 28 28 #include "sysfs.h" 29 29 #include "qgroup.h" 30 30 #include "ref-verify.h" 31 + #include "space-info.h" 32 + #include "block-rsv.h" 33 + #include "delalloc-space.h" 31 34 32 35 #undef SCRAMBLE_DELAYED_REFS 33 36 34 - /* 35 - * control flags for do_chunk_alloc's force field 36 - * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 37 - * if we really need one. 38 - * 39 - * CHUNK_ALLOC_LIMITED means to only try and allocate one 40 - * if we have very few chunks already allocated. This is 41 - * used as part of the clustering code to help make sure 42 - * we have a good pool of storage to cluster in, without 43 - * filling the FS with empty chunks 44 - * 45 - * CHUNK_ALLOC_FORCE means it must try to allocate one 46 - * 47 - */ 48 - enum { 49 - CHUNK_ALLOC_NO_FORCE = 0, 50 - CHUNK_ALLOC_LIMITED = 1, 51 - CHUNK_ALLOC_FORCE = 2, 52 - }; 53 - 54 - /* 55 - * Declare a helper function to detect underflow of various space info members 56 - */ 57 - #define DECLARE_SPACE_INFO_UPDATE(name) \ 58 - static inline void update_##name(struct btrfs_space_info *sinfo, \ 59 - s64 bytes) \ 60 - { \ 61 - if (bytes < 0 && sinfo->name < -bytes) { \ 62 - WARN_ON(1); \ 63 - sinfo->name = 0; \ 64 - return; \ 65 - } \ 66 - sinfo->name += bytes; \ 67 - } 68 - 69 - DECLARE_SPACE_INFO_UPDATE(bytes_may_use); 70 - DECLARE_SPACE_INFO_UPDATE(bytes_pinned); 71 37 72 38 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 73 39 struct btrfs_delayed_ref_node *node, u64 parent, ··· 50 84 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 51 85 struct btrfs_delayed_ref_node *node, 52 86 struct btrfs_delayed_extent_op *extent_op); 53 - static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 54 - int force); 55 87 static int find_next_key(struct btrfs_path *path, int level, 56 88 struct btrfs_key *key); 57 - static void dump_space_info(struct btrfs_fs_info *fs_info, 58 - struct btrfs_space_info *info, u64 bytes, 59 - int dump_block_groups); 60 - static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 61 - u64 num_bytes); 62 - static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 63 - struct btrfs_space_info *space_info, 64 - u64 num_bytes); 65 - static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 66 - struct btrfs_space_info *space_info, 67 - u64 num_bytes); 68 89 69 90 static noinline int 70 91 block_group_cache_done(struct btrfs_block_group_cache *cache) ··· 690 737 return block_group_cache_tree_search(info, bytenr, 1); 691 738 } 692 739 693 - static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 694 - u64 flags) 740 + static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) 695 741 { 696 - struct list_head *head = &info->space_info; 697 - struct btrfs_space_info *found; 698 - 699 - flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 700 - 701 - rcu_read_lock(); 702 - list_for_each_entry_rcu(found, head, list) { 703 - if (found->flags & flags) { 704 - rcu_read_unlock(); 705 - return found; 706 - } 742 + if (ref->type == BTRFS_REF_METADATA) { 743 + if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) 744 + return BTRFS_BLOCK_GROUP_SYSTEM; 745 + else 746 + return BTRFS_BLOCK_GROUP_METADATA; 707 747 } 708 - rcu_read_unlock(); 709 - return NULL; 748 + return BTRFS_BLOCK_GROUP_DATA; 710 749 } 711 750 712 751 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, 713 - struct btrfs_ref *ref, int sign) 752 + struct btrfs_ref *ref) 714 753 { 715 754 struct btrfs_space_info *space_info; 716 - s64 num_bytes; 717 - u64 flags; 755 + u64 flags = generic_ref_to_space_flags(ref); 718 756 719 - ASSERT(sign == 1 || sign == -1); 720 - num_bytes = sign * ref->len; 721 - if (ref->type == BTRFS_REF_METADATA) { 722 - if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) 723 - flags = BTRFS_BLOCK_GROUP_SYSTEM; 724 - else 725 - flags = BTRFS_BLOCK_GROUP_METADATA; 726 - } else { 727 - flags = BTRFS_BLOCK_GROUP_DATA; 728 - } 729 - 730 - space_info = __find_space_info(fs_info, flags); 757 + space_info = btrfs_find_space_info(fs_info, flags); 731 758 ASSERT(space_info); 732 - percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes, 759 + percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len, 733 760 BTRFS_TOTAL_BYTES_PINNED_BATCH); 734 761 } 735 762 736 - /* 737 - * after adding space to the filesystem, we need to clear the full flags 738 - * on all the space infos. 739 - */ 740 - void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 763 + static void sub_pinned_bytes(struct btrfs_fs_info *fs_info, 764 + struct btrfs_ref *ref) 741 765 { 742 - struct list_head *head = &info->space_info; 743 - struct btrfs_space_info *found; 766 + struct btrfs_space_info *space_info; 767 + u64 flags = generic_ref_to_space_flags(ref); 744 768 745 - rcu_read_lock(); 746 - list_for_each_entry_rcu(found, head, list) 747 - found->full = 0; 748 - rcu_read_unlock(); 769 + space_info = btrfs_find_space_info(fs_info, flags); 770 + ASSERT(space_info); 771 + percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len, 772 + BTRFS_TOTAL_BYTES_PINNED_BATCH); 749 773 } 750 774 751 775 /* simple helper to search for an existing data extent at a given offset */ ··· 1051 1121 __le64 lenum; 1052 1122 1053 1123 lenum = cpu_to_le64(root_objectid); 1054 - high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); 1124 + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); 1055 1125 lenum = cpu_to_le64(owner); 1056 - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1126 + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1057 1127 lenum = cpu_to_le64(offset); 1058 - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1128 + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1059 1129 1060 1130 return ((u64)high_crc << 31) ^ (u64)low_crc; 1061 1131 } ··· 1995 2065 btrfs_ref_tree_mod(fs_info, generic_ref); 1996 2066 1997 2067 if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) 1998 - add_pinned_bytes(fs_info, generic_ref, -1); 2068 + sub_pinned_bytes(fs_info, generic_ref); 1999 2069 2000 2070 return ret; 2001 2071 } ··· 2392 2462 flags = BTRFS_BLOCK_GROUP_SYSTEM; 2393 2463 else 2394 2464 flags = BTRFS_BLOCK_GROUP_METADATA; 2395 - space_info = __find_space_info(fs_info, flags); 2465 + space_info = btrfs_find_space_info(fs_info, flags); 2396 2466 ASSERT(space_info); 2397 2467 percpu_counter_add_batch(&space_info->total_bytes_pinned, 2398 2468 -head->num_bytes, ··· 2752 2822 num_csums += num_csums_per_leaf - 1; 2753 2823 num_csums = div64_u64(num_csums, num_csums_per_leaf); 2754 2824 return num_csums; 2755 - } 2756 - 2757 - bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) 2758 - { 2759 - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 2760 - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 2761 - bool ret = false; 2762 - u64 reserved; 2763 - 2764 - spin_lock(&global_rsv->lock); 2765 - reserved = global_rsv->reserved; 2766 - spin_unlock(&global_rsv->lock); 2767 - 2768 - /* 2769 - * Since the global reserve is just kind of magic we don't really want 2770 - * to rely on it to save our bacon, so if our size is more than the 2771 - * delayed_refs_rsv and the global rsv then it's time to think about 2772 - * bailing. 2773 - */ 2774 - spin_lock(&delayed_refs_rsv->lock); 2775 - reserved += delayed_refs_rsv->reserved; 2776 - if (delayed_refs_rsv->size >= reserved) 2777 - ret = true; 2778 - spin_unlock(&delayed_refs_rsv->lock); 2779 - return ret; 2780 - } 2781 - 2782 - int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) 2783 - { 2784 - u64 num_entries = 2785 - atomic_read(&trans->transaction->delayed_refs.num_entries); 2786 - u64 avg_runtime; 2787 - u64 val; 2788 - 2789 - smp_mb(); 2790 - avg_runtime = trans->fs_info->avg_delayed_ref_runtime; 2791 - val = num_entries * avg_runtime; 2792 - if (val >= NSEC_PER_SEC) 2793 - return 1; 2794 - if (val >= NSEC_PER_SEC / 2) 2795 - return 2; 2796 - 2797 - return btrfs_check_space_for_delayed_refs(trans->fs_info); 2798 2825 } 2799 2826 2800 2827 /* ··· 3721 3834 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); 3722 3835 } 3723 3836 3724 - static const char *alloc_name(u64 flags) 3725 - { 3726 - switch (flags) { 3727 - case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 3728 - return "mixed"; 3729 - case BTRFS_BLOCK_GROUP_METADATA: 3730 - return "metadata"; 3731 - case BTRFS_BLOCK_GROUP_DATA: 3732 - return "data"; 3733 - case BTRFS_BLOCK_GROUP_SYSTEM: 3734 - return "system"; 3735 - default: 3736 - WARN_ON(1); 3737 - return "invalid-combination"; 3738 - }; 3739 - } 3740 - 3741 - static int create_space_info(struct btrfs_fs_info *info, u64 flags) 3742 - { 3743 - 3744 - struct btrfs_space_info *space_info; 3745 - int i; 3746 - int ret; 3747 - 3748 - space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 3749 - if (!space_info) 3750 - return -ENOMEM; 3751 - 3752 - ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 3753 - GFP_KERNEL); 3754 - if (ret) { 3755 - kfree(space_info); 3756 - return ret; 3757 - } 3758 - 3759 - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3760 - INIT_LIST_HEAD(&space_info->block_groups[i]); 3761 - init_rwsem(&space_info->groups_sem); 3762 - spin_lock_init(&space_info->lock); 3763 - space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3764 - space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3765 - init_waitqueue_head(&space_info->wait); 3766 - INIT_LIST_HEAD(&space_info->ro_bgs); 3767 - INIT_LIST_HEAD(&space_info->tickets); 3768 - INIT_LIST_HEAD(&space_info->priority_tickets); 3769 - 3770 - ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, 3771 - info->space_info_kobj, "%s", 3772 - alloc_name(space_info->flags)); 3773 - if (ret) { 3774 - kobject_put(&space_info->kobj); 3775 - return ret; 3776 - } 3777 - 3778 - list_add_rcu(&space_info->list, &info->space_info); 3779 - if (flags & BTRFS_BLOCK_GROUP_DATA) 3780 - info->data_sinfo = space_info; 3781 - 3782 - return ret; 3783 - } 3784 - 3785 - static void update_space_info(struct btrfs_fs_info *info, u64 flags, 3786 - u64 total_bytes, u64 bytes_used, 3787 - u64 bytes_readonly, 3788 - struct btrfs_space_info **space_info) 3789 - { 3790 - struct btrfs_space_info *found; 3791 - int factor; 3792 - 3793 - factor = btrfs_bg_type_to_factor(flags); 3794 - 3795 - found = __find_space_info(info, flags); 3796 - ASSERT(found); 3797 - spin_lock(&found->lock); 3798 - found->total_bytes += total_bytes; 3799 - found->disk_total += total_bytes * factor; 3800 - found->bytes_used += bytes_used; 3801 - found->disk_used += bytes_used * factor; 3802 - found->bytes_readonly += bytes_readonly; 3803 - if (total_bytes > 0) 3804 - found->full = 0; 3805 - space_info_add_new_bytes(info, found, total_bytes - 3806 - bytes_used - bytes_readonly); 3807 - spin_unlock(&found->lock); 3808 - *space_info = found; 3809 - } 3810 - 3811 3837 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3812 3838 { 3813 3839 u64 extra_flags = chunk_to_extended(flags) & ··· 3868 4068 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3869 4069 } 3870 4070 3871 - static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 3872 - bool may_use_included) 3873 - { 3874 - ASSERT(s_info); 3875 - return s_info->bytes_used + s_info->bytes_reserved + 3876 - s_info->bytes_pinned + s_info->bytes_readonly + 3877 - (may_use_included ? s_info->bytes_may_use : 0); 3878 - } 3879 - 3880 - int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) 3881 - { 3882 - struct btrfs_root *root = inode->root; 3883 - struct btrfs_fs_info *fs_info = root->fs_info; 3884 - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; 3885 - u64 used; 3886 - int ret = 0; 3887 - int need_commit = 2; 3888 - int have_pinned_space; 3889 - 3890 - /* make sure bytes are sectorsize aligned */ 3891 - bytes = ALIGN(bytes, fs_info->sectorsize); 3892 - 3893 - if (btrfs_is_free_space_inode(inode)) { 3894 - need_commit = 0; 3895 - ASSERT(current->journal_info); 3896 - } 3897 - 3898 - again: 3899 - /* make sure we have enough space to handle the data first */ 3900 - spin_lock(&data_sinfo->lock); 3901 - used = btrfs_space_info_used(data_sinfo, true); 3902 - 3903 - if (used + bytes > data_sinfo->total_bytes) { 3904 - struct btrfs_trans_handle *trans; 3905 - 3906 - /* 3907 - * if we don't have enough free bytes in this space then we need 3908 - * to alloc a new chunk. 3909 - */ 3910 - if (!data_sinfo->full) { 3911 - u64 alloc_target; 3912 - 3913 - data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 3914 - spin_unlock(&data_sinfo->lock); 3915 - 3916 - alloc_target = btrfs_data_alloc_profile(fs_info); 3917 - /* 3918 - * It is ugly that we don't call nolock join 3919 - * transaction for the free space inode case here. 3920 - * But it is safe because we only do the data space 3921 - * reservation for the free space cache in the 3922 - * transaction context, the common join transaction 3923 - * just increase the counter of the current transaction 3924 - * handler, doesn't try to acquire the trans_lock of 3925 - * the fs. 3926 - */ 3927 - trans = btrfs_join_transaction(root); 3928 - if (IS_ERR(trans)) 3929 - return PTR_ERR(trans); 3930 - 3931 - ret = do_chunk_alloc(trans, alloc_target, 3932 - CHUNK_ALLOC_NO_FORCE); 3933 - btrfs_end_transaction(trans); 3934 - if (ret < 0) { 3935 - if (ret != -ENOSPC) 3936 - return ret; 3937 - else { 3938 - have_pinned_space = 1; 3939 - goto commit_trans; 3940 - } 3941 - } 3942 - 3943 - goto again; 3944 - } 3945 - 3946 - /* 3947 - * If we don't have enough pinned space to deal with this 3948 - * allocation, and no removed chunk in current transaction, 3949 - * don't bother committing the transaction. 3950 - */ 3951 - have_pinned_space = __percpu_counter_compare( 3952 - &data_sinfo->total_bytes_pinned, 3953 - used + bytes - data_sinfo->total_bytes, 3954 - BTRFS_TOTAL_BYTES_PINNED_BATCH); 3955 - spin_unlock(&data_sinfo->lock); 3956 - 3957 - /* commit the current transaction and try again */ 3958 - commit_trans: 3959 - if (need_commit) { 3960 - need_commit--; 3961 - 3962 - if (need_commit > 0) { 3963 - btrfs_start_delalloc_roots(fs_info, -1); 3964 - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, 3965 - (u64)-1); 3966 - } 3967 - 3968 - trans = btrfs_join_transaction(root); 3969 - if (IS_ERR(trans)) 3970 - return PTR_ERR(trans); 3971 - if (have_pinned_space >= 0 || 3972 - test_bit(BTRFS_TRANS_HAVE_FREE_BGS, 3973 - &trans->transaction->flags) || 3974 - need_commit > 0) { 3975 - ret = btrfs_commit_transaction(trans); 3976 - if (ret) 3977 - return ret; 3978 - /* 3979 - * The cleaner kthread might still be doing iput 3980 - * operations. Wait for it to finish so that 3981 - * more space is released. We don't need to 3982 - * explicitly run the delayed iputs here because 3983 - * the commit_transaction would have woken up 3984 - * the cleaner. 3985 - */ 3986 - ret = btrfs_wait_on_delayed_iputs(fs_info); 3987 - if (ret) 3988 - return ret; 3989 - goto again; 3990 - } else { 3991 - btrfs_end_transaction(trans); 3992 - } 3993 - } 3994 - 3995 - trace_btrfs_space_reservation(fs_info, 3996 - "space_info:enospc", 3997 - data_sinfo->flags, bytes, 1); 3998 - return -ENOSPC; 3999 - } 4000 - update_bytes_may_use(data_sinfo, bytes); 4001 - trace_btrfs_space_reservation(fs_info, "space_info", 4002 - data_sinfo->flags, bytes, 1); 4003 - spin_unlock(&data_sinfo->lock); 4004 - 4005 - return 0; 4006 - } 4007 - 4008 - int btrfs_check_data_free_space(struct inode *inode, 4009 - struct extent_changeset **reserved, u64 start, u64 len) 4010 - { 4011 - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4012 - int ret; 4013 - 4014 - /* align the range */ 4015 - len = round_up(start + len, fs_info->sectorsize) - 4016 - round_down(start, fs_info->sectorsize); 4017 - start = round_down(start, fs_info->sectorsize); 4018 - 4019 - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); 4020 - if (ret < 0) 4021 - return ret; 4022 - 4023 - /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ 4024 - ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); 4025 - if (ret < 0) 4026 - btrfs_free_reserved_data_space_noquota(inode, start, len); 4027 - else 4028 - ret = 0; 4029 - return ret; 4030 - } 4031 - 4032 - /* 4033 - * Called if we need to clear a data reservation for this inode 4034 - * Normally in a error case. 4035 - * 4036 - * This one will *NOT* use accurate qgroup reserved space API, just for case 4037 - * which we can't sleep and is sure it won't affect qgroup reserved space. 4038 - * Like clear_bit_hook(). 4039 - */ 4040 - void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, 4041 - u64 len) 4042 - { 4043 - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4044 - struct btrfs_space_info *data_sinfo; 4045 - 4046 - /* Make sure the range is aligned to sectorsize */ 4047 - len = round_up(start + len, fs_info->sectorsize) - 4048 - round_down(start, fs_info->sectorsize); 4049 - start = round_down(start, fs_info->sectorsize); 4050 - 4051 - data_sinfo = fs_info->data_sinfo; 4052 - spin_lock(&data_sinfo->lock); 4053 - update_bytes_may_use(data_sinfo, -len); 4054 - trace_btrfs_space_reservation(fs_info, "space_info", 4055 - data_sinfo->flags, len, 0); 4056 - spin_unlock(&data_sinfo->lock); 4057 - } 4058 - 4059 - /* 4060 - * Called if we need to clear a data reservation for this inode 4061 - * Normally in a error case. 4062 - * 4063 - * This one will handle the per-inode data rsv map for accurate reserved 4064 - * space framework. 4065 - */ 4066 - void btrfs_free_reserved_data_space(struct inode *inode, 4067 - struct extent_changeset *reserved, u64 start, u64 len) 4068 - { 4069 - struct btrfs_root *root = BTRFS_I(inode)->root; 4070 - 4071 - /* Make sure the range is aligned to sectorsize */ 4072 - len = round_up(start + len, root->fs_info->sectorsize) - 4073 - round_down(start, root->fs_info->sectorsize); 4074 - start = round_down(start, root->fs_info->sectorsize); 4075 - 4076 - btrfs_free_reserved_data_space_noquota(inode, start, len); 4077 - btrfs_qgroup_free_data(inode, reserved, start, len); 4078 - } 4079 - 4080 4071 static void force_metadata_allocation(struct btrfs_fs_info *info) 4081 4072 { 4082 4073 struct list_head *head = &info->space_info; ··· 3879 4288 found->force_alloc = CHUNK_ALLOC_FORCE; 3880 4289 } 3881 4290 rcu_read_unlock(); 3882 - } 3883 - 3884 - static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 3885 - { 3886 - return (global->size << 1); 3887 4291 } 3888 4292 3889 4293 static int should_alloc_chunk(struct btrfs_fs_info *fs_info, ··· 3911 4325 { 3912 4326 u64 num_dev; 3913 4327 3914 - if (type & (BTRFS_BLOCK_GROUP_RAID10 | 3915 - BTRFS_BLOCK_GROUP_RAID0 | 3916 - BTRFS_BLOCK_GROUP_RAID5 | 3917 - BTRFS_BLOCK_GROUP_RAID6)) 4328 + num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; 4329 + if (!num_dev) 3918 4330 num_dev = fs_info->fs_devices->rw_devices; 3919 - else if (type & BTRFS_BLOCK_GROUP_RAID1) 3920 - num_dev = 2; 3921 - else 3922 - num_dev = 1; /* DUP or single */ 3923 4331 3924 4332 return num_dev; 3925 4333 } ··· 3938 4358 */ 3939 4359 lockdep_assert_held(&fs_info->chunk_mutex); 3940 4360 3941 - info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4361 + info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3942 4362 spin_lock(&info->lock); 3943 4363 left = info->total_bytes - btrfs_space_info_used(info, true); 3944 4364 spin_unlock(&info->lock); ··· 3952 4372 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 3953 4373 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 3954 4374 left, thresh, type); 3955 - dump_space_info(fs_info, info, 0, 0); 4375 + btrfs_dump_space_info(fs_info, info, 0, 0); 3956 4376 } 3957 4377 3958 4378 if (left < thresh) { ··· 3985 4405 * - return 1 if it successfully allocates a chunk, 3986 4406 * - return errors including -ENOSPC otherwise. 3987 4407 */ 3988 - static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 3989 - int force) 4408 + int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 4409 + enum btrfs_chunk_alloc_enum force) 3990 4410 { 3991 4411 struct btrfs_fs_info *fs_info = trans->fs_info; 3992 4412 struct btrfs_space_info *space_info; ··· 3998 4418 if (trans->allocating_chunk) 3999 4419 return -ENOSPC; 4000 4420 4001 - space_info = __find_space_info(fs_info, flags); 4421 + space_info = btrfs_find_space_info(fs_info, flags); 4002 4422 ASSERT(space_info); 4003 4423 4004 4424 do { ··· 4105 4525 return ret; 4106 4526 } 4107 4527 4108 - static int can_overcommit(struct btrfs_fs_info *fs_info, 4109 - struct btrfs_space_info *space_info, u64 bytes, 4110 - enum btrfs_reserve_flush_enum flush, 4111 - bool system_chunk) 4112 - { 4113 - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4114 - u64 profile; 4115 - u64 space_size; 4116 - u64 avail; 4117 - u64 used; 4118 - int factor; 4119 - 4120 - /* Don't overcommit when in mixed mode. */ 4121 - if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 4122 - return 0; 4123 - 4124 - if (system_chunk) 4125 - profile = btrfs_system_alloc_profile(fs_info); 4126 - else 4127 - profile = btrfs_metadata_alloc_profile(fs_info); 4128 - 4129 - used = btrfs_space_info_used(space_info, false); 4130 - 4131 - /* 4132 - * We only want to allow over committing if we have lots of actual space 4133 - * free, but if we don't have enough space to handle the global reserve 4134 - * space then we could end up having a real enospc problem when trying 4135 - * to allocate a chunk or some other such important allocation. 4136 - */ 4137 - spin_lock(&global_rsv->lock); 4138 - space_size = calc_global_rsv_need_space(global_rsv); 4139 - spin_unlock(&global_rsv->lock); 4140 - if (used + space_size >= space_info->total_bytes) 4141 - return 0; 4142 - 4143 - used += space_info->bytes_may_use; 4144 - 4145 - avail = atomic64_read(&fs_info->free_chunk_space); 4146 - 4147 - /* 4148 - * If we have dup, raid1 or raid10 then only half of the free 4149 - * space is actually usable. For raid56, the space info used 4150 - * doesn't include the parity drive, so we don't have to 4151 - * change the math 4152 - */ 4153 - factor = btrfs_bg_type_to_factor(profile); 4154 - avail = div_u64(avail, factor); 4155 - 4156 - /* 4157 - * If we aren't flushing all things, let us overcommit up to 4158 - * 1/2th of the space. If we can flush, don't let us overcommit 4159 - * too much, let it overcommit up to 1/8 of the space. 4160 - */ 4161 - if (flush == BTRFS_RESERVE_FLUSH_ALL) 4162 - avail >>= 3; 4163 - else 4164 - avail >>= 1; 4165 - 4166 - if (used + bytes < space_info->total_bytes + avail) 4167 - return 1; 4168 - return 0; 4169 - } 4170 - 4171 - static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 4172 - unsigned long nr_pages, int nr_items) 4173 - { 4174 - struct super_block *sb = fs_info->sb; 4175 - 4176 - if (down_read_trylock(&sb->s_umount)) { 4177 - writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4178 - up_read(&sb->s_umount); 4179 - } else { 4180 - /* 4181 - * We needn't worry the filesystem going from r/w to r/o though 4182 - * we don't acquire ->s_umount mutex, because the filesystem 4183 - * should guarantee the delalloc inodes list be empty after 4184 - * the filesystem is readonly(all dirty pages are written to 4185 - * the disk). 4186 - */ 4187 - btrfs_start_delalloc_roots(fs_info, nr_items); 4188 - if (!current->journal_info) 4189 - btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 4190 - } 4191 - } 4192 - 4193 - static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 4194 - u64 to_reclaim) 4195 - { 4196 - u64 bytes; 4197 - u64 nr; 4198 - 4199 - bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 4200 - nr = div64_u64(to_reclaim, bytes); 4201 - if (!nr) 4202 - nr = 1; 4203 - return nr; 4204 - } 4205 - 4206 - #define EXTENT_SIZE_PER_ITEM SZ_256K 4207 - 4208 - /* 4209 - * shrink metadata reservation for delalloc 4210 - */ 4211 - static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 4212 - u64 orig, bool wait_ordered) 4213 - { 4214 - struct btrfs_space_info *space_info; 4215 - struct btrfs_trans_handle *trans; 4216 - u64 delalloc_bytes; 4217 - u64 dio_bytes; 4218 - u64 async_pages; 4219 - u64 items; 4220 - long time_left; 4221 - unsigned long nr_pages; 4222 - int loops; 4223 - 4224 - /* Calc the number of the pages we need flush for space reservation */ 4225 - items = calc_reclaim_items_nr(fs_info, to_reclaim); 4226 - to_reclaim = items * EXTENT_SIZE_PER_ITEM; 4227 - 4228 - trans = (struct btrfs_trans_handle *)current->journal_info; 4229 - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4230 - 4231 - delalloc_bytes = percpu_counter_sum_positive( 4232 - &fs_info->delalloc_bytes); 4233 - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 4234 - if (delalloc_bytes == 0 && dio_bytes == 0) { 4235 - if (trans) 4236 - return; 4237 - if (wait_ordered) 4238 - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4239 - return; 4240 - } 4241 - 4242 - /* 4243 - * If we are doing more ordered than delalloc we need to just wait on 4244 - * ordered extents, otherwise we'll waste time trying to flush delalloc 4245 - * that likely won't give us the space back we need. 4246 - */ 4247 - if (dio_bytes > delalloc_bytes) 4248 - wait_ordered = true; 4249 - 4250 - loops = 0; 4251 - while ((delalloc_bytes || dio_bytes) && loops < 3) { 4252 - nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 4253 - 4254 - /* 4255 - * Triggers inode writeback for up to nr_pages. This will invoke 4256 - * ->writepages callback and trigger delalloc filling 4257 - * (btrfs_run_delalloc_range()). 4258 - */ 4259 - btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 4260 - 4261 - /* 4262 - * We need to wait for the compressed pages to start before 4263 - * we continue. 4264 - */ 4265 - async_pages = atomic_read(&fs_info->async_delalloc_pages); 4266 - if (!async_pages) 4267 - goto skip_async; 4268 - 4269 - /* 4270 - * Calculate how many compressed pages we want to be written 4271 - * before we continue. I.e if there are more async pages than we 4272 - * require wait_event will wait until nr_pages are written. 4273 - */ 4274 - if (async_pages <= nr_pages) 4275 - async_pages = 0; 4276 - else 4277 - async_pages -= nr_pages; 4278 - 4279 - wait_event(fs_info->async_submit_wait, 4280 - atomic_read(&fs_info->async_delalloc_pages) <= 4281 - (int)async_pages); 4282 - skip_async: 4283 - spin_lock(&space_info->lock); 4284 - if (list_empty(&space_info->tickets) && 4285 - list_empty(&space_info->priority_tickets)) { 4286 - spin_unlock(&space_info->lock); 4287 - break; 4288 - } 4289 - spin_unlock(&space_info->lock); 4290 - 4291 - loops++; 4292 - if (wait_ordered && !trans) { 4293 - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4294 - } else { 4295 - time_left = schedule_timeout_killable(1); 4296 - if (time_left) 4297 - break; 4298 - } 4299 - delalloc_bytes = percpu_counter_sum_positive( 4300 - &fs_info->delalloc_bytes); 4301 - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 4302 - } 4303 - } 4304 - 4305 - struct reserve_ticket { 4306 - u64 orig_bytes; 4307 - u64 bytes; 4308 - int error; 4309 - struct list_head list; 4310 - wait_queue_head_t wait; 4311 - }; 4312 - 4313 - /** 4314 - * maybe_commit_transaction - possibly commit the transaction if its ok to 4315 - * @root - the root we're allocating for 4316 - * @bytes - the number of bytes we want to reserve 4317 - * @force - force the commit 4318 - * 4319 - * This will check to make sure that committing the transaction will actually 4320 - * get us somewhere and then commit the transaction if it does. Otherwise it 4321 - * will return -ENOSPC. 4322 - */ 4323 - static int may_commit_transaction(struct btrfs_fs_info *fs_info, 4324 - struct btrfs_space_info *space_info) 4325 - { 4326 - struct reserve_ticket *ticket = NULL; 4327 - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 4328 - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 4329 - struct btrfs_trans_handle *trans; 4330 - u64 bytes_needed; 4331 - u64 reclaim_bytes = 0; 4332 - 4333 - trans = (struct btrfs_trans_handle *)current->journal_info; 4334 - if (trans) 4335 - return -EAGAIN; 4336 - 4337 - spin_lock(&space_info->lock); 4338 - if (!list_empty(&space_info->priority_tickets)) 4339 - ticket = list_first_entry(&space_info->priority_tickets, 4340 - struct reserve_ticket, list); 4341 - else if (!list_empty(&space_info->tickets)) 4342 - ticket = list_first_entry(&space_info->tickets, 4343 - struct reserve_ticket, list); 4344 - bytes_needed = (ticket) ? ticket->bytes : 0; 4345 - spin_unlock(&space_info->lock); 4346 - 4347 - if (!bytes_needed) 4348 - return 0; 4349 - 4350 - trans = btrfs_join_transaction(fs_info->extent_root); 4351 - if (IS_ERR(trans)) 4352 - return PTR_ERR(trans); 4353 - 4354 - /* 4355 - * See if there is enough pinned space to make this reservation, or if 4356 - * we have block groups that are going to be freed, allowing us to 4357 - * possibly do a chunk allocation the next loop through. 4358 - */ 4359 - if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || 4360 - __percpu_counter_compare(&space_info->total_bytes_pinned, 4361 - bytes_needed, 4362 - BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 4363 - goto commit; 4364 - 4365 - /* 4366 - * See if there is some space in the delayed insertion reservation for 4367 - * this reservation. 4368 - */ 4369 - if (space_info != delayed_rsv->space_info) 4370 - goto enospc; 4371 - 4372 - spin_lock(&delayed_rsv->lock); 4373 - reclaim_bytes += delayed_rsv->reserved; 4374 - spin_unlock(&delayed_rsv->lock); 4375 - 4376 - spin_lock(&delayed_refs_rsv->lock); 4377 - reclaim_bytes += delayed_refs_rsv->reserved; 4378 - spin_unlock(&delayed_refs_rsv->lock); 4379 - if (reclaim_bytes >= bytes_needed) 4380 - goto commit; 4381 - bytes_needed -= reclaim_bytes; 4382 - 4383 - if (__percpu_counter_compare(&space_info->total_bytes_pinned, 4384 - bytes_needed, 4385 - BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) 4386 - goto enospc; 4387 - 4388 - commit: 4389 - return btrfs_commit_transaction(trans); 4390 - enospc: 4391 - btrfs_end_transaction(trans); 4392 - return -ENOSPC; 4393 - } 4394 - 4395 - /* 4396 - * Try to flush some data based on policy set by @state. This is only advisory 4397 - * and may fail for various reasons. The caller is supposed to examine the 4398 - * state of @space_info to detect the outcome. 4399 - */ 4400 - static void flush_space(struct btrfs_fs_info *fs_info, 4401 - struct btrfs_space_info *space_info, u64 num_bytes, 4402 - int state) 4403 - { 4404 - struct btrfs_root *root = fs_info->extent_root; 4405 - struct btrfs_trans_handle *trans; 4406 - int nr; 4407 - int ret = 0; 4408 - 4409 - switch (state) { 4410 - case FLUSH_DELAYED_ITEMS_NR: 4411 - case FLUSH_DELAYED_ITEMS: 4412 - if (state == FLUSH_DELAYED_ITEMS_NR) 4413 - nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 4414 - else 4415 - nr = -1; 4416 - 4417 - trans = btrfs_join_transaction(root); 4418 - if (IS_ERR(trans)) { 4419 - ret = PTR_ERR(trans); 4420 - break; 4421 - } 4422 - ret = btrfs_run_delayed_items_nr(trans, nr); 4423 - btrfs_end_transaction(trans); 4424 - break; 4425 - case FLUSH_DELALLOC: 4426 - case FLUSH_DELALLOC_WAIT: 4427 - shrink_delalloc(fs_info, num_bytes * 2, num_bytes, 4428 - state == FLUSH_DELALLOC_WAIT); 4429 - break; 4430 - case FLUSH_DELAYED_REFS_NR: 4431 - case FLUSH_DELAYED_REFS: 4432 - trans = btrfs_join_transaction(root); 4433 - if (IS_ERR(trans)) { 4434 - ret = PTR_ERR(trans); 4435 - break; 4436 - } 4437 - if (state == FLUSH_DELAYED_REFS_NR) 4438 - nr = calc_reclaim_items_nr(fs_info, num_bytes); 4439 - else 4440 - nr = 0; 4441 - btrfs_run_delayed_refs(trans, nr); 4442 - btrfs_end_transaction(trans); 4443 - break; 4444 - case ALLOC_CHUNK: 4445 - case ALLOC_CHUNK_FORCE: 4446 - trans = btrfs_join_transaction(root); 4447 - if (IS_ERR(trans)) { 4448 - ret = PTR_ERR(trans); 4449 - break; 4450 - } 4451 - ret = do_chunk_alloc(trans, 4452 - btrfs_metadata_alloc_profile(fs_info), 4453 - (state == ALLOC_CHUNK) ? 4454 - CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); 4455 - btrfs_end_transaction(trans); 4456 - if (ret > 0 || ret == -ENOSPC) 4457 - ret = 0; 4458 - break; 4459 - case COMMIT_TRANS: 4460 - /* 4461 - * If we have pending delayed iputs then we could free up a 4462 - * bunch of pinned space, so make sure we run the iputs before 4463 - * we do our pinned bytes check below. 4464 - */ 4465 - btrfs_run_delayed_iputs(fs_info); 4466 - btrfs_wait_on_delayed_iputs(fs_info); 4467 - 4468 - ret = may_commit_transaction(fs_info, space_info); 4469 - break; 4470 - default: 4471 - ret = -ENOSPC; 4472 - break; 4473 - } 4474 - 4475 - trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 4476 - ret); 4477 - return; 4478 - } 4479 - 4480 - static inline u64 4481 - btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 4482 - struct btrfs_space_info *space_info, 4483 - bool system_chunk) 4484 - { 4485 - struct reserve_ticket *ticket; 4486 - u64 used; 4487 - u64 expected; 4488 - u64 to_reclaim = 0; 4489 - 4490 - list_for_each_entry(ticket, &space_info->tickets, list) 4491 - to_reclaim += ticket->bytes; 4492 - list_for_each_entry(ticket, &space_info->priority_tickets, list) 4493 - to_reclaim += ticket->bytes; 4494 - if (to_reclaim) 4495 - return to_reclaim; 4496 - 4497 - to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 4498 - if (can_overcommit(fs_info, space_info, to_reclaim, 4499 - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 4500 - return 0; 4501 - 4502 - used = btrfs_space_info_used(space_info, true); 4503 - 4504 - if (can_overcommit(fs_info, space_info, SZ_1M, 4505 - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 4506 - expected = div_factor_fine(space_info->total_bytes, 95); 4507 - else 4508 - expected = div_factor_fine(space_info->total_bytes, 90); 4509 - 4510 - if (used > expected) 4511 - to_reclaim = used - expected; 4512 - else 4513 - to_reclaim = 0; 4514 - to_reclaim = min(to_reclaim, space_info->bytes_may_use + 4515 - space_info->bytes_reserved); 4516 - return to_reclaim; 4517 - } 4518 - 4519 - static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, 4520 - struct btrfs_space_info *space_info, 4521 - u64 used, bool system_chunk) 4522 - { 4523 - u64 thresh = div_factor_fine(space_info->total_bytes, 98); 4524 - 4525 - /* If we're just plain full then async reclaim just slows us down. */ 4526 - if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 4527 - return 0; 4528 - 4529 - if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, 4530 - system_chunk)) 4531 - return 0; 4532 - 4533 - return (used >= thresh && !btrfs_fs_closing(fs_info) && 4534 - !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 4535 - } 4536 - 4537 - static bool wake_all_tickets(struct list_head *head) 4538 - { 4539 - struct reserve_ticket *ticket; 4540 - 4541 - while (!list_empty(head)) { 4542 - ticket = list_first_entry(head, struct reserve_ticket, list); 4543 - list_del_init(&ticket->list); 4544 - ticket->error = -ENOSPC; 4545 - wake_up(&ticket->wait); 4546 - if (ticket->bytes != ticket->orig_bytes) 4547 - return true; 4548 - } 4549 - return false; 4550 - } 4551 - 4552 - /* 4553 - * This is for normal flushers, we can wait all goddamned day if we want to. We 4554 - * will loop and continuously try to flush as long as we are making progress. 4555 - * We count progress as clearing off tickets each time we have to loop. 4556 - */ 4557 - static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 4558 - { 4559 - struct btrfs_fs_info *fs_info; 4560 - struct btrfs_space_info *space_info; 4561 - u64 to_reclaim; 4562 - int flush_state; 4563 - int commit_cycles = 0; 4564 - u64 last_tickets_id; 4565 - 4566 - fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 4567 - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4568 - 4569 - spin_lock(&space_info->lock); 4570 - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 4571 - false); 4572 - if (!to_reclaim) { 4573 - space_info->flush = 0; 4574 - spin_unlock(&space_info->lock); 4575 - return; 4576 - } 4577 - last_tickets_id = space_info->tickets_id; 4578 - spin_unlock(&space_info->lock); 4579 - 4580 - flush_state = FLUSH_DELAYED_ITEMS_NR; 4581 - do { 4582 - flush_space(fs_info, space_info, to_reclaim, flush_state); 4583 - spin_lock(&space_info->lock); 4584 - if (list_empty(&space_info->tickets)) { 4585 - space_info->flush = 0; 4586 - spin_unlock(&space_info->lock); 4587 - return; 4588 - } 4589 - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 4590 - space_info, 4591 - false); 4592 - if (last_tickets_id == space_info->tickets_id) { 4593 - flush_state++; 4594 - } else { 4595 - last_tickets_id = space_info->tickets_id; 4596 - flush_state = FLUSH_DELAYED_ITEMS_NR; 4597 - if (commit_cycles) 4598 - commit_cycles--; 4599 - } 4600 - 4601 - /* 4602 - * We don't want to force a chunk allocation until we've tried 4603 - * pretty hard to reclaim space. Think of the case where we 4604 - * freed up a bunch of space and so have a lot of pinned space 4605 - * to reclaim. We would rather use that than possibly create a 4606 - * underutilized metadata chunk. So if this is our first run 4607 - * through the flushing state machine skip ALLOC_CHUNK_FORCE and 4608 - * commit the transaction. If nothing has changed the next go 4609 - * around then we can force a chunk allocation. 4610 - */ 4611 - if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 4612 - flush_state++; 4613 - 4614 - if (flush_state > COMMIT_TRANS) { 4615 - commit_cycles++; 4616 - if (commit_cycles > 2) { 4617 - if (wake_all_tickets(&space_info->tickets)) { 4618 - flush_state = FLUSH_DELAYED_ITEMS_NR; 4619 - commit_cycles--; 4620 - } else { 4621 - space_info->flush = 0; 4622 - } 4623 - } else { 4624 - flush_state = FLUSH_DELAYED_ITEMS_NR; 4625 - } 4626 - } 4627 - spin_unlock(&space_info->lock); 4628 - } while (flush_state <= COMMIT_TRANS); 4629 - } 4630 - 4631 - void btrfs_init_async_reclaim_work(struct work_struct *work) 4632 - { 4633 - INIT_WORK(work, btrfs_async_reclaim_metadata_space); 4634 - } 4635 - 4636 - static const enum btrfs_flush_state priority_flush_states[] = { 4637 - FLUSH_DELAYED_ITEMS_NR, 4638 - FLUSH_DELAYED_ITEMS, 4639 - ALLOC_CHUNK, 4640 - }; 4641 - 4642 - static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 4643 - struct btrfs_space_info *space_info, 4644 - struct reserve_ticket *ticket) 4645 - { 4646 - u64 to_reclaim; 4647 - int flush_state; 4648 - 4649 - spin_lock(&space_info->lock); 4650 - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 4651 - false); 4652 - if (!to_reclaim) { 4653 - spin_unlock(&space_info->lock); 4654 - return; 4655 - } 4656 - spin_unlock(&space_info->lock); 4657 - 4658 - flush_state = 0; 4659 - do { 4660 - flush_space(fs_info, space_info, to_reclaim, 4661 - priority_flush_states[flush_state]); 4662 - flush_state++; 4663 - spin_lock(&space_info->lock); 4664 - if (ticket->bytes == 0) { 4665 - spin_unlock(&space_info->lock); 4666 - return; 4667 - } 4668 - spin_unlock(&space_info->lock); 4669 - } while (flush_state < ARRAY_SIZE(priority_flush_states)); 4670 - } 4671 - 4672 - static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, 4673 - struct btrfs_space_info *space_info, 4674 - struct reserve_ticket *ticket) 4675 - 4676 - { 4677 - DEFINE_WAIT(wait); 4678 - u64 reclaim_bytes = 0; 4679 - int ret = 0; 4680 - 4681 - spin_lock(&space_info->lock); 4682 - while (ticket->bytes > 0 && ticket->error == 0) { 4683 - ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 4684 - if (ret) { 4685 - ret = -EINTR; 4686 - break; 4687 - } 4688 - spin_unlock(&space_info->lock); 4689 - 4690 - schedule(); 4691 - 4692 - finish_wait(&ticket->wait, &wait); 4693 - spin_lock(&space_info->lock); 4694 - } 4695 - if (!ret) 4696 - ret = ticket->error; 4697 - if (!list_empty(&ticket->list)) 4698 - list_del_init(&ticket->list); 4699 - if (ticket->bytes && ticket->bytes < ticket->orig_bytes) 4700 - reclaim_bytes = ticket->orig_bytes - ticket->bytes; 4701 - spin_unlock(&space_info->lock); 4702 - 4703 - if (reclaim_bytes) 4704 - space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); 4705 - return ret; 4706 - } 4707 - 4708 - /** 4709 - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 4710 - * @root - the root we're allocating for 4711 - * @space_info - the space info we want to allocate from 4712 - * @orig_bytes - the number of bytes we want 4713 - * @flush - whether or not we can flush to make our reservation 4714 - * 4715 - * This will reserve orig_bytes number of bytes from the space info associated 4716 - * with the block_rsv. If there is not enough space it will make an attempt to 4717 - * flush out space to make room. It will do this by flushing delalloc if 4718 - * possible or committing the transaction. If flush is 0 then no attempts to 4719 - * regain reservations will be made and this will fail if there is not enough 4720 - * space already. 4721 - */ 4722 - static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 4723 - struct btrfs_space_info *space_info, 4724 - u64 orig_bytes, 4725 - enum btrfs_reserve_flush_enum flush, 4726 - bool system_chunk) 4727 - { 4728 - struct reserve_ticket ticket; 4729 - u64 used; 4730 - u64 reclaim_bytes = 0; 4731 - int ret = 0; 4732 - 4733 - ASSERT(orig_bytes); 4734 - ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 4735 - 4736 - spin_lock(&space_info->lock); 4737 - ret = -ENOSPC; 4738 - used = btrfs_space_info_used(space_info, true); 4739 - 4740 - /* 4741 - * If we have enough space then hooray, make our reservation and carry 4742 - * on. If not see if we can overcommit, and if we can, hooray carry on. 4743 - * If not things get more complicated. 4744 - */ 4745 - if (used + orig_bytes <= space_info->total_bytes) { 4746 - update_bytes_may_use(space_info, orig_bytes); 4747 - trace_btrfs_space_reservation(fs_info, "space_info", 4748 - space_info->flags, orig_bytes, 1); 4749 - ret = 0; 4750 - } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, 4751 - system_chunk)) { 4752 - update_bytes_may_use(space_info, orig_bytes); 4753 - trace_btrfs_space_reservation(fs_info, "space_info", 4754 - space_info->flags, orig_bytes, 1); 4755 - ret = 0; 4756 - } 4757 - 4758 - /* 4759 - * If we couldn't make a reservation then setup our reservation ticket 4760 - * and kick the async worker if it's not already running. 4761 - * 4762 - * If we are a priority flusher then we just need to add our ticket to 4763 - * the list and we will do our own flushing further down. 4764 - */ 4765 - if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 4766 - ticket.orig_bytes = orig_bytes; 4767 - ticket.bytes = orig_bytes; 4768 - ticket.error = 0; 4769 - init_waitqueue_head(&ticket.wait); 4770 - if (flush == BTRFS_RESERVE_FLUSH_ALL) { 4771 - list_add_tail(&ticket.list, &space_info->tickets); 4772 - if (!space_info->flush) { 4773 - space_info->flush = 1; 4774 - trace_btrfs_trigger_flush(fs_info, 4775 - space_info->flags, 4776 - orig_bytes, flush, 4777 - "enospc"); 4778 - queue_work(system_unbound_wq, 4779 - &fs_info->async_reclaim_work); 4780 - } 4781 - } else { 4782 - list_add_tail(&ticket.list, 4783 - &space_info->priority_tickets); 4784 - } 4785 - } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 4786 - used += orig_bytes; 4787 - /* 4788 - * We will do the space reservation dance during log replay, 4789 - * which means we won't have fs_info->fs_root set, so don't do 4790 - * the async reclaim as we will panic. 4791 - */ 4792 - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 4793 - need_do_async_reclaim(fs_info, space_info, 4794 - used, system_chunk) && 4795 - !work_busy(&fs_info->async_reclaim_work)) { 4796 - trace_btrfs_trigger_flush(fs_info, space_info->flags, 4797 - orig_bytes, flush, "preempt"); 4798 - queue_work(system_unbound_wq, 4799 - &fs_info->async_reclaim_work); 4800 - } 4801 - } 4802 - spin_unlock(&space_info->lock); 4803 - if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 4804 - return ret; 4805 - 4806 - if (flush == BTRFS_RESERVE_FLUSH_ALL) 4807 - return wait_reserve_ticket(fs_info, space_info, &ticket); 4808 - 4809 - ret = 0; 4810 - priority_reclaim_metadata_space(fs_info, space_info, &ticket); 4811 - spin_lock(&space_info->lock); 4812 - if (ticket.bytes) { 4813 - if (ticket.bytes < orig_bytes) 4814 - reclaim_bytes = orig_bytes - ticket.bytes; 4815 - list_del_init(&ticket.list); 4816 - ret = -ENOSPC; 4817 - } 4818 - spin_unlock(&space_info->lock); 4819 - 4820 - if (reclaim_bytes) 4821 - space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); 4822 - ASSERT(list_empty(&ticket.list)); 4823 - return ret; 4824 - } 4825 - 4826 - /** 4827 - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 4828 - * @root - the root we're allocating for 4829 - * @block_rsv - the block_rsv we're allocating for 4830 - * @orig_bytes - the number of bytes we want 4831 - * @flush - whether or not we can flush to make our reservation 4832 - * 4833 - * This will reserve orig_bytes number of bytes from the space info associated 4834 - * with the block_rsv. If there is not enough space it will make an attempt to 4835 - * flush out space to make room. It will do this by flushing delalloc if 4836 - * possible or committing the transaction. If flush is 0 then no attempts to 4837 - * regain reservations will be made and this will fail if there is not enough 4838 - * space already. 4839 - */ 4840 - static int reserve_metadata_bytes(struct btrfs_root *root, 4841 - struct btrfs_block_rsv *block_rsv, 4842 - u64 orig_bytes, 4843 - enum btrfs_reserve_flush_enum flush) 4844 - { 4845 - struct btrfs_fs_info *fs_info = root->fs_info; 4846 - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4847 - int ret; 4848 - bool system_chunk = (root == fs_info->chunk_root); 4849 - 4850 - ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, 4851 - orig_bytes, flush, system_chunk); 4852 - if (ret == -ENOSPC && 4853 - unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 4854 - if (block_rsv != global_rsv && 4855 - !block_rsv_use_bytes(global_rsv, orig_bytes)) 4856 - ret = 0; 4857 - } 4858 - if (ret == -ENOSPC) { 4859 - trace_btrfs_space_reservation(fs_info, "space_info:enospc", 4860 - block_rsv->space_info->flags, 4861 - orig_bytes, 1); 4862 - 4863 - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 4864 - dump_space_info(fs_info, block_rsv->space_info, 4865 - orig_bytes, 0); 4866 - } 4867 - return ret; 4868 - } 4869 - 4870 - static struct btrfs_block_rsv *get_block_rsv( 4871 - const struct btrfs_trans_handle *trans, 4872 - const struct btrfs_root *root) 4873 - { 4874 - struct btrfs_fs_info *fs_info = root->fs_info; 4875 - struct btrfs_block_rsv *block_rsv = NULL; 4876 - 4877 - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 4878 - (root == fs_info->csum_root && trans->adding_csums) || 4879 - (root == fs_info->uuid_root)) 4880 - block_rsv = trans->block_rsv; 4881 - 4882 - if (!block_rsv) 4883 - block_rsv = root->block_rsv; 4884 - 4885 - if (!block_rsv) 4886 - block_rsv = &fs_info->empty_block_rsv; 4887 - 4888 - return block_rsv; 4889 - } 4890 - 4891 - static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 4892 - u64 num_bytes) 4893 - { 4894 - int ret = -ENOSPC; 4895 - spin_lock(&block_rsv->lock); 4896 - if (block_rsv->reserved >= num_bytes) { 4897 - block_rsv->reserved -= num_bytes; 4898 - if (block_rsv->reserved < block_rsv->size) 4899 - block_rsv->full = 0; 4900 - ret = 0; 4901 - } 4902 - spin_unlock(&block_rsv->lock); 4903 - return ret; 4904 - } 4905 - 4906 - static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 4907 - u64 num_bytes, bool update_size) 4908 - { 4909 - spin_lock(&block_rsv->lock); 4910 - block_rsv->reserved += num_bytes; 4911 - if (update_size) 4912 - block_rsv->size += num_bytes; 4913 - else if (block_rsv->reserved >= block_rsv->size) 4914 - block_rsv->full = 1; 4915 - spin_unlock(&block_rsv->lock); 4916 - } 4917 - 4918 - int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 4919 - struct btrfs_block_rsv *dest, u64 num_bytes, 4920 - int min_factor) 4921 - { 4922 - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4923 - u64 min_bytes; 4924 - 4925 - if (global_rsv->space_info != dest->space_info) 4926 - return -ENOSPC; 4927 - 4928 - spin_lock(&global_rsv->lock); 4929 - min_bytes = div_factor(global_rsv->size, min_factor); 4930 - if (global_rsv->reserved < min_bytes + num_bytes) { 4931 - spin_unlock(&global_rsv->lock); 4932 - return -ENOSPC; 4933 - } 4934 - global_rsv->reserved -= num_bytes; 4935 - if (global_rsv->reserved < global_rsv->size) 4936 - global_rsv->full = 0; 4937 - spin_unlock(&global_rsv->lock); 4938 - 4939 - block_rsv_add_bytes(dest, num_bytes, true); 4940 - return 0; 4941 - } 4942 - 4943 - /** 4944 - * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. 4945 - * @fs_info - the fs info for our fs. 4946 - * @src - the source block rsv to transfer from. 4947 - * @num_bytes - the number of bytes to transfer. 4948 - * 4949 - * This transfers up to the num_bytes amount from the src rsv to the 4950 - * delayed_refs_rsv. Any extra bytes are returned to the space info. 4951 - */ 4952 - void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, 4953 - struct btrfs_block_rsv *src, 4954 - u64 num_bytes) 4955 - { 4956 - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 4957 - u64 to_free = 0; 4958 - 4959 - spin_lock(&src->lock); 4960 - src->reserved -= num_bytes; 4961 - src->size -= num_bytes; 4962 - spin_unlock(&src->lock); 4963 - 4964 - spin_lock(&delayed_refs_rsv->lock); 4965 - if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { 4966 - u64 delta = delayed_refs_rsv->size - 4967 - delayed_refs_rsv->reserved; 4968 - if (num_bytes > delta) { 4969 - to_free = num_bytes - delta; 4970 - num_bytes = delta; 4971 - } 4972 - } else { 4973 - to_free = num_bytes; 4974 - num_bytes = 0; 4975 - } 4976 - 4977 - if (num_bytes) 4978 - delayed_refs_rsv->reserved += num_bytes; 4979 - if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) 4980 - delayed_refs_rsv->full = 1; 4981 - spin_unlock(&delayed_refs_rsv->lock); 4982 - 4983 - if (num_bytes) 4984 - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 4985 - 0, num_bytes, 1); 4986 - if (to_free) 4987 - space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, 4988 - to_free); 4989 - } 4990 - 4991 - /** 4992 - * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. 4993 - * @fs_info - the fs_info for our fs. 4994 - * @flush - control how we can flush for this reservation. 4995 - * 4996 - * This will refill the delayed block_rsv up to 1 items size worth of space and 4997 - * will return -ENOSPC if we can't make the reservation. 4998 - */ 4999 - int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, 5000 - enum btrfs_reserve_flush_enum flush) 5001 - { 5002 - struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 5003 - u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); 5004 - u64 num_bytes = 0; 5005 - int ret = -ENOSPC; 5006 - 5007 - spin_lock(&block_rsv->lock); 5008 - if (block_rsv->reserved < block_rsv->size) { 5009 - num_bytes = block_rsv->size - block_rsv->reserved; 5010 - num_bytes = min(num_bytes, limit); 5011 - } 5012 - spin_unlock(&block_rsv->lock); 5013 - 5014 - if (!num_bytes) 5015 - return 0; 5016 - 5017 - ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, 5018 - num_bytes, flush); 5019 - if (ret) 5020 - return ret; 5021 - block_rsv_add_bytes(block_rsv, num_bytes, 0); 5022 - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 5023 - 0, num_bytes, 1); 5024 - return 0; 5025 - } 5026 - 5027 - /* 5028 - * This is for space we already have accounted in space_info->bytes_may_use, so 5029 - * basically when we're returning space from block_rsv's. 5030 - */ 5031 - static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 5032 - struct btrfs_space_info *space_info, 5033 - u64 num_bytes) 5034 - { 5035 - struct reserve_ticket *ticket; 5036 - struct list_head *head; 5037 - u64 used; 5038 - enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 5039 - bool check_overcommit = false; 5040 - 5041 - spin_lock(&space_info->lock); 5042 - head = &space_info->priority_tickets; 5043 - 5044 - /* 5045 - * If we are over our limit then we need to check and see if we can 5046 - * overcommit, and if we can't then we just need to free up our space 5047 - * and not satisfy any requests. 5048 - */ 5049 - used = btrfs_space_info_used(space_info, true); 5050 - if (used - num_bytes >= space_info->total_bytes) 5051 - check_overcommit = true; 5052 - again: 5053 - while (!list_empty(head) && num_bytes) { 5054 - ticket = list_first_entry(head, struct reserve_ticket, 5055 - list); 5056 - /* 5057 - * We use 0 bytes because this space is already reserved, so 5058 - * adding the ticket space would be a double count. 5059 - */ 5060 - if (check_overcommit && 5061 - !can_overcommit(fs_info, space_info, 0, flush, false)) 5062 - break; 5063 - if (num_bytes >= ticket->bytes) { 5064 - list_del_init(&ticket->list); 5065 - num_bytes -= ticket->bytes; 5066 - ticket->bytes = 0; 5067 - space_info->tickets_id++; 5068 - wake_up(&ticket->wait); 5069 - } else { 5070 - ticket->bytes -= num_bytes; 5071 - num_bytes = 0; 5072 - } 5073 - } 5074 - 5075 - if (num_bytes && head == &space_info->priority_tickets) { 5076 - head = &space_info->tickets; 5077 - flush = BTRFS_RESERVE_FLUSH_ALL; 5078 - goto again; 5079 - } 5080 - update_bytes_may_use(space_info, -num_bytes); 5081 - trace_btrfs_space_reservation(fs_info, "space_info", 5082 - space_info->flags, num_bytes, 0); 5083 - spin_unlock(&space_info->lock); 5084 - } 5085 - 5086 - /* 5087 - * This is for newly allocated space that isn't accounted in 5088 - * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent 5089 - * we use this helper. 5090 - */ 5091 - static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 5092 - struct btrfs_space_info *space_info, 5093 - u64 num_bytes) 5094 - { 5095 - struct reserve_ticket *ticket; 5096 - struct list_head *head = &space_info->priority_tickets; 5097 - 5098 - again: 5099 - while (!list_empty(head) && num_bytes) { 5100 - ticket = list_first_entry(head, struct reserve_ticket, 5101 - list); 5102 - if (num_bytes >= ticket->bytes) { 5103 - trace_btrfs_space_reservation(fs_info, "space_info", 5104 - space_info->flags, 5105 - ticket->bytes, 1); 5106 - list_del_init(&ticket->list); 5107 - num_bytes -= ticket->bytes; 5108 - update_bytes_may_use(space_info, ticket->bytes); 5109 - ticket->bytes = 0; 5110 - space_info->tickets_id++; 5111 - wake_up(&ticket->wait); 5112 - } else { 5113 - trace_btrfs_space_reservation(fs_info, "space_info", 5114 - space_info->flags, 5115 - num_bytes, 1); 5116 - update_bytes_may_use(space_info, num_bytes); 5117 - ticket->bytes -= num_bytes; 5118 - num_bytes = 0; 5119 - } 5120 - } 5121 - 5122 - if (num_bytes && head == &space_info->priority_tickets) { 5123 - head = &space_info->tickets; 5124 - goto again; 5125 - } 5126 - } 5127 - 5128 - static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 5129 - struct btrfs_block_rsv *block_rsv, 5130 - struct btrfs_block_rsv *dest, u64 num_bytes, 5131 - u64 *qgroup_to_release_ret) 5132 - { 5133 - struct btrfs_space_info *space_info = block_rsv->space_info; 5134 - u64 qgroup_to_release = 0; 5135 - u64 ret; 5136 - 5137 - spin_lock(&block_rsv->lock); 5138 - if (num_bytes == (u64)-1) { 5139 - num_bytes = block_rsv->size; 5140 - qgroup_to_release = block_rsv->qgroup_rsv_size; 5141 - } 5142 - block_rsv->size -= num_bytes; 5143 - if (block_rsv->reserved >= block_rsv->size) { 5144 - num_bytes = block_rsv->reserved - block_rsv->size; 5145 - block_rsv->reserved = block_rsv->size; 5146 - block_rsv->full = 1; 5147 - } else { 5148 - num_bytes = 0; 5149 - } 5150 - if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { 5151 - qgroup_to_release = block_rsv->qgroup_rsv_reserved - 5152 - block_rsv->qgroup_rsv_size; 5153 - block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; 5154 - } else { 5155 - qgroup_to_release = 0; 5156 - } 5157 - spin_unlock(&block_rsv->lock); 5158 - 5159 - ret = num_bytes; 5160 - if (num_bytes > 0) { 5161 - if (dest) { 5162 - spin_lock(&dest->lock); 5163 - if (!dest->full) { 5164 - u64 bytes_to_add; 5165 - 5166 - bytes_to_add = dest->size - dest->reserved; 5167 - bytes_to_add = min(num_bytes, bytes_to_add); 5168 - dest->reserved += bytes_to_add; 5169 - if (dest->reserved >= dest->size) 5170 - dest->full = 1; 5171 - num_bytes -= bytes_to_add; 5172 - } 5173 - spin_unlock(&dest->lock); 5174 - } 5175 - if (num_bytes) 5176 - space_info_add_old_bytes(fs_info, space_info, 5177 - num_bytes); 5178 - } 5179 - if (qgroup_to_release_ret) 5180 - *qgroup_to_release_ret = qgroup_to_release; 5181 - return ret; 5182 - } 5183 - 5184 - int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, 5185 - struct btrfs_block_rsv *dst, u64 num_bytes, 5186 - bool update_size) 5187 - { 5188 - int ret; 5189 - 5190 - ret = block_rsv_use_bytes(src, num_bytes); 5191 - if (ret) 5192 - return ret; 5193 - 5194 - block_rsv_add_bytes(dst, num_bytes, update_size); 5195 - return 0; 5196 - } 5197 - 5198 - void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 5199 - { 5200 - memset(rsv, 0, sizeof(*rsv)); 5201 - spin_lock_init(&rsv->lock); 5202 - rsv->type = type; 5203 - } 5204 - 5205 - void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, 5206 - struct btrfs_block_rsv *rsv, 5207 - unsigned short type) 5208 - { 5209 - btrfs_init_block_rsv(rsv, type); 5210 - rsv->space_info = __find_space_info(fs_info, 5211 - BTRFS_BLOCK_GROUP_METADATA); 5212 - } 5213 - 5214 - struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 5215 - unsigned short type) 5216 - { 5217 - struct btrfs_block_rsv *block_rsv; 5218 - 5219 - block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 5220 - if (!block_rsv) 5221 - return NULL; 5222 - 5223 - btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); 5224 - return block_rsv; 5225 - } 5226 - 5227 - void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, 5228 - struct btrfs_block_rsv *rsv) 5229 - { 5230 - if (!rsv) 5231 - return; 5232 - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 5233 - kfree(rsv); 5234 - } 5235 - 5236 - int btrfs_block_rsv_add(struct btrfs_root *root, 5237 - struct btrfs_block_rsv *block_rsv, u64 num_bytes, 5238 - enum btrfs_reserve_flush_enum flush) 5239 - { 5240 - int ret; 5241 - 5242 - if (num_bytes == 0) 5243 - return 0; 5244 - 5245 - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5246 - if (!ret) 5247 - block_rsv_add_bytes(block_rsv, num_bytes, true); 5248 - 5249 - return ret; 5250 - } 5251 - 5252 - int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) 5253 - { 5254 - u64 num_bytes = 0; 5255 - int ret = -ENOSPC; 5256 - 5257 - if (!block_rsv) 5258 - return 0; 5259 - 5260 - spin_lock(&block_rsv->lock); 5261 - num_bytes = div_factor(block_rsv->size, min_factor); 5262 - if (block_rsv->reserved >= num_bytes) 5263 - ret = 0; 5264 - spin_unlock(&block_rsv->lock); 5265 - 5266 - return ret; 5267 - } 5268 - 5269 - int btrfs_block_rsv_refill(struct btrfs_root *root, 5270 - struct btrfs_block_rsv *block_rsv, u64 min_reserved, 5271 - enum btrfs_reserve_flush_enum flush) 5272 - { 5273 - u64 num_bytes = 0; 5274 - int ret = -ENOSPC; 5275 - 5276 - if (!block_rsv) 5277 - return 0; 5278 - 5279 - spin_lock(&block_rsv->lock); 5280 - num_bytes = min_reserved; 5281 - if (block_rsv->reserved >= num_bytes) 5282 - ret = 0; 5283 - else 5284 - num_bytes -= block_rsv->reserved; 5285 - spin_unlock(&block_rsv->lock); 5286 - 5287 - if (!ret) 5288 - return 0; 5289 - 5290 - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5291 - if (!ret) { 5292 - block_rsv_add_bytes(block_rsv, num_bytes, false); 5293 - return 0; 5294 - } 5295 - 5296 - return ret; 5297 - } 5298 - 5299 - static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5300 - struct btrfs_block_rsv *block_rsv, 5301 - u64 num_bytes, u64 *qgroup_to_release) 5302 - { 5303 - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5304 - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 5305 - struct btrfs_block_rsv *target = delayed_rsv; 5306 - 5307 - if (target->full || target == block_rsv) 5308 - target = global_rsv; 5309 - 5310 - if (block_rsv->space_info != target->space_info) 5311 - target = NULL; 5312 - 5313 - return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, 5314 - qgroup_to_release); 5315 - } 5316 - 5317 - void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5318 - struct btrfs_block_rsv *block_rsv, 5319 - u64 num_bytes) 5320 - { 5321 - __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); 5322 - } 5323 - 5324 - /** 5325 - * btrfs_inode_rsv_release - release any excessive reservation. 5326 - * @inode - the inode we need to release from. 5327 - * @qgroup_free - free or convert qgroup meta. 5328 - * Unlike normal operation, qgroup meta reservation needs to know if we are 5329 - * freeing qgroup reservation or just converting it into per-trans. Normally 5330 - * @qgroup_free is true for error handling, and false for normal release. 5331 - * 5332 - * This is the same as btrfs_block_rsv_release, except that it handles the 5333 - * tracepoint for the reservation. 5334 - */ 5335 - static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) 5336 - { 5337 - struct btrfs_fs_info *fs_info = inode->root->fs_info; 5338 - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 5339 - u64 released = 0; 5340 - u64 qgroup_to_release = 0; 5341 - 5342 - /* 5343 - * Since we statically set the block_rsv->size we just want to say we 5344 - * are releasing 0 bytes, and then we'll just get the reservation over 5345 - * the size free'd. 5346 - */ 5347 - released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, 5348 - &qgroup_to_release); 5349 - if (released > 0) 5350 - trace_btrfs_space_reservation(fs_info, "delalloc", 5351 - btrfs_ino(inode), released, 0); 5352 - if (qgroup_free) 5353 - btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); 5354 - else 5355 - btrfs_qgroup_convert_reserved_meta(inode->root, 5356 - qgroup_to_release); 5357 - } 5358 - 5359 - /** 5360 - * btrfs_delayed_refs_rsv_release - release a ref head's reservation. 5361 - * @fs_info - the fs_info for our fs. 5362 - * @nr - the number of items to drop. 5363 - * 5364 - * This drops the delayed ref head's count from the delayed refs rsv and frees 5365 - * any excess reservation we had. 5366 - */ 5367 - void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) 5368 - { 5369 - struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 5370 - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5371 - u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); 5372 - u64 released = 0; 5373 - 5374 - released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 5375 - num_bytes, NULL); 5376 - if (released) 5377 - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 5378 - 0, released, 0); 5379 - } 5380 - 5381 - static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 5382 - { 5383 - struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 5384 - struct btrfs_space_info *sinfo = block_rsv->space_info; 5385 - u64 num_bytes; 5386 - 5387 - /* 5388 - * The global block rsv is based on the size of the extent tree, the 5389 - * checksum tree and the root tree. If the fs is empty we want to set 5390 - * it to a minimal amount for safety. 5391 - */ 5392 - num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + 5393 - btrfs_root_used(&fs_info->csum_root->root_item) + 5394 - btrfs_root_used(&fs_info->tree_root->root_item); 5395 - num_bytes = max_t(u64, num_bytes, SZ_16M); 5396 - 5397 - spin_lock(&sinfo->lock); 5398 - spin_lock(&block_rsv->lock); 5399 - 5400 - block_rsv->size = min_t(u64, num_bytes, SZ_512M); 5401 - 5402 - if (block_rsv->reserved < block_rsv->size) { 5403 - num_bytes = btrfs_space_info_used(sinfo, true); 5404 - if (sinfo->total_bytes > num_bytes) { 5405 - num_bytes = sinfo->total_bytes - num_bytes; 5406 - num_bytes = min(num_bytes, 5407 - block_rsv->size - block_rsv->reserved); 5408 - block_rsv->reserved += num_bytes; 5409 - update_bytes_may_use(sinfo, num_bytes); 5410 - trace_btrfs_space_reservation(fs_info, "space_info", 5411 - sinfo->flags, num_bytes, 5412 - 1); 5413 - } 5414 - } else if (block_rsv->reserved > block_rsv->size) { 5415 - num_bytes = block_rsv->reserved - block_rsv->size; 5416 - update_bytes_may_use(sinfo, -num_bytes); 5417 - trace_btrfs_space_reservation(fs_info, "space_info", 5418 - sinfo->flags, num_bytes, 0); 5419 - block_rsv->reserved = block_rsv->size; 5420 - } 5421 - 5422 - if (block_rsv->reserved == block_rsv->size) 5423 - block_rsv->full = 1; 5424 - else 5425 - block_rsv->full = 0; 5426 - 5427 - spin_unlock(&block_rsv->lock); 5428 - spin_unlock(&sinfo->lock); 5429 - } 5430 - 5431 - static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 5432 - { 5433 - struct btrfs_space_info *space_info; 5434 - 5435 - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 5436 - fs_info->chunk_block_rsv.space_info = space_info; 5437 - 5438 - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5439 - fs_info->global_block_rsv.space_info = space_info; 5440 - fs_info->trans_block_rsv.space_info = space_info; 5441 - fs_info->empty_block_rsv.space_info = space_info; 5442 - fs_info->delayed_block_rsv.space_info = space_info; 5443 - fs_info->delayed_refs_rsv.space_info = space_info; 5444 - 5445 - fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; 5446 - fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; 5447 - fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 5448 - fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 5449 - if (fs_info->quota_root) 5450 - fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 5451 - fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 5452 - 5453 - update_global_block_rsv(fs_info); 5454 - } 5455 - 5456 - static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 5457 - { 5458 - block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 5459 - (u64)-1, NULL); 5460 - WARN_ON(fs_info->trans_block_rsv.size > 0); 5461 - WARN_ON(fs_info->trans_block_rsv.reserved > 0); 5462 - WARN_ON(fs_info->chunk_block_rsv.size > 0); 5463 - WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 5464 - WARN_ON(fs_info->delayed_block_rsv.size > 0); 5465 - WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 5466 - WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); 5467 - WARN_ON(fs_info->delayed_refs_rsv.size > 0); 5468 - } 5469 - 5470 - /* 5471 - * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv 5472 - * @trans - the trans that may have generated delayed refs 5473 - * 5474 - * This is to be called anytime we may have adjusted trans->delayed_ref_updates, 5475 - * it'll calculate the additional size and add it to the delayed_refs_rsv. 5476 - */ 5477 - void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) 5478 - { 5479 - struct btrfs_fs_info *fs_info = trans->fs_info; 5480 - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 5481 - u64 num_bytes; 5482 - 5483 - if (!trans->delayed_ref_updates) 5484 - return; 5485 - 5486 - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 5487 - trans->delayed_ref_updates); 5488 - spin_lock(&delayed_rsv->lock); 5489 - delayed_rsv->size += num_bytes; 5490 - delayed_rsv->full = 0; 5491 - spin_unlock(&delayed_rsv->lock); 5492 - trans->delayed_ref_updates = 0; 5493 - } 5494 - 5495 - /* 5496 - * To be called after all the new block groups attached to the transaction 5497 - * handle have been created (btrfs_create_pending_block_groups()). 5498 - */ 5499 - void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) 5500 - { 5501 - struct btrfs_fs_info *fs_info = trans->fs_info; 5502 - 5503 - if (!trans->chunk_bytes_reserved) 5504 - return; 5505 - 5506 - WARN_ON_ONCE(!list_empty(&trans->new_bgs)); 5507 - 5508 - block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, 5509 - trans->chunk_bytes_reserved, NULL); 5510 - trans->chunk_bytes_reserved = 0; 5511 - } 5512 - 5513 - /* 5514 - * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 5515 - * root: the root of the parent directory 5516 - * rsv: block reservation 5517 - * items: the number of items that we need do reservation 5518 - * use_global_rsv: allow fallback to the global block reservation 5519 - * 5520 - * This function is used to reserve the space for snapshot/subvolume 5521 - * creation and deletion. Those operations are different with the 5522 - * common file/directory operations, they change two fs/file trees 5523 - * and root tree, the number of items that the qgroup reserves is 5524 - * different with the free space reservation. So we can not use 5525 - * the space reservation mechanism in start_transaction(). 5526 - */ 5527 - int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 5528 - struct btrfs_block_rsv *rsv, int items, 5529 - bool use_global_rsv) 5530 - { 5531 - u64 qgroup_num_bytes = 0; 5532 - u64 num_bytes; 5533 - int ret; 5534 - struct btrfs_fs_info *fs_info = root->fs_info; 5535 - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5536 - 5537 - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 5538 - /* One for parent inode, two for dir entries */ 5539 - qgroup_num_bytes = 3 * fs_info->nodesize; 5540 - ret = btrfs_qgroup_reserve_meta_prealloc(root, 5541 - qgroup_num_bytes, true); 5542 - if (ret) 5543 - return ret; 5544 - } 5545 - 5546 - num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); 5547 - rsv->space_info = __find_space_info(fs_info, 5548 - BTRFS_BLOCK_GROUP_METADATA); 5549 - ret = btrfs_block_rsv_add(root, rsv, num_bytes, 5550 - BTRFS_RESERVE_FLUSH_ALL); 5551 - 5552 - if (ret == -ENOSPC && use_global_rsv) 5553 - ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); 5554 - 5555 - if (ret && qgroup_num_bytes) 5556 - btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); 5557 - 5558 - return ret; 5559 - } 5560 - 5561 - void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, 5562 - struct btrfs_block_rsv *rsv) 5563 - { 5564 - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 5565 - } 5566 - 5567 - static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, 5568 - struct btrfs_inode *inode) 5569 - { 5570 - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 5571 - u64 reserve_size = 0; 5572 - u64 qgroup_rsv_size = 0; 5573 - u64 csum_leaves; 5574 - unsigned outstanding_extents; 5575 - 5576 - lockdep_assert_held(&inode->lock); 5577 - outstanding_extents = inode->outstanding_extents; 5578 - if (outstanding_extents) 5579 - reserve_size = btrfs_calc_trans_metadata_size(fs_info, 5580 - outstanding_extents + 1); 5581 - csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, 5582 - inode->csum_bytes); 5583 - reserve_size += btrfs_calc_trans_metadata_size(fs_info, 5584 - csum_leaves); 5585 - /* 5586 - * For qgroup rsv, the calculation is very simple: 5587 - * account one nodesize for each outstanding extent 5588 - * 5589 - * This is overestimating in most cases. 5590 - */ 5591 - qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; 5592 - 5593 - spin_lock(&block_rsv->lock); 5594 - block_rsv->size = reserve_size; 5595 - block_rsv->qgroup_rsv_size = qgroup_rsv_size; 5596 - spin_unlock(&block_rsv->lock); 5597 - } 5598 - 5599 - static void calc_inode_reservations(struct btrfs_fs_info *fs_info, 5600 - u64 num_bytes, u64 *meta_reserve, 5601 - u64 *qgroup_reserve) 5602 - { 5603 - u64 nr_extents = count_max_extents(num_bytes); 5604 - u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); 5605 - 5606 - /* We add one for the inode update at finish ordered time */ 5607 - *meta_reserve = btrfs_calc_trans_metadata_size(fs_info, 5608 - nr_extents + csum_leaves + 1); 5609 - *qgroup_reserve = nr_extents * fs_info->nodesize; 5610 - } 5611 - 5612 - int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) 5613 - { 5614 - struct btrfs_root *root = inode->root; 5615 - struct btrfs_fs_info *fs_info = root->fs_info; 5616 - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 5617 - u64 meta_reserve, qgroup_reserve; 5618 - unsigned nr_extents; 5619 - enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 5620 - int ret = 0; 5621 - bool delalloc_lock = true; 5622 - 5623 - /* If we are a free space inode we need to not flush since we will be in 5624 - * the middle of a transaction commit. We also don't need the delalloc 5625 - * mutex since we won't race with anybody. We need this mostly to make 5626 - * lockdep shut its filthy mouth. 5627 - * 5628 - * If we have a transaction open (can happen if we call truncate_block 5629 - * from truncate), then we need FLUSH_LIMIT so we don't deadlock. 5630 - */ 5631 - if (btrfs_is_free_space_inode(inode)) { 5632 - flush = BTRFS_RESERVE_NO_FLUSH; 5633 - delalloc_lock = false; 5634 - } else { 5635 - if (current->journal_info) 5636 - flush = BTRFS_RESERVE_FLUSH_LIMIT; 5637 - 5638 - if (btrfs_transaction_in_commit(fs_info)) 5639 - schedule_timeout(1); 5640 - } 5641 - 5642 - if (delalloc_lock) 5643 - mutex_lock(&inode->delalloc_mutex); 5644 - 5645 - num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 5646 - 5647 - /* 5648 - * We always want to do it this way, every other way is wrong and ends 5649 - * in tears. Pre-reserving the amount we are going to add will always 5650 - * be the right way, because otherwise if we have enough parallelism we 5651 - * could end up with thousands of inodes all holding little bits of 5652 - * reservations they were able to make previously and the only way to 5653 - * reclaim that space is to ENOSPC out the operations and clear 5654 - * everything out and try again, which is bad. This way we just 5655 - * over-reserve slightly, and clean up the mess when we are done. 5656 - */ 5657 - calc_inode_reservations(fs_info, num_bytes, &meta_reserve, 5658 - &qgroup_reserve); 5659 - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); 5660 - if (ret) 5661 - goto out_fail; 5662 - ret = reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); 5663 - if (ret) 5664 - goto out_qgroup; 5665 - 5666 - /* 5667 - * Now we need to update our outstanding extents and csum bytes _first_ 5668 - * and then add the reservation to the block_rsv. This keeps us from 5669 - * racing with an ordered completion or some such that would think it 5670 - * needs to free the reservation we just made. 5671 - */ 5672 - spin_lock(&inode->lock); 5673 - nr_extents = count_max_extents(num_bytes); 5674 - btrfs_mod_outstanding_extents(inode, nr_extents); 5675 - inode->csum_bytes += num_bytes; 5676 - btrfs_calculate_inode_block_rsv_size(fs_info, inode); 5677 - spin_unlock(&inode->lock); 5678 - 5679 - /* Now we can safely add our space to our block rsv */ 5680 - block_rsv_add_bytes(block_rsv, meta_reserve, false); 5681 - trace_btrfs_space_reservation(root->fs_info, "delalloc", 5682 - btrfs_ino(inode), meta_reserve, 1); 5683 - 5684 - spin_lock(&block_rsv->lock); 5685 - block_rsv->qgroup_rsv_reserved += qgroup_reserve; 5686 - spin_unlock(&block_rsv->lock); 5687 - 5688 - if (delalloc_lock) 5689 - mutex_unlock(&inode->delalloc_mutex); 5690 - return 0; 5691 - out_qgroup: 5692 - btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); 5693 - out_fail: 5694 - btrfs_inode_rsv_release(inode, true); 5695 - if (delalloc_lock) 5696 - mutex_unlock(&inode->delalloc_mutex); 5697 - return ret; 5698 - } 5699 - 5700 - /** 5701 - * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 5702 - * @inode: the inode to release the reservation for. 5703 - * @num_bytes: the number of bytes we are releasing. 5704 - * @qgroup_free: free qgroup reservation or convert it to per-trans reservation 5705 - * 5706 - * This will release the metadata reservation for an inode. This can be called 5707 - * once we complete IO for a given set of bytes to release their metadata 5708 - * reservations, or on error for the same reason. 5709 - */ 5710 - void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, 5711 - bool qgroup_free) 5712 - { 5713 - struct btrfs_fs_info *fs_info = inode->root->fs_info; 5714 - 5715 - num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 5716 - spin_lock(&inode->lock); 5717 - inode->csum_bytes -= num_bytes; 5718 - btrfs_calculate_inode_block_rsv_size(fs_info, inode); 5719 - spin_unlock(&inode->lock); 5720 - 5721 - if (btrfs_is_testing(fs_info)) 5722 - return; 5723 - 5724 - btrfs_inode_rsv_release(inode, qgroup_free); 5725 - } 5726 - 5727 - /** 5728 - * btrfs_delalloc_release_extents - release our outstanding_extents 5729 - * @inode: the inode to balance the reservation for. 5730 - * @num_bytes: the number of bytes we originally reserved with 5731 - * @qgroup_free: do we need to free qgroup meta reservation or convert them. 5732 - * 5733 - * When we reserve space we increase outstanding_extents for the extents we may 5734 - * add. Once we've set the range as delalloc or created our ordered extents we 5735 - * have outstanding_extents to track the real usage, so we use this to free our 5736 - * temporarily tracked outstanding_extents. This _must_ be used in conjunction 5737 - * with btrfs_delalloc_reserve_metadata. 5738 - */ 5739 - void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, 5740 - bool qgroup_free) 5741 - { 5742 - struct btrfs_fs_info *fs_info = inode->root->fs_info; 5743 - unsigned num_extents; 5744 - 5745 - spin_lock(&inode->lock); 5746 - num_extents = count_max_extents(num_bytes); 5747 - btrfs_mod_outstanding_extents(inode, -num_extents); 5748 - btrfs_calculate_inode_block_rsv_size(fs_info, inode); 5749 - spin_unlock(&inode->lock); 5750 - 5751 - if (btrfs_is_testing(fs_info)) 5752 - return; 5753 - 5754 - btrfs_inode_rsv_release(inode, qgroup_free); 5755 - } 5756 - 5757 - /** 5758 - * btrfs_delalloc_reserve_space - reserve data and metadata space for 5759 - * delalloc 5760 - * @inode: inode we're writing to 5761 - * @start: start range we are writing to 5762 - * @len: how long the range we are writing to 5763 - * @reserved: mandatory parameter, record actually reserved qgroup ranges of 5764 - * current reservation. 5765 - * 5766 - * This will do the following things 5767 - * 5768 - * o reserve space in data space info for num bytes 5769 - * and reserve precious corresponding qgroup space 5770 - * (Done in check_data_free_space) 5771 - * 5772 - * o reserve space for metadata space, based on the number of outstanding 5773 - * extents and how much csums will be needed 5774 - * also reserve metadata space in a per root over-reserve method. 5775 - * o add to the inodes->delalloc_bytes 5776 - * o add it to the fs_info's delalloc inodes list. 5777 - * (Above 3 all done in delalloc_reserve_metadata) 5778 - * 5779 - * Return 0 for success 5780 - * Return <0 for error(-ENOSPC or -EQUOT) 5781 - */ 5782 - int btrfs_delalloc_reserve_space(struct inode *inode, 5783 - struct extent_changeset **reserved, u64 start, u64 len) 5784 - { 5785 - int ret; 5786 - 5787 - ret = btrfs_check_data_free_space(inode, reserved, start, len); 5788 - if (ret < 0) 5789 - return ret; 5790 - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); 5791 - if (ret < 0) 5792 - btrfs_free_reserved_data_space(inode, *reserved, start, len); 5793 - return ret; 5794 - } 5795 - 5796 - /** 5797 - * btrfs_delalloc_release_space - release data and metadata space for delalloc 5798 - * @inode: inode we're releasing space for 5799 - * @start: start position of the space already reserved 5800 - * @len: the len of the space already reserved 5801 - * @release_bytes: the len of the space we consumed or didn't use 5802 - * 5803 - * This function will release the metadata space that was not used and will 5804 - * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 5805 - * list if there are no delalloc bytes left. 5806 - * Also it will handle the qgroup reserved space. 5807 - */ 5808 - void btrfs_delalloc_release_space(struct inode *inode, 5809 - struct extent_changeset *reserved, 5810 - u64 start, u64 len, bool qgroup_free) 5811 - { 5812 - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); 5813 - btrfs_free_reserved_data_space(inode, reserved, start, len); 5814 - } 5815 - 5816 4528 static int update_block_group(struct btrfs_trans_handle *trans, 5817 4529 u64 bytenr, u64 num_bytes, int alloc) 5818 4530 { ··· 4168 6296 old_val -= num_bytes; 4169 6297 btrfs_set_block_group_used(&cache->item, old_val); 4170 6298 cache->pinned += num_bytes; 4171 - update_bytes_pinned(cache->space_info, num_bytes); 6299 + btrfs_space_info_update_bytes_pinned(info, 6300 + cache->space_info, num_bytes); 4172 6301 cache->space_info->bytes_used -= num_bytes; 4173 6302 cache->space_info->disk_used -= num_bytes * factor; 4174 6303 spin_unlock(&cache->lock); ··· 4244 6371 spin_lock(&cache->space_info->lock); 4245 6372 spin_lock(&cache->lock); 4246 6373 cache->pinned += num_bytes; 4247 - update_bytes_pinned(cache->space_info, num_bytes); 6374 + btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, 6375 + num_bytes); 4248 6376 if (reserved) { 4249 6377 cache->reserved -= num_bytes; 4250 6378 cache->space_info->bytes_reserved -= num_bytes; ··· 4454 6580 } else { 4455 6581 cache->reserved += num_bytes; 4456 6582 space_info->bytes_reserved += num_bytes; 4457 - update_bytes_may_use(space_info, -ram_bytes); 6583 + btrfs_space_info_update_bytes_may_use(cache->fs_info, 6584 + space_info, -ram_bytes); 4458 6585 if (delalloc) 4459 6586 cache->delalloc_bytes += num_bytes; 4460 6587 } ··· 4521 6646 4522 6647 up_write(&fs_info->commit_root_sem); 4523 6648 4524 - update_global_block_rsv(fs_info); 6649 + btrfs_update_global_block_rsv(fs_info); 4525 6650 } 4526 6651 4527 6652 /* ··· 4611 6736 spin_lock(&space_info->lock); 4612 6737 spin_lock(&cache->lock); 4613 6738 cache->pinned -= len; 4614 - update_bytes_pinned(space_info, -len); 6739 + btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); 4615 6740 4616 6741 trace_btrfs_space_reservation(fs_info, "pinned", 4617 6742 space_info->flags, len, 0); ··· 4632 6757 to_add = min(len, global_rsv->size - 4633 6758 global_rsv->reserved); 4634 6759 global_rsv->reserved += to_add; 4635 - update_bytes_may_use(space_info, to_add); 6760 + btrfs_space_info_update_bytes_may_use(fs_info, 6761 + space_info, to_add); 4636 6762 if (global_rsv->reserved >= global_rsv->size) 4637 6763 global_rsv->full = 1; 4638 6764 trace_btrfs_space_reservation(fs_info, ··· 4645 6769 spin_unlock(&global_rsv->lock); 4646 6770 /* Add to any tickets we may have */ 4647 6771 if (len) 4648 - space_info_add_new_bytes(fs_info, space_info, 4649 - len); 6772 + btrfs_space_info_add_new_bytes(fs_info, 6773 + space_info, len); 4650 6774 } 4651 6775 spin_unlock(&space_info->lock); 4652 6776 } ··· 5067 7191 } 5068 7192 out: 5069 7193 if (pin) 5070 - add_pinned_bytes(fs_info, &generic_ref, 1); 7194 + add_pinned_bytes(fs_info, &generic_ref); 5071 7195 5072 7196 if (last_ref) { 5073 7197 /* ··· 5115 7239 btrfs_ref_tree_mod(fs_info, ref); 5116 7240 5117 7241 if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) 5118 - add_pinned_bytes(fs_info, ref, 1); 7242 + add_pinned_bytes(fs_info, ref); 5119 7243 5120 7244 return ret; 5121 7245 } ··· 5168 7292 } 5169 7293 5170 7294 enum btrfs_loop_type { 5171 - LOOP_CACHING_NOWAIT = 0, 5172 - LOOP_CACHING_WAIT = 1, 5173 - LOOP_ALLOC_CHUNK = 2, 5174 - LOOP_NO_EMPTY_SIZE = 3, 7295 + LOOP_CACHING_NOWAIT, 7296 + LOOP_CACHING_WAIT, 7297 + LOOP_ALLOC_CHUNK, 7298 + LOOP_NO_EMPTY_SIZE, 5175 7299 }; 5176 7300 5177 7301 static inline void ··· 5537 7661 return ret; 5538 7662 } 5539 7663 5540 - ret = do_chunk_alloc(trans, ffe_ctl->flags, 5541 - CHUNK_ALLOC_FORCE); 7664 + ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, 7665 + CHUNK_ALLOC_FORCE); 5542 7666 5543 7667 /* 5544 7668 * If we can't allocate a new chunk we've already looped ··· 5634 7758 5635 7759 trace_find_free_extent(fs_info, num_bytes, empty_size, flags); 5636 7760 5637 - space_info = __find_space_info(fs_info, flags); 7761 + space_info = btrfs_find_space_info(fs_info, flags); 5638 7762 if (!space_info) { 5639 7763 btrfs_err(fs_info, "No space info for %llu", flags); 5640 7764 return -ENOSPC; ··· 5739 7863 */ 5740 7864 if (!block_group_bits(block_group, flags)) { 5741 7865 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5742 - BTRFS_BLOCK_GROUP_RAID1 | 5743 - BTRFS_BLOCK_GROUP_RAID5 | 5744 - BTRFS_BLOCK_GROUP_RAID6 | 7866 + BTRFS_BLOCK_GROUP_RAID1_MASK | 7867 + BTRFS_BLOCK_GROUP_RAID56_MASK | 5745 7868 BTRFS_BLOCK_GROUP_RAID10; 5746 7869 5747 7870 /* ··· 5859 7984 return ret; 5860 7985 } 5861 7986 5862 - #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 5863 - do { \ 5864 - struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 5865 - spin_lock(&__rsv->lock); \ 5866 - btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 5867 - __rsv->size, __rsv->reserved); \ 5868 - spin_unlock(&__rsv->lock); \ 5869 - } while (0) 5870 - 5871 - static void dump_space_info(struct btrfs_fs_info *fs_info, 5872 - struct btrfs_space_info *info, u64 bytes, 5873 - int dump_block_groups) 5874 - { 5875 - struct btrfs_block_group_cache *cache; 5876 - int index = 0; 5877 - 5878 - spin_lock(&info->lock); 5879 - btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 5880 - info->flags, 5881 - info->total_bytes - btrfs_space_info_used(info, true), 5882 - info->full ? "" : "not "); 5883 - btrfs_info(fs_info, 5884 - "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 5885 - info->total_bytes, info->bytes_used, info->bytes_pinned, 5886 - info->bytes_reserved, info->bytes_may_use, 5887 - info->bytes_readonly); 5888 - spin_unlock(&info->lock); 5889 - 5890 - DUMP_BLOCK_RSV(fs_info, global_block_rsv); 5891 - DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 5892 - DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 5893 - DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 5894 - DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 5895 - 5896 - if (!dump_block_groups) 5897 - return; 5898 - 5899 - down_read(&info->groups_sem); 5900 - again: 5901 - list_for_each_entry(cache, &info->block_groups[index], list) { 5902 - spin_lock(&cache->lock); 5903 - btrfs_info(fs_info, 5904 - "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 5905 - cache->key.objectid, cache->key.offset, 5906 - btrfs_block_group_used(&cache->item), cache->pinned, 5907 - cache->reserved, cache->ro ? "[readonly]" : ""); 5908 - btrfs_dump_free_space(cache, bytes); 5909 - spin_unlock(&cache->lock); 5910 - } 5911 - if (++index < BTRFS_NR_RAID_TYPES) 5912 - goto again; 5913 - up_read(&info->groups_sem); 5914 - } 5915 - 5916 7987 /* 5917 7988 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a 5918 7989 * hole that is at least as big as @num_bytes. ··· 5934 8113 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 5935 8114 struct btrfs_space_info *sinfo; 5936 8115 5937 - sinfo = __find_space_info(fs_info, flags); 8116 + sinfo = btrfs_find_space_info(fs_info, flags); 5938 8117 btrfs_err(fs_info, 5939 8118 "allocation failed flags %llu, wanted %llu", 5940 8119 flags, num_bytes); 5941 8120 if (sinfo) 5942 - dump_space_info(fs_info, sinfo, num_bytes, 1); 8121 + btrfs_dump_space_info(fs_info, sinfo, 8122 + num_bytes, 1); 5943 8123 } 5944 8124 } 5945 8125 ··· 6278 8456 return buf; 6279 8457 } 6280 8458 6281 - static struct btrfs_block_rsv * 6282 - use_block_rsv(struct btrfs_trans_handle *trans, 6283 - struct btrfs_root *root, u32 blocksize) 6284 - { 6285 - struct btrfs_fs_info *fs_info = root->fs_info; 6286 - struct btrfs_block_rsv *block_rsv; 6287 - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 6288 - int ret; 6289 - bool global_updated = false; 6290 - 6291 - block_rsv = get_block_rsv(trans, root); 6292 - 6293 - if (unlikely(block_rsv->size == 0)) 6294 - goto try_reserve; 6295 - again: 6296 - ret = block_rsv_use_bytes(block_rsv, blocksize); 6297 - if (!ret) 6298 - return block_rsv; 6299 - 6300 - if (block_rsv->failfast) 6301 - return ERR_PTR(ret); 6302 - 6303 - if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 6304 - global_updated = true; 6305 - update_global_block_rsv(fs_info); 6306 - goto again; 6307 - } 6308 - 6309 - /* 6310 - * The global reserve still exists to save us from ourselves, so don't 6311 - * warn_on if we are short on our delayed refs reserve. 6312 - */ 6313 - if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && 6314 - btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 6315 - static DEFINE_RATELIMIT_STATE(_rs, 6316 - DEFAULT_RATELIMIT_INTERVAL * 10, 6317 - /*DEFAULT_RATELIMIT_BURST*/ 1); 6318 - if (__ratelimit(&_rs)) 6319 - WARN(1, KERN_DEBUG 6320 - "BTRFS: block rsv returned %d\n", ret); 6321 - } 6322 - try_reserve: 6323 - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6324 - BTRFS_RESERVE_NO_FLUSH); 6325 - if (!ret) 6326 - return block_rsv; 6327 - /* 6328 - * If we couldn't reserve metadata bytes try and use some from 6329 - * the global reserve if its space type is the same as the global 6330 - * reservation. 6331 - */ 6332 - if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 6333 - block_rsv->space_info == global_rsv->space_info) { 6334 - ret = block_rsv_use_bytes(global_rsv, blocksize); 6335 - if (!ret) 6336 - return global_rsv; 6337 - } 6338 - return ERR_PTR(ret); 6339 - } 6340 - 6341 - static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 6342 - struct btrfs_block_rsv *block_rsv, u32 blocksize) 6343 - { 6344 - block_rsv_add_bytes(block_rsv, blocksize, false); 6345 - block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); 6346 - } 6347 - 6348 8459 /* 6349 8460 * finds a free extent and does all the dirty work required for allocation 6350 8461 * returns the tree buffer or an ERR_PTR on error. ··· 6310 8555 } 6311 8556 #endif 6312 8557 6313 - block_rsv = use_block_rsv(trans, root, blocksize); 8558 + block_rsv = btrfs_use_block_rsv(trans, root, blocksize); 6314 8559 if (IS_ERR(block_rsv)) 6315 8560 return ERR_CAST(block_rsv); 6316 8561 ··· 6368 8613 out_free_reserved: 6369 8614 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); 6370 8615 out_unuse: 6371 - unuse_block_rsv(fs_info, block_rsv, blocksize); 8616 + btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); 6372 8617 return ERR_PTR(ret); 6373 8618 } 6374 8619 ··· 7307 9552 7308 9553 num_devices = fs_info->fs_devices->rw_devices; 7309 9554 7310 - stripped = BTRFS_BLOCK_GROUP_RAID0 | 7311 - BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 7312 - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 9555 + stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK | 9556 + BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; 7313 9557 7314 9558 if (num_devices == 1) { 7315 9559 stripped |= BTRFS_BLOCK_GROUP_DUP; ··· 7319 9565 return stripped; 7320 9566 7321 9567 /* turn mirroring into duplication */ 7322 - if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 9568 + if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK | 7323 9569 BTRFS_BLOCK_GROUP_RAID10)) 7324 9570 return stripped | BTRFS_BLOCK_GROUP_DUP; 7325 9571 } else { ··· 7390 9636 btrfs_info(cache->fs_info, 7391 9637 "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", 7392 9638 sinfo_used, num_bytes, min_allocable_bytes); 7393 - dump_space_info(cache->fs_info, cache->space_info, 0, 0); 9639 + btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); 7394 9640 } 7395 9641 return ret; 7396 9642 } ··· 7432 9678 */ 7433 9679 alloc_flags = update_block_group_flags(fs_info, cache->flags); 7434 9680 if (alloc_flags != cache->flags) { 7435 - ret = do_chunk_alloc(trans, alloc_flags, 7436 - CHUNK_ALLOC_FORCE); 9681 + ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 7437 9682 /* 7438 9683 * ENOSPC is allowed here, we may have enough space 7439 9684 * already allocated at the new raid level to ··· 7448 9695 if (!ret) 7449 9696 goto out; 7450 9697 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); 7451 - ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 9698 + ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 7452 9699 if (ret < 0) 7453 9700 goto out; 7454 9701 ret = inc_block_group_ro(cache, 0); ··· 7469 9716 { 7470 9717 u64 alloc_flags = get_alloc_profile(trans->fs_info, type); 7471 9718 7472 - return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 9719 + return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 7473 9720 } 7474 9721 7475 9722 /* ··· 7702 9949 struct extent_map_tree *em_tree; 7703 9950 struct extent_map *em; 7704 9951 7705 - em_tree = &root->fs_info->mapping_tree.map_tree; 9952 + em_tree = &root->fs_info->mapping_tree; 7706 9953 read_lock(&em_tree->lock); 7707 9954 em = lookup_extent_mapping(em_tree, found_key.objectid, 7708 9955 found_key.offset); ··· 7855 10102 */ 7856 10103 synchronize_rcu(); 7857 10104 7858 - release_global_block_rsv(info); 10105 + btrfs_release_global_block_rsv(info); 7859 10106 7860 10107 while (!list_empty(&info->space_info)) { 7861 10108 int i; ··· 7871 10118 if (WARN_ON(space_info->bytes_pinned > 0 || 7872 10119 space_info->bytes_reserved > 0 || 7873 10120 space_info->bytes_may_use > 0)) 7874 - dump_space_info(info, space_info, 0, 0); 10121 + btrfs_dump_space_info(info, space_info, 0, 0); 7875 10122 list_del(&space_info->list); 7876 10123 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 7877 10124 struct kobject *kobj; ··· 7894 10141 struct btrfs_space_info *space_info; 7895 10142 struct raid_kobject *rkobj; 7896 10143 LIST_HEAD(list); 7897 - int index; 7898 10144 int ret = 0; 7899 10145 7900 10146 spin_lock(&fs_info->pending_raid_kobjs_lock); ··· 7901 10149 spin_unlock(&fs_info->pending_raid_kobjs_lock); 7902 10150 7903 10151 list_for_each_entry(rkobj, &list, list) { 7904 - space_info = __find_space_info(fs_info, rkobj->flags); 7905 - index = btrfs_bg_flags_to_raid_index(rkobj->flags); 10152 + space_info = btrfs_find_space_info(fs_info, rkobj->flags); 7906 10153 7907 10154 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 7908 - "%s", get_raid_name(index)); 10155 + "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); 7909 10156 if (ret) { 7910 10157 kobject_put(&rkobj->kobj); 7911 10158 break; ··· 7994 10243 */ 7995 10244 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) 7996 10245 { 7997 - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 10246 + struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7998 10247 struct extent_map *em; 7999 10248 struct btrfs_block_group_cache *bg; 8000 10249 u64 start = 0; 8001 10250 int ret = 0; 8002 10251 8003 10252 while (1) { 8004 - read_lock(&map_tree->map_tree.lock); 10253 + read_lock(&map_tree->lock); 8005 10254 /* 8006 10255 * lookup_extent_mapping will return the first extent map 8007 10256 * intersecting the range, so setting @len to 1 is enough to 8008 10257 * get the first chunk. 8009 10258 */ 8010 - em = lookup_extent_mapping(&map_tree->map_tree, start, 1); 8011 - read_unlock(&map_tree->map_tree.lock); 10259 + em = lookup_extent_mapping(map_tree, start, 1); 10260 + read_unlock(&map_tree->lock); 8012 10261 if (!em) 8013 10262 break; 8014 10263 ··· 8168 10417 } 8169 10418 8170 10419 trace_btrfs_add_block_group(info, cache, 0); 8171 - update_space_info(info, cache->flags, found_key.offset, 8172 - btrfs_block_group_used(&cache->item), 8173 - cache->bytes_super, &space_info); 10420 + btrfs_update_space_info(info, cache->flags, found_key.offset, 10421 + btrfs_block_group_used(&cache->item), 10422 + cache->bytes_super, &space_info); 8174 10423 8175 10424 cache->space_info = space_info; 8176 10425 ··· 8188 10437 list_for_each_entry_rcu(space_info, &info->space_info, list) { 8189 10438 if (!(get_alloc_profile(info, space_info->flags) & 8190 10439 (BTRFS_BLOCK_GROUP_RAID10 | 8191 - BTRFS_BLOCK_GROUP_RAID1 | 8192 - BTRFS_BLOCK_GROUP_RAID5 | 8193 - BTRFS_BLOCK_GROUP_RAID6 | 10440 + BTRFS_BLOCK_GROUP_RAID1_MASK | 10441 + BTRFS_BLOCK_GROUP_RAID56_MASK | 8194 10442 BTRFS_BLOCK_GROUP_DUP))) 8195 10443 continue; 8196 10444 /* ··· 8207 10457 } 8208 10458 8209 10459 btrfs_add_raid_kobjects(info); 8210 - init_global_block_rsv(info); 10460 + btrfs_init_global_block_rsv(info); 8211 10461 ret = check_chunk_block_group_mappings(info); 8212 10462 error: 8213 10463 btrfs_free_path(path); ··· 8304 10554 * assigned to our block group. We want our bg to be added to the rbtree 8305 10555 * with its ->space_info set. 8306 10556 */ 8307 - cache->space_info = __find_space_info(fs_info, cache->flags); 10557 + cache->space_info = btrfs_find_space_info(fs_info, cache->flags); 8308 10558 ASSERT(cache->space_info); 8309 10559 8310 10560 ret = btrfs_add_block_group_cache(fs_info, cache); ··· 8319 10569 * the rbtree, update the space info's counters. 8320 10570 */ 8321 10571 trace_btrfs_add_block_group(fs_info, cache, 1); 8322 - update_space_info(fs_info, cache->flags, size, bytes_used, 10572 + btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, 8323 10573 cache->bytes_super, &cache->space_info); 8324 - update_global_block_rsv(fs_info); 10574 + btrfs_update_global_block_rsv(fs_info); 8325 10575 8326 10576 link_block_group(cache); 8327 10577 ··· 8346 10596 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 8347 10597 fs_info->avail_system_alloc_bits &= ~extra_flags; 8348 10598 write_sequnlock(&fs_info->profiles_lock); 10599 + } 10600 + 10601 + /* 10602 + * Clear incompat bits for the following feature(s): 10603 + * 10604 + * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group 10605 + * in the whole filesystem 10606 + */ 10607 + static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) 10608 + { 10609 + if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) { 10610 + struct list_head *head = &fs_info->space_info; 10611 + struct btrfs_space_info *sinfo; 10612 + 10613 + list_for_each_entry_rcu(sinfo, head, list) { 10614 + bool found = false; 10615 + 10616 + down_read(&sinfo->groups_sem); 10617 + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) 10618 + found = true; 10619 + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) 10620 + found = true; 10621 + up_read(&sinfo->groups_sem); 10622 + 10623 + if (found) 10624 + return; 10625 + } 10626 + btrfs_clear_fs_incompat(fs_info, RAID56); 10627 + } 8349 10628 } 8350 10629 8351 10630 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ··· 8523 10744 clear_avail_alloc_bits(fs_info, block_group->flags); 8524 10745 } 8525 10746 up_write(&block_group->space_info->groups_sem); 10747 + clear_incompat_bg_bits(fs_info, block_group->flags); 8526 10748 if (kobj) { 8527 10749 kobject_del(kobj); 8528 10750 kobject_put(kobj); ··· 8633 10853 if (remove_em) { 8634 10854 struct extent_map_tree *em_tree; 8635 10855 8636 - em_tree = &fs_info->mapping_tree.map_tree; 10856 + em_tree = &fs_info->mapping_tree; 8637 10857 write_lock(&em_tree->lock); 8638 10858 remove_extent_mapping(em_tree, em); 8639 10859 write_unlock(&em_tree->lock); ··· 8651 10871 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, 8652 10872 const u64 chunk_offset) 8653 10873 { 8654 - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 10874 + struct extent_map_tree *em_tree = &fs_info->mapping_tree; 8655 10875 struct extent_map *em; 8656 10876 struct map_lookup *map; 8657 10877 unsigned int num_items; ··· 8800 11020 spin_lock(&space_info->lock); 8801 11021 spin_lock(&block_group->lock); 8802 11022 8803 - update_bytes_pinned(space_info, -block_group->pinned); 11023 + btrfs_space_info_update_bytes_pinned(fs_info, space_info, 11024 + -block_group->pinned); 8804 11025 space_info->bytes_readonly += block_group->pinned; 8805 11026 percpu_counter_add_batch(&space_info->total_bytes_pinned, 8806 11027 -block_group->pinned, ··· 8855 11074 spin_lock(&fs_info->unused_bgs_lock); 8856 11075 } 8857 11076 spin_unlock(&fs_info->unused_bgs_lock); 8858 - } 8859 - 8860 - int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 8861 - { 8862 - struct btrfs_super_block *disk_super; 8863 - u64 features; 8864 - u64 flags; 8865 - int mixed = 0; 8866 - int ret; 8867 - 8868 - disk_super = fs_info->super_copy; 8869 - if (!btrfs_super_root(disk_super)) 8870 - return -EINVAL; 8871 - 8872 - features = btrfs_super_incompat_flags(disk_super); 8873 - if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 8874 - mixed = 1; 8875 - 8876 - flags = BTRFS_BLOCK_GROUP_SYSTEM; 8877 - ret = create_space_info(fs_info, flags); 8878 - if (ret) 8879 - goto out; 8880 - 8881 - if (mixed) { 8882 - flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 8883 - ret = create_space_info(fs_info, flags); 8884 - } else { 8885 - flags = BTRFS_BLOCK_GROUP_METADATA; 8886 - ret = create_space_info(fs_info, flags); 8887 - if (ret) 8888 - goto out; 8889 - 8890 - flags = BTRFS_BLOCK_GROUP_DATA; 8891 - ret = create_space_info(fs_info, flags); 8892 - } 8893 - out: 8894 - return ret; 8895 11077 } 8896 11078 8897 11079 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, ··· 8915 11171 find_first_clear_extent_bit(&device->alloc_state, start, 8916 11172 &start, &end, 8917 11173 CHUNK_TRIMMED | CHUNK_ALLOCATED); 11174 + 11175 + /* Ensure we skip the reserved area in the first 1M */ 11176 + start = max_t(u64, start, SZ_1M); 11177 + 8918 11178 /* 8919 11179 * If find_first_clear_extent_bit find a range that spans the 8920 11180 * end of the device it will set end to -1, in this case it's up 8921 11181 * to the caller to trim the value to the size of the device. 8922 11182 */ 8923 11183 end = min(end, device->total_bytes - 1); 11184 + 8924 11185 len = end - start + 1; 8925 11186 8926 11187 /* We didn't find any extents */

+99 -50

fs/btrfs/extent_io.c

··· 359 359 return NULL; 360 360 } 361 361 362 + /** 363 + * __etree_search - searche @tree for an entry that contains @offset. Such 364 + * entry would have entry->start <= offset && entry->end >= offset. 365 + * 366 + * @tree - the tree to search 367 + * @offset - offset that should fall within an entry in @tree 368 + * @next_ret - pointer to the first entry whose range ends after @offset 369 + * @prev - pointer to the first entry whose range begins before @offset 370 + * @p_ret - pointer where new node should be anchored (used when inserting an 371 + * entry in the tree) 372 + * @parent_ret - points to entry which would have been the parent of the entry, 373 + * containing @offset 374 + * 375 + * This function returns a pointer to the entry that contains @offset byte 376 + * address. If no such entry exists, then NULL is returned and the other 377 + * pointer arguments to the function are filled, otherwise the found entry is 378 + * returned and other pointers are left untouched. 379 + */ 362 380 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 363 381 struct rb_node **next_ret, 364 382 struct rb_node **prev_ret, ··· 522 504 { 523 505 struct rb_node *node; 524 506 525 - if (end < start) 526 - WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", 527 - end, start); 507 + if (end < start) { 508 + btrfs_err(tree->fs_info, 509 + "insert state: end < start %llu %llu", end, start); 510 + WARN_ON(1); 511 + } 528 512 state->start = start; 529 513 state->end = end; 530 514 ··· 536 516 if (node) { 537 517 struct extent_state *found; 538 518 found = rb_entry(node, struct extent_state, rb_node); 539 - pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n", 519 + btrfs_err(tree->fs_info, 520 + "found node %llu %llu on insert of %llu %llu", 540 521 found->start, found->end, start, end); 541 522 return -EEXIST; 542 523 } ··· 1558 1537 } 1559 1538 1560 1539 /** 1561 - * find_first_clear_extent_bit - finds the first range that has @bits not set 1562 - * and that starts after @start 1540 + * find_first_clear_extent_bit - find the first range that has @bits not set. 1541 + * This range could start before @start. 1563 1542 * 1564 1543 * @tree - the tree to search 1565 1544 * @start - the offset at/after which the found extent should start ··· 1599 1578 goto out; 1600 1579 } 1601 1580 } 1581 + /* 1582 + * At this point 'node' either contains 'start' or start is 1583 + * before 'node' 1584 + */ 1602 1585 state = rb_entry(node, struct extent_state, rb_node); 1603 - if (in_range(start, state->start, state->end - state->start + 1) && 1604 - (state->state & bits)) { 1605 - start = state->end + 1; 1586 + 1587 + if (in_range(start, state->start, state->end - state->start + 1)) { 1588 + if (state->state & bits) { 1589 + /* 1590 + * |--range with bits sets--| 1591 + * | 1592 + * start 1593 + */ 1594 + start = state->end + 1; 1595 + } else { 1596 + /* 1597 + * 'start' falls within a range that doesn't 1598 + * have the bits set, so take its start as 1599 + * the beginning of the desired range 1600 + * 1601 + * |--range with bits cleared----| 1602 + * | 1603 + * start 1604 + */ 1605 + *start_ret = state->start; 1606 + break; 1607 + } 1606 1608 } else { 1607 - *start_ret = start; 1609 + /* 1610 + * |---prev range---|---hole/unset---|---node range---| 1611 + * | 1612 + * start 1613 + * 1614 + * or 1615 + * 1616 + * |---hole/unset--||--first node--| 1617 + * 0 | 1618 + * start 1619 + */ 1620 + if (prev) { 1621 + state = rb_entry(prev, struct extent_state, 1622 + rb_node); 1623 + *start_ret = state->end + 1; 1624 + } else { 1625 + *start_ret = 0; 1626 + } 1608 1627 break; 1609 1628 } 1610 1629 } ··· 1780 1719 */ 1781 1720 EXPORT_FOR_TESTS 1782 1721 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, 1783 - struct extent_io_tree *tree, 1784 1722 struct page *locked_page, u64 *start, 1785 1723 u64 *end) 1786 1724 { 1725 + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 1787 1726 u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; 1788 1727 u64 delalloc_start; 1789 1728 u64 delalloc_end; ··· 2861 2800 * never fail. We're returning a bio right now but you can call btrfs_io_bio 2862 2801 * for the appropriate container_of magic 2863 2802 */ 2864 - struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) 2803 + struct bio *btrfs_bio_alloc(u64 first_byte) 2865 2804 { 2866 2805 struct bio *bio; 2867 2806 2868 2807 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); 2869 - bio_set_dev(bio, bdev); 2870 2808 bio->bi_iter.bi_sector = first_byte >> 9; 2871 2809 btrfs_io_bio_init(btrfs_io_bio(bio)); 2872 2810 return bio; ··· 2976 2916 } 2977 2917 } 2978 2918 2979 - bio = btrfs_bio_alloc(bdev, offset); 2919 + bio = btrfs_bio_alloc(offset); 2920 + bio_set_dev(bio, bdev); 2980 2921 bio_add_page(bio, page, page_size, pg_offset); 2981 2922 bio->bi_end_io = end_io_func; 2982 2923 bio->bi_private = tree; ··· 3265 3204 unsigned long *bio_flags, 3266 3205 u64 *prev_em_start) 3267 3206 { 3268 - struct inode *inode; 3269 - struct btrfs_ordered_extent *ordered; 3207 + struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); 3270 3208 int index; 3271 3209 3272 - inode = pages[0]->mapping->host; 3273 - while (1) { 3274 - lock_extent(tree, start, end); 3275 - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, 3276 - end - start + 1); 3277 - if (!ordered) 3278 - break; 3279 - unlock_extent(tree, start, end); 3280 - btrfs_start_ordered_extent(inode, ordered, 1); 3281 - btrfs_put_ordered_extent(ordered); 3282 - } 3210 + btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); 3283 3211 3284 3212 for (index = 0; index < nr_pages; index++) { 3285 3213 __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, ··· 3284 3234 unsigned long *bio_flags, 3285 3235 unsigned int read_flags) 3286 3236 { 3287 - struct inode *inode = page->mapping->host; 3288 - struct btrfs_ordered_extent *ordered; 3237 + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 3289 3238 u64 start = page_offset(page); 3290 3239 u64 end = start + PAGE_SIZE - 1; 3291 3240 int ret; 3292 3241 3293 - while (1) { 3294 - lock_extent(tree, start, end); 3295 - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, 3296 - PAGE_SIZE); 3297 - if (!ordered) 3298 - break; 3299 - unlock_extent(tree, start, end); 3300 - btrfs_start_ordered_extent(inode, ordered, 1); 3301 - btrfs_put_ordered_extent(ordered); 3302 - } 3242 + btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); 3303 3243 3304 3244 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, 3305 3245 bio_flags, read_flags, NULL); ··· 3330 3290 struct page *page, struct writeback_control *wbc, 3331 3291 u64 delalloc_start, unsigned long *nr_written) 3332 3292 { 3333 - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 3334 3293 u64 page_end = delalloc_start + PAGE_SIZE - 1; 3335 3294 bool found; 3336 3295 u64 delalloc_to_write = 0; ··· 3339 3300 3340 3301 3341 3302 while (delalloc_end < page_end) { 3342 - found = find_lock_delalloc_range(inode, tree, 3343 - page, 3303 + found = find_lock_delalloc_range(inode, page, 3344 3304 &delalloc_start, 3345 3305 &delalloc_end); 3346 3306 if (!found) { ··· 3348 3310 } 3349 3311 ret = btrfs_run_delalloc_range(inode, page, delalloc_start, 3350 3312 delalloc_end, &page_started, nr_written, wbc); 3351 - /* File system has been set read-only */ 3352 3313 if (ret) { 3353 3314 SetPageError(page); 3354 3315 /* ··· 4579 4542 struct btrfs_path *path; 4580 4543 struct btrfs_root *root = BTRFS_I(inode)->root; 4581 4544 struct fiemap_cache cache = { 0 }; 4545 + struct ulist *roots; 4546 + struct ulist *tmp_ulist; 4582 4547 int end = 0; 4583 4548 u64 em_start = 0; 4584 4549 u64 em_len = 0; ··· 4594 4555 return -ENOMEM; 4595 4556 path->leave_spinning = 1; 4596 4557 4558 + roots = ulist_alloc(GFP_KERNEL); 4559 + tmp_ulist = ulist_alloc(GFP_KERNEL); 4560 + if (!roots || !tmp_ulist) { 4561 + ret = -ENOMEM; 4562 + goto out_free_ulist; 4563 + } 4564 + 4597 4565 start = round_down(start, btrfs_inode_sectorsize(inode)); 4598 4566 len = round_up(max, btrfs_inode_sectorsize(inode)) - start; 4599 4567 ··· 4611 4565 ret = btrfs_lookup_file_extent(NULL, root, path, 4612 4566 btrfs_ino(BTRFS_I(inode)), -1, 0); 4613 4567 if (ret < 0) { 4614 - btrfs_free_path(path); 4615 - return ret; 4568 + goto out_free_ulist; 4616 4569 } else { 4617 4570 WARN_ON(!ret); 4618 4571 if (ret == 1) ··· 4720 4675 */ 4721 4676 ret = btrfs_check_shared(root, 4722 4677 btrfs_ino(BTRFS_I(inode)), 4723 - bytenr); 4678 + bytenr, roots, tmp_ulist); 4724 4679 if (ret < 0) 4725 4680 goto out_free; 4726 4681 if (ret) ··· 4763 4718 ret = emit_last_fiemap_cache(fieinfo, &cache); 4764 4719 free_extent_map(em); 4765 4720 out: 4766 - btrfs_free_path(path); 4767 4721 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4768 4722 &cached_state); 4723 + 4724 + out_free_ulist: 4725 + btrfs_free_path(path); 4726 + ulist_free(roots); 4727 + ulist_free(tmp_ulist); 4769 4728 return ret; 4770 4729 } 4771 4730 ··· 4857 4808 eb->bflags = 0; 4858 4809 rwlock_init(&eb->lock); 4859 4810 atomic_set(&eb->blocking_readers, 0); 4860 - atomic_set(&eb->blocking_writers, 0); 4811 + eb->blocking_writers = 0; 4861 4812 eb->lock_nested = false; 4862 4813 init_waitqueue_head(&eb->write_lock_wq); 4863 4814 init_waitqueue_head(&eb->read_lock_wq); ··· 4876 4827 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); 4877 4828 4878 4829 #ifdef CONFIG_BTRFS_DEBUG 4879 - atomic_set(&eb->spinning_writers, 0); 4830 + eb->spinning_writers = 0; 4880 4831 atomic_set(&eb->spinning_readers, 0); 4881 4832 atomic_set(&eb->read_locks, 0); 4882 - atomic_set(&eb->write_locks, 0); 4833 + eb->write_locks = 0; 4883 4834 #endif 4884 4835 4885 4836 return eb;

+5 -5

fs/btrfs/extent_io.h

··· 167 167 struct rcu_head rcu_head; 168 168 pid_t lock_owner; 169 169 170 - atomic_t blocking_writers; 170 + int blocking_writers; 171 171 atomic_t blocking_readers; 172 172 bool lock_nested; 173 173 /* >= 0 if eb belongs to a log tree, -1 otherwise */ ··· 187 187 wait_queue_head_t read_lock_wq; 188 188 struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; 189 189 #ifdef CONFIG_BTRFS_DEBUG 190 - atomic_t spinning_writers; 190 + int spinning_writers; 191 191 atomic_t spinning_readers; 192 192 atomic_t read_locks; 193 - atomic_t write_locks; 193 + int write_locks; 194 194 struct list_head leak_list; 195 195 #endif 196 196 }; ··· 497 497 u64 delalloc_end, struct page *locked_page, 498 498 unsigned bits_to_clear, 499 499 unsigned long page_ops); 500 - struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte); 500 + struct bio *btrfs_bio_alloc(u64 first_byte); 501 501 struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); 502 502 struct bio *btrfs_bio_clone(struct bio *bio); 503 503 struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); ··· 549 549 struct extent_io_tree *io_tree, 550 550 struct io_failure_record *rec); 551 551 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 552 - bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, 552 + bool find_lock_delalloc_range(struct inode *inode, 553 553 struct page *locked_page, u64 *start, 554 554 u64 *end); 555 555 #endif

+25 -18

fs/btrfs/file-item.c

··· 8 8 #include <linux/pagemap.h> 9 9 #include <linux/highmem.h> 10 10 #include <linux/sched/mm.h> 11 + #include <crypto/hash.h> 11 12 #include "ctree.h" 12 13 #include "disk-io.h" 13 14 #include "transaction.h" ··· 23 22 #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ 24 23 PAGE_SIZE)) 25 24 26 - #define MAX_ORDERED_SUM_BYTES(fs_info) ((PAGE_SIZE - \ 27 - sizeof(struct btrfs_ordered_sum)) / \ 28 - sizeof(u32) * (fs_info)->sectorsize) 25 + static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, 26 + u16 csum_size) 27 + { 28 + u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size; 29 + 30 + return ncsums * fs_info->sectorsize; 31 + } 29 32 30 33 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 31 34 struct btrfs_root *root, ··· 149 144 } 150 145 151 146 static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, 152 - u64 logical_offset, u32 *dst, int dio) 147 + u64 logical_offset, u8 *dst, int dio) 153 148 { 154 149 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 155 150 struct bio_vec bvec; ··· 187 182 } 188 183 csum = btrfs_bio->csum; 189 184 } else { 190 - csum = (u8 *)dst; 185 + csum = dst; 191 186 } 192 187 193 188 if (bio->bi_iter.bi_size > PAGE_SIZE * 8) ··· 216 211 if (!dio) 217 212 offset = page_offset(bvec.bv_page) + bvec.bv_offset; 218 213 count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, 219 - (u32 *)csum, nblocks); 214 + csum, nblocks); 220 215 if (count) 221 216 goto found; 222 217 ··· 288 283 return 0; 289 284 } 290 285 291 - blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) 286 + blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, 287 + u8 *dst) 292 288 { 293 289 return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); 294 290 } ··· 380 374 struct btrfs_csum_item); 381 375 while (start < csum_end) { 382 376 size = min_t(size_t, csum_end - start, 383 - MAX_ORDERED_SUM_BYTES(fs_info)); 377 + max_ordered_sum_bytes(fs_info, csum_size)); 384 378 sums = kzalloc(btrfs_ordered_sum_size(fs_info, size), 385 379 GFP_NOFS); 386 380 if (!sums) { ··· 433 427 u64 file_start, int contig) 434 428 { 435 429 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 430 + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 436 431 struct btrfs_ordered_sum *sums; 437 432 struct btrfs_ordered_extent *ordered = NULL; 438 433 char *data; ··· 446 439 int i; 447 440 u64 offset; 448 441 unsigned nofs_flag; 442 + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 449 443 450 444 nofs_flag = memalloc_nofs_save(); 451 445 sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), ··· 466 458 467 459 sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; 468 460 index = 0; 461 + 462 + shash->tfm = fs_info->csum_shash; 469 463 470 464 bio_for_each_segment(bvec, bio, iter) { 471 465 if (!contig) ··· 508 498 index = 0; 509 499 } 510 500 511 - sums->sums[index] = ~(u32)0; 501 + crypto_shash_init(shash); 512 502 data = kmap_atomic(bvec.bv_page); 513 - sums->sums[index] 514 - = btrfs_csum_data(data + bvec.bv_offset 515 - + (i * fs_info->sectorsize), 516 - sums->sums[index], 517 - fs_info->sectorsize); 503 + crypto_shash_update(shash, data + bvec.bv_offset 504 + + (i * fs_info->sectorsize), 505 + fs_info->sectorsize); 518 506 kunmap_atomic(data); 519 - btrfs_csum_final(sums->sums[index], 520 - (char *)(sums->sums + index)); 521 - index++; 507 + crypto_shash_final(shash, (char *)(sums->sums + index)); 508 + index += csum_size; 522 509 offset += fs_info->sectorsize; 523 510 this_sum_bytes += fs_info->sectorsize; 524 511 total_bytes += fs_info->sectorsize; ··· 911 904 write_extent_buffer(leaf, sums->sums + index, (unsigned long)item, 912 905 ins_size); 913 906 907 + index += ins_size; 914 908 ins_size /= csum_size; 915 909 total_bytes += ins_size * fs_info->sectorsize; 916 - index += ins_size; 917 910 918 911 btrfs_mark_buffer_dirty(path->nodes[0]); 919 912 if (total_bytes < sums->len) {

+12 -16

fs/btrfs/file.c

··· 26 26 #include "volumes.h" 27 27 #include "qgroup.h" 28 28 #include "compression.h" 29 + #include "delalloc-space.h" 29 30 30 31 static struct kmem_cache *btrfs_inode_defrag_cachep; 31 32 /* ··· 1551 1550 { 1552 1551 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1553 1552 struct btrfs_root *root = inode->root; 1554 - struct btrfs_ordered_extent *ordered; 1555 1553 u64 lockstart, lockend; 1556 1554 u64 num_bytes; 1557 1555 int ret; 1558 1556 1559 1557 ret = btrfs_start_write_no_snapshotting(root); 1560 1558 if (!ret) 1561 - return -ENOSPC; 1559 + return -EAGAIN; 1562 1560 1563 1561 lockstart = round_down(pos, fs_info->sectorsize); 1564 1562 lockend = round_up(pos + *write_bytes, 1565 1563 fs_info->sectorsize) - 1; 1566 1564 1567 - while (1) { 1568 - lock_extent(&inode->io_tree, lockstart, lockend); 1569 - ordered = btrfs_lookup_ordered_range(inode, lockstart, 1570 - lockend - lockstart + 1); 1571 - if (!ordered) { 1572 - break; 1573 - } 1574 - unlock_extent(&inode->io_tree, lockstart, lockend); 1575 - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); 1576 - btrfs_put_ordered_extent(ordered); 1577 - } 1565 + btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart, 1566 + lockend, NULL); 1578 1567 1579 1568 num_bytes = lockend - lockstart + 1; 1580 1569 ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, ··· 2712 2721 * for detecting, at fsync time, if the inode isn't yet in the 2713 2722 * log tree or it's there but not up to date. 2714 2723 */ 2724 + struct timespec64 now = current_time(inode); 2725 + 2726 + inode_inc_iversion(inode); 2727 + inode->i_mtime = now; 2728 + inode->i_ctime = now; 2715 2729 trans = btrfs_start_transaction(root, 1); 2716 2730 if (IS_ERR(trans)) { 2717 2731 err = PTR_ERR(trans); ··· 2797 2801 } 2798 2802 2799 2803 enum { 2800 - RANGE_BOUNDARY_WRITTEN_EXTENT = 0, 2801 - RANGE_BOUNDARY_PREALLOC_EXTENT = 1, 2802 - RANGE_BOUNDARY_HOLE = 2, 2804 + RANGE_BOUNDARY_WRITTEN_EXTENT, 2805 + RANGE_BOUNDARY_PREALLOC_EXTENT, 2806 + RANGE_BOUNDARY_HOLE, 2803 2807 }; 2804 2808 2805 2809 static int btrfs_zero_range_check_range_boundary(struct inode *inode,

+8 -8

fs/btrfs/free-space-cache.c

··· 18 18 #include "extent_io.h" 19 19 #include "inode-map.h" 20 20 #include "volumes.h" 21 + #include "space-info.h" 22 + #include "delalloc-space.h" 21 23 22 24 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 23 25 #define MAX_CACHE_BYTES_PER_GIG SZ_32K ··· 467 465 if (index == 0) 468 466 offset = sizeof(u32) * io_ctl->num_pages; 469 467 470 - crc = btrfs_csum_data(io_ctl->orig + offset, crc, 471 - PAGE_SIZE - offset); 472 - btrfs_csum_final(crc, (u8 *)&crc); 468 + crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 469 + btrfs_crc32c_final(crc, (u8 *)&crc); 473 470 io_ctl_unmap_page(io_ctl); 474 471 tmp = page_address(io_ctl->pages[0]); 475 472 tmp += index; ··· 494 493 val = *tmp; 495 494 496 495 io_ctl_map_page(io_ctl, 0); 497 - crc = btrfs_csum_data(io_ctl->orig + offset, crc, 498 - PAGE_SIZE - offset); 499 - btrfs_csum_final(crc, (u8 *)&crc); 496 + crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 497 + btrfs_crc32c_final(crc, (u8 *)&crc); 500 498 if (val != crc) { 501 499 btrfs_err_rl(io_ctl->fs_info, 502 500 "csum mismatch on free space cache"); ··· 3166 3166 space_info->bytes_readonly += reserved_bytes; 3167 3167 block_group->reserved -= reserved_bytes; 3168 3168 space_info->bytes_reserved -= reserved_bytes; 3169 - spin_unlock(&space_info->lock); 3170 3169 spin_unlock(&block_group->lock); 3170 + spin_unlock(&space_info->lock); 3171 3171 } 3172 3172 3173 3173 return ret; ··· 3358 3358 3359 3359 if (cleanup) { 3360 3360 mutex_lock(&fs_info->chunk_mutex); 3361 - em_tree = &fs_info->mapping_tree.map_tree; 3361 + em_tree = &fs_info->mapping_tree; 3362 3362 write_lock(&em_tree->lock); 3363 3363 em = lookup_extent_mapping(em_tree, block_group->key.objectid, 3364 3364 1);

+1

fs/btrfs/inode-map.c

··· 11 11 #include "free-space-cache.h" 12 12 #include "inode-map.h" 13 13 #include "transaction.h" 14 + #include "delalloc-space.h" 14 15 15 16 static int caching_kthread(void *data) 16 17 {

+64 -45

fs/btrfs/inode.c

··· 47 47 #include "props.h" 48 48 #include "qgroup.h" 49 49 #include "dedupe.h" 50 + #include "delalloc-space.h" 50 51 51 52 struct btrfs_iget_args { 52 53 struct btrfs_key *location; ··· 1933 1932 u64 length = 0; 1934 1933 u64 map_length; 1935 1934 int ret; 1935 + struct btrfs_io_geometry geom; 1936 1936 1937 1937 if (bio_flags & EXTENT_BIO_COMPRESSED) 1938 1938 return 0; 1939 1939 1940 1940 length = bio->bi_iter.bi_size; 1941 1941 map_length = length; 1942 - ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, 1943 - NULL, 0); 1942 + ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length, 1943 + &geom); 1944 1944 if (ret < 0) 1945 1945 return ret; 1946 - if (map_length < length + size) 1946 + 1947 + if (geom.len < length + size) 1947 1948 return 1; 1948 1949 return 0; 1949 1950 } ··· 3206 3203 int icsum, struct page *page, 3207 3204 int pgoff, u64 start, size_t len) 3208 3205 { 3206 + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3207 + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 3209 3208 char *kaddr; 3210 - u32 csum_expected; 3211 - u32 csum = ~(u32)0; 3209 + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 3210 + u8 *csum_expected; 3211 + u8 csum[BTRFS_CSUM_SIZE]; 3212 3212 3213 - csum_expected = *(((u32 *)io_bio->csum) + icsum); 3213 + csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size; 3214 3214 3215 3215 kaddr = kmap_atomic(page); 3216 - csum = btrfs_csum_data(kaddr + pgoff, csum, len); 3217 - btrfs_csum_final(csum, (u8 *)&csum); 3218 - if (csum != csum_expected) 3216 + shash->tfm = fs_info->csum_shash; 3217 + 3218 + crypto_shash_init(shash); 3219 + crypto_shash_update(shash, kaddr + pgoff, len); 3220 + crypto_shash_final(shash, csum); 3221 + 3222 + if (memcmp(csum, csum_expected, csum_size)) 3219 3223 goto zeroit; 3220 3224 3221 3225 kunmap_atomic(kaddr); ··· 3296 3286 wake_up_process(fs_info->cleaner_kthread); 3297 3287 } 3298 3288 3289 + static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, 3290 + struct btrfs_inode *inode) 3291 + { 3292 + list_del_init(&inode->delayed_iput); 3293 + spin_unlock(&fs_info->delayed_iput_lock); 3294 + iput(&inode->vfs_inode); 3295 + if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) 3296 + wake_up(&fs_info->delayed_iputs_wait); 3297 + spin_lock(&fs_info->delayed_iput_lock); 3298 + } 3299 + 3300 + static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, 3301 + struct btrfs_inode *inode) 3302 + { 3303 + if (!list_empty(&inode->delayed_iput)) { 3304 + spin_lock(&fs_info->delayed_iput_lock); 3305 + if (!list_empty(&inode->delayed_iput)) 3306 + run_delayed_iput_locked(fs_info, inode); 3307 + spin_unlock(&fs_info->delayed_iput_lock); 3308 + } 3309 + } 3310 + 3299 3311 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) 3300 3312 { 3301 3313 ··· 3327 3295 3328 3296 inode = list_first_entry(&fs_info->delayed_iputs, 3329 3297 struct btrfs_inode, delayed_iput); 3330 - list_del_init(&inode->delayed_iput); 3331 - spin_unlock(&fs_info->delayed_iput_lock); 3332 - iput(&inode->vfs_inode); 3333 - if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) 3334 - wake_up(&fs_info->delayed_iputs_wait); 3335 - spin_lock(&fs_info->delayed_iput_lock); 3298 + run_delayed_iput_locked(fs_info, inode); 3336 3299 } 3337 3300 spin_unlock(&fs_info->delayed_iput_lock); 3338 3301 } ··· 3962 3935 struct btrfs_fs_info *fs_info = root->fs_info; 3963 3936 struct btrfs_path *path; 3964 3937 int ret = 0; 3965 - struct extent_buffer *leaf; 3966 3938 struct btrfs_dir_item *di; 3967 - struct btrfs_key key; 3968 3939 u64 index; 3969 3940 u64 ino = btrfs_ino(inode); 3970 3941 u64 dir_ino = btrfs_ino(dir); ··· 3980 3955 ret = di ? PTR_ERR(di) : -ENOENT; 3981 3956 goto err; 3982 3957 } 3983 - leaf = path->nodes[0]; 3984 - btrfs_dir_item_key_to_cpu(leaf, di, &key); 3985 3958 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3986 3959 if (ret) 3987 3960 goto err; ··· 4032 4009 ret = 0; 4033 4010 else if (ret) 4034 4011 btrfs_abort_transaction(trans, ret); 4012 + 4013 + /* 4014 + * If we have a pending delayed iput we could end up with the final iput 4015 + * being run in btrfs-cleaner context. If we have enough of these built 4016 + * up we can end up burning a lot of time in btrfs-cleaner without any 4017 + * way to throttle the unlinks. Since we're currently holding a ref on 4018 + * the inode we can run the delayed iput here without any issues as the 4019 + * final iput won't be done until after we drop the ref we're currently 4020 + * holding. 4021 + */ 4022 + btrfs_run_delayed_iput(fs_info, inode); 4035 4023 err: 4036 4024 btrfs_free_path(path); 4037 4025 if (ret) ··· 5042 5008 if (size <= hole_start) 5043 5009 return 0; 5044 5010 5045 - while (1) { 5046 - struct btrfs_ordered_extent *ordered; 5047 - 5048 - lock_extent_bits(io_tree, hole_start, block_end - 1, 5049 - &cached_state); 5050 - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start, 5051 - block_end - hole_start); 5052 - if (!ordered) 5053 - break; 5054 - unlock_extent_cached(io_tree, hole_start, block_end - 1, 5055 - &cached_state); 5056 - btrfs_start_ordered_extent(inode, ordered, 1); 5057 - btrfs_put_ordered_extent(ordered); 5058 - } 5059 - 5011 + btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start, 5012 + block_end - 1, &cached_state); 5060 5013 cur_offset = hole_start; 5061 5014 while (1) { 5062 5015 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, ··· 8339 8318 struct bio *orig_bio = dip->orig_bio; 8340 8319 u64 start_sector = orig_bio->bi_iter.bi_sector; 8341 8320 u64 file_offset = dip->logical_offset; 8342 - u64 map_length; 8343 8321 int async_submit = 0; 8344 8322 u64 submit_len; 8345 8323 int clone_offset = 0; 8346 8324 int clone_len; 8347 8325 int ret; 8348 8326 blk_status_t status; 8327 + struct btrfs_io_geometry geom; 8349 8328 8350 - map_length = orig_bio->bi_iter.bi_size; 8351 - submit_len = map_length; 8352 - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, 8353 - &map_length, NULL, 0); 8329 + submit_len = orig_bio->bi_iter.bi_size; 8330 + ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), 8331 + start_sector << 9, submit_len, &geom); 8354 8332 if (ret) 8355 8333 return -EIO; 8356 8334 8357 - if (map_length >= submit_len) { 8335 + if (geom.len >= submit_len) { 8358 8336 bio = orig_bio; 8359 8337 dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; 8360 8338 goto submit; ··· 8366 8346 async_submit = 1; 8367 8347 8368 8348 /* bio split */ 8369 - ASSERT(map_length <= INT_MAX); 8349 + ASSERT(geom.len <= INT_MAX); 8370 8350 atomic_inc(&dip->pending_bios); 8371 8351 do { 8372 - clone_len = min_t(int, submit_len, map_length); 8352 + clone_len = min_t(int, submit_len, geom.len); 8373 8353 8374 8354 /* 8375 8355 * This will never fail as it's passing GPF_NOFS and ··· 8406 8386 start_sector += clone_len >> 9; 8407 8387 file_offset += clone_len; 8408 8388 8409 - map_length = submit_len; 8410 - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), 8411 - start_sector << 9, &map_length, NULL, 0); 8389 + ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), 8390 + start_sector << 9, submit_len, &geom); 8412 8391 if (ret) 8413 8392 goto out_err; 8414 8393 } while (submit_len > 0);

+23

fs/btrfs/ioctl.c

··· 43 43 #include "qgroup.h" 44 44 #include "tree-log.h" 45 45 #include "compression.h" 46 + #include "space-info.h" 47 + #include "delalloc-space.h" 46 48 47 49 #ifdef CONFIG_64BIT 48 50 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI ··· 3994 3992 inode_dio_wait(inode_in); 3995 3993 if (!same_inode) 3996 3994 inode_dio_wait(inode_out); 3995 + 3996 + /* 3997 + * Workaround to make sure NOCOW buffered write reach disk as NOCOW. 3998 + * 3999 + * Btrfs' back references do not have a block level granularity, they 4000 + * work at the whole extent level. 4001 + * NOCOW buffered write without data space reserved may not be able 4002 + * to fall back to CoW due to lack of data space, thus could cause 4003 + * data loss. 4004 + * 4005 + * Here we take a shortcut by flushing the whole inode, so that all 4006 + * nocow write should reach disk as nocow before we increase the 4007 + * reference of the extent. We could do better by only flushing NOCOW 4008 + * data, but that needs extra accounting. 4009 + * 4010 + * Also we don't need to check ASYNC_EXTENT, as async extent will be 4011 + * CoWed anyway, not affecting nocow part. 4012 + */ 4013 + ret = filemap_flush(inode_in->i_mapping); 4014 + if (ret < 0) 4015 + return ret; 3997 4016 3998 4017 ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), 3999 4018 wb_len);

+28 -34

fs/btrfs/locking.c

··· 15 15 #ifdef CONFIG_BTRFS_DEBUG 16 16 static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) 17 17 { 18 - WARN_ON(atomic_read(&eb->spinning_writers)); 19 - atomic_inc(&eb->spinning_writers); 18 + WARN_ON(eb->spinning_writers); 19 + eb->spinning_writers++; 20 20 } 21 21 22 22 static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) 23 23 { 24 - WARN_ON(atomic_read(&eb->spinning_writers) != 1); 25 - atomic_dec(&eb->spinning_writers); 24 + WARN_ON(eb->spinning_writers != 1); 25 + eb->spinning_writers--; 26 26 } 27 27 28 28 static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) 29 29 { 30 - WARN_ON(atomic_read(&eb->spinning_writers)); 30 + WARN_ON(eb->spinning_writers); 31 31 } 32 32 33 33 static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) ··· 58 58 59 59 static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) 60 60 { 61 - atomic_inc(&eb->write_locks); 61 + eb->write_locks++; 62 62 } 63 63 64 64 static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) 65 65 { 66 - atomic_dec(&eb->write_locks); 66 + eb->write_locks--; 67 67 } 68 68 69 69 void btrfs_assert_tree_locked(struct extent_buffer *eb) 70 70 { 71 - BUG_ON(!atomic_read(&eb->write_locks)); 71 + BUG_ON(!eb->write_locks); 72 72 } 73 73 74 74 #else ··· 111 111 */ 112 112 if (eb->lock_nested && current->pid == eb->lock_owner) 113 113 return; 114 - if (atomic_read(&eb->blocking_writers) == 0) { 114 + if (eb->blocking_writers == 0) { 115 115 btrfs_assert_spinning_writers_put(eb); 116 116 btrfs_assert_tree_locked(eb); 117 - atomic_inc(&eb->blocking_writers); 117 + eb->blocking_writers++; 118 118 write_unlock(&eb->lock); 119 119 } 120 120 } ··· 148 148 */ 149 149 if (eb->lock_nested && current->pid == eb->lock_owner) 150 150 return; 151 - BUG_ON(atomic_read(&eb->blocking_writers) != 1); 152 151 write_lock(&eb->lock); 152 + BUG_ON(eb->blocking_writers != 1); 153 153 btrfs_assert_spinning_writers_get(eb); 154 - /* atomic_dec_and_test implies a barrier */ 155 - if (atomic_dec_and_test(&eb->blocking_writers)) 156 - cond_wake_up_nomb(&eb->write_lock_wq); 154 + if (--eb->blocking_writers == 0) 155 + cond_wake_up(&eb->write_lock_wq); 157 156 } 158 157 159 158 /* ··· 166 167 if (trace_btrfs_tree_read_lock_enabled()) 167 168 start_ns = ktime_get_ns(); 168 169 again: 169 - BUG_ON(!atomic_read(&eb->blocking_writers) && 170 - current->pid == eb->lock_owner); 171 - 172 170 read_lock(&eb->lock); 173 - if (atomic_read(&eb->blocking_writers) && 174 - current->pid == eb->lock_owner) { 171 + BUG_ON(eb->blocking_writers == 0 && 172 + current->pid == eb->lock_owner); 173 + if (eb->blocking_writers && current->pid == eb->lock_owner) { 175 174 /* 176 175 * This extent is already write-locked by our thread. We allow 177 176 * an additional read lock to be added because it's for the same ··· 182 185 trace_btrfs_tree_read_lock(eb, start_ns); 183 186 return; 184 187 } 185 - if (atomic_read(&eb->blocking_writers)) { 188 + if (eb->blocking_writers) { 186 189 read_unlock(&eb->lock); 187 190 wait_event(eb->write_lock_wq, 188 - atomic_read(&eb->blocking_writers) == 0); 191 + eb->blocking_writers == 0); 189 192 goto again; 190 193 } 191 194 btrfs_assert_tree_read_locks_get(eb); ··· 200 203 */ 201 204 int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) 202 205 { 203 - if (atomic_read(&eb->blocking_writers)) 206 + if (eb->blocking_writers) 204 207 return 0; 205 208 206 209 read_lock(&eb->lock); 207 - if (atomic_read(&eb->blocking_writers)) { 210 + if (eb->blocking_writers) { 208 211 read_unlock(&eb->lock); 209 212 return 0; 210 213 } ··· 220 223 */ 221 224 int btrfs_try_tree_read_lock(struct extent_buffer *eb) 222 225 { 223 - if (atomic_read(&eb->blocking_writers)) 226 + if (eb->blocking_writers) 224 227 return 0; 225 228 226 229 if (!read_trylock(&eb->lock)) 227 230 return 0; 228 231 229 - if (atomic_read(&eb->blocking_writers)) { 232 + if (eb->blocking_writers) { 230 233 read_unlock(&eb->lock); 231 234 return 0; 232 235 } ··· 242 245 */ 243 246 int btrfs_try_tree_write_lock(struct extent_buffer *eb) 244 247 { 245 - if (atomic_read(&eb->blocking_writers) || 246 - atomic_read(&eb->blocking_readers)) 248 + if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) 247 249 return 0; 248 250 249 251 write_lock(&eb->lock); 250 - if (atomic_read(&eb->blocking_writers) || 251 - atomic_read(&eb->blocking_readers)) { 252 + if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) { 252 253 write_unlock(&eb->lock); 253 254 return 0; 254 255 } ··· 317 322 WARN_ON(eb->lock_owner == current->pid); 318 323 again: 319 324 wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); 320 - wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); 325 + wait_event(eb->write_lock_wq, eb->blocking_writers == 0); 321 326 write_lock(&eb->lock); 322 - if (atomic_read(&eb->blocking_readers) || 323 - atomic_read(&eb->blocking_writers)) { 327 + if (atomic_read(&eb->blocking_readers) || eb->blocking_writers) { 324 328 write_unlock(&eb->lock); 325 329 goto again; 326 330 } ··· 334 340 */ 335 341 void btrfs_tree_unlock(struct extent_buffer *eb) 336 342 { 337 - int blockers = atomic_read(&eb->blocking_writers); 343 + int blockers = eb->blocking_writers; 338 344 339 345 BUG_ON(blockers > 1); 340 346 ··· 345 351 346 352 if (blockers) { 347 353 btrfs_assert_no_spinning_writers(eb); 348 - atomic_dec(&eb->blocking_writers); 354 + eb->blocking_writers--; 349 355 /* Use the lighter barrier after atomic */ 350 356 smp_mb__after_atomic(); 351 357 cond_wake_up_nomb(&eb->write_lock_wq);

+52 -4

fs/btrfs/ordered-data.c

··· 13 13 #include "extent_io.h" 14 14 #include "disk-io.h" 15 15 #include "compression.h" 16 + #include "delalloc-space.h" 16 17 17 18 static struct kmem_cache *btrfs_ordered_extent_cache; 18 19 ··· 925 924 * be reclaimed before their checksum is actually put into the btree 926 925 */ 927 926 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, 928 - u32 *sum, int len) 927 + u8 *sum, int len) 929 928 { 929 + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 930 930 struct btrfs_ordered_sum *ordered_sum; 931 931 struct btrfs_ordered_extent *ordered; 932 932 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 933 933 unsigned long num_sectors; 934 934 unsigned long i; 935 935 u32 sectorsize = btrfs_inode_sectorsize(inode); 936 + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 936 937 int index = 0; 937 938 938 939 ordered = btrfs_lookup_ordered_extent(inode, offset); ··· 950 947 num_sectors = ordered_sum->len >> 951 948 inode->i_sb->s_blocksize_bits; 952 949 num_sectors = min_t(int, len - index, num_sectors - i); 953 - memcpy(sum + index, ordered_sum->sums + i, 954 - num_sectors); 950 + memcpy(sum + index, ordered_sum->sums + i * csum_size, 951 + num_sectors * csum_size); 955 952 956 - index += (int)num_sectors; 953 + index += (int)num_sectors * csum_size; 957 954 if (index == len) 958 955 goto out; 959 956 disk_bytenr += num_sectors * sectorsize; ··· 963 960 spin_unlock_irq(&tree->lock); 964 961 btrfs_put_ordered_extent(ordered); 965 962 return index; 963 + } 964 + 965 + /* 966 + * btrfs_flush_ordered_range - Lock the passed range and ensures all pending 967 + * ordered extents in it are run to completion. 968 + * 969 + * @tree: IO tree used for locking out other users of the range 970 + * @inode: Inode whose ordered tree is to be searched 971 + * @start: Beginning of range to flush 972 + * @end: Last byte of range to lock 973 + * @cached_state: If passed, will return the extent state responsible for the 974 + * locked range. It's the caller's responsibility to free the cached state. 975 + * 976 + * This function always returns with the given range locked, ensuring after it's 977 + * called no order extent can be pending. 978 + */ 979 + void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, 980 + struct btrfs_inode *inode, u64 start, 981 + u64 end, 982 + struct extent_state **cached_state) 983 + { 984 + struct btrfs_ordered_extent *ordered; 985 + struct extent_state *cachedp = NULL; 986 + 987 + if (cached_state) 988 + cachedp = *cached_state; 989 + 990 + while (1) { 991 + lock_extent_bits(tree, start, end, &cachedp); 992 + ordered = btrfs_lookup_ordered_range(inode, start, 993 + end - start + 1); 994 + if (!ordered) { 995 + /* 996 + * If no external cached_state has been passed then 997 + * decrement the extra ref taken for cachedp since we 998 + * aren't exposing it outside of this function 999 + */ 1000 + if (!cached_state) 1001 + refcount_dec(&cachedp->refs); 1002 + break; 1003 + } 1004 + unlock_extent_cached(tree, start, end, &cachedp); 1005 + btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); 1006 + btrfs_put_ordered_extent(ordered); 1007 + } 966 1008 } 967 1009 968 1010 int __init ordered_data_init(void)

+6 -2

fs/btrfs/ordered-data.h

··· 23 23 int len; 24 24 struct list_head list; 25 25 /* last field is a variable length array of csums */ 26 - u32 sums[]; 26 + u8 sums[]; 27 27 }; 28 28 29 29 /* ··· 183 183 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 184 184 struct btrfs_ordered_extent *ordered); 185 185 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, 186 - u32 *sum, int len); 186 + u8 *sum, int len); 187 187 u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, 188 188 const u64 range_start, const u64 range_len); 189 189 u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, 190 190 const u64 range_start, const u64 range_len); 191 + void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, 192 + struct btrfs_inode *inode, u64 start, 193 + u64 end, 194 + struct extent_state **cached_state); 191 195 int __init ordered_data_init(void); 192 196 void __cold ordered_data_exit(void); 193 197

+3 -3

fs/btrfs/print-tree.c

··· 153 153 #ifdef CONFIG_BTRFS_DEBUG 154 154 btrfs_info(eb->fs_info, 155 155 "refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u", 156 - atomic_read(&eb->refs), atomic_read(&eb->write_locks), 156 + atomic_read(&eb->refs), eb->write_locks, 157 157 atomic_read(&eb->read_locks), 158 - atomic_read(&eb->blocking_writers), 158 + eb->blocking_writers, 159 159 atomic_read(&eb->blocking_readers), 160 - atomic_read(&eb->spinning_writers), 160 + eb->spinning_writers, 161 161 atomic_read(&eb->spinning_readers), 162 162 eb->lock_owner, current->pid); 163 163 #endif

+2 -6

fs/btrfs/props.c

··· 257 257 if (!value) 258 258 return 0; 259 259 260 - if (!strncmp("lzo", value, 3)) 261 - return 0; 262 - else if (!strncmp("zlib", value, 4)) 263 - return 0; 264 - else if (!strncmp("zstd", value, 4)) 260 + if (btrfs_compress_is_valid_type(value, len)) 265 261 return 0; 266 262 267 263 return -EINVAL; ··· 337 341 for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { 338 342 const struct prop_handler *h = &prop_handlers[i]; 339 343 const char *value; 340 - u64 num_bytes; 344 + u64 num_bytes = 0; 341 345 342 346 if (!h->inheritable) 343 347 continue;

+22 -2

fs/btrfs/qgroup.c

··· 2614 2614 int ret = 0; 2615 2615 int i; 2616 2616 u64 *i_qgroups; 2617 + bool committing = false; 2617 2618 struct btrfs_fs_info *fs_info = trans->fs_info; 2618 2619 struct btrfs_root *quota_root; 2619 2620 struct btrfs_qgroup *srcgroup; ··· 2622 2621 u32 level_size = 0; 2623 2622 u64 nums; 2624 2623 2625 - mutex_lock(&fs_info->qgroup_ioctl_lock); 2624 + /* 2625 + * There are only two callers of this function. 2626 + * 2627 + * One in create_subvol() in the ioctl context, which needs to hold 2628 + * the qgroup_ioctl_lock. 2629 + * 2630 + * The other one in create_pending_snapshot() where no other qgroup 2631 + * code can modify the fs as they all need to either start a new trans 2632 + * or hold a trans handler, thus we don't need to hold 2633 + * qgroup_ioctl_lock. 2634 + * This would avoid long and complex lock chain and make lockdep happy. 2635 + */ 2636 + spin_lock(&fs_info->trans_lock); 2637 + if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) 2638 + committing = true; 2639 + spin_unlock(&fs_info->trans_lock); 2640 + 2641 + if (!committing) 2642 + mutex_lock(&fs_info->qgroup_ioctl_lock); 2626 2643 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2627 2644 goto out; 2628 2645 ··· 2804 2785 unlock: 2805 2786 spin_unlock(&fs_info->qgroup_lock); 2806 2787 out: 2807 - mutex_unlock(&fs_info->qgroup_ioctl_lock); 2788 + if (!committing) 2789 + mutex_unlock(&fs_info->qgroup_ioctl_lock); 2808 2790 return ret; 2809 2791 } 2810 2792

+2 -2

fs/btrfs/raid56.h

··· 7 7 #ifndef BTRFS_RAID56_H 8 8 #define BTRFS_RAID56_H 9 9 10 - static inline int nr_parity_stripes(struct map_lookup *map) 10 + static inline int nr_parity_stripes(const struct map_lookup *map) 11 11 { 12 12 if (map->type & BTRFS_BLOCK_GROUP_RAID5) 13 13 return 1; ··· 17 17 return 0; 18 18 } 19 19 20 - static inline int nr_data_stripes(struct map_lookup *map) 20 + static inline int nr_data_stripes(const struct map_lookup *map) 21 21 { 22 22 return map->num_stripes - nr_parity_stripes(map); 23 23 }

+1

fs/btrfs/relocation.c

··· 20 20 #include "inode-map.h" 21 21 #include "qgroup.h" 22 22 #include "print-tree.h" 23 + #include "delalloc-space.h" 23 24 24 25 /* 25 26 * backref_node, mapping_node and tree_block start with this

+56

fs/btrfs/root-tree.c

··· 9 9 #include "transaction.h" 10 10 #include "disk-io.h" 11 11 #include "print-tree.h" 12 + #include "qgroup.h" 13 + #include "space-info.h" 12 14 13 15 /* 14 16 * Read a root item from the tree. In case we detect a root item smaller then ··· 498 496 btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec); 499 497 btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec); 500 498 spin_unlock(&root->root_item_lock); 499 + } 500 + 501 + /* 502 + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 503 + * root: the root of the parent directory 504 + * rsv: block reservation 505 + * items: the number of items that we need do reservation 506 + * use_global_rsv: allow fallback to the global block reservation 507 + * 508 + * This function is used to reserve the space for snapshot/subvolume 509 + * creation and deletion. Those operations are different with the 510 + * common file/directory operations, they change two fs/file trees 511 + * and root tree, the number of items that the qgroup reserves is 512 + * different with the free space reservation. So we can not use 513 + * the space reservation mechanism in start_transaction(). 514 + */ 515 + int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 516 + struct btrfs_block_rsv *rsv, int items, 517 + bool use_global_rsv) 518 + { 519 + u64 qgroup_num_bytes = 0; 520 + u64 num_bytes; 521 + int ret; 522 + struct btrfs_fs_info *fs_info = root->fs_info; 523 + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 524 + 525 + if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 526 + /* One for parent inode, two for dir entries */ 527 + qgroup_num_bytes = 3 * fs_info->nodesize; 528 + ret = btrfs_qgroup_reserve_meta_prealloc(root, 529 + qgroup_num_bytes, true); 530 + if (ret) 531 + return ret; 532 + } 533 + 534 + num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); 535 + rsv->space_info = btrfs_find_space_info(fs_info, 536 + BTRFS_BLOCK_GROUP_METADATA); 537 + ret = btrfs_block_rsv_add(root, rsv, num_bytes, 538 + BTRFS_RESERVE_FLUSH_ALL); 539 + 540 + if (ret == -ENOSPC && use_global_rsv) 541 + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); 542 + 543 + if (ret && qgroup_num_bytes) 544 + btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); 545 + 546 + return ret; 547 + } 548 + 549 + void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, 550 + struct btrfs_block_rsv *rsv) 551 + { 552 + btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 501 553 }

+31 -19

fs/btrfs/scrub.c

··· 6 6 #include <linux/blkdev.h> 7 7 #include <linux/ratelimit.h> 8 8 #include <linux/sched/mm.h> 9 + #include <crypto/hash.h> 9 10 #include "ctree.h" 10 11 #include "volumes.h" 11 12 #include "disk-io.h" ··· 1788 1787 static int scrub_checksum_data(struct scrub_block *sblock) 1789 1788 { 1790 1789 struct scrub_ctx *sctx = sblock->sctx; 1790 + struct btrfs_fs_info *fs_info = sctx->fs_info; 1791 + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 1791 1792 u8 csum[BTRFS_CSUM_SIZE]; 1792 1793 u8 *on_disk_csum; 1793 1794 struct page *page; 1794 1795 void *buffer; 1795 - u32 crc = ~(u32)0; 1796 1796 u64 len; 1797 1797 int index; 1798 1798 1799 1799 BUG_ON(sblock->page_count < 1); 1800 1800 if (!sblock->pagev[0]->have_csum) 1801 1801 return 0; 1802 + 1803 + shash->tfm = fs_info->csum_shash; 1804 + crypto_shash_init(shash); 1802 1805 1803 1806 on_disk_csum = sblock->pagev[0]->csum; 1804 1807 page = sblock->pagev[0]->page; ··· 1813 1808 for (;;) { 1814 1809 u64 l = min_t(u64, len, PAGE_SIZE); 1815 1810 1816 - crc = btrfs_csum_data(buffer, crc, l); 1811 + crypto_shash_update(shash, buffer, l); 1817 1812 kunmap_atomic(buffer); 1818 1813 len -= l; 1819 1814 if (len == 0) ··· 1825 1820 buffer = kmap_atomic(page); 1826 1821 } 1827 1822 1828 - btrfs_csum_final(crc, csum); 1823 + crypto_shash_final(shash, csum); 1829 1824 if (memcmp(csum, on_disk_csum, sctx->csum_size)) 1830 1825 sblock->checksum_error = 1; 1831 1826 ··· 1837 1832 struct scrub_ctx *sctx = sblock->sctx; 1838 1833 struct btrfs_header *h; 1839 1834 struct btrfs_fs_info *fs_info = sctx->fs_info; 1835 + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 1840 1836 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1841 1837 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1842 1838 struct page *page; 1843 1839 void *mapped_buffer; 1844 1840 u64 mapped_size; 1845 1841 void *p; 1846 - u32 crc = ~(u32)0; 1847 1842 u64 len; 1848 1843 int index; 1844 + 1845 + shash->tfm = fs_info->csum_shash; 1846 + crypto_shash_init(shash); 1849 1847 1850 1848 BUG_ON(sblock->page_count < 1); 1851 1849 page = sblock->pagev[0]->page; ··· 1883 1875 for (;;) { 1884 1876 u64 l = min_t(u64, len, mapped_size); 1885 1877 1886 - crc = btrfs_csum_data(p, crc, l); 1878 + crypto_shash_update(shash, p, l); 1887 1879 kunmap_atomic(mapped_buffer); 1888 1880 len -= l; 1889 1881 if (len == 0) ··· 1897 1889 p = mapped_buffer; 1898 1890 } 1899 1891 1900 - btrfs_csum_final(crc, calculated_csum); 1892 + crypto_shash_final(shash, calculated_csum); 1901 1893 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 1902 1894 sblock->checksum_error = 1; 1903 1895 ··· 1908 1900 { 1909 1901 struct btrfs_super_block *s; 1910 1902 struct scrub_ctx *sctx = sblock->sctx; 1903 + struct btrfs_fs_info *fs_info = sctx->fs_info; 1904 + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 1911 1905 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1912 1906 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1913 1907 struct page *page; 1914 1908 void *mapped_buffer; 1915 1909 u64 mapped_size; 1916 1910 void *p; 1917 - u32 crc = ~(u32)0; 1918 1911 int fail_gen = 0; 1919 1912 int fail_cor = 0; 1920 1913 u64 len; 1921 1914 int index; 1915 + 1916 + shash->tfm = fs_info->csum_shash; 1917 + crypto_shash_init(shash); 1922 1918 1923 1919 BUG_ON(sblock->page_count < 1); 1924 1920 page = sblock->pagev[0]->page; ··· 1946 1934 for (;;) { 1947 1935 u64 l = min_t(u64, len, mapped_size); 1948 1936 1949 - crc = btrfs_csum_data(p, crc, l); 1937 + crypto_shash_update(shash, p, l); 1950 1938 kunmap_atomic(mapped_buffer); 1951 1939 len -= l; 1952 1940 if (len == 0) ··· 1960 1948 p = mapped_buffer; 1961 1949 } 1962 1950 1963 - btrfs_csum_final(crc, calculated_csum); 1951 + crypto_shash_final(shash, calculated_csum); 1964 1952 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 1965 1953 ++fail_cor; 1966 1954 ··· 2460 2448 ASSERT(index < UINT_MAX); 2461 2449 2462 2450 num_sectors = sum->len / sctx->fs_info->sectorsize; 2463 - memcpy(csum, sum->sums + index, sctx->csum_size); 2451 + memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size); 2464 2452 if (index == num_sectors - 1) { 2465 2453 list_del(&sum->list); 2466 2454 kfree(sum); ··· 2672 2660 u64 last_offset; 2673 2661 u32 stripe_index; 2674 2662 u32 rot; 2663 + const int data_stripes = nr_data_stripes(map); 2675 2664 2676 - last_offset = (physical - map->stripes[num].physical) * 2677 - nr_data_stripes(map); 2665 + last_offset = (physical - map->stripes[num].physical) * data_stripes; 2678 2666 if (stripe_start) 2679 2667 *stripe_start = last_offset; 2680 2668 2681 2669 *offset = last_offset; 2682 - for (i = 0; i < nr_data_stripes(map); i++) { 2670 + for (i = 0; i < data_stripes; i++) { 2683 2671 *offset = last_offset + i * map->stripe_len; 2684 2672 2685 2673 stripe_nr = div64_u64(*offset, map->stripe_len); 2686 - stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); 2674 + stripe_nr = div_u64(stripe_nr, data_stripes); 2687 2675 2688 2676 /* Work out the disk rotation on this stripe-set */ 2689 2677 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); ··· 3091 3079 offset = map->stripe_len * (num / map->sub_stripes); 3092 3080 increment = map->stripe_len * factor; 3093 3081 mirror_num = num % map->sub_stripes + 1; 3094 - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3082 + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 3095 3083 increment = map->stripe_len; 3096 3084 mirror_num = num % map->num_stripes + 1; 3097 3085 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { ··· 3422 3410 struct btrfs_block_group_cache *cache) 3423 3411 { 3424 3412 struct btrfs_fs_info *fs_info = sctx->fs_info; 3425 - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 3413 + struct extent_map_tree *map_tree = &fs_info->mapping_tree; 3426 3414 struct map_lookup *map; 3427 3415 struct extent_map *em; 3428 3416 int i; 3429 3417 int ret = 0; 3430 3418 3431 - read_lock(&map_tree->map_tree.lock); 3432 - em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 3433 - read_unlock(&map_tree->map_tree.lock); 3419 + read_lock(&map_tree->lock); 3420 + em = lookup_extent_mapping(map_tree, chunk_offset, 1); 3421 + read_unlock(&map_tree->lock); 3434 3422 3435 3423 if (!em) { 3436 3424 /*

+15 -1

fs/btrfs/send.c

··· 686 686 hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); 687 687 hdr->crc = 0; 688 688 689 - crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); 689 + crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); 690 690 hdr->crc = cpu_to_le32(crc); 691 691 692 692 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, ··· 6929 6929 if (ret) 6930 6930 goto out; 6931 6931 6932 + mutex_lock(&fs_info->balance_mutex); 6933 + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 6934 + mutex_unlock(&fs_info->balance_mutex); 6935 + btrfs_warn_rl(fs_info, 6936 + "cannot run send because a balance operation is in progress"); 6937 + ret = -EAGAIN; 6938 + goto out; 6939 + } 6940 + fs_info->send_in_progress++; 6941 + mutex_unlock(&fs_info->balance_mutex); 6942 + 6932 6943 current->journal_info = BTRFS_SEND_TRANS_STUB; 6933 6944 ret = send_subvol(sctx); 6934 6945 current->journal_info = NULL; 6946 + mutex_lock(&fs_info->balance_mutex); 6947 + fs_info->send_in_progress--; 6948 + mutex_unlock(&fs_info->balance_mutex); 6935 6949 if (ret < 0) 6936 6950 goto out; 6937 6951

+1094

fs/btrfs/space-info.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "ctree.h" 4 + #include "space-info.h" 5 + #include "sysfs.h" 6 + #include "volumes.h" 7 + #include "free-space-cache.h" 8 + #include "ordered-data.h" 9 + #include "transaction.h" 10 + #include "math.h" 11 + 12 + u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 13 + bool may_use_included) 14 + { 15 + ASSERT(s_info); 16 + return s_info->bytes_used + s_info->bytes_reserved + 17 + s_info->bytes_pinned + s_info->bytes_readonly + 18 + (may_use_included ? s_info->bytes_may_use : 0); 19 + } 20 + 21 + /* 22 + * after adding space to the filesystem, we need to clear the full flags 23 + * on all the space infos. 24 + */ 25 + void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 26 + { 27 + struct list_head *head = &info->space_info; 28 + struct btrfs_space_info *found; 29 + 30 + rcu_read_lock(); 31 + list_for_each_entry_rcu(found, head, list) 32 + found->full = 0; 33 + rcu_read_unlock(); 34 + } 35 + 36 + static const char *alloc_name(u64 flags) 37 + { 38 + switch (flags) { 39 + case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 40 + return "mixed"; 41 + case BTRFS_BLOCK_GROUP_METADATA: 42 + return "metadata"; 43 + case BTRFS_BLOCK_GROUP_DATA: 44 + return "data"; 45 + case BTRFS_BLOCK_GROUP_SYSTEM: 46 + return "system"; 47 + default: 48 + WARN_ON(1); 49 + return "invalid-combination"; 50 + }; 51 + } 52 + 53 + static int create_space_info(struct btrfs_fs_info *info, u64 flags) 54 + { 55 + 56 + struct btrfs_space_info *space_info; 57 + int i; 58 + int ret; 59 + 60 + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 61 + if (!space_info) 62 + return -ENOMEM; 63 + 64 + ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 65 + GFP_KERNEL); 66 + if (ret) { 67 + kfree(space_info); 68 + return ret; 69 + } 70 + 71 + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 72 + INIT_LIST_HEAD(&space_info->block_groups[i]); 73 + init_rwsem(&space_info->groups_sem); 74 + spin_lock_init(&space_info->lock); 75 + space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 76 + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 77 + init_waitqueue_head(&space_info->wait); 78 + INIT_LIST_HEAD(&space_info->ro_bgs); 79 + INIT_LIST_HEAD(&space_info->tickets); 80 + INIT_LIST_HEAD(&space_info->priority_tickets); 81 + 82 + ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, 83 + info->space_info_kobj, "%s", 84 + alloc_name(space_info->flags)); 85 + if (ret) { 86 + kobject_put(&space_info->kobj); 87 + return ret; 88 + } 89 + 90 + list_add_rcu(&space_info->list, &info->space_info); 91 + if (flags & BTRFS_BLOCK_GROUP_DATA) 92 + info->data_sinfo = space_info; 93 + 94 + return ret; 95 + } 96 + 97 + int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 98 + { 99 + struct btrfs_super_block *disk_super; 100 + u64 features; 101 + u64 flags; 102 + int mixed = 0; 103 + int ret; 104 + 105 + disk_super = fs_info->super_copy; 106 + if (!btrfs_super_root(disk_super)) 107 + return -EINVAL; 108 + 109 + features = btrfs_super_incompat_flags(disk_super); 110 + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 111 + mixed = 1; 112 + 113 + flags = BTRFS_BLOCK_GROUP_SYSTEM; 114 + ret = create_space_info(fs_info, flags); 115 + if (ret) 116 + goto out; 117 + 118 + if (mixed) { 119 + flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 120 + ret = create_space_info(fs_info, flags); 121 + } else { 122 + flags = BTRFS_BLOCK_GROUP_METADATA; 123 + ret = create_space_info(fs_info, flags); 124 + if (ret) 125 + goto out; 126 + 127 + flags = BTRFS_BLOCK_GROUP_DATA; 128 + ret = create_space_info(fs_info, flags); 129 + } 130 + out: 131 + return ret; 132 + } 133 + 134 + void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, 135 + u64 total_bytes, u64 bytes_used, 136 + u64 bytes_readonly, 137 + struct btrfs_space_info **space_info) 138 + { 139 + struct btrfs_space_info *found; 140 + int factor; 141 + 142 + factor = btrfs_bg_type_to_factor(flags); 143 + 144 + found = btrfs_find_space_info(info, flags); 145 + ASSERT(found); 146 + spin_lock(&found->lock); 147 + found->total_bytes += total_bytes; 148 + found->disk_total += total_bytes * factor; 149 + found->bytes_used += bytes_used; 150 + found->disk_used += bytes_used * factor; 151 + found->bytes_readonly += bytes_readonly; 152 + if (total_bytes > 0) 153 + found->full = 0; 154 + btrfs_space_info_add_new_bytes(info, found, 155 + total_bytes - bytes_used - 156 + bytes_readonly); 157 + spin_unlock(&found->lock); 158 + *space_info = found; 159 + } 160 + 161 + struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 162 + u64 flags) 163 + { 164 + struct list_head *head = &info->space_info; 165 + struct btrfs_space_info *found; 166 + 167 + flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 168 + 169 + rcu_read_lock(); 170 + list_for_each_entry_rcu(found, head, list) { 171 + if (found->flags & flags) { 172 + rcu_read_unlock(); 173 + return found; 174 + } 175 + } 176 + rcu_read_unlock(); 177 + return NULL; 178 + } 179 + 180 + static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 181 + { 182 + return (global->size << 1); 183 + } 184 + 185 + static int can_overcommit(struct btrfs_fs_info *fs_info, 186 + struct btrfs_space_info *space_info, u64 bytes, 187 + enum btrfs_reserve_flush_enum flush, 188 + bool system_chunk) 189 + { 190 + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 191 + u64 profile; 192 + u64 space_size; 193 + u64 avail; 194 + u64 used; 195 + int factor; 196 + 197 + /* Don't overcommit when in mixed mode. */ 198 + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 199 + return 0; 200 + 201 + if (system_chunk) 202 + profile = btrfs_system_alloc_profile(fs_info); 203 + else 204 + profile = btrfs_metadata_alloc_profile(fs_info); 205 + 206 + used = btrfs_space_info_used(space_info, false); 207 + 208 + /* 209 + * We only want to allow over committing if we have lots of actual space 210 + * free, but if we don't have enough space to handle the global reserve 211 + * space then we could end up having a real enospc problem when trying 212 + * to allocate a chunk or some other such important allocation. 213 + */ 214 + spin_lock(&global_rsv->lock); 215 + space_size = calc_global_rsv_need_space(global_rsv); 216 + spin_unlock(&global_rsv->lock); 217 + if (used + space_size >= space_info->total_bytes) 218 + return 0; 219 + 220 + used += space_info->bytes_may_use; 221 + 222 + avail = atomic64_read(&fs_info->free_chunk_space); 223 + 224 + /* 225 + * If we have dup, raid1 or raid10 then only half of the free 226 + * space is actually usable. For raid56, the space info used 227 + * doesn't include the parity drive, so we don't have to 228 + * change the math 229 + */ 230 + factor = btrfs_bg_type_to_factor(profile); 231 + avail = div_u64(avail, factor); 232 + 233 + /* 234 + * If we aren't flushing all things, let us overcommit up to 235 + * 1/2th of the space. If we can flush, don't let us overcommit 236 + * too much, let it overcommit up to 1/8 of the space. 237 + */ 238 + if (flush == BTRFS_RESERVE_FLUSH_ALL) 239 + avail >>= 3; 240 + else 241 + avail >>= 1; 242 + 243 + if (used + bytes < space_info->total_bytes + avail) 244 + return 1; 245 + return 0; 246 + } 247 + 248 + /* 249 + * This is for space we already have accounted in space_info->bytes_may_use, so 250 + * basically when we're returning space from block_rsv's. 251 + */ 252 + void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 253 + struct btrfs_space_info *space_info, 254 + u64 num_bytes) 255 + { 256 + struct reserve_ticket *ticket; 257 + struct list_head *head; 258 + u64 used; 259 + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 260 + bool check_overcommit = false; 261 + 262 + spin_lock(&space_info->lock); 263 + head = &space_info->priority_tickets; 264 + 265 + /* 266 + * If we are over our limit then we need to check and see if we can 267 + * overcommit, and if we can't then we just need to free up our space 268 + * and not satisfy any requests. 269 + */ 270 + used = btrfs_space_info_used(space_info, true); 271 + if (used - num_bytes >= space_info->total_bytes) 272 + check_overcommit = true; 273 + again: 274 + while (!list_empty(head) && num_bytes) { 275 + ticket = list_first_entry(head, struct reserve_ticket, 276 + list); 277 + /* 278 + * We use 0 bytes because this space is already reserved, so 279 + * adding the ticket space would be a double count. 280 + */ 281 + if (check_overcommit && 282 + !can_overcommit(fs_info, space_info, 0, flush, false)) 283 + break; 284 + if (num_bytes >= ticket->bytes) { 285 + list_del_init(&ticket->list); 286 + num_bytes -= ticket->bytes; 287 + ticket->bytes = 0; 288 + space_info->tickets_id++; 289 + wake_up(&ticket->wait); 290 + } else { 291 + ticket->bytes -= num_bytes; 292 + num_bytes = 0; 293 + } 294 + } 295 + 296 + if (num_bytes && head == &space_info->priority_tickets) { 297 + head = &space_info->tickets; 298 + flush = BTRFS_RESERVE_FLUSH_ALL; 299 + goto again; 300 + } 301 + btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); 302 + trace_btrfs_space_reservation(fs_info, "space_info", 303 + space_info->flags, num_bytes, 0); 304 + spin_unlock(&space_info->lock); 305 + } 306 + 307 + /* 308 + * This is for newly allocated space that isn't accounted in 309 + * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent 310 + * we use this helper. 311 + */ 312 + void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 313 + struct btrfs_space_info *space_info, 314 + u64 num_bytes) 315 + { 316 + struct reserve_ticket *ticket; 317 + struct list_head *head = &space_info->priority_tickets; 318 + 319 + again: 320 + while (!list_empty(head) && num_bytes) { 321 + ticket = list_first_entry(head, struct reserve_ticket, 322 + list); 323 + if (num_bytes >= ticket->bytes) { 324 + trace_btrfs_space_reservation(fs_info, "space_info", 325 + space_info->flags, 326 + ticket->bytes, 1); 327 + list_del_init(&ticket->list); 328 + num_bytes -= ticket->bytes; 329 + btrfs_space_info_update_bytes_may_use(fs_info, 330 + space_info, 331 + ticket->bytes); 332 + ticket->bytes = 0; 333 + space_info->tickets_id++; 334 + wake_up(&ticket->wait); 335 + } else { 336 + trace_btrfs_space_reservation(fs_info, "space_info", 337 + space_info->flags, 338 + num_bytes, 1); 339 + btrfs_space_info_update_bytes_may_use(fs_info, 340 + space_info, 341 + num_bytes); 342 + ticket->bytes -= num_bytes; 343 + num_bytes = 0; 344 + } 345 + } 346 + 347 + if (num_bytes && head == &space_info->priority_tickets) { 348 + head = &space_info->tickets; 349 + goto again; 350 + } 351 + } 352 + 353 + #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 354 + do { \ 355 + struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 356 + spin_lock(&__rsv->lock); \ 357 + btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 358 + __rsv->size, __rsv->reserved); \ 359 + spin_unlock(&__rsv->lock); \ 360 + } while (0) 361 + 362 + void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 363 + struct btrfs_space_info *info, u64 bytes, 364 + int dump_block_groups) 365 + { 366 + struct btrfs_block_group_cache *cache; 367 + int index = 0; 368 + 369 + spin_lock(&info->lock); 370 + btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 371 + info->flags, 372 + info->total_bytes - btrfs_space_info_used(info, true), 373 + info->full ? "" : "not "); 374 + btrfs_info(fs_info, 375 + "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 376 + info->total_bytes, info->bytes_used, info->bytes_pinned, 377 + info->bytes_reserved, info->bytes_may_use, 378 + info->bytes_readonly); 379 + spin_unlock(&info->lock); 380 + 381 + DUMP_BLOCK_RSV(fs_info, global_block_rsv); 382 + DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 383 + DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 384 + DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 385 + DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 386 + 387 + if (!dump_block_groups) 388 + return; 389 + 390 + down_read(&info->groups_sem); 391 + again: 392 + list_for_each_entry(cache, &info->block_groups[index], list) { 393 + spin_lock(&cache->lock); 394 + btrfs_info(fs_info, 395 + "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 396 + cache->key.objectid, cache->key.offset, 397 + btrfs_block_group_used(&cache->item), cache->pinned, 398 + cache->reserved, cache->ro ? "[readonly]" : ""); 399 + btrfs_dump_free_space(cache, bytes); 400 + spin_unlock(&cache->lock); 401 + } 402 + if (++index < BTRFS_NR_RAID_TYPES) 403 + goto again; 404 + up_read(&info->groups_sem); 405 + } 406 + 407 + static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 408 + unsigned long nr_pages, int nr_items) 409 + { 410 + struct super_block *sb = fs_info->sb; 411 + 412 + if (down_read_trylock(&sb->s_umount)) { 413 + writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 414 + up_read(&sb->s_umount); 415 + } else { 416 + /* 417 + * We needn't worry the filesystem going from r/w to r/o though 418 + * we don't acquire ->s_umount mutex, because the filesystem 419 + * should guarantee the delalloc inodes list be empty after 420 + * the filesystem is readonly(all dirty pages are written to 421 + * the disk). 422 + */ 423 + btrfs_start_delalloc_roots(fs_info, nr_items); 424 + if (!current->journal_info) 425 + btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 426 + } 427 + } 428 + 429 + static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 430 + u64 to_reclaim) 431 + { 432 + u64 bytes; 433 + u64 nr; 434 + 435 + bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 436 + nr = div64_u64(to_reclaim, bytes); 437 + if (!nr) 438 + nr = 1; 439 + return nr; 440 + } 441 + 442 + #define EXTENT_SIZE_PER_ITEM SZ_256K 443 + 444 + /* 445 + * shrink metadata reservation for delalloc 446 + */ 447 + static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 448 + u64 orig, bool wait_ordered) 449 + { 450 + struct btrfs_space_info *space_info; 451 + struct btrfs_trans_handle *trans; 452 + u64 delalloc_bytes; 453 + u64 dio_bytes; 454 + u64 async_pages; 455 + u64 items; 456 + long time_left; 457 + unsigned long nr_pages; 458 + int loops; 459 + 460 + /* Calc the number of the pages we need flush for space reservation */ 461 + items = calc_reclaim_items_nr(fs_info, to_reclaim); 462 + to_reclaim = items * EXTENT_SIZE_PER_ITEM; 463 + 464 + trans = (struct btrfs_trans_handle *)current->journal_info; 465 + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 466 + 467 + delalloc_bytes = percpu_counter_sum_positive( 468 + &fs_info->delalloc_bytes); 469 + dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 470 + if (delalloc_bytes == 0 && dio_bytes == 0) { 471 + if (trans) 472 + return; 473 + if (wait_ordered) 474 + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 475 + return; 476 + } 477 + 478 + /* 479 + * If we are doing more ordered than delalloc we need to just wait on 480 + * ordered extents, otherwise we'll waste time trying to flush delalloc 481 + * that likely won't give us the space back we need. 482 + */ 483 + if (dio_bytes > delalloc_bytes) 484 + wait_ordered = true; 485 + 486 + loops = 0; 487 + while ((delalloc_bytes || dio_bytes) && loops < 3) { 488 + nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 489 + 490 + /* 491 + * Triggers inode writeback for up to nr_pages. This will invoke 492 + * ->writepages callback and trigger delalloc filling 493 + * (btrfs_run_delalloc_range()). 494 + */ 495 + btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 496 + 497 + /* 498 + * We need to wait for the compressed pages to start before 499 + * we continue. 500 + */ 501 + async_pages = atomic_read(&fs_info->async_delalloc_pages); 502 + if (!async_pages) 503 + goto skip_async; 504 + 505 + /* 506 + * Calculate how many compressed pages we want to be written 507 + * before we continue. I.e if there are more async pages than we 508 + * require wait_event will wait until nr_pages are written. 509 + */ 510 + if (async_pages <= nr_pages) 511 + async_pages = 0; 512 + else 513 + async_pages -= nr_pages; 514 + 515 + wait_event(fs_info->async_submit_wait, 516 + atomic_read(&fs_info->async_delalloc_pages) <= 517 + (int)async_pages); 518 + skip_async: 519 + spin_lock(&space_info->lock); 520 + if (list_empty(&space_info->tickets) && 521 + list_empty(&space_info->priority_tickets)) { 522 + spin_unlock(&space_info->lock); 523 + break; 524 + } 525 + spin_unlock(&space_info->lock); 526 + 527 + loops++; 528 + if (wait_ordered && !trans) { 529 + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 530 + } else { 531 + time_left = schedule_timeout_killable(1); 532 + if (time_left) 533 + break; 534 + } 535 + delalloc_bytes = percpu_counter_sum_positive( 536 + &fs_info->delalloc_bytes); 537 + dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 538 + } 539 + } 540 + 541 + /** 542 + * maybe_commit_transaction - possibly commit the transaction if its ok to 543 + * @root - the root we're allocating for 544 + * @bytes - the number of bytes we want to reserve 545 + * @force - force the commit 546 + * 547 + * This will check to make sure that committing the transaction will actually 548 + * get us somewhere and then commit the transaction if it does. Otherwise it 549 + * will return -ENOSPC. 550 + */ 551 + static int may_commit_transaction(struct btrfs_fs_info *fs_info, 552 + struct btrfs_space_info *space_info) 553 + { 554 + struct reserve_ticket *ticket = NULL; 555 + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 556 + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 557 + struct btrfs_trans_handle *trans; 558 + u64 bytes_needed; 559 + u64 reclaim_bytes = 0; 560 + 561 + trans = (struct btrfs_trans_handle *)current->journal_info; 562 + if (trans) 563 + return -EAGAIN; 564 + 565 + spin_lock(&space_info->lock); 566 + if (!list_empty(&space_info->priority_tickets)) 567 + ticket = list_first_entry(&space_info->priority_tickets, 568 + struct reserve_ticket, list); 569 + else if (!list_empty(&space_info->tickets)) 570 + ticket = list_first_entry(&space_info->tickets, 571 + struct reserve_ticket, list); 572 + bytes_needed = (ticket) ? ticket->bytes : 0; 573 + spin_unlock(&space_info->lock); 574 + 575 + if (!bytes_needed) 576 + return 0; 577 + 578 + trans = btrfs_join_transaction(fs_info->extent_root); 579 + if (IS_ERR(trans)) 580 + return PTR_ERR(trans); 581 + 582 + /* 583 + * See if there is enough pinned space to make this reservation, or if 584 + * we have block groups that are going to be freed, allowing us to 585 + * possibly do a chunk allocation the next loop through. 586 + */ 587 + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || 588 + __percpu_counter_compare(&space_info->total_bytes_pinned, 589 + bytes_needed, 590 + BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 591 + goto commit; 592 + 593 + /* 594 + * See if there is some space in the delayed insertion reservation for 595 + * this reservation. 596 + */ 597 + if (space_info != delayed_rsv->space_info) 598 + goto enospc; 599 + 600 + spin_lock(&delayed_rsv->lock); 601 + reclaim_bytes += delayed_rsv->reserved; 602 + spin_unlock(&delayed_rsv->lock); 603 + 604 + spin_lock(&delayed_refs_rsv->lock); 605 + reclaim_bytes += delayed_refs_rsv->reserved; 606 + spin_unlock(&delayed_refs_rsv->lock); 607 + if (reclaim_bytes >= bytes_needed) 608 + goto commit; 609 + bytes_needed -= reclaim_bytes; 610 + 611 + if (__percpu_counter_compare(&space_info->total_bytes_pinned, 612 + bytes_needed, 613 + BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) 614 + goto enospc; 615 + 616 + commit: 617 + return btrfs_commit_transaction(trans); 618 + enospc: 619 + btrfs_end_transaction(trans); 620 + return -ENOSPC; 621 + } 622 + 623 + /* 624 + * Try to flush some data based on policy set by @state. This is only advisory 625 + * and may fail for various reasons. The caller is supposed to examine the 626 + * state of @space_info to detect the outcome. 627 + */ 628 + static void flush_space(struct btrfs_fs_info *fs_info, 629 + struct btrfs_space_info *space_info, u64 num_bytes, 630 + int state) 631 + { 632 + struct btrfs_root *root = fs_info->extent_root; 633 + struct btrfs_trans_handle *trans; 634 + int nr; 635 + int ret = 0; 636 + 637 + switch (state) { 638 + case FLUSH_DELAYED_ITEMS_NR: 639 + case FLUSH_DELAYED_ITEMS: 640 + if (state == FLUSH_DELAYED_ITEMS_NR) 641 + nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 642 + else 643 + nr = -1; 644 + 645 + trans = btrfs_join_transaction(root); 646 + if (IS_ERR(trans)) { 647 + ret = PTR_ERR(trans); 648 + break; 649 + } 650 + ret = btrfs_run_delayed_items_nr(trans, nr); 651 + btrfs_end_transaction(trans); 652 + break; 653 + case FLUSH_DELALLOC: 654 + case FLUSH_DELALLOC_WAIT: 655 + shrink_delalloc(fs_info, num_bytes * 2, num_bytes, 656 + state == FLUSH_DELALLOC_WAIT); 657 + break; 658 + case FLUSH_DELAYED_REFS_NR: 659 + case FLUSH_DELAYED_REFS: 660 + trans = btrfs_join_transaction(root); 661 + if (IS_ERR(trans)) { 662 + ret = PTR_ERR(trans); 663 + break; 664 + } 665 + if (state == FLUSH_DELAYED_REFS_NR) 666 + nr = calc_reclaim_items_nr(fs_info, num_bytes); 667 + else 668 + nr = 0; 669 + btrfs_run_delayed_refs(trans, nr); 670 + btrfs_end_transaction(trans); 671 + break; 672 + case ALLOC_CHUNK: 673 + case ALLOC_CHUNK_FORCE: 674 + trans = btrfs_join_transaction(root); 675 + if (IS_ERR(trans)) { 676 + ret = PTR_ERR(trans); 677 + break; 678 + } 679 + ret = btrfs_chunk_alloc(trans, 680 + btrfs_metadata_alloc_profile(fs_info), 681 + (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : 682 + CHUNK_ALLOC_FORCE); 683 + btrfs_end_transaction(trans); 684 + if (ret > 0 || ret == -ENOSPC) 685 + ret = 0; 686 + break; 687 + case COMMIT_TRANS: 688 + /* 689 + * If we have pending delayed iputs then we could free up a 690 + * bunch of pinned space, so make sure we run the iputs before 691 + * we do our pinned bytes check below. 692 + */ 693 + btrfs_run_delayed_iputs(fs_info); 694 + btrfs_wait_on_delayed_iputs(fs_info); 695 + 696 + ret = may_commit_transaction(fs_info, space_info); 697 + break; 698 + default: 699 + ret = -ENOSPC; 700 + break; 701 + } 702 + 703 + trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 704 + ret); 705 + return; 706 + } 707 + 708 + static inline u64 709 + btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 710 + struct btrfs_space_info *space_info, 711 + bool system_chunk) 712 + { 713 + struct reserve_ticket *ticket; 714 + u64 used; 715 + u64 expected; 716 + u64 to_reclaim = 0; 717 + 718 + list_for_each_entry(ticket, &space_info->tickets, list) 719 + to_reclaim += ticket->bytes; 720 + list_for_each_entry(ticket, &space_info->priority_tickets, list) 721 + to_reclaim += ticket->bytes; 722 + if (to_reclaim) 723 + return to_reclaim; 724 + 725 + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 726 + if (can_overcommit(fs_info, space_info, to_reclaim, 727 + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 728 + return 0; 729 + 730 + used = btrfs_space_info_used(space_info, true); 731 + 732 + if (can_overcommit(fs_info, space_info, SZ_1M, 733 + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 734 + expected = div_factor_fine(space_info->total_bytes, 95); 735 + else 736 + expected = div_factor_fine(space_info->total_bytes, 90); 737 + 738 + if (used > expected) 739 + to_reclaim = used - expected; 740 + else 741 + to_reclaim = 0; 742 + to_reclaim = min(to_reclaim, space_info->bytes_may_use + 743 + space_info->bytes_reserved); 744 + return to_reclaim; 745 + } 746 + 747 + static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, 748 + struct btrfs_space_info *space_info, 749 + u64 used, bool system_chunk) 750 + { 751 + u64 thresh = div_factor_fine(space_info->total_bytes, 98); 752 + 753 + /* If we're just plain full then async reclaim just slows us down. */ 754 + if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 755 + return 0; 756 + 757 + if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, 758 + system_chunk)) 759 + return 0; 760 + 761 + return (used >= thresh && !btrfs_fs_closing(fs_info) && 762 + !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 763 + } 764 + 765 + static bool wake_all_tickets(struct list_head *head) 766 + { 767 + struct reserve_ticket *ticket; 768 + 769 + while (!list_empty(head)) { 770 + ticket = list_first_entry(head, struct reserve_ticket, list); 771 + list_del_init(&ticket->list); 772 + ticket->error = -ENOSPC; 773 + wake_up(&ticket->wait); 774 + if (ticket->bytes != ticket->orig_bytes) 775 + return true; 776 + } 777 + return false; 778 + } 779 + 780 + /* 781 + * This is for normal flushers, we can wait all goddamned day if we want to. We 782 + * will loop and continuously try to flush as long as we are making progress. 783 + * We count progress as clearing off tickets each time we have to loop. 784 + */ 785 + static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 786 + { 787 + struct btrfs_fs_info *fs_info; 788 + struct btrfs_space_info *space_info; 789 + u64 to_reclaim; 790 + int flush_state; 791 + int commit_cycles = 0; 792 + u64 last_tickets_id; 793 + 794 + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 795 + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 796 + 797 + spin_lock(&space_info->lock); 798 + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 799 + false); 800 + if (!to_reclaim) { 801 + space_info->flush = 0; 802 + spin_unlock(&space_info->lock); 803 + return; 804 + } 805 + last_tickets_id = space_info->tickets_id; 806 + spin_unlock(&space_info->lock); 807 + 808 + flush_state = FLUSH_DELAYED_ITEMS_NR; 809 + do { 810 + flush_space(fs_info, space_info, to_reclaim, flush_state); 811 + spin_lock(&space_info->lock); 812 + if (list_empty(&space_info->tickets)) { 813 + space_info->flush = 0; 814 + spin_unlock(&space_info->lock); 815 + return; 816 + } 817 + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 818 + space_info, 819 + false); 820 + if (last_tickets_id == space_info->tickets_id) { 821 + flush_state++; 822 + } else { 823 + last_tickets_id = space_info->tickets_id; 824 + flush_state = FLUSH_DELAYED_ITEMS_NR; 825 + if (commit_cycles) 826 + commit_cycles--; 827 + } 828 + 829 + /* 830 + * We don't want to force a chunk allocation until we've tried 831 + * pretty hard to reclaim space. Think of the case where we 832 + * freed up a bunch of space and so have a lot of pinned space 833 + * to reclaim. We would rather use that than possibly create a 834 + * underutilized metadata chunk. So if this is our first run 835 + * through the flushing state machine skip ALLOC_CHUNK_FORCE and 836 + * commit the transaction. If nothing has changed the next go 837 + * around then we can force a chunk allocation. 838 + */ 839 + if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 840 + flush_state++; 841 + 842 + if (flush_state > COMMIT_TRANS) { 843 + commit_cycles++; 844 + if (commit_cycles > 2) { 845 + if (wake_all_tickets(&space_info->tickets)) { 846 + flush_state = FLUSH_DELAYED_ITEMS_NR; 847 + commit_cycles--; 848 + } else { 849 + space_info->flush = 0; 850 + } 851 + } else { 852 + flush_state = FLUSH_DELAYED_ITEMS_NR; 853 + } 854 + } 855 + spin_unlock(&space_info->lock); 856 + } while (flush_state <= COMMIT_TRANS); 857 + } 858 + 859 + void btrfs_init_async_reclaim_work(struct work_struct *work) 860 + { 861 + INIT_WORK(work, btrfs_async_reclaim_metadata_space); 862 + } 863 + 864 + static const enum btrfs_flush_state priority_flush_states[] = { 865 + FLUSH_DELAYED_ITEMS_NR, 866 + FLUSH_DELAYED_ITEMS, 867 + ALLOC_CHUNK, 868 + }; 869 + 870 + static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 871 + struct btrfs_space_info *space_info, 872 + struct reserve_ticket *ticket) 873 + { 874 + u64 to_reclaim; 875 + int flush_state; 876 + 877 + spin_lock(&space_info->lock); 878 + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 879 + false); 880 + if (!to_reclaim) { 881 + spin_unlock(&space_info->lock); 882 + return; 883 + } 884 + spin_unlock(&space_info->lock); 885 + 886 + flush_state = 0; 887 + do { 888 + flush_space(fs_info, space_info, to_reclaim, 889 + priority_flush_states[flush_state]); 890 + flush_state++; 891 + spin_lock(&space_info->lock); 892 + if (ticket->bytes == 0) { 893 + spin_unlock(&space_info->lock); 894 + return; 895 + } 896 + spin_unlock(&space_info->lock); 897 + } while (flush_state < ARRAY_SIZE(priority_flush_states)); 898 + } 899 + 900 + static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, 901 + struct btrfs_space_info *space_info, 902 + struct reserve_ticket *ticket) 903 + 904 + { 905 + DEFINE_WAIT(wait); 906 + u64 reclaim_bytes = 0; 907 + int ret = 0; 908 + 909 + spin_lock(&space_info->lock); 910 + while (ticket->bytes > 0 && ticket->error == 0) { 911 + ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 912 + if (ret) { 913 + ret = -EINTR; 914 + break; 915 + } 916 + spin_unlock(&space_info->lock); 917 + 918 + schedule(); 919 + 920 + finish_wait(&ticket->wait, &wait); 921 + spin_lock(&space_info->lock); 922 + } 923 + if (!ret) 924 + ret = ticket->error; 925 + if (!list_empty(&ticket->list)) 926 + list_del_init(&ticket->list); 927 + if (ticket->bytes && ticket->bytes < ticket->orig_bytes) 928 + reclaim_bytes = ticket->orig_bytes - ticket->bytes; 929 + spin_unlock(&space_info->lock); 930 + 931 + if (reclaim_bytes) 932 + btrfs_space_info_add_old_bytes(fs_info, space_info, 933 + reclaim_bytes); 934 + return ret; 935 + } 936 + 937 + /** 938 + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 939 + * @root - the root we're allocating for 940 + * @space_info - the space info we want to allocate from 941 + * @orig_bytes - the number of bytes we want 942 + * @flush - whether or not we can flush to make our reservation 943 + * 944 + * This will reserve orig_bytes number of bytes from the space info associated 945 + * with the block_rsv. If there is not enough space it will make an attempt to 946 + * flush out space to make room. It will do this by flushing delalloc if 947 + * possible or committing the transaction. If flush is 0 then no attempts to 948 + * regain reservations will be made and this will fail if there is not enough 949 + * space already. 950 + */ 951 + static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 952 + struct btrfs_space_info *space_info, 953 + u64 orig_bytes, 954 + enum btrfs_reserve_flush_enum flush, 955 + bool system_chunk) 956 + { 957 + struct reserve_ticket ticket; 958 + u64 used; 959 + u64 reclaim_bytes = 0; 960 + int ret = 0; 961 + 962 + ASSERT(orig_bytes); 963 + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 964 + 965 + spin_lock(&space_info->lock); 966 + ret = -ENOSPC; 967 + used = btrfs_space_info_used(space_info, true); 968 + 969 + /* 970 + * Carry on if we have enough space (short-circuit) OR call 971 + * can_overcommit() to ensure we can overcommit to continue. 972 + */ 973 + if ((used + orig_bytes <= space_info->total_bytes) || 974 + can_overcommit(fs_info, space_info, orig_bytes, flush, 975 + system_chunk)) { 976 + btrfs_space_info_update_bytes_may_use(fs_info, space_info, 977 + orig_bytes); 978 + trace_btrfs_space_reservation(fs_info, "space_info", 979 + space_info->flags, orig_bytes, 1); 980 + ret = 0; 981 + } 982 + 983 + /* 984 + * If we couldn't make a reservation then setup our reservation ticket 985 + * and kick the async worker if it's not already running. 986 + * 987 + * If we are a priority flusher then we just need to add our ticket to 988 + * the list and we will do our own flushing further down. 989 + */ 990 + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 991 + ticket.orig_bytes = orig_bytes; 992 + ticket.bytes = orig_bytes; 993 + ticket.error = 0; 994 + init_waitqueue_head(&ticket.wait); 995 + if (flush == BTRFS_RESERVE_FLUSH_ALL) { 996 + list_add_tail(&ticket.list, &space_info->tickets); 997 + if (!space_info->flush) { 998 + space_info->flush = 1; 999 + trace_btrfs_trigger_flush(fs_info, 1000 + space_info->flags, 1001 + orig_bytes, flush, 1002 + "enospc"); 1003 + queue_work(system_unbound_wq, 1004 + &fs_info->async_reclaim_work); 1005 + } 1006 + } else { 1007 + list_add_tail(&ticket.list, 1008 + &space_info->priority_tickets); 1009 + } 1010 + } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 1011 + used += orig_bytes; 1012 + /* 1013 + * We will do the space reservation dance during log replay, 1014 + * which means we won't have fs_info->fs_root set, so don't do 1015 + * the async reclaim as we will panic. 1016 + */ 1017 + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 1018 + need_do_async_reclaim(fs_info, space_info, 1019 + used, system_chunk) && 1020 + !work_busy(&fs_info->async_reclaim_work)) { 1021 + trace_btrfs_trigger_flush(fs_info, space_info->flags, 1022 + orig_bytes, flush, "preempt"); 1023 + queue_work(system_unbound_wq, 1024 + &fs_info->async_reclaim_work); 1025 + } 1026 + } 1027 + spin_unlock(&space_info->lock); 1028 + if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 1029 + return ret; 1030 + 1031 + if (flush == BTRFS_RESERVE_FLUSH_ALL) 1032 + return wait_reserve_ticket(fs_info, space_info, &ticket); 1033 + 1034 + ret = 0; 1035 + priority_reclaim_metadata_space(fs_info, space_info, &ticket); 1036 + spin_lock(&space_info->lock); 1037 + if (ticket.bytes) { 1038 + if (ticket.bytes < orig_bytes) 1039 + reclaim_bytes = orig_bytes - ticket.bytes; 1040 + list_del_init(&ticket.list); 1041 + ret = -ENOSPC; 1042 + } 1043 + spin_unlock(&space_info->lock); 1044 + 1045 + if (reclaim_bytes) 1046 + btrfs_space_info_add_old_bytes(fs_info, space_info, 1047 + reclaim_bytes); 1048 + ASSERT(list_empty(&ticket.list)); 1049 + return ret; 1050 + } 1051 + 1052 + /** 1053 + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 1054 + * @root - the root we're allocating for 1055 + * @block_rsv - the block_rsv we're allocating for 1056 + * @orig_bytes - the number of bytes we want 1057 + * @flush - whether or not we can flush to make our reservation 1058 + * 1059 + * This will reserve orig_bytes number of bytes from the space info associated 1060 + * with the block_rsv. If there is not enough space it will make an attempt to 1061 + * flush out space to make room. It will do this by flushing delalloc if 1062 + * possible or committing the transaction. If flush is 0 then no attempts to 1063 + * regain reservations will be made and this will fail if there is not enough 1064 + * space already. 1065 + */ 1066 + int btrfs_reserve_metadata_bytes(struct btrfs_root *root, 1067 + struct btrfs_block_rsv *block_rsv, 1068 + u64 orig_bytes, 1069 + enum btrfs_reserve_flush_enum flush) 1070 + { 1071 + struct btrfs_fs_info *fs_info = root->fs_info; 1072 + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 1073 + int ret; 1074 + bool system_chunk = (root == fs_info->chunk_root); 1075 + 1076 + ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, 1077 + orig_bytes, flush, system_chunk); 1078 + if (ret == -ENOSPC && 1079 + unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 1080 + if (block_rsv != global_rsv && 1081 + !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) 1082 + ret = 0; 1083 + } 1084 + if (ret == -ENOSPC) { 1085 + trace_btrfs_space_reservation(fs_info, "space_info:enospc", 1086 + block_rsv->space_info->flags, 1087 + orig_bytes, 1); 1088 + 1089 + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 1090 + btrfs_dump_space_info(fs_info, block_rsv->space_info, 1091 + orig_bytes, 0); 1092 + } 1093 + return ret; 1094 + }

+133

fs/btrfs/space-info.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + 3 + #ifndef BTRFS_SPACE_INFO_H 4 + #define BTRFS_SPACE_INFO_H 5 + 6 + struct btrfs_space_info { 7 + spinlock_t lock; 8 + 9 + u64 total_bytes; /* total bytes in the space, 10 + this doesn't take mirrors into account */ 11 + u64 bytes_used; /* total bytes used, 12 + this doesn't take mirrors into account */ 13 + u64 bytes_pinned; /* total bytes pinned, will be freed when the 14 + transaction finishes */ 15 + u64 bytes_reserved; /* total bytes the allocator has reserved for 16 + current allocations */ 17 + u64 bytes_may_use; /* number of bytes that may be used for 18 + delalloc/allocations */ 19 + u64 bytes_readonly; /* total bytes that are read only */ 20 + 21 + u64 max_extent_size; /* This will hold the maximum extent size of 22 + the space info if we had an ENOSPC in the 23 + allocator. */ 24 + 25 + unsigned int full:1; /* indicates that we cannot allocate any more 26 + chunks for this space */ 27 + unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ 28 + 29 + unsigned int flush:1; /* set if we are trying to make space */ 30 + 31 + unsigned int force_alloc; /* set if we need to force a chunk 32 + alloc for this space */ 33 + 34 + u64 disk_used; /* total bytes used on disk */ 35 + u64 disk_total; /* total bytes on disk, takes mirrors into 36 + account */ 37 + 38 + u64 flags; 39 + 40 + /* 41 + * bytes_pinned is kept in line with what is actually pinned, as in 42 + * we've called update_block_group and dropped the bytes_used counter 43 + * and increased the bytes_pinned counter. However this means that 44 + * bytes_pinned does not reflect the bytes that will be pinned once the 45 + * delayed refs are flushed, so this counter is inc'ed every time we 46 + * call btrfs_free_extent so it is a realtime count of what will be 47 + * freed once the transaction is committed. It will be zeroed every 48 + * time the transaction commits. 49 + */ 50 + struct percpu_counter total_bytes_pinned; 51 + 52 + struct list_head list; 53 + /* Protected by the spinlock 'lock'. */ 54 + struct list_head ro_bgs; 55 + struct list_head priority_tickets; 56 + struct list_head tickets; 57 + /* 58 + * tickets_id just indicates the next ticket will be handled, so note 59 + * it's not stored per ticket. 60 + */ 61 + u64 tickets_id; 62 + 63 + struct rw_semaphore groups_sem; 64 + /* for block groups in our same type */ 65 + struct list_head block_groups[BTRFS_NR_RAID_TYPES]; 66 + wait_queue_head_t wait; 67 + 68 + struct kobject kobj; 69 + struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; 70 + }; 71 + 72 + struct reserve_ticket { 73 + u64 orig_bytes; 74 + u64 bytes; 75 + int error; 76 + struct list_head list; 77 + wait_queue_head_t wait; 78 + }; 79 + 80 + static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) 81 + { 82 + return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && 83 + (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); 84 + } 85 + 86 + /* 87 + * 88 + * Declare a helper function to detect underflow of various space info members 89 + */ 90 + #define DECLARE_SPACE_INFO_UPDATE(name) \ 91 + static inline void \ 92 + btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ 93 + struct btrfs_space_info *sinfo, \ 94 + s64 bytes) \ 95 + { \ 96 + lockdep_assert_held(&sinfo->lock); \ 97 + trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ 98 + if (bytes < 0 && sinfo->name < -bytes) { \ 99 + WARN_ON(1); \ 100 + sinfo->name = 0; \ 101 + return; \ 102 + } \ 103 + sinfo->name += bytes; \ 104 + } 105 + 106 + DECLARE_SPACE_INFO_UPDATE(bytes_may_use); 107 + DECLARE_SPACE_INFO_UPDATE(bytes_pinned); 108 + 109 + void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 110 + struct btrfs_space_info *space_info, 111 + u64 num_bytes); 112 + void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 113 + struct btrfs_space_info *space_info, 114 + u64 num_bytes); 115 + int btrfs_init_space_info(struct btrfs_fs_info *fs_info); 116 + void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, 117 + u64 total_bytes, u64 bytes_used, 118 + u64 bytes_readonly, 119 + struct btrfs_space_info **space_info); 120 + struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 121 + u64 flags); 122 + u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 123 + bool may_use_included); 124 + void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 125 + void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 126 + struct btrfs_space_info *info, u64 bytes, 127 + int dump_block_groups); 128 + int btrfs_reserve_metadata_bytes(struct btrfs_root *root, 129 + struct btrfs_block_rsv *block_rsv, 130 + u64 orig_bytes, 131 + enum btrfs_reserve_flush_enum flush); 132 + 133 + #endif /* BTRFS_SPACE_INFO_H */

+17 -19

fs/btrfs/super.c

··· 42 42 #include "dev-replace.h" 43 43 #include "free-space-cache.h" 44 44 #include "backref.h" 45 + #include "space-info.h" 45 46 #include "tests/btrfs-tests.h" 46 47 47 48 #include "qgroup.h" ··· 1554 1553 } else { 1555 1554 snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); 1556 1555 btrfs_sb(s)->bdev_holder = fs_type; 1556 + if (!strstr(crc32c_impl(), "generic")) 1557 + set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); 1557 1558 error = btrfs_fill_super(s, fs_devices, data); 1558 1559 } 1559 1560 if (!error) ··· 1604 1601 { 1605 1602 struct vfsmount *mnt_root; 1606 1603 struct dentry *root; 1607 - fmode_t mode = FMODE_READ; 1608 1604 char *subvol_name = NULL; 1609 1605 u64 subvol_objectid = 0; 1610 1606 int error = 0; 1611 - 1612 - if (!(flags & SB_RDONLY)) 1613 - mode |= FMODE_WRITE; 1614 1607 1615 1608 error = btrfs_parse_subvol_options(data, &subvol_name, 1616 1609 &subvol_objectid); ··· 1903 1904 u64 type; 1904 1905 u64 avail_space; 1905 1906 u64 min_stripe_size; 1906 - int min_stripes = 1, num_stripes = 1; 1907 + int min_stripes, num_stripes = 1; 1907 1908 int i = 0, nr_devices; 1909 + const struct btrfs_raid_attr *rattr; 1908 1910 1909 1911 /* 1910 1912 * We aren't under the device list lock, so this is racy-ish, but good ··· 1929 1929 1930 1930 /* calc min stripe number for data space allocation */ 1931 1931 type = btrfs_data_alloc_profile(fs_info); 1932 - if (type & BTRFS_BLOCK_GROUP_RAID0) { 1933 - min_stripes = 2; 1934 - num_stripes = nr_devices; 1935 - } else if (type & BTRFS_BLOCK_GROUP_RAID1) { 1936 - min_stripes = 2; 1937 - num_stripes = 2; 1938 - } else if (type & BTRFS_BLOCK_GROUP_RAID10) { 1939 - min_stripes = 4; 1940 - num_stripes = 4; 1941 - } 1932 + rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)]; 1933 + min_stripes = rattr->devs_min; 1942 1934 1943 - if (type & BTRFS_BLOCK_GROUP_DUP) 1944 - min_stripe_size = 2 * BTRFS_STRIPE_LEN; 1945 - else 1946 - min_stripe_size = BTRFS_STRIPE_LEN; 1935 + if (type & BTRFS_BLOCK_GROUP_RAID0) 1936 + num_stripes = nr_devices; 1937 + else if (type & BTRFS_BLOCK_GROUP_RAID1) 1938 + num_stripes = 2; 1939 + else if (type & BTRFS_BLOCK_GROUP_RAID10) 1940 + num_stripes = 4; 1941 + 1942 + /* Adjust for more than 1 stripe per device */ 1943 + min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN; 1947 1944 1948 1945 rcu_read_lock(); 1949 1946 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { ··· 2463 2466 module_exit(exit_btrfs_fs) 2464 2467 2465 2468 MODULE_LICENSE("GPL"); 2469 + MODULE_SOFTDEP("pre: crc32c");

+1

fs/btrfs/sysfs.c

··· 16 16 #include "transaction.h" 17 17 #include "sysfs.h" 18 18 #include "volumes.h" 19 + #include "space-info.h" 19 20 20 21 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); 21 22 static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);

+103 -14

fs/btrfs/tests/extent-io-tests.c

··· 10 10 #include "btrfs-tests.h" 11 11 #include "../ctree.h" 12 12 #include "../extent_io.h" 13 + #include "../btrfs_inode.h" 13 14 14 15 #define PROCESS_UNLOCK (1 << 0) 15 16 #define PROCESS_RELEASE (1 << 1) ··· 59 58 static int test_find_delalloc(u32 sectorsize) 60 59 { 61 60 struct inode *inode; 62 - struct extent_io_tree tmp; 61 + struct extent_io_tree *tmp; 63 62 struct page *page; 64 63 struct page *locked_page = NULL; 65 64 unsigned long index = 0; ··· 77 76 test_std_err(TEST_ALLOC_INODE); 78 77 return -ENOMEM; 79 78 } 79 + tmp = &BTRFS_I(inode)->io_tree; 80 80 81 81 /* 82 82 * Passing NULL as we don't have fs_info but tracepoints are not used 83 83 * at this point 84 84 */ 85 - extent_io_tree_init(NULL, &tmp, IO_TREE_SELFTEST, NULL); 85 + extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL); 86 86 87 87 /* 88 88 * First go through and create and mark all of our pages dirty, we pin ··· 110 108 * |--- delalloc ---| 111 109 * |--- search ---| 112 110 */ 113 - set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); 111 + set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL); 114 112 start = 0; 115 113 end = 0; 116 - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, 114 + found = find_lock_delalloc_range(inode, locked_page, &start, 117 115 &end); 118 116 if (!found) { 119 117 test_err("should have found at least one delalloc"); ··· 124 122 sectorsize - 1, start, end); 125 123 goto out_bits; 126 124 } 127 - unlock_extent(&tmp, start, end); 125 + unlock_extent(tmp, start, end); 128 126 unlock_page(locked_page); 129 127 put_page(locked_page); 130 128 ··· 141 139 test_err("couldn't find the locked page"); 142 140 goto out_bits; 143 141 } 144 - set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); 142 + set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL); 145 143 start = test_start; 146 144 end = 0; 147 - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, 145 + found = find_lock_delalloc_range(inode, locked_page, &start, 148 146 &end); 149 147 if (!found) { 150 148 test_err("couldn't find delalloc in our range"); ··· 160 158 test_err("there were unlocked pages in the range"); 161 159 goto out_bits; 162 160 } 163 - unlock_extent(&tmp, start, end); 161 + unlock_extent(tmp, start, end); 164 162 /* locked_page was unlocked above */ 165 163 put_page(locked_page); 166 164 ··· 178 176 } 179 177 start = test_start; 180 178 end = 0; 181 - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, 179 + found = find_lock_delalloc_range(inode, locked_page, &start, 182 180 &end); 183 181 if (found) { 184 182 test_err("found range when we shouldn't have"); ··· 196 194 * 197 195 * We are re-using our test_start from above since it works out well. 198 196 */ 199 - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); 197 + set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL); 200 198 start = test_start; 201 199 end = 0; 202 - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, 200 + found = find_lock_delalloc_range(inode, locked_page, &start, 203 201 &end); 204 202 if (!found) { 205 203 test_err("didn't find our range"); ··· 215 213 test_err("pages in range were not all locked"); 216 214 goto out_bits; 217 215 } 218 - unlock_extent(&tmp, start, end); 216 + unlock_extent(tmp, start, end); 219 217 220 218 /* 221 219 * Now to test where we run into a page that is no longer dirty in the ··· 240 238 * this changes at any point in the future we will need to fix this 241 239 * tests expected behavior. 242 240 */ 243 - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, 241 + found = find_lock_delalloc_range(inode, locked_page, &start, 244 242 &end); 245 243 if (!found) { 246 244 test_err("didn't find our range"); ··· 258 256 } 259 257 ret = 0; 260 258 out_bits: 261 - clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1); 259 + clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); 262 260 out: 263 261 if (locked_page) 264 262 put_page(locked_page); ··· 434 432 return ret; 435 433 } 436 434 435 + static int test_find_first_clear_extent_bit(void) 436 + { 437 + struct extent_io_tree tree; 438 + u64 start, end; 439 + 440 + test_msg("running find_first_clear_extent_bit test"); 441 + extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); 442 + 443 + /* 444 + * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between 445 + * 4M-32M 446 + */ 447 + set_extent_bits(&tree, SZ_1M, SZ_4M - 1, 448 + CHUNK_TRIMMED | CHUNK_ALLOCATED); 449 + 450 + find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, 451 + CHUNK_TRIMMED | CHUNK_ALLOCATED); 452 + 453 + if (start != 0 || end != SZ_1M -1) 454 + test_err("error finding beginning range: start %llu end %llu", 455 + start, end); 456 + 457 + /* Now add 32M-64M so that we have a hole between 4M-32M */ 458 + set_extent_bits(&tree, SZ_32M, SZ_64M - 1, 459 + CHUNK_TRIMMED | CHUNK_ALLOCATED); 460 + 461 + /* 462 + * Request first hole starting at 12M, we should get 4M-32M 463 + */ 464 + find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, 465 + CHUNK_TRIMMED | CHUNK_ALLOCATED); 466 + 467 + if (start != SZ_4M || end != SZ_32M - 1) 468 + test_err("error finding trimmed range: start %llu end %llu", 469 + start, end); 470 + 471 + /* 472 + * Search in the middle of allocated range, should get the next one 473 + * available, which happens to be unallocated -> 4M-32M 474 + */ 475 + find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, 476 + CHUNK_TRIMMED | CHUNK_ALLOCATED); 477 + 478 + if (start != SZ_4M || end != SZ_32M -1) 479 + test_err("error finding next unalloc range: start %llu end %llu", 480 + start, end); 481 + 482 + /* 483 + * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag 484 + * being unset in this range, we should get the entry in range 64M-72M 485 + */ 486 + set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED); 487 + find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, 488 + CHUNK_TRIMMED); 489 + 490 + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) 491 + test_err("error finding exact range: start %llu end %llu", 492 + start, end); 493 + 494 + find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, 495 + CHUNK_TRIMMED); 496 + 497 + /* 498 + * Search in the middle of set range whose immediate neighbour doesn't 499 + * have the bits set so it must be returned 500 + */ 501 + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) 502 + test_err("error finding next alloc range: start %llu end %llu", 503 + start, end); 504 + 505 + /* 506 + * Search beyond any known range, shall return after last known range 507 + * and end should be -1 508 + */ 509 + find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); 510 + if (start != SZ_64M + SZ_8M || end != -1) 511 + test_err( 512 + "error handling beyond end of range search: start %llu end %llu", 513 + start, end); 514 + 515 + return 0; 516 + } 517 + 437 518 int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) 438 519 { 439 520 int ret; ··· 524 439 test_msg("running extent I/O tests"); 525 440 526 441 ret = test_find_delalloc(sectorsize); 442 + if (ret) 443 + goto out; 444 + 445 + ret = test_find_first_clear_extent_bit(); 527 446 if (ret) 528 447 goto out; 529 448

+22

fs/btrfs/tests/extent-map-tests.c

··· 66 66 em->len = SZ_16K; 67 67 em->block_start = 0; 68 68 em->block_len = SZ_16K; 69 + write_lock(&em_tree->lock); 69 70 ret = add_extent_mapping(em_tree, em, 0); 71 + write_unlock(&em_tree->lock); 70 72 if (ret < 0) { 71 73 test_err("cannot add extent range [0, 16K)"); 72 74 goto out; ··· 87 85 em->len = SZ_4K; 88 86 em->block_start = SZ_32K; /* avoid merging */ 89 87 em->block_len = SZ_4K; 88 + write_lock(&em_tree->lock); 90 89 ret = add_extent_mapping(em_tree, em, 0); 90 + write_unlock(&em_tree->lock); 91 91 if (ret < 0) { 92 92 test_err("cannot add extent range [16K, 20K)"); 93 93 goto out; ··· 108 104 em->len = len; 109 105 em->block_start = start; 110 106 em->block_len = len; 107 + write_lock(&em_tree->lock); 111 108 ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); 109 + write_unlock(&em_tree->lock); 112 110 if (ret) { 113 111 test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); 114 112 goto out; ··· 154 148 em->len = SZ_1K; 155 149 em->block_start = EXTENT_MAP_INLINE; 156 150 em->block_len = (u64)-1; 151 + write_lock(&em_tree->lock); 157 152 ret = add_extent_mapping(em_tree, em, 0); 153 + write_unlock(&em_tree->lock); 158 154 if (ret < 0) { 159 155 test_err("cannot add extent range [0, 1K)"); 160 156 goto out; ··· 175 167 em->len = SZ_4K; 176 168 em->block_start = SZ_4K; 177 169 em->block_len = SZ_4K; 170 + write_lock(&em_tree->lock); 178 171 ret = add_extent_mapping(em_tree, em, 0); 172 + write_unlock(&em_tree->lock); 179 173 if (ret < 0) { 180 174 test_err("cannot add extent range [4K, 8K)"); 181 175 goto out; ··· 196 186 em->len = SZ_1K; 197 187 em->block_start = EXTENT_MAP_INLINE; 198 188 em->block_len = (u64)-1; 189 + write_lock(&em_tree->lock); 199 190 ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); 191 + write_unlock(&em_tree->lock); 200 192 if (ret) { 201 193 test_err("case2 [0 1K]: ret %d", ret); 202 194 goto out; ··· 237 225 em->len = SZ_4K; 238 226 em->block_start = SZ_4K; 239 227 em->block_len = SZ_4K; 228 + write_lock(&em_tree->lock); 240 229 ret = add_extent_mapping(em_tree, em, 0); 230 + write_unlock(&em_tree->lock); 241 231 if (ret < 0) { 242 232 test_err("cannot add extent range [4K, 8K)"); 243 233 goto out; ··· 258 244 em->len = SZ_16K; 259 245 em->block_start = 0; 260 246 em->block_len = SZ_16K; 247 + write_lock(&em_tree->lock); 261 248 ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); 249 + write_unlock(&em_tree->lock); 262 250 if (ret) { 263 251 test_err("case3 [0x%llx 0x%llx): ret %d", 264 252 start, start + len, ret); ··· 336 320 em->len = SZ_8K; 337 321 em->block_start = 0; 338 322 em->block_len = SZ_8K; 323 + write_lock(&em_tree->lock); 339 324 ret = add_extent_mapping(em_tree, em, 0); 325 + write_unlock(&em_tree->lock); 340 326 if (ret < 0) { 341 327 test_err("cannot add extent range [0, 8K)"); 342 328 goto out; ··· 357 339 em->len = 24 * SZ_1K; 358 340 em->block_start = SZ_16K; /* avoid merging */ 359 341 em->block_len = 24 * SZ_1K; 342 + write_lock(&em_tree->lock); 360 343 ret = add_extent_mapping(em_tree, em, 0); 344 + write_unlock(&em_tree->lock); 361 345 if (ret < 0) { 362 346 test_err("cannot add extent range [8K, 32K)"); 363 347 goto out; ··· 377 357 em->len = SZ_32K; 378 358 em->block_start = 0; 379 359 em->block_len = SZ_32K; 360 + write_lock(&em_tree->lock); 380 361 ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); 362 + write_unlock(&em_tree->lock); 381 363 if (ret) { 382 364 test_err("case4 [0x%llx 0x%llx): ret %d", 383 365 start, len, ret);

+18

fs/btrfs/transaction.c

··· 129 129 } 130 130 131 131 /* 132 + * To be called after all the new block groups attached to the transaction 133 + * handle have been created (btrfs_create_pending_block_groups()). 134 + */ 135 + void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) 136 + { 137 + struct btrfs_fs_info *fs_info = trans->fs_info; 138 + 139 + if (!trans->chunk_bytes_reserved) 140 + return; 141 + 142 + WARN_ON_ONCE(!list_empty(&trans->new_bgs)); 143 + 144 + btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, 145 + trans->chunk_bytes_reserved); 146 + trans->chunk_bytes_reserved = 0; 147 + } 148 + 149 + /* 132 150 * either allocate a new transaction or hop into the existing one 133 151 */ 134 152 static noinline int join_transaction(struct btrfs_fs_info *fs_info,

+1

fs/btrfs/transaction.h

··· 224 224 void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); 225 225 void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, 226 226 struct btrfs_root *root); 227 + void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); 227 228 228 229 #endif

+11

fs/btrfs/tree-checker.c

··· 132 132 struct btrfs_file_extent_item *fi; 133 133 u32 sectorsize = fs_info->sectorsize; 134 134 u32 item_size = btrfs_item_size_nr(leaf, slot); 135 + u64 extent_end; 135 136 136 137 if (!IS_ALIGNED(key->offset, sectorsize)) { 137 138 file_extent_err(leaf, slot, ··· 207 206 CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) || 208 207 CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)) 209 208 return -EUCLEAN; 209 + 210 + /* Catch extent end overflow */ 211 + if (check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi), 212 + key->offset, &extent_end)) { 213 + file_extent_err(leaf, slot, 214 + "extent end overflow, have file offset %llu extent num bytes %llu", 215 + key->offset, 216 + btrfs_file_extent_num_bytes(leaf, fi)); 217 + return -EUCLEAN; 218 + } 210 219 211 220 /* 212 221 * Check that no two consecutive file extent items, in the same leaf,

+37 -3

fs/btrfs/tree-log.c

··· 3323 3323 } 3324 3324 3325 3325 /* 3326 + * Check if an inode was logged in the current transaction. We can't always rely 3327 + * on an inode's logged_trans value, because it's an in-memory only field and 3328 + * therefore not persisted. This means that its value is lost if the inode gets 3329 + * evicted and loaded again from disk (in which case it has a value of 0, and 3330 + * certainly it is smaller then any possible transaction ID), when that happens 3331 + * the full_sync flag is set in the inode's runtime flags, so on that case we 3332 + * assume eviction happened and ignore the logged_trans value, assuming the 3333 + * worst case, that the inode was logged before in the current transaction. 3334 + */ 3335 + static bool inode_logged(struct btrfs_trans_handle *trans, 3336 + struct btrfs_inode *inode) 3337 + { 3338 + if (inode->logged_trans == trans->transid) 3339 + return true; 3340 + 3341 + if (inode->last_trans == trans->transid && 3342 + test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && 3343 + !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) 3344 + return true; 3345 + 3346 + return false; 3347 + } 3348 + 3349 + /* 3326 3350 * If both a file and directory are logged, and unlinks or renames are 3327 3351 * mixed in, we have a few interesting corners: 3328 3352 * ··· 3380 3356 int bytes_del = 0; 3381 3357 u64 dir_ino = btrfs_ino(dir); 3382 3358 3383 - if (dir->logged_trans < trans->transid) 3359 + if (!inode_logged(trans, dir)) 3384 3360 return 0; 3385 3361 3386 3362 ret = join_running_log_trans(root); ··· 3484 3460 u64 index; 3485 3461 int ret; 3486 3462 3487 - if (inode->logged_trans < trans->transid) 3463 + if (!inode_logged(trans, inode)) 3488 3464 return 0; 3489 3465 3490 3466 ret = join_running_log_trans(root); ··· 5444 5420 } 5445 5421 } 5446 5422 5423 + /* 5424 + * Don't update last_log_commit if we logged that an inode exists after 5425 + * it was loaded to memory (full_sync bit set). 5426 + * This is to prevent data loss when we do a write to the inode, then 5427 + * the inode gets evicted after all delalloc was flushed, then we log 5428 + * it exists (due to a rename for example) and then fsync it. This last 5429 + * fsync would do nothing (not logging the extents previously written). 5430 + */ 5447 5431 spin_lock(&inode->lock); 5448 5432 inode->logged_trans = trans->transid; 5449 - inode->last_log_commit = inode->last_sub_trans; 5433 + if (inode_only != LOG_INODE_EXISTS || 5434 + !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) 5435 + inode->last_log_commit = inode->last_sub_trans; 5450 5436 spin_unlock(&inode->lock); 5451 5437 out_unlock: 5452 5438 mutex_unlock(&inode->log_mutex);

+207 -169

fs/btrfs/volumes.c

··· 28 28 #include "dev-replace.h" 29 29 #include "sysfs.h" 30 30 #include "tree-checker.h" 31 + #include "space-info.h" 31 32 32 33 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 33 34 [BTRFS_RAID_RAID10] = { ··· 124 123 }, 125 124 }; 126 125 127 - const char *get_raid_name(enum btrfs_raid_types type) 126 + const char *btrfs_bg_type_to_raid_name(u64 flags) 128 127 { 129 - if (type >= BTRFS_NR_RAID_TYPES) 128 + const int index = btrfs_bg_flags_to_raid_index(flags); 129 + 130 + if (index >= BTRFS_NR_RAID_TYPES) 130 131 return NULL; 131 132 132 - return btrfs_raid_array[type].raid_name; 133 + return btrfs_raid_array[index].raid_name; 133 134 } 134 135 135 136 /* ··· 240 237 * chunk_mutex 241 238 * ----------- 242 239 * protects chunks, adding or removing during allocation, trim or when a new 243 - * device is added/removed 240 + * device is added/removed. Additionally it also protects post_commit_list of 241 + * individual devices, since they can be added to the transaction's 242 + * post_commit_list only with chunk_mutex held. 244 243 * 245 244 * cleaner_mutex 246 245 * ------------- ··· 1823 1818 struct rb_node *n; 1824 1819 u64 ret = 0; 1825 1820 1826 - em_tree = &fs_info->mapping_tree.map_tree; 1821 + em_tree = &fs_info->mapping_tree; 1827 1822 read_lock(&em_tree->lock); 1828 1823 n = rb_last(&em_tree->map.rb_root); 1829 1824 if (n) { ··· 2946 2941 struct extent_map_tree *em_tree; 2947 2942 struct extent_map *em; 2948 2943 2949 - em_tree = &fs_info->mapping_tree.map_tree; 2944 + em_tree = &fs_info->mapping_tree; 2950 2945 read_lock(&em_tree->lock); 2951 2946 em = lookup_extent_mapping(em_tree, logical, length); 2952 2947 read_unlock(&em_tree->lock); ··· 3479 3474 return 1; 3480 3475 } 3481 3476 3477 + static u64 calc_data_stripes(u64 type, int num_stripes) 3478 + { 3479 + const int index = btrfs_bg_flags_to_raid_index(type); 3480 + const int ncopies = btrfs_raid_array[index].ncopies; 3481 + const int nparity = btrfs_raid_array[index].nparity; 3482 + 3483 + if (nparity) 3484 + return num_stripes - nparity; 3485 + else 3486 + return num_stripes / ncopies; 3487 + } 3488 + 3482 3489 /* [pstart, pend) */ 3483 3490 static int chunk_drange_filter(struct extent_buffer *leaf, 3484 3491 struct btrfs_chunk *chunk, ··· 3500 3483 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3501 3484 u64 stripe_offset; 3502 3485 u64 stripe_length; 3486 + u64 type; 3503 3487 int factor; 3504 3488 int i; 3505 3489 3506 3490 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3507 3491 return 0; 3508 3492 3509 - if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 3510 - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 3511 - factor = num_stripes / 2; 3512 - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 3513 - factor = num_stripes - 1; 3514 - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 3515 - factor = num_stripes - 2; 3516 - } else { 3517 - factor = num_stripes; 3518 - } 3493 + type = btrfs_chunk_type(leaf, chunk); 3494 + factor = calc_data_stripes(type, num_stripes); 3519 3495 3520 3496 for (i = 0; i < num_stripes; i++) { 3521 3497 stripe = btrfs_stripe_nr(chunk, i); ··· 3931 3921 bp += ret; \ 3932 3922 } while (0) 3933 3923 3934 - if (flags & BTRFS_BALANCE_ARGS_CONVERT) { 3935 - int index = btrfs_bg_flags_to_raid_index(bargs->target); 3936 - 3937 - CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index)); 3938 - } 3924 + if (flags & BTRFS_BALANCE_ARGS_CONVERT) 3925 + CHECK_APPEND_1ARG("convert=%s,", 3926 + btrfs_bg_type_to_raid_name(bargs->target)); 3939 3927 3940 3928 if (flags & BTRFS_BALANCE_ARGS_SOFT) 3941 3929 CHECK_APPEND_NOARG("soft,"); ··· 4055 4047 u64 num_devices; 4056 4048 unsigned seq; 4057 4049 bool reducing_integrity; 4050 + int i; 4058 4051 4059 4052 if (btrfs_fs_closing(fs_info) || 4060 4053 atomic_read(&fs_info->balance_pause_req) || ··· 4085 4076 } 4086 4077 4087 4078 num_devices = btrfs_num_devices(fs_info); 4079 + allowed = 0; 4080 + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 4081 + if (num_devices >= btrfs_raid_array[i].devs_min) 4082 + allowed |= btrfs_raid_array[i].bg_flag; 4088 4083 4089 - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; 4090 - if (num_devices > 1) 4091 - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 4092 - if (num_devices > 2) 4093 - allowed |= BTRFS_BLOCK_GROUP_RAID5; 4094 - if (num_devices > 3) 4095 - allowed |= (BTRFS_BLOCK_GROUP_RAID10 | 4096 - BTRFS_BLOCK_GROUP_RAID6); 4097 4084 if (validate_convert_profile(&bctl->data, allowed)) { 4098 - int index = btrfs_bg_flags_to_raid_index(bctl->data.target); 4099 - 4100 4085 btrfs_err(fs_info, 4101 4086 "balance: invalid convert data profile %s", 4102 - get_raid_name(index)); 4087 + btrfs_bg_type_to_raid_name(bctl->data.target)); 4103 4088 ret = -EINVAL; 4104 4089 goto out; 4105 4090 } 4106 4091 if (validate_convert_profile(&bctl->meta, allowed)) { 4107 - int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); 4108 - 4109 4092 btrfs_err(fs_info, 4110 4093 "balance: invalid convert metadata profile %s", 4111 - get_raid_name(index)); 4094 + btrfs_bg_type_to_raid_name(bctl->meta.target)); 4112 4095 ret = -EINVAL; 4113 4096 goto out; 4114 4097 } 4115 4098 if (validate_convert_profile(&bctl->sys, allowed)) { 4116 - int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); 4117 - 4118 4099 btrfs_err(fs_info, 4119 4100 "balance: invalid convert system profile %s", 4120 - get_raid_name(index)); 4101 + btrfs_bg_type_to_raid_name(bctl->sys.target)); 4121 4102 ret = -EINVAL; 4122 4103 goto out; 4123 4104 } 4124 4105 4125 - /* allow to reduce meta or sys integrity only if force set */ 4126 - allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 4127 - BTRFS_BLOCK_GROUP_RAID10 | 4128 - BTRFS_BLOCK_GROUP_RAID5 | 4129 - BTRFS_BLOCK_GROUP_RAID6; 4106 + /* 4107 + * Allow to reduce metadata or system integrity only if force set for 4108 + * profiles with redundancy (copies, parity) 4109 + */ 4110 + allowed = 0; 4111 + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 4112 + if (btrfs_raid_array[i].ncopies >= 2 || 4113 + btrfs_raid_array[i].tolerated_failures >= 1) 4114 + allowed |= btrfs_raid_array[i].bg_flag; 4115 + } 4130 4116 do { 4131 4117 seq = read_seqbegin(&fs_info->profiles_lock); 4132 4118 ··· 4156 4152 4157 4153 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 4158 4154 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4159 - int meta_index = btrfs_bg_flags_to_raid_index(meta_target); 4160 - int data_index = btrfs_bg_flags_to_raid_index(data_target); 4161 - 4162 4155 btrfs_warn(fs_info, 4163 4156 "balance: metadata profile %s has lower redundancy than data profile %s", 4164 - get_raid_name(meta_index), get_raid_name(data_index)); 4157 + btrfs_bg_type_to_raid_name(meta_target), 4158 + btrfs_bg_type_to_raid_name(data_target)); 4159 + } 4160 + 4161 + if (fs_info->send_in_progress) { 4162 + btrfs_warn_rl(fs_info, 4163 + "cannot run balance while send operations are in progress (%d in progress)", 4164 + fs_info->send_in_progress); 4165 + ret = -EAGAIN; 4166 + goto out; 4165 4167 } 4166 4168 4167 4169 ret = insert_balance_item(fs_info, bctl); ··· 4959 4949 sub_stripes = btrfs_raid_array[index].sub_stripes; 4960 4950 dev_stripes = btrfs_raid_array[index].dev_stripes; 4961 4951 devs_max = btrfs_raid_array[index].devs_max; 4952 + if (!devs_max) 4953 + devs_max = BTRFS_MAX_DEVS(info); 4962 4954 devs_min = btrfs_raid_array[index].devs_min; 4963 4955 devs_increment = btrfs_raid_array[index].devs_increment; 4964 4956 ncopies = btrfs_raid_array[index].ncopies; ··· 4969 4957 if (type & BTRFS_BLOCK_GROUP_DATA) { 4970 4958 max_stripe_size = SZ_1G; 4971 4959 max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; 4972 - if (!devs_max) 4973 - devs_max = BTRFS_MAX_DEVS(info); 4974 4960 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4975 4961 /* for larger filesystems, use larger metadata chunks */ 4976 4962 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) ··· 4976 4966 else 4977 4967 max_stripe_size = SZ_256M; 4978 4968 max_chunk_size = max_stripe_size; 4979 - if (!devs_max) 4980 - devs_max = BTRFS_MAX_DEVS(info); 4981 4969 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4982 4970 max_stripe_size = SZ_32M; 4983 4971 max_chunk_size = 2 * max_stripe_size; 4984 - if (!devs_max) 4985 - devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; 4986 4972 } else { 4987 4973 btrfs_err(info, "invalid chunk type 0x%llx requested", 4988 4974 type); ··· 5149 5143 em->block_len = em->len; 5150 5144 em->orig_block_len = stripe_size; 5151 5145 5152 - em_tree = &info->mapping_tree.map_tree; 5146 + em_tree = &info->mapping_tree; 5153 5147 write_lock(&em_tree->lock); 5154 5148 ret = add_extent_mapping(em_tree, em, 0); 5155 5149 if (ret) { ··· 5330 5324 5331 5325 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5332 5326 { 5333 - int max_errors; 5327 + const int index = btrfs_bg_flags_to_raid_index(map->type); 5334 5328 5335 - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5336 - BTRFS_BLOCK_GROUP_RAID10 | 5337 - BTRFS_BLOCK_GROUP_RAID5 | 5338 - BTRFS_BLOCK_GROUP_DUP)) { 5339 - max_errors = 1; 5340 - } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 5341 - max_errors = 2; 5342 - } else { 5343 - max_errors = 0; 5344 - } 5345 - 5346 - return max_errors; 5329 + return btrfs_raid_array[index].tolerated_failures; 5347 5330 } 5348 5331 5349 5332 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) ··· 5373 5378 return readonly; 5374 5379 } 5375 5380 5376 - void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 5377 - { 5378 - extent_map_tree_init(&tree->map_tree); 5379 - } 5380 - 5381 - void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 5381 + void btrfs_mapping_tree_free(struct extent_map_tree *tree) 5382 5382 { 5383 5383 struct extent_map *em; 5384 5384 5385 5385 while (1) { 5386 - write_lock(&tree->map_tree.lock); 5387 - em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 5386 + write_lock(&tree->lock); 5387 + em = lookup_extent_mapping(tree, 0, (u64)-1); 5388 5388 if (em) 5389 - remove_extent_mapping(&tree->map_tree, em); 5390 - write_unlock(&tree->map_tree.lock); 5389 + remove_extent_mapping(tree, em); 5390 + write_unlock(&tree->lock); 5391 5391 if (!em) 5392 5392 break; 5393 5393 /* once for us */ ··· 5409 5419 return 1; 5410 5420 5411 5421 map = em->map_lookup; 5412 - if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 5422 + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) 5413 5423 ret = map->num_stripes; 5414 5424 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5415 5425 ret = map->sub_stripes; ··· 5483 5493 struct btrfs_device *srcdev; 5484 5494 5485 5495 ASSERT((map->type & 5486 - (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); 5496 + (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 5487 5497 5488 5498 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5489 5499 num_stripes = map->sub_stripes; ··· 5672 5682 &remaining_stripes); 5673 5683 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5674 5684 last_stripe *= sub_stripes; 5675 - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5685 + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 5676 5686 BTRFS_BLOCK_GROUP_DUP)) { 5677 5687 num_stripes = map->num_stripes; 5678 5688 } else { ··· 5916 5926 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 5917 5927 } 5918 5928 5929 + /* 5930 + * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) 5931 + * tuple. This information is used to calculate how big a 5932 + * particular bio can get before it straddles a stripe. 5933 + * 5934 + * @fs_info - the filesystem 5935 + * @logical - address that we want to figure out the geometry of 5936 + * @len - the length of IO we are going to perform, starting at @logical 5937 + * @op - type of operation - write or read 5938 + * @io_geom - pointer used to return values 5939 + * 5940 + * Returns < 0 in case a chunk for the given logical address cannot be found, 5941 + * usually shouldn't happen unless @logical is corrupted, 0 otherwise. 5942 + */ 5943 + int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 5944 + u64 logical, u64 len, struct btrfs_io_geometry *io_geom) 5945 + { 5946 + struct extent_map *em; 5947 + struct map_lookup *map; 5948 + u64 offset; 5949 + u64 stripe_offset; 5950 + u64 stripe_nr; 5951 + u64 stripe_len; 5952 + u64 raid56_full_stripe_start = (u64)-1; 5953 + int data_stripes; 5954 + 5955 + ASSERT(op != BTRFS_MAP_DISCARD); 5956 + 5957 + em = btrfs_get_chunk_map(fs_info, logical, len); 5958 + if (IS_ERR(em)) 5959 + return PTR_ERR(em); 5960 + 5961 + map = em->map_lookup; 5962 + /* Offset of this logical address in the chunk */ 5963 + offset = logical - em->start; 5964 + /* Len of a stripe in a chunk */ 5965 + stripe_len = map->stripe_len; 5966 + /* Stripe wher this block falls in */ 5967 + stripe_nr = div64_u64(offset, stripe_len); 5968 + /* Offset of stripe in the chunk */ 5969 + stripe_offset = stripe_nr * stripe_len; 5970 + if (offset < stripe_offset) { 5971 + btrfs_crit(fs_info, 5972 + "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", 5973 + stripe_offset, offset, em->start, logical, stripe_len); 5974 + free_extent_map(em); 5975 + return -EINVAL; 5976 + } 5977 + 5978 + /* stripe_offset is the offset of this block in its stripe */ 5979 + stripe_offset = offset - stripe_offset; 5980 + data_stripes = nr_data_stripes(map); 5981 + 5982 + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 5983 + u64 max_len = stripe_len - stripe_offset; 5984 + 5985 + /* 5986 + * In case of raid56, we need to know the stripe aligned start 5987 + */ 5988 + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5989 + unsigned long full_stripe_len = stripe_len * data_stripes; 5990 + raid56_full_stripe_start = offset; 5991 + 5992 + /* 5993 + * Allow a write of a full stripe, but make sure we 5994 + * don't allow straddling of stripes 5995 + */ 5996 + raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 5997 + full_stripe_len); 5998 + raid56_full_stripe_start *= full_stripe_len; 5999 + 6000 + /* 6001 + * For writes to RAID[56], allow a full stripeset across 6002 + * all disks. For other RAID types and for RAID[56] 6003 + * reads, just allow a single stripe (on a single disk). 6004 + */ 6005 + if (op == BTRFS_MAP_WRITE) { 6006 + max_len = stripe_len * data_stripes - 6007 + (offset - raid56_full_stripe_start); 6008 + } 6009 + } 6010 + len = min_t(u64, em->len - offset, max_len); 6011 + } else { 6012 + len = em->len - offset; 6013 + } 6014 + 6015 + io_geom->len = len; 6016 + io_geom->offset = offset; 6017 + io_geom->stripe_len = stripe_len; 6018 + io_geom->stripe_nr = stripe_nr; 6019 + io_geom->stripe_offset = stripe_offset; 6020 + io_geom->raid56_stripe_offset = raid56_full_stripe_start; 6021 + 6022 + return 0; 6023 + } 6024 + 5919 6025 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 5920 6026 enum btrfs_map_op op, 5921 6027 u64 logical, u64 *length, ··· 6025 5939 u64 stripe_nr; 6026 5940 u64 stripe_len; 6027 5941 u32 stripe_index; 5942 + int data_stripes; 6028 5943 int i; 6029 5944 int ret = 0; 6030 5945 int num_stripes; ··· 6038 5951 int patch_the_first_stripe_for_dev_replace = 0; 6039 5952 u64 physical_to_patch_in_first_stripe = 0; 6040 5953 u64 raid56_full_stripe_start = (u64)-1; 5954 + struct btrfs_io_geometry geom; 5955 + 5956 + ASSERT(bbio_ret); 6041 5957 6042 5958 if (op == BTRFS_MAP_DISCARD) 6043 5959 return __btrfs_map_block_for_discard(fs_info, logical, 6044 5960 *length, bbio_ret); 6045 5961 5962 + ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); 5963 + if (ret < 0) 5964 + return ret; 5965 + 6046 5966 em = btrfs_get_chunk_map(fs_info, logical, *length); 6047 - if (IS_ERR(em)) 6048 - return PTR_ERR(em); 6049 - 5967 + ASSERT(em); 6050 5968 map = em->map_lookup; 6051 - offset = logical - em->start; 6052 5969 6053 - stripe_len = map->stripe_len; 6054 - stripe_nr = offset; 6055 - /* 6056 - * stripe_nr counts the total number of stripes we have to stride 6057 - * to get to this block 6058 - */ 6059 - stripe_nr = div64_u64(stripe_nr, stripe_len); 6060 - 6061 - stripe_offset = stripe_nr * stripe_len; 6062 - if (offset < stripe_offset) { 6063 - btrfs_crit(fs_info, 6064 - "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", 6065 - stripe_offset, offset, em->start, logical, 6066 - stripe_len); 6067 - free_extent_map(em); 6068 - return -EINVAL; 6069 - } 6070 - 6071 - /* stripe_offset is the offset of this block in its stripe*/ 6072 - stripe_offset = offset - stripe_offset; 6073 - 6074 - /* if we're here for raid56, we need to know the stripe aligned start */ 6075 - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6076 - unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 6077 - raid56_full_stripe_start = offset; 6078 - 6079 - /* allow a write of a full stripe, but make sure we don't 6080 - * allow straddling of stripes 6081 - */ 6082 - raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 6083 - full_stripe_len); 6084 - raid56_full_stripe_start *= full_stripe_len; 6085 - } 6086 - 6087 - if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 6088 - u64 max_len; 6089 - /* For writes to RAID[56], allow a full stripeset across all disks. 6090 - For other RAID types and for RAID[56] reads, just allow a single 6091 - stripe (on a single disk). */ 6092 - if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6093 - (op == BTRFS_MAP_WRITE)) { 6094 - max_len = stripe_len * nr_data_stripes(map) - 6095 - (offset - raid56_full_stripe_start); 6096 - } else { 6097 - /* we limit the length of each bio to what fits in a stripe */ 6098 - max_len = stripe_len - stripe_offset; 6099 - } 6100 - *length = min_t(u64, em->len - offset, max_len); 6101 - } else { 6102 - *length = em->len - offset; 6103 - } 6104 - 6105 - /* 6106 - * This is for when we're called from btrfs_bio_fits_in_stripe and all 6107 - * it cares about is the length 6108 - */ 6109 - if (!bbio_ret) 6110 - goto out; 5970 + *length = geom.len; 5971 + offset = geom.offset; 5972 + stripe_len = geom.stripe_len; 5973 + stripe_nr = geom.stripe_nr; 5974 + stripe_offset = geom.stripe_offset; 5975 + raid56_full_stripe_start = geom.raid56_stripe_offset; 5976 + data_stripes = nr_data_stripes(map); 6111 5977 6112 5978 down_read(&dev_replace->rwsem); 6113 5979 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); ··· 6092 6052 &stripe_index); 6093 6053 if (!need_full_stripe(op)) 6094 6054 mirror_num = 1; 6095 - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 6055 + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 6096 6056 if (need_full_stripe(op)) 6097 6057 num_stripes = map->num_stripes; 6098 6058 else if (mirror_num) ··· 6134 6094 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 6135 6095 /* push stripe_nr back to the start of the full stripe */ 6136 6096 stripe_nr = div64_u64(raid56_full_stripe_start, 6137 - stripe_len * nr_data_stripes(map)); 6097 + stripe_len * data_stripes); 6138 6098 6139 6099 /* RAID[56] write or recovery. Return all stripes */ 6140 6100 num_stripes = map->num_stripes; ··· 6150 6110 * Mirror #3 is RAID6 Q block. 6151 6111 */ 6152 6112 stripe_nr = div_u64_rem(stripe_nr, 6153 - nr_data_stripes(map), &stripe_index); 6113 + data_stripes, &stripe_index); 6154 6114 if (mirror_num > 1) 6155 - stripe_index = nr_data_stripes(map) + 6156 - mirror_num - 2; 6115 + stripe_index = data_stripes + mirror_num - 2; 6157 6116 6158 6117 /* We distribute the parity blocks across stripes */ 6159 6118 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, ··· 6210 6171 div_u64_rem(stripe_nr, num_stripes, &rot); 6211 6172 6212 6173 /* Fill in the logical address of each stripe */ 6213 - tmp = stripe_nr * nr_data_stripes(map); 6214 - for (i = 0; i < nr_data_stripes(map); i++) 6174 + tmp = stripe_nr * data_stripes; 6175 + for (i = 0; i < data_stripes; i++) 6215 6176 bbio->raid_map[(i+rot) % num_stripes] = 6216 6177 em->start + (tmp + i) * map->stripe_len; 6217 6178 ··· 6726 6687 struct btrfs_chunk *chunk) 6727 6688 { 6728 6689 struct btrfs_fs_info *fs_info = leaf->fs_info; 6729 - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 6690 + struct extent_map_tree *map_tree = &fs_info->mapping_tree; 6730 6691 struct map_lookup *map; 6731 6692 struct extent_map *em; 6732 6693 u64 logical; ··· 6751 6712 return ret; 6752 6713 } 6753 6714 6754 - read_lock(&map_tree->map_tree.lock); 6755 - em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 6756 - read_unlock(&map_tree->map_tree.lock); 6715 + read_lock(&map_tree->lock); 6716 + em = lookup_extent_mapping(map_tree, logical, 1); 6717 + read_unlock(&map_tree->lock); 6757 6718 6758 6719 /* already mapped? */ 6759 6720 if (em && em->start <= logical && em->start + em->len > logical) { ··· 6822 6783 6823 6784 } 6824 6785 6825 - write_lock(&map_tree->map_tree.lock); 6826 - ret = add_extent_mapping(&map_tree->map_tree, em, 0); 6827 - write_unlock(&map_tree->map_tree.lock); 6786 + write_lock(&map_tree->lock); 6787 + ret = add_extent_mapping(map_tree, em, 0); 6788 + write_unlock(&map_tree->lock); 6828 6789 if (ret < 0) { 6829 6790 btrfs_err(fs_info, 6830 6791 "failed to add chunk map, start=%llu len=%llu: %d", ··· 7142 7103 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 7143 7104 struct btrfs_device *failing_dev) 7144 7105 { 7145 - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 7106 + struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7146 7107 struct extent_map *em; 7147 7108 u64 next_start = 0; 7148 7109 bool ret = true; 7149 7110 7150 - read_lock(&map_tree->map_tree.lock); 7151 - em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); 7152 - read_unlock(&map_tree->map_tree.lock); 7111 + read_lock(&map_tree->lock); 7112 + em = lookup_extent_mapping(map_tree, 0, (u64)-1); 7113 + read_unlock(&map_tree->lock); 7153 7114 /* No chunk at all? Return false anyway */ 7154 7115 if (!em) { 7155 7116 ret = false; ··· 7187 7148 next_start = extent_map_end(em); 7188 7149 free_extent_map(em); 7189 7150 7190 - read_lock(&map_tree->map_tree.lock); 7191 - em = lookup_extent_mapping(&map_tree->map_tree, next_start, 7151 + read_lock(&map_tree->lock); 7152 + em = lookup_extent_mapping(map_tree, next_start, 7192 7153 (u64)(-1) - next_start); 7193 - read_unlock(&map_tree->map_tree.lock); 7154 + read_unlock(&map_tree->lock); 7194 7155 } 7195 7156 out: 7196 7157 return ret; ··· 7639 7600 */ 7640 7601 int btrfs_bg_type_to_factor(u64 flags) 7641 7602 { 7642 - if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 7643 - BTRFS_BLOCK_GROUP_RAID10)) 7644 - return 2; 7645 - return 1; 7603 + const int index = btrfs_bg_flags_to_raid_index(flags); 7604 + 7605 + return btrfs_raid_array[index].ncopies; 7646 7606 } 7647 7607 7648 7608 ··· 7650 7612 u64 chunk_offset, u64 devid, 7651 7613 u64 physical_offset, u64 physical_len) 7652 7614 { 7653 - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 7615 + struct extent_map_tree *em_tree = &fs_info->mapping_tree; 7654 7616 struct extent_map *em; 7655 7617 struct map_lookup *map; 7656 7618 struct btrfs_device *dev; ··· 7739 7701 7740 7702 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 7741 7703 { 7742 - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 7704 + struct extent_map_tree *em_tree = &fs_info->mapping_tree; 7743 7705 struct extent_map *em; 7744 7706 struct rb_node *node; 7745 7707 int ret = 0;

+36 -16

fs/btrfs/volumes.h

··· 23 23 struct bio *tail; 24 24 }; 25 25 26 + struct btrfs_io_geometry { 27 + /* remaining bytes before crossing a stripe */ 28 + u64 len; 29 + /* offset of logical address in chunk */ 30 + u64 offset; 31 + /* length of single IO stripe */ 32 + u64 stripe_len; 33 + /* number of stripe where address falls */ 34 + u64 stripe_nr; 35 + /* offset of address in stripe */ 36 + u64 stripe_offset; 37 + /* offset of raid56 stripe into the chunk */ 38 + u64 raid56_stripe_offset; 39 + }; 40 + 26 41 /* 27 42 * Use sequence counter to get consistent device stat data on 28 43 * 32-bit processors. ··· 58 43 #define BTRFS_DEV_STATE_FLUSH_SENT (4) 59 44 60 45 struct btrfs_device { 61 - struct list_head dev_list; 62 - struct list_head dev_alloc_list; 46 + struct list_head dev_list; /* device_list_mutex */ 47 + struct list_head dev_alloc_list; /* chunk mutex */ 63 48 struct list_head post_commit_list; /* chunk mutex */ 64 49 struct btrfs_fs_devices *fs_devices; 65 50 struct btrfs_fs_info *fs_info; ··· 244 229 * this mutex lock. 245 230 */ 246 231 struct mutex device_list_mutex; 232 + 233 + /* List of all devices, protected by device_list_mutex */ 247 234 struct list_head devices; 248 235 249 - /* devices not currently being allocated */ 236 + /* 237 + * Devices which can satisfy space allocation. Protected by 238 + * chunk_mutex 239 + */ 250 240 struct list_head alloc_list; 251 241 252 242 struct btrfs_fs_devices *seed; ··· 356 336 }; 357 337 358 338 struct btrfs_raid_attr { 359 - int sub_stripes; /* sub_stripes info for map */ 360 - int dev_stripes; /* stripes per dev */ 361 - int devs_max; /* max devs to use */ 362 - int devs_min; /* min devs needed */ 363 - int tolerated_failures; /* max tolerated fail devs */ 364 - int devs_increment; /* ndevs has to be a multiple of this */ 365 - int ncopies; /* how many copies to data has */ 366 - int nparity; /* number of stripes worth of bytes to store 339 + u8 sub_stripes; /* sub_stripes info for map */ 340 + u8 dev_stripes; /* stripes per dev */ 341 + u8 devs_max; /* max devs to use */ 342 + u8 devs_min; /* min devs needed */ 343 + u8 tolerated_failures; /* max tolerated fail devs */ 344 + u8 devs_increment; /* ndevs has to be a multiple of this */ 345 + u8 ncopies; /* how many copies to data has */ 346 + u8 nparity; /* number of stripes worth of bytes to store 367 347 * parity information */ 368 - int mindev_error; /* error code if min devs requisite is unmet */ 348 + u8 mindev_error; /* error code if min devs requisite is unmet */ 369 349 const char raid_name[8]; /* name of the raid */ 370 350 u64 bg_flag; /* block group flag of the raid */ 371 351 }; ··· 428 408 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 429 409 u64 logical, u64 *length, 430 410 struct btrfs_bio **bbio_ret); 411 + int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 412 + u64 logical, u64 len, struct btrfs_io_geometry *io_geom); 431 413 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, 432 414 u64 physical, u64 **logical, int *naddrs, int *stripe_len); 433 415 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); 434 416 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); 435 417 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); 436 - void btrfs_mapping_init(struct btrfs_mapping_tree *tree); 437 - void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); 418 + void btrfs_mapping_tree_free(struct extent_map_tree *tree); 438 419 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 439 420 int mirror_num, int async_submit); 440 421 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, ··· 578 557 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 579 558 } 580 559 581 - const char *get_raid_name(enum btrfs_raid_types type); 582 - 583 560 void btrfs_commit_device_sizes(struct btrfs_transaction *trans); 584 561 585 562 struct list_head *btrfs_get_fs_uuids(void); ··· 587 568 struct btrfs_device *failing_dev); 588 569 589 570 int btrfs_bg_type_to_factor(u64 flags); 571 + const char *btrfs_bg_type_to_raid_name(u64 flags); 590 572 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); 591 573 592 574 #endif

+40

include/trace/events/btrfs.h

··· 29 29 struct btrfs_qgroup; 30 30 struct extent_io_tree; 31 31 struct prelim_ref; 32 + struct btrfs_space_info; 32 33 33 34 TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS_NR); 34 35 TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS); ··· 2091 2090 DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_read_lock); 2092 2091 DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_write_lock); 2093 2092 DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_lock_atomic); 2093 + 2094 + DECLARE_EVENT_CLASS(btrfs__space_info_update, 2095 + 2096 + TP_PROTO(struct btrfs_fs_info *fs_info, 2097 + struct btrfs_space_info *sinfo, u64 old, s64 diff), 2098 + 2099 + TP_ARGS(fs_info, sinfo, old, diff), 2100 + 2101 + TP_STRUCT__entry_btrfs( 2102 + __field( u64, type ) 2103 + __field( u64, old ) 2104 + __field( s64, diff ) 2105 + ), 2106 + 2107 + TP_fast_assign_btrfs(fs_info, 2108 + __entry->type = sinfo->flags; 2109 + __entry->old = old; 2110 + __entry->diff = diff; 2111 + ), 2112 + TP_printk_btrfs("type=%s old=%llu diff=%lld", 2113 + __print_flags(__entry->type, "|", BTRFS_GROUP_FLAGS), 2114 + __entry->old, __entry->diff) 2115 + ); 2116 + 2117 + DEFINE_EVENT(btrfs__space_info_update, update_bytes_may_use, 2118 + 2119 + TP_PROTO(struct btrfs_fs_info *fs_info, 2120 + struct btrfs_space_info *sinfo, u64 old, s64 diff), 2121 + 2122 + TP_ARGS(fs_info, sinfo, old, diff) 2123 + ); 2124 + 2125 + DEFINE_EVENT(btrfs__space_info_update, update_bytes_pinned, 2126 + 2127 + TP_PROTO(struct btrfs_fs_info *fs_info, 2128 + struct btrfs_space_info *sinfo, u64 old, s64 diff), 2129 + 2130 + TP_ARGS(fs_info, sinfo, old, diff) 2131 + ); 2094 2132 2095 2133 #endif /* _TRACE_BTRFS_H */ 2096 2134

+2

include/uapi/linux/btrfs_tree.h

··· 866 866 #define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ 867 867 BTRFS_BLOCK_GROUP_RAID6) 868 868 869 + #define BTRFS_BLOCK_GROUP_RAID1_MASK (BTRFS_BLOCK_GROUP_RAID1) 870 + 869 871 /* 870 872 * We need a bit for restriper to be able to tell when chunks of type 871 873 * SINGLE are available. This "extended" profile format is used in