Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

Pull btrfs fixes from Chris Mason:
"These are bug fixes, including a really old fsync bug, and a few trace
points to help us track down problems in the quota code"

* 'for-linus-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs:
Btrfs: fix file/data loss caused by fsync after rename and new inode
btrfs: Reset IO error counters before start of device replacing
btrfs: Add qgroup tracing
Btrfs: don't use src fd for printk
btrfs: fallback to vmalloc in btrfs_compare_tree
btrfs: handle non-fatal errors in btrfs_qgroup_inherit()
btrfs: Output more info for enospc_debug mount option
Btrfs: fix invalid reference in replace_path
Btrfs: Improve FL_KEEP_SIZE handling in fallocate

+303 -33
+8 -4
fs/btrfs/ctree.c
··· 19 19 #include <linux/sched.h> 20 20 #include <linux/slab.h> 21 21 #include <linux/rbtree.h> 22 + #include <linux/vmalloc.h> 22 23 #include "ctree.h" 23 24 #include "disk-io.h" 24 25 #include "transaction.h" ··· 5362 5361 goto out; 5363 5362 } 5364 5363 5365 - tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL); 5364 + tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL | __GFP_NOWARN); 5366 5365 if (!tmp_buf) { 5367 - ret = -ENOMEM; 5368 - goto out; 5366 + tmp_buf = vmalloc(left_root->nodesize); 5367 + if (!tmp_buf) { 5368 + ret = -ENOMEM; 5369 + goto out; 5370 + } 5369 5371 } 5370 5372 5371 5373 left_path->search_commit_root = 1; ··· 5569 5565 out: 5570 5566 btrfs_free_path(left_path); 5571 5567 btrfs_free_path(right_path); 5572 - kfree(tmp_buf); 5568 + kvfree(tmp_buf); 5573 5569 return ret; 5574 5570 } 5575 5571
+2
fs/btrfs/dev-replace.c
··· 394 394 dev_replace->cursor_right = 0; 395 395 dev_replace->is_valid = 1; 396 396 dev_replace->item_needs_writeback = 1; 397 + atomic64_set(&dev_replace->num_write_errors, 0); 398 + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); 397 399 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 398 400 btrfs_dev_replace_unlock(dev_replace, 1); 399 401
+19 -2
fs/btrfs/extent-tree.c
··· 9386 9386 u64 dev_min = 1; 9387 9387 u64 dev_nr = 0; 9388 9388 u64 target; 9389 + int debug; 9389 9390 int index; 9390 9391 int full = 0; 9391 9392 int ret = 0; 9392 9393 9394 + debug = btrfs_test_opt(root, ENOSPC_DEBUG); 9395 + 9393 9396 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 9394 9397 9395 9398 /* odd, couldn't find the block group, leave it alone */ 9396 - if (!block_group) 9399 + if (!block_group) { 9400 + if (debug) 9401 + btrfs_warn(root->fs_info, 9402 + "can't find block group for bytenr %llu", 9403 + bytenr); 9397 9404 return -1; 9405 + } 9398 9406 9399 9407 min_free = btrfs_block_group_used(&block_group->item); 9400 9408 ··· 9456 9448 * this is just a balance, so if we were marked as full 9457 9449 * we know there is no space for a new chunk 9458 9450 */ 9459 - if (full) 9451 + if (full) { 9452 + if (debug) 9453 + btrfs_warn(root->fs_info, 9454 + "no space to alloc new chunk for block group %llu", 9455 + block_group->key.objectid); 9460 9456 goto out; 9457 + } 9461 9458 9462 9459 index = get_block_group_index(block_group); 9463 9460 } ··· 9509 9496 ret = -1; 9510 9497 } 9511 9498 } 9499 + if (debug && ret == -1) 9500 + btrfs_warn(root->fs_info, 9501 + "no space to allocate a new chunk for block group %llu", 9502 + block_group->key.objectid); 9512 9503 mutex_unlock(&root->fs_info->chunk_mutex); 9513 9504 btrfs_end_transaction(trans, root); 9514 9505 out:
+6 -3
fs/btrfs/file.c
··· 2682 2682 return ret; 2683 2683 2684 2684 inode_lock(inode); 2685 - ret = inode_newsize_ok(inode, alloc_end); 2686 - if (ret) 2687 - goto out; 2685 + 2686 + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { 2687 + ret = inode_newsize_ok(inode, offset + len); 2688 + if (ret) 2689 + goto out; 2690 + } 2688 2691 2689 2692 /* 2690 2693 * TODO: Move these two operations after we have checked
+1 -1
fs/btrfs/ioctl.c
··· 1654 1654 1655 1655 src_inode = file_inode(src.file); 1656 1656 if (src_inode->i_sb != file_inode(file)->i_sb) { 1657 - btrfs_info(BTRFS_I(src_inode)->root->fs_info, 1657 + btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, 1658 1658 "Snapshot src from another FS"); 1659 1659 ret = -EXDEV; 1660 1660 } else if (!inode_owner_or_capable(src_inode)) {
+41 -22
fs/btrfs/qgroup.c
··· 1463 1463 u64 bytenr = record->bytenr; 1464 1464 1465 1465 assert_spin_locked(&delayed_refs->lock); 1466 + trace_btrfs_qgroup_insert_dirty_extent(record); 1466 1467 1467 1468 while (*p) { 1468 1469 parent_node = *p; ··· 1595 1594 cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); 1596 1595 cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); 1597 1596 1597 + trace_qgroup_update_counters(qg->qgroupid, cur_old_count, 1598 + cur_new_count); 1599 + 1598 1600 /* Rfer update part */ 1599 1601 if (cur_old_count == 0 && cur_new_count > 0) { 1600 1602 qg->rfer += num_bytes; ··· 1687 1683 goto out_free; 1688 1684 BUG_ON(!fs_info->quota_root); 1689 1685 1686 + trace_btrfs_qgroup_account_extent(bytenr, num_bytes, nr_old_roots, 1687 + nr_new_roots); 1688 + 1690 1689 qgroups = ulist_alloc(GFP_NOFS); 1691 1690 if (!qgroups) { 1692 1691 ret = -ENOMEM; ··· 1758 1751 while ((node = rb_first(&delayed_refs->dirty_extent_root))) { 1759 1752 record = rb_entry(node, struct btrfs_qgroup_extent_record, 1760 1753 node); 1754 + 1755 + trace_btrfs_qgroup_account_extents(record); 1761 1756 1762 1757 if (!ret) { 1763 1758 /* ··· 1851 1842 } 1852 1843 1853 1844 /* 1854 - * copy the acounting information between qgroups. This is necessary when a 1855 - * snapshot or a subvolume is created 1845 + * Copy the acounting information between qgroups. This is necessary 1846 + * when a snapshot or a subvolume is created. Throwing an error will 1847 + * cause a transaction abort so we take extra care here to only error 1848 + * when a readonly fs is a reasonable outcome. 1856 1849 */ 1857 1850 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, 1858 1851 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, ··· 1884 1873 2 * inherit->num_excl_copies; 1885 1874 for (i = 0; i < nums; ++i) { 1886 1875 srcgroup = find_qgroup_rb(fs_info, *i_qgroups); 1887 - if (!srcgroup) { 1888 - ret = -EINVAL; 1889 - goto out; 1890 - } 1891 1876 1892 - if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) { 1893 - ret = -EINVAL; 1894 - goto out; 1895 - } 1877 + /* 1878 + * Zero out invalid groups so we can ignore 1879 + * them later. 1880 + */ 1881 + if (!srcgroup || 1882 + ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) 1883 + *i_qgroups = 0ULL; 1884 + 1896 1885 ++i_qgroups; 1897 1886 } 1898 1887 } ··· 1927 1916 */ 1928 1917 if (inherit) { 1929 1918 i_qgroups = (u64 *)(inherit + 1); 1930 - for (i = 0; i < inherit->num_qgroups; ++i) { 1919 + for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { 1920 + if (*i_qgroups == 0) 1921 + continue; 1931 1922 ret = add_qgroup_relation_item(trans, quota_root, 1932 1923 objectid, *i_qgroups); 1933 - if (ret) 1924 + if (ret && ret != -EEXIST) 1934 1925 goto out; 1935 1926 ret = add_qgroup_relation_item(trans, quota_root, 1936 1927 *i_qgroups, objectid); 1937 - if (ret) 1928 + if (ret && ret != -EEXIST) 1938 1929 goto out; 1939 - ++i_qgroups; 1940 1930 } 1931 + ret = 0; 1941 1932 } 1942 1933 1943 1934 ··· 2000 1987 2001 1988 i_qgroups = (u64 *)(inherit + 1); 2002 1989 for (i = 0; i < inherit->num_qgroups; ++i) { 2003 - ret = add_relation_rb(quota_root->fs_info, objectid, 2004 - *i_qgroups); 2005 - if (ret) 2006 - goto unlock; 1990 + if (*i_qgroups) { 1991 + ret = add_relation_rb(quota_root->fs_info, objectid, 1992 + *i_qgroups); 1993 + if (ret) 1994 + goto unlock; 1995 + } 2007 1996 ++i_qgroups; 2008 1997 } 2009 1998 2010 - for (i = 0; i < inherit->num_ref_copies; ++i) { 1999 + for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { 2011 2000 struct btrfs_qgroup *src; 2012 2001 struct btrfs_qgroup *dst; 2002 + 2003 + if (!i_qgroups[0] || !i_qgroups[1]) 2004 + continue; 2013 2005 2014 2006 src = find_qgroup_rb(fs_info, i_qgroups[0]); 2015 2007 dst = find_qgroup_rb(fs_info, i_qgroups[1]); ··· 2026 2008 2027 2009 dst->rfer = src->rfer - level_size; 2028 2010 dst->rfer_cmpr = src->rfer_cmpr - level_size; 2029 - i_qgroups += 2; 2030 2011 } 2031 - for (i = 0; i < inherit->num_excl_copies; ++i) { 2012 + for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { 2032 2013 struct btrfs_qgroup *src; 2033 2014 struct btrfs_qgroup *dst; 2015 + 2016 + if (!i_qgroups[0] || !i_qgroups[1]) 2017 + continue; 2034 2018 2035 2019 src = find_qgroup_rb(fs_info, i_qgroups[0]); 2036 2020 dst = find_qgroup_rb(fs_info, i_qgroups[1]); ··· 2044 2024 2045 2025 dst->excl = src->excl + level_size; 2046 2026 dst->excl_cmpr = src->excl_cmpr + level_size; 2047 - i_qgroups += 2; 2048 2027 } 2049 2028 2050 2029 unlock:
+1
fs/btrfs/relocation.c
··· 1850 1850 eb = read_tree_block(dest, old_bytenr, old_ptr_gen); 1851 1851 if (IS_ERR(eb)) { 1852 1852 ret = PTR_ERR(eb); 1853 + break; 1853 1854 } else if (!extent_buffer_uptodate(eb)) { 1854 1855 ret = -EIO; 1855 1856 free_extent_buffer(eb);
+137
fs/btrfs/tree-log.c
··· 4415 4415 return ret; 4416 4416 } 4417 4417 4418 + /* 4419 + * When we are logging a new inode X, check if it doesn't have a reference that 4420 + * matches the reference from some other inode Y created in a past transaction 4421 + * and that was renamed in the current transaction. If we don't do this, then at 4422 + * log replay time we can lose inode Y (and all its files if it's a directory): 4423 + * 4424 + * mkdir /mnt/x 4425 + * echo "hello world" > /mnt/x/foobar 4426 + * sync 4427 + * mv /mnt/x /mnt/y 4428 + * mkdir /mnt/x # or touch /mnt/x 4429 + * xfs_io -c fsync /mnt/x 4430 + * <power fail> 4431 + * mount fs, trigger log replay 4432 + * 4433 + * After the log replay procedure, we would lose the first directory and all its 4434 + * files (file foobar). 4435 + * For the case where inode Y is not a directory we simply end up losing it: 4436 + * 4437 + * echo "123" > /mnt/foo 4438 + * sync 4439 + * mv /mnt/foo /mnt/bar 4440 + * echo "abc" > /mnt/foo 4441 + * xfs_io -c fsync /mnt/foo 4442 + * <power fail> 4443 + * 4444 + * We also need this for cases where a snapshot entry is replaced by some other 4445 + * entry (file or directory) otherwise we end up with an unreplayable log due to 4446 + * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4447 + * if it were a regular entry: 4448 + * 4449 + * mkdir /mnt/x 4450 + * btrfs subvolume snapshot /mnt /mnt/x/snap 4451 + * btrfs subvolume delete /mnt/x/snap 4452 + * rmdir /mnt/x 4453 + * mkdir /mnt/x 4454 + * fsync /mnt/x or fsync some new file inside it 4455 + * <power fail> 4456 + * 4457 + * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4458 + * the same transaction. 4459 + */ 4460 + static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4461 + const int slot, 4462 + const struct btrfs_key *key, 4463 + struct inode *inode) 4464 + { 4465 + int ret; 4466 + struct btrfs_path *search_path; 4467 + char *name = NULL; 4468 + u32 name_len = 0; 4469 + u32 item_size = btrfs_item_size_nr(eb, slot); 4470 + u32 cur_offset = 0; 4471 + unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4472 + 4473 + search_path = btrfs_alloc_path(); 4474 + if (!search_path) 4475 + return -ENOMEM; 4476 + search_path->search_commit_root = 1; 4477 + search_path->skip_locking = 1; 4478 + 4479 + while (cur_offset < item_size) { 4480 + u64 parent; 4481 + u32 this_name_len; 4482 + u32 this_len; 4483 + unsigned long name_ptr; 4484 + struct btrfs_dir_item *di; 4485 + 4486 + if (key->type == BTRFS_INODE_REF_KEY) { 4487 + struct btrfs_inode_ref *iref; 4488 + 4489 + iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4490 + parent = key->offset; 4491 + this_name_len = btrfs_inode_ref_name_len(eb, iref); 4492 + name_ptr = (unsigned long)(iref + 1); 4493 + this_len = sizeof(*iref) + this_name_len; 4494 + } else { 4495 + struct btrfs_inode_extref *extref; 4496 + 4497 + extref = (struct btrfs_inode_extref *)(ptr + 4498 + cur_offset); 4499 + parent = btrfs_inode_extref_parent(eb, extref); 4500 + this_name_len = btrfs_inode_extref_name_len(eb, extref); 4501 + name_ptr = (unsigned long)&extref->name; 4502 + this_len = sizeof(*extref) + this_name_len; 4503 + } 4504 + 4505 + if (this_name_len > name_len) { 4506 + char *new_name; 4507 + 4508 + new_name = krealloc(name, this_name_len, GFP_NOFS); 4509 + if (!new_name) { 4510 + ret = -ENOMEM; 4511 + goto out; 4512 + } 4513 + name_len = this_name_len; 4514 + name = new_name; 4515 + } 4516 + 4517 + read_extent_buffer(eb, name, name_ptr, this_name_len); 4518 + di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root, 4519 + search_path, parent, 4520 + name, this_name_len, 0); 4521 + if (di && !IS_ERR(di)) { 4522 + ret = 1; 4523 + goto out; 4524 + } else if (IS_ERR(di)) { 4525 + ret = PTR_ERR(di); 4526 + goto out; 4527 + } 4528 + btrfs_release_path(search_path); 4529 + 4530 + cur_offset += this_len; 4531 + } 4532 + ret = 0; 4533 + out: 4534 + btrfs_free_path(search_path); 4535 + kfree(name); 4536 + return ret; 4537 + } 4538 + 4418 4539 /* log a single inode in the tree log. 4419 4540 * At least one parent directory for this inode must exist in the tree 4420 4541 * or be logged already. ··· 4722 4601 4723 4602 if (min_key.type == BTRFS_INODE_ITEM_KEY) 4724 4603 need_log_inode_item = false; 4604 + 4605 + if ((min_key.type == BTRFS_INODE_REF_KEY || 4606 + min_key.type == BTRFS_INODE_EXTREF_KEY) && 4607 + BTRFS_I(inode)->generation == trans->transid) { 4608 + ret = btrfs_check_ref_name_override(path->nodes[0], 4609 + path->slots[0], 4610 + &min_key, inode); 4611 + if (ret < 0) { 4612 + err = ret; 4613 + goto out_unlock; 4614 + } else if (ret > 0) { 4615 + err = 1; 4616 + btrfs_set_log_full_commit(root->fs_info, trans); 4617 + goto out_unlock; 4618 + } 4619 + } 4725 4620 4726 4621 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 4727 4622 if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
+88 -1
include/trace/events/btrfs.h
··· 23 23 struct extent_buffer; 24 24 struct btrfs_work; 25 25 struct __btrfs_workqueue; 26 - struct btrfs_qgroup_operation; 26 + struct btrfs_qgroup_extent_record; 27 27 28 28 #define show_ref_type(type) \ 29 29 __print_symbolic(type, \ ··· 1231 1231 1232 1232 TP_ARGS(ref_root, reserved) 1233 1233 ); 1234 + 1235 + DECLARE_EVENT_CLASS(btrfs_qgroup_extent, 1236 + TP_PROTO(struct btrfs_qgroup_extent_record *rec), 1237 + 1238 + TP_ARGS(rec), 1239 + 1240 + TP_STRUCT__entry( 1241 + __field( u64, bytenr ) 1242 + __field( u64, num_bytes ) 1243 + ), 1244 + 1245 + TP_fast_assign( 1246 + __entry->bytenr = rec->bytenr, 1247 + __entry->num_bytes = rec->num_bytes; 1248 + ), 1249 + 1250 + TP_printk("bytenr = %llu, num_bytes = %llu", 1251 + (unsigned long long)__entry->bytenr, 1252 + (unsigned long long)__entry->num_bytes) 1253 + ); 1254 + 1255 + DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents, 1256 + 1257 + TP_PROTO(struct btrfs_qgroup_extent_record *rec), 1258 + 1259 + TP_ARGS(rec) 1260 + ); 1261 + 1262 + DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_insert_dirty_extent, 1263 + 1264 + TP_PROTO(struct btrfs_qgroup_extent_record *rec), 1265 + 1266 + TP_ARGS(rec) 1267 + ); 1268 + 1269 + TRACE_EVENT(btrfs_qgroup_account_extent, 1270 + 1271 + TP_PROTO(u64 bytenr, u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots), 1272 + 1273 + TP_ARGS(bytenr, num_bytes, nr_old_roots, nr_new_roots), 1274 + 1275 + TP_STRUCT__entry( 1276 + __field( u64, bytenr ) 1277 + __field( u64, num_bytes ) 1278 + __field( u64, nr_old_roots ) 1279 + __field( u64, nr_new_roots ) 1280 + ), 1281 + 1282 + TP_fast_assign( 1283 + __entry->bytenr = bytenr; 1284 + __entry->num_bytes = num_bytes; 1285 + __entry->nr_old_roots = nr_old_roots; 1286 + __entry->nr_new_roots = nr_new_roots; 1287 + ), 1288 + 1289 + TP_printk("bytenr = %llu, num_bytes = %llu, nr_old_roots = %llu, " 1290 + "nr_new_roots = %llu", 1291 + __entry->bytenr, 1292 + __entry->num_bytes, 1293 + __entry->nr_old_roots, 1294 + __entry->nr_new_roots) 1295 + ); 1296 + 1297 + TRACE_EVENT(qgroup_update_counters, 1298 + 1299 + TP_PROTO(u64 qgid, u64 cur_old_count, u64 cur_new_count), 1300 + 1301 + TP_ARGS(qgid, cur_old_count, cur_new_count), 1302 + 1303 + TP_STRUCT__entry( 1304 + __field( u64, qgid ) 1305 + __field( u64, cur_old_count ) 1306 + __field( u64, cur_new_count ) 1307 + ), 1308 + 1309 + TP_fast_assign( 1310 + __entry->qgid = qgid; 1311 + __entry->cur_old_count = cur_old_count; 1312 + __entry->cur_new_count = cur_new_count; 1313 + ), 1314 + 1315 + TP_printk("qgid = %llu, cur_old_count = %llu, cur_new_count = %llu", 1316 + __entry->qgid, 1317 + __entry->cur_old_count, 1318 + __entry->cur_new_count) 1319 + ); 1320 + 1234 1321 #endif /* _TRACE_BTRFS_H */ 1235 1322 1236 1323 /* This part must be outside protection */