Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable

+5 -4

fs/btrfs/acl.c

··· 178 178 179 179 if (value) { 180 180 acl = posix_acl_from_xattr(value, size); 181 - if (acl == NULL) { 182 - value = NULL; 183 - size = 0; 181 + if (acl) { 182 + ret = posix_acl_valid(acl); 183 + if (ret) 184 + goto out; 184 185 } else if (IS_ERR(acl)) { 185 186 return PTR_ERR(acl); 186 187 } 187 188 } 188 189 189 190 ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); 190 - 191 + out: 191 192 posix_acl_release(acl); 192 193 193 194 return ret;

+8 -1

fs/btrfs/ctree.h

··· 740 740 */ 741 741 unsigned long reservation_progress; 742 742 743 - int full; /* indicates that we cannot allocate any more 743 + int full:1; /* indicates that we cannot allocate any more 744 744 chunks for this space */ 745 + int chunk_alloc:1; /* set if we are allocating a chunk */ 746 + 745 747 int force_alloc; /* set if we need to force a chunk alloc for 746 748 this space */ 747 749 ··· 2578 2576 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 2579 2577 struct inode *inode, u64 start, u64 end); 2580 2578 int btrfs_release_file(struct inode *inode, struct file *file); 2579 + void btrfs_drop_pages(struct page **pages, size_t num_pages); 2580 + int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, 2581 + struct page **pages, size_t num_pages, 2582 + loff_t pos, size_t write_bytes, 2583 + struct extent_state **cached); 2581 2584 2582 2585 /* tree-defrag.c */ 2583 2586 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,

+1 -1

fs/btrfs/disk-io.c

··· 3057 3057 btrfs_destroy_pinned_extent(root, 3058 3058 root->fs_info->pinned_extents); 3059 3059 3060 - t->use_count = 0; 3060 + atomic_set(&t->use_count, 0); 3061 3061 list_del_init(&t->list); 3062 3062 memset(t, 0, sizeof(*t)); 3063 3063 kmem_cache_free(btrfs_transaction_cachep, t);

+98 -27

fs/btrfs/extent-tree.c

··· 33 33 #include "locking.h" 34 34 #include "free-space-cache.h" 35 35 36 + /* control flags for do_chunk_alloc's force field 37 + * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 38 + * if we really need one. 39 + * 40 + * CHUNK_ALLOC_FORCE means it must try to allocate one 41 + * 42 + * CHUNK_ALLOC_LIMITED means to only try and allocate one 43 + * if we have very few chunks already allocated. This is 44 + * used as part of the clustering code to help make sure 45 + * we have a good pool of storage to cluster in, without 46 + * filling the FS with empty chunks 47 + * 48 + */ 49 + enum { 50 + CHUNK_ALLOC_NO_FORCE = 0, 51 + CHUNK_ALLOC_FORCE = 1, 52 + CHUNK_ALLOC_LIMITED = 2, 53 + }; 54 + 36 55 static int update_block_group(struct btrfs_trans_handle *trans, 37 56 struct btrfs_root *root, 38 57 u64 bytenr, u64 num_bytes, int alloc); ··· 3038 3019 found->bytes_readonly = 0; 3039 3020 found->bytes_may_use = 0; 3040 3021 found->full = 0; 3041 - found->force_alloc = 0; 3022 + found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3023 + found->chunk_alloc = 0; 3042 3024 *space_info = found; 3043 3025 list_add_rcu(&found->list, &info->space_info); 3044 3026 atomic_set(&found->caching_threads, 0); ··· 3170 3150 if (!data_sinfo->full && alloc_chunk) { 3171 3151 u64 alloc_target; 3172 3152 3173 - data_sinfo->force_alloc = 1; 3153 + data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 3174 3154 spin_unlock(&data_sinfo->lock); 3175 3155 alloc: 3176 3156 alloc_target = btrfs_get_alloc_profile(root, 1); ··· 3180 3160 3181 3161 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3182 3162 bytes + 2 * 1024 * 1024, 3183 - alloc_target, 0); 3163 + alloc_target, 3164 + CHUNK_ALLOC_NO_FORCE); 3184 3165 btrfs_end_transaction(trans, root); 3185 3166 if (ret < 0) { 3186 3167 if (ret != -ENOSPC) ··· 3260 3239 rcu_read_lock(); 3261 3240 list_for_each_entry_rcu(found, head, list) { 3262 3241 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3263 - found->force_alloc = 1; 3242 + found->force_alloc = CHUNK_ALLOC_FORCE; 3264 3243 } 3265 3244 rcu_read_unlock(); 3266 3245 } 3267 3246 3268 3247 static int should_alloc_chunk(struct btrfs_root *root, 3269 - struct btrfs_space_info *sinfo, u64 alloc_bytes) 3248 + struct btrfs_space_info *sinfo, u64 alloc_bytes, 3249 + int force) 3270 3250 { 3271 3251 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3252 + u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3272 3253 u64 thresh; 3273 3254 3274 - if (sinfo->bytes_used + sinfo->bytes_reserved + 3275 - alloc_bytes + 256 * 1024 * 1024 < num_bytes) 3255 + if (force == CHUNK_ALLOC_FORCE) 3256 + return 1; 3257 + 3258 + /* 3259 + * in limited mode, we want to have some free space up to 3260 + * about 1% of the FS size. 3261 + */ 3262 + if (force == CHUNK_ALLOC_LIMITED) { 3263 + thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3264 + thresh = max_t(u64, 64 * 1024 * 1024, 3265 + div_factor_fine(thresh, 1)); 3266 + 3267 + if (num_bytes - num_allocated < thresh) 3268 + return 1; 3269 + } 3270 + 3271 + /* 3272 + * we have two similar checks here, one based on percentage 3273 + * and once based on a hard number of 256MB. The idea 3274 + * is that if we have a good amount of free 3275 + * room, don't allocate a chunk. A good mount is 3276 + * less than 80% utilized of the chunks we have allocated, 3277 + * or more than 256MB free 3278 + */ 3279 + if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes) 3276 3280 return 0; 3277 3281 3278 - if (sinfo->bytes_used + sinfo->bytes_reserved + 3279 - alloc_bytes < div_factor(num_bytes, 8)) 3282 + if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) 3280 3283 return 0; 3281 3284 3282 3285 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3286 + 3287 + /* 256MB or 5% of the FS */ 3283 3288 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3284 3289 3285 3290 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) 3286 3291 return 0; 3287 - 3288 3292 return 1; 3289 3293 } 3290 3294 ··· 3319 3273 { 3320 3274 struct btrfs_space_info *space_info; 3321 3275 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3276 + int wait_for_alloc = 0; 3322 3277 int ret = 0; 3323 - 3324 - mutex_lock(&fs_info->chunk_mutex); 3325 3278 3326 3279 flags = btrfs_reduce_alloc_profile(extent_root, flags); 3327 3280 ··· 3332 3287 } 3333 3288 BUG_ON(!space_info); 3334 3289 3290 + again: 3335 3291 spin_lock(&space_info->lock); 3336 3292 if (space_info->force_alloc) 3337 - force = 1; 3293 + force = space_info->force_alloc; 3338 3294 if (space_info->full) { 3339 3295 spin_unlock(&space_info->lock); 3340 - goto out; 3296 + return 0; 3341 3297 } 3342 3298 3343 - if (!force && !should_alloc_chunk(extent_root, space_info, 3344 - alloc_bytes)) { 3299 + if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) { 3345 3300 spin_unlock(&space_info->lock); 3346 - goto out; 3301 + return 0; 3302 + } else if (space_info->chunk_alloc) { 3303 + wait_for_alloc = 1; 3304 + } else { 3305 + space_info->chunk_alloc = 1; 3347 3306 } 3307 + 3348 3308 spin_unlock(&space_info->lock); 3309 + 3310 + mutex_lock(&fs_info->chunk_mutex); 3311 + 3312 + /* 3313 + * The chunk_mutex is held throughout the entirety of a chunk 3314 + * allocation, so once we've acquired the chunk_mutex we know that the 3315 + * other guy is done and we need to recheck and see if we should 3316 + * allocate. 3317 + */ 3318 + if (wait_for_alloc) { 3319 + mutex_unlock(&fs_info->chunk_mutex); 3320 + wait_for_alloc = 0; 3321 + goto again; 3322 + } 3349 3323 3350 3324 /* 3351 3325 * If we have mixed data/metadata chunks we want to make sure we keep ··· 3391 3327 space_info->full = 1; 3392 3328 else 3393 3329 ret = 1; 3394 - space_info->force_alloc = 0; 3330 + 3331 + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3332 + space_info->chunk_alloc = 0; 3395 3333 spin_unlock(&space_info->lock); 3396 - out: 3397 3334 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3398 3335 return ret; 3399 3336 } ··· 5368 5303 5369 5304 if (allowed_chunk_alloc) { 5370 5305 ret = do_chunk_alloc(trans, root, num_bytes + 5371 - 2 * 1024 * 1024, data, 1); 5306 + 2 * 1024 * 1024, data, 5307 + CHUNK_ALLOC_LIMITED); 5372 5308 allowed_chunk_alloc = 0; 5373 5309 done_chunk_alloc = 1; 5374 - } else if (!done_chunk_alloc) { 5375 - space_info->force_alloc = 1; 5310 + } else if (!done_chunk_alloc && 5311 + space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) { 5312 + space_info->force_alloc = CHUNK_ALLOC_LIMITED; 5376 5313 } 5377 5314 5378 5315 if (loop < LOOP_NO_EMPTY_SIZE) { ··· 5460 5393 */ 5461 5394 if (empty_size || root->ref_cows) 5462 5395 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 5463 - num_bytes + 2 * 1024 * 1024, data, 0); 5396 + num_bytes + 2 * 1024 * 1024, data, 5397 + CHUNK_ALLOC_NO_FORCE); 5464 5398 5465 5399 WARN_ON(num_bytes < root->sectorsize); 5466 5400 ret = find_free_extent(trans, root, num_bytes, empty_size, ··· 5473 5405 num_bytes = num_bytes & ~(root->sectorsize - 1); 5474 5406 num_bytes = max(num_bytes, min_alloc_size); 5475 5407 do_chunk_alloc(trans, root->fs_info->extent_root, 5476 - num_bytes, data, 1); 5408 + num_bytes, data, CHUNK_ALLOC_FORCE); 5477 5409 goto again; 5478 5410 } 5479 5411 if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { ··· 8177 8109 8178 8110 alloc_flags = update_block_group_flags(root, cache->flags); 8179 8111 if (alloc_flags != cache->flags) 8180 - do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); 8112 + do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 8113 + CHUNK_ALLOC_FORCE); 8181 8114 8182 8115 ret = set_block_group_ro(cache); 8183 8116 if (!ret) 8184 8117 goto out; 8185 8118 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 8186 - ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); 8119 + ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 8120 + CHUNK_ALLOC_FORCE); 8187 8121 if (ret < 0) 8188 8122 goto out; 8189 8123 ret = set_block_group_ro(cache); ··· 8198 8128 struct btrfs_root *root, u64 type) 8199 8129 { 8200 8130 u64 alloc_flags = get_alloc_profile(root, type); 8201 - return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); 8131 + return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 8132 + CHUNK_ALLOC_FORCE); 8202 8133 } 8203 8134 8204 8135 /*

+62 -20

fs/btrfs/extent_io.c

··· 690 690 } 691 691 } 692 692 693 + static void uncache_state(struct extent_state **cached_ptr) 694 + { 695 + if (cached_ptr && (*cached_ptr)) { 696 + struct extent_state *state = *cached_ptr; 697 + *cached_ptr = NULL; 698 + free_extent_state(state); 699 + } 700 + } 701 + 693 702 /* 694 703 * set some bits on a range in the tree. This may require allocations or 695 704 * sleeping, so the gfp mask is used to indicate what is allowed. ··· 949 940 } 950 941 951 942 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 952 - gfp_t mask) 943 + struct extent_state **cached_state, gfp_t mask) 953 944 { 954 - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 955 - NULL, mask); 945 + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 946 + NULL, cached_state, mask); 956 947 } 957 948 958 949 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, ··· 1021 1012 mask); 1022 1013 } 1023 1014 1024 - int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1025 - gfp_t mask) 1015 + int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 1026 1016 { 1027 1017 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1028 1018 mask); ··· 1743 1735 1744 1736 do { 1745 1737 struct page *page = bvec->bv_page; 1738 + struct extent_state *cached = NULL; 1739 + struct extent_state *state; 1740 + 1746 1741 tree = &BTRFS_I(page->mapping->host)->io_tree; 1747 1742 1748 1743 start = ((u64)page->index << PAGE_CACHE_SHIFT) + ··· 1760 1749 if (++bvec <= bvec_end) 1761 1750 prefetchw(&bvec->bv_page->flags); 1762 1751 1752 + spin_lock(&tree->lock); 1753 + state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); 1754 + if (state && state->start == start) { 1755 + /* 1756 + * take a reference on the state, unlock will drop 1757 + * the ref 1758 + */ 1759 + cache_state(state, &cached); 1760 + } 1761 + spin_unlock(&tree->lock); 1762 + 1763 1763 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1764 1764 ret = tree->ops->readpage_end_io_hook(page, start, end, 1765 - NULL); 1765 + state); 1766 1766 if (ret) 1767 1767 uptodate = 0; 1768 1768 } ··· 1786 1764 test_bit(BIO_UPTODATE, &bio->bi_flags); 1787 1765 if (err) 1788 1766 uptodate = 0; 1767 + uncache_state(&cached); 1789 1768 continue; 1790 1769 } 1791 1770 } 1792 1771 1793 1772 if (uptodate) { 1794 - set_extent_uptodate(tree, start, end, 1773 + set_extent_uptodate(tree, start, end, &cached, 1795 1774 GFP_ATOMIC); 1796 1775 } 1797 - unlock_extent(tree, start, end, GFP_ATOMIC); 1776 + unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 1798 1777 1799 1778 if (whole_page) { 1800 1779 if (uptodate) { ··· 1834 1811 1835 1812 do { 1836 1813 struct page *page = bvec->bv_page; 1814 + struct extent_state *cached = NULL; 1837 1815 tree = &BTRFS_I(page->mapping->host)->io_tree; 1838 1816 1839 1817 start = ((u64)page->index << PAGE_CACHE_SHIFT) + ··· 1845 1821 prefetchw(&bvec->bv_page->flags); 1846 1822 1847 1823 if (uptodate) { 1848 - set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1824 + set_extent_uptodate(tree, start, end, &cached, 1825 + GFP_ATOMIC); 1849 1826 } else { 1850 1827 ClearPageUptodate(page); 1851 1828 SetPageError(page); 1852 1829 } 1853 1830 1854 - unlock_extent(tree, start, end, GFP_ATOMIC); 1831 + unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 1855 1832 1856 1833 } while (bvec >= bio->bi_io_vec); 1857 1834 ··· 2041 2016 while (cur <= end) { 2042 2017 if (cur >= last_byte) { 2043 2018 char *userpage; 2019 + struct extent_state *cached = NULL; 2020 + 2044 2021 iosize = PAGE_CACHE_SIZE - page_offset; 2045 2022 userpage = kmap_atomic(page, KM_USER0); 2046 2023 memset(userpage + page_offset, 0, iosize); 2047 2024 flush_dcache_page(page); 2048 2025 kunmap_atomic(userpage, KM_USER0); 2049 2026 set_extent_uptodate(tree, cur, cur + iosize - 1, 2050 - GFP_NOFS); 2051 - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2027 + &cached, GFP_NOFS); 2028 + unlock_extent_cached(tree, cur, cur + iosize - 1, 2029 + &cached, GFP_NOFS); 2052 2030 break; 2053 2031 } 2054 2032 em = get_extent(inode, page, page_offset, cur, ··· 2091 2063 /* we've found a hole, just zero and go on */ 2092 2064 if (block_start == EXTENT_MAP_HOLE) { 2093 2065 char *userpage; 2066 + struct extent_state *cached = NULL; 2067 + 2094 2068 userpage = kmap_atomic(page, KM_USER0); 2095 2069 memset(userpage + page_offset, 0, iosize); 2096 2070 flush_dcache_page(page); 2097 2071 kunmap_atomic(userpage, KM_USER0); 2098 2072 2099 2073 set_extent_uptodate(tree, cur, cur + iosize - 1, 2100 - GFP_NOFS); 2101 - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2074 + &cached, GFP_NOFS); 2075 + unlock_extent_cached(tree, cur, cur + iosize - 1, 2076 + &cached, GFP_NOFS); 2102 2077 cur = cur + iosize; 2103 2078 page_offset += iosize; 2104 2079 continue; ··· 2820 2789 iocount++; 2821 2790 block_start = block_start + iosize; 2822 2791 } else { 2823 - set_extent_uptodate(tree, block_start, cur_end, 2792 + struct extent_state *cached = NULL; 2793 + 2794 + set_extent_uptodate(tree, block_start, cur_end, &cached, 2824 2795 GFP_NOFS); 2825 - unlock_extent(tree, block_start, cur_end, GFP_NOFS); 2796 + unlock_extent_cached(tree, block_start, cur_end, 2797 + &cached, GFP_NOFS); 2826 2798 block_start = cur_end + 1; 2827 2799 } 2828 2800 page_offset = block_start & (PAGE_CACHE_SIZE - 1); ··· 3491 3457 num_pages = num_extent_pages(eb->start, eb->len); 3492 3458 3493 3459 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3494 - GFP_NOFS); 3460 + NULL, GFP_NOFS); 3495 3461 for (i = 0; i < num_pages; i++) { 3496 3462 page = extent_buffer_page(eb, i); 3497 3463 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || ··· 3919 3885 kunmap_atomic(dst_kaddr, KM_USER0); 3920 3886 } 3921 3887 3888 + static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 3889 + { 3890 + unsigned long distance = (src > dst) ? src - dst : dst - src; 3891 + return distance < len; 3892 + } 3893 + 3922 3894 static void copy_pages(struct page *dst_page, struct page *src_page, 3923 3895 unsigned long dst_off, unsigned long src_off, 3924 3896 unsigned long len) ··· 3932 3892 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3933 3893 char *src_kaddr; 3934 3894 3935 - if (dst_page != src_page) 3895 + if (dst_page != src_page) { 3936 3896 src_kaddr = kmap_atomic(src_page, KM_USER1); 3937 - else 3897 + } else { 3938 3898 src_kaddr = dst_kaddr; 3899 + BUG_ON(areas_overlap(src_off, dst_off, len)); 3900 + } 3939 3901 3940 3902 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3941 3903 kunmap_atomic(dst_kaddr, KM_USER0); ··· 4012 3970 "len %lu len %lu\n", dst_offset, len, dst->len); 4013 3971 BUG_ON(1); 4014 3972 } 4015 - if (dst_offset < src_offset) { 3973 + if (!areas_overlap(src_offset, dst_offset, len)) { 4016 3974 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 4017 3975 return; 4018 3976 }

+1 -1

fs/btrfs/extent_io.h

··· 208 208 int bits, int exclusive_bits, u64 *failed_start, 209 209 struct extent_state **cached_state, gfp_t mask); 210 210 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 211 - gfp_t mask); 211 + struct extent_state **cached_state, gfp_t mask); 212 212 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 213 213 gfp_t mask); 214 214 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,

+9 -12

fs/btrfs/file.c

··· 104 104 /* 105 105 * unlocks pages after btrfs_file_write is done with them 106 106 */ 107 - static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) 107 + void btrfs_drop_pages(struct page **pages, size_t num_pages) 108 108 { 109 109 size_t i; 110 110 for (i = 0; i < num_pages; i++) { ··· 127 127 * this also makes the decision about creating an inline extent vs 128 128 * doing real data extents, marking pages dirty and delalloc as required. 129 129 */ 130 - static noinline int dirty_and_release_pages(struct btrfs_root *root, 131 - struct file *file, 132 - struct page **pages, 133 - size_t num_pages, 134 - loff_t pos, 135 - size_t write_bytes) 130 + int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, 131 + struct page **pages, size_t num_pages, 132 + loff_t pos, size_t write_bytes, 133 + struct extent_state **cached) 136 134 { 137 135 int err = 0; 138 136 int i; 139 - struct inode *inode = fdentry(file)->d_inode; 140 137 u64 num_bytes; 141 138 u64 start_pos; 142 139 u64 end_of_last_block; ··· 146 149 147 150 end_of_last_block = start_pos + num_bytes - 1; 148 151 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 149 - NULL); 152 + cached); 150 153 if (err) 151 154 return err; 152 155 ··· 989 992 } 990 993 991 994 if (copied > 0) { 992 - ret = dirty_and_release_pages(root, file, pages, 993 - dirty_pages, pos, 994 - copied); 995 + ret = btrfs_dirty_pages(root, inode, pages, 996 + dirty_pages, pos, copied, 997 + NULL); 995 998 if (ret) { 996 999 btrfs_delalloc_release_space(inode, 997 1000 dirty_pages << PAGE_CACHE_SHIFT);

+56 -63

fs/btrfs/free-space-cache.c

··· 508 508 struct inode *inode; 509 509 struct rb_node *node; 510 510 struct list_head *pos, *n; 511 + struct page **pages; 511 512 struct page *page; 512 513 struct extent_state *cached_state = NULL; 513 514 struct btrfs_free_cluster *cluster = NULL; ··· 518 517 u64 start, end, len; 519 518 u64 bytes = 0; 520 519 u32 *crc, *checksums; 521 - pgoff_t index = 0, last_index = 0; 522 520 unsigned long first_page_offset; 523 - int num_checksums; 521 + int index = 0, num_pages = 0; 524 522 int entries = 0; 525 523 int bitmaps = 0; 526 524 int ret = 0; 527 525 bool next_page = false; 526 + bool out_of_space = false; 528 527 529 528 root = root->fs_info->tree_root; 530 529 ··· 552 551 return 0; 553 552 } 554 553 555 - last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 554 + num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 555 + PAGE_CACHE_SHIFT; 556 556 filemap_write_and_wait(inode->i_mapping); 557 557 btrfs_wait_ordered_range(inode, inode->i_size & 558 558 ~(root->sectorsize - 1), (u64)-1); 559 559 560 560 /* We need a checksum per page. */ 561 - num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; 562 - crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); 561 + crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS); 563 562 if (!crc) { 563 + iput(inode); 564 + return 0; 565 + } 566 + 567 + pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); 568 + if (!pages) { 569 + kfree(crc); 564 570 iput(inode); 565 571 return 0; 566 572 } ··· 576 568 * need to calculate the offset into the page that we can start writing 577 569 * our entries. 578 570 */ 579 - first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); 571 + first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64); 580 572 581 573 /* Get the cluster for this block_group if it exists */ 582 574 if (!list_empty(&block_group->cluster_list)) ··· 598 590 * after find_get_page at this point. Just putting this here so people 599 591 * know and don't freak out. 600 592 */ 601 - while (index <= last_index) { 593 + while (index < num_pages) { 602 594 page = grab_cache_page(inode->i_mapping, index); 603 595 if (!page) { 604 - pgoff_t i = 0; 596 + int i; 605 597 606 - while (i < index) { 607 - page = find_get_page(inode->i_mapping, i); 608 - unlock_page(page); 609 - page_cache_release(page); 610 - page_cache_release(page); 611 - i++; 598 + for (i = 0; i < num_pages; i++) { 599 + unlock_page(pages[i]); 600 + page_cache_release(pages[i]); 612 601 } 613 602 goto out_free; 614 603 } 604 + pages[index] = page; 615 605 index++; 616 606 } 617 607 ··· 637 631 offset = start_offset; 638 632 } 639 633 640 - page = find_get_page(inode->i_mapping, index); 634 + if (index >= num_pages) { 635 + out_of_space = true; 636 + break; 637 + } 638 + 639 + page = pages[index]; 641 640 642 641 addr = kmap(page); 643 642 entry = addr + start_offset; ··· 719 708 720 709 bytes += PAGE_CACHE_SIZE; 721 710 722 - ClearPageChecked(page); 723 - set_page_extent_mapped(page); 724 - SetPageUptodate(page); 725 - set_page_dirty(page); 726 - 727 - /* 728 - * We need to release our reference we got for grab_cache_page, 729 - * except for the first page which will hold our checksums, we 730 - * do that below. 731 - */ 732 - if (index != 0) { 733 - unlock_page(page); 734 - page_cache_release(page); 735 - } 736 - 737 - page_cache_release(page); 738 - 739 711 index++; 740 712 } while (node || next_page); 741 713 ··· 728 734 struct btrfs_free_space *entry = 729 735 list_entry(pos, struct btrfs_free_space, list); 730 736 731 - page = find_get_page(inode->i_mapping, index); 737 + if (index >= num_pages) { 738 + out_of_space = true; 739 + break; 740 + } 741 + page = pages[index]; 732 742 733 743 addr = kmap(page); 734 744 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); ··· 743 745 crc++; 744 746 bytes += PAGE_CACHE_SIZE; 745 747 746 - ClearPageChecked(page); 747 - set_page_extent_mapped(page); 748 - SetPageUptodate(page); 749 - set_page_dirty(page); 750 - unlock_page(page); 751 - page_cache_release(page); 752 - page_cache_release(page); 753 748 list_del_init(&entry->list); 754 749 index++; 755 750 } 756 751 752 + if (out_of_space) { 753 + btrfs_drop_pages(pages, num_pages); 754 + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 755 + i_size_read(inode) - 1, &cached_state, 756 + GFP_NOFS); 757 + ret = 0; 758 + goto out_free; 759 + } 760 + 757 761 /* Zero out the rest of the pages just to make sure */ 758 - while (index <= last_index) { 762 + while (index < num_pages) { 759 763 void *addr; 760 764 761 - page = find_get_page(inode->i_mapping, index); 762 - 765 + page = pages[index]; 763 766 addr = kmap(page); 764 767 memset(addr, 0, PAGE_CACHE_SIZE); 765 768 kunmap(page); 766 - ClearPageChecked(page); 767 - set_page_extent_mapped(page); 768 - SetPageUptodate(page); 769 - set_page_dirty(page); 770 - unlock_page(page); 771 - page_cache_release(page); 772 - page_cache_release(page); 773 769 bytes += PAGE_CACHE_SIZE; 774 770 index++; 775 771 } 776 - 777 - btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state); 778 772 779 773 /* Write the checksums and trans id to the first page */ 780 774 { 781 775 void *addr; 782 776 u64 *gen; 783 777 784 - page = find_get_page(inode->i_mapping, 0); 778 + page = pages[0]; 785 779 786 780 addr = kmap(page); 787 - memcpy(addr, checksums, sizeof(u32) * num_checksums); 788 - gen = addr + (sizeof(u32) * num_checksums); 781 + memcpy(addr, checksums, sizeof(u32) * num_pages); 782 + gen = addr + (sizeof(u32) * num_pages); 789 783 *gen = trans->transid; 790 784 kunmap(page); 791 - ClearPageChecked(page); 792 - set_page_extent_mapped(page); 793 - SetPageUptodate(page); 794 - set_page_dirty(page); 795 - unlock_page(page); 796 - page_cache_release(page); 797 - page_cache_release(page); 798 785 } 799 - BTRFS_I(inode)->generation = trans->transid; 800 786 787 + ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, 788 + bytes, &cached_state); 789 + btrfs_drop_pages(pages, num_pages); 801 790 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 802 791 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 792 + 793 + if (ret) { 794 + ret = 0; 795 + goto out_free; 796 + } 797 + 798 + BTRFS_I(inode)->generation = trans->transid; 803 799 804 800 filemap_write_and_wait(inode->i_mapping); 805 801 ··· 845 853 BTRFS_I(inode)->generation = 0; 846 854 } 847 855 kfree(checksums); 856 + kfree(pages); 848 857 btrfs_update_inode(trans, root, inode); 849 858 iput(inode); 850 859 return ret;

+116 -49

fs/btrfs/inode.c

··· 1770 1770 add_pending_csums(trans, inode, ordered_extent->file_offset, 1771 1771 &ordered_extent->list); 1772 1772 1773 - btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1774 - ret = btrfs_update_inode(trans, root, inode); 1775 - BUG_ON(ret); 1773 + ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1774 + if (!ret) { 1775 + ret = btrfs_update_inode(trans, root, inode); 1776 + BUG_ON(ret); 1777 + } 1778 + ret = 0; 1776 1779 out: 1777 1780 if (nolock) { 1778 1781 if (trans) ··· 2593 2590 struct btrfs_inode_item *item, 2594 2591 struct inode *inode) 2595 2592 { 2593 + if (!leaf->map_token) 2594 + map_private_extent_buffer(leaf, (unsigned long)item, 2595 + sizeof(struct btrfs_inode_item), 2596 + &leaf->map_token, &leaf->kaddr, 2597 + &leaf->map_start, &leaf->map_len, 2598 + KM_USER1); 2599 + 2596 2600 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2597 2601 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2598 2602 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); ··· 2628 2618 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2629 2619 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2630 2620 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); 2621 + 2622 + if (leaf->map_token) { 2623 + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); 2624 + leaf->map_token = NULL; 2625 + } 2631 2626 } 2632 2627 2633 2628 /* ··· 4222 4207 struct btrfs_key found_key; 4223 4208 struct btrfs_path *path; 4224 4209 int ret; 4225 - u32 nritems; 4226 4210 struct extent_buffer *leaf; 4227 4211 int slot; 4228 - int advance; 4229 4212 unsigned char d_type; 4230 4213 int over = 0; 4231 4214 u32 di_cur; ··· 4266 4253 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4267 4254 if (ret < 0) 4268 4255 goto err; 4269 - advance = 0; 4270 4256 4271 4257 while (1) { 4272 4258 leaf = path->nodes[0]; 4273 - nritems = btrfs_header_nritems(leaf); 4274 4259 slot = path->slots[0]; 4275 - if (advance || slot >= nritems) { 4276 - if (slot >= nritems - 1) { 4277 - ret = btrfs_next_leaf(root, path); 4278 - if (ret) 4279 - break; 4280 - leaf = path->nodes[0]; 4281 - nritems = btrfs_header_nritems(leaf); 4282 - slot = path->slots[0]; 4283 - } else { 4284 - slot++; 4285 - path->slots[0]++; 4286 - } 4260 + if (slot >= btrfs_header_nritems(leaf)) { 4261 + ret = btrfs_next_leaf(root, path); 4262 + if (ret < 0) 4263 + goto err; 4264 + else if (ret > 0) 4265 + break; 4266 + continue; 4287 4267 } 4288 4268 4289 - advance = 1; 4290 4269 item = btrfs_item_nr(leaf, slot); 4291 4270 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4292 4271 ··· 4287 4282 if (btrfs_key_type(&found_key) != key_type) 4288 4283 break; 4289 4284 if (found_key.offset < filp->f_pos) 4290 - continue; 4285 + goto next; 4291 4286 4292 4287 filp->f_pos = found_key.offset; 4293 4288 ··· 4340 4335 di_cur += di_len; 4341 4336 di = (struct btrfs_dir_item *)((char *)di + di_len); 4342 4337 } 4338 + next: 4339 + path->slots[0]++; 4343 4340 } 4344 4341 4345 4342 /* Reached end of directory/root. Bump pos past the last item. */ ··· 4534 4527 BUG_ON(!path); 4535 4528 4536 4529 inode = new_inode(root->fs_info->sb); 4537 - if (!inode) 4530 + if (!inode) { 4531 + btrfs_free_path(path); 4538 4532 return ERR_PTR(-ENOMEM); 4533 + } 4539 4534 4540 4535 if (dir) { 4541 4536 trace_btrfs_inode_request(dir); 4542 4537 4543 4538 ret = btrfs_set_inode_index(dir, index); 4544 4539 if (ret) { 4540 + btrfs_free_path(path); 4545 4541 iput(inode); 4546 4542 return ERR_PTR(ret); 4547 4543 } ··· 4844 4834 if (inode->i_nlink == ~0U) 4845 4835 return -EMLINK; 4846 4836 4847 - btrfs_inc_nlink(inode); 4848 - inode->i_ctime = CURRENT_TIME; 4849 - 4850 4837 err = btrfs_set_inode_index(dir, &index); 4851 4838 if (err) 4852 4839 goto fail; ··· 4858 4851 err = PTR_ERR(trans); 4859 4852 goto fail; 4860 4853 } 4854 + 4855 + btrfs_inc_nlink(inode); 4856 + inode->i_ctime = CURRENT_TIME; 4861 4857 4862 4858 btrfs_set_trans_block_group(trans, dir); 4863 4859 ihold(inode); ··· 5231 5221 btrfs_mark_buffer_dirty(leaf); 5232 5222 } 5233 5223 set_extent_uptodate(io_tree, em->start, 5234 - extent_map_end(em) - 1, GFP_NOFS); 5224 + extent_map_end(em) - 1, NULL, GFP_NOFS); 5235 5225 goto insert; 5236 5226 } else { 5237 5227 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); ··· 5438 5428 } 5439 5429 5440 5430 static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5431 + struct extent_map *em, 5441 5432 u64 start, u64 len) 5442 5433 { 5443 5434 struct btrfs_root *root = BTRFS_I(inode)->root; 5444 5435 struct btrfs_trans_handle *trans; 5445 - struct extent_map *em; 5446 5436 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5447 5437 struct btrfs_key ins; 5448 5438 u64 alloc_hint; 5449 5439 int ret; 5440 + bool insert = false; 5450 5441 5451 - btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5442 + /* 5443 + * Ok if the extent map we looked up is a hole and is for the exact 5444 + * range we want, there is no reason to allocate a new one, however if 5445 + * it is not right then we need to free this one and drop the cache for 5446 + * our range. 5447 + */ 5448 + if (em->block_start != EXTENT_MAP_HOLE || em->start != start || 5449 + em->len != len) { 5450 + free_extent_map(em); 5451 + em = NULL; 5452 + insert = true; 5453 + btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5454 + } 5452 5455 5453 5456 trans = btrfs_join_transaction(root, 0); 5454 5457 if (IS_ERR(trans)) ··· 5477 5454 goto out; 5478 5455 } 5479 5456 5480 - em = alloc_extent_map(GFP_NOFS); 5481 5457 if (!em) { 5482 - em = ERR_PTR(-ENOMEM); 5483 - goto out; 5458 + em = alloc_extent_map(GFP_NOFS); 5459 + if (!em) { 5460 + em = ERR_PTR(-ENOMEM); 5461 + goto out; 5462 + } 5484 5463 } 5485 5464 5486 5465 em->start = start; ··· 5492 5467 em->block_start = ins.objectid; 5493 5468 em->block_len = ins.offset; 5494 5469 em->bdev = root->fs_info->fs_devices->latest_bdev; 5470 + 5471 + /* 5472 + * We need to do this because if we're using the original em we searched 5473 + * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. 5474 + */ 5475 + em->flags = 0; 5495 5476 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5496 5477 5497 - while (1) { 5478 + while (insert) { 5498 5479 write_lock(&em_tree->lock); 5499 5480 ret = add_extent_mapping(em_tree, em); 5500 5481 write_unlock(&em_tree->lock); ··· 5718 5687 * it above 5719 5688 */ 5720 5689 len = bh_result->b_size; 5721 - free_extent_map(em); 5722 - em = btrfs_new_extent_direct(inode, start, len); 5690 + em = btrfs_new_extent_direct(inode, em, start, len); 5723 5691 if (IS_ERR(em)) 5724 5692 return PTR_ERR(em); 5725 5693 len = min(len, em->len - (start - em->start)); ··· 5881 5851 } 5882 5852 5883 5853 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5884 - btrfs_ordered_update_i_size(inode, 0, ordered); 5885 - btrfs_update_inode(trans, root, inode); 5854 + ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5855 + if (!ret) 5856 + btrfs_update_inode(trans, root, inode); 5857 + ret = 0; 5886 5858 out_unlock: 5887 5859 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5888 5860 ordered->file_offset + ordered->len - 1, ··· 5970 5938 5971 5939 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 5972 5940 int rw, u64 file_offset, int skip_sum, 5973 - u32 *csums) 5941 + u32 *csums, int async_submit) 5974 5942 { 5975 5943 int write = rw & REQ_WRITE; 5976 5944 struct btrfs_root *root = BTRFS_I(inode)->root; ··· 5981 5949 if (ret) 5982 5950 goto err; 5983 5951 5984 - if (write && !skip_sum) { 5952 + if (skip_sum) 5953 + goto map; 5954 + 5955 + if (write && async_submit) { 5985 5956 ret = btrfs_wq_submit_bio(root->fs_info, 5986 5957 inode, rw, bio, 0, 0, 5987 5958 file_offset, 5988 5959 __btrfs_submit_bio_start_direct_io, 5989 5960 __btrfs_submit_bio_done); 5990 5961 goto err; 5962 + } else if (write) { 5963 + /* 5964 + * If we aren't doing async submit, calculate the csum of the 5965 + * bio now. 5966 + */ 5967 + ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); 5968 + if (ret) 5969 + goto err; 5991 5970 } else if (!skip_sum) { 5992 5971 ret = btrfs_lookup_bio_sums_dio(root, inode, bio, 5993 5972 file_offset, csums); ··· 6006 5963 goto err; 6007 5964 } 6008 5965 6009 - ret = btrfs_map_bio(root, rw, bio, 0, 1); 5966 + map: 5967 + ret = btrfs_map_bio(root, rw, bio, 0, async_submit); 6010 5968 err: 6011 5969 bio_put(bio); 6012 5970 return ret; ··· 6029 5985 int nr_pages = 0; 6030 5986 u32 *csums = dip->csums; 6031 5987 int ret = 0; 5988 + int async_submit = 0; 6032 5989 int write = rw & REQ_WRITE; 6033 - 6034 - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 6035 - if (!bio) 6036 - return -ENOMEM; 6037 - bio->bi_private = dip; 6038 - bio->bi_end_io = btrfs_end_dio_bio; 6039 - atomic_inc(&dip->pending_bios); 6040 5990 6041 5991 map_length = orig_bio->bi_size; 6042 5992 ret = btrfs_map_block(map_tree, READ, start_sector << 9, ··· 6039 6001 bio_put(bio); 6040 6002 return -EIO; 6041 6003 } 6004 + 6005 + if (map_length >= orig_bio->bi_size) { 6006 + bio = orig_bio; 6007 + goto submit; 6008 + } 6009 + 6010 + async_submit = 1; 6011 + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 6012 + if (!bio) 6013 + return -ENOMEM; 6014 + bio->bi_private = dip; 6015 + bio->bi_end_io = btrfs_end_dio_bio; 6016 + atomic_inc(&dip->pending_bios); 6042 6017 6043 6018 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 6044 6019 if (unlikely(map_length < submit_len + bvec->bv_len || ··· 6066 6015 atomic_inc(&dip->pending_bios); 6067 6016 ret = __btrfs_submit_dio_bio(bio, inode, rw, 6068 6017 file_offset, skip_sum, 6069 - csums); 6018 + csums, async_submit); 6070 6019 if (ret) { 6071 6020 bio_put(bio); 6072 6021 atomic_dec(&dip->pending_bios); ··· 6103 6052 } 6104 6053 } 6105 6054 6055 + submit: 6106 6056 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, 6107 - csums); 6057 + csums, async_submit); 6108 6058 if (!ret) 6109 6059 return 0; 6110 6060 ··· 6200 6148 unsigned long nr_segs) 6201 6149 { 6202 6150 int seg; 6151 + int i; 6203 6152 size_t size; 6204 6153 unsigned long addr; 6205 6154 unsigned blocksize_mask = root->sectorsize - 1; ··· 6215 6162 addr = (unsigned long)iov[seg].iov_base; 6216 6163 size = iov[seg].iov_len; 6217 6164 end += size; 6218 - if ((addr & blocksize_mask) || (size & blocksize_mask)) 6165 + if ((addr & blocksize_mask) || (size & blocksize_mask)) 6219 6166 goto out; 6167 + 6168 + /* If this is a write we don't need to check anymore */ 6169 + if (rw & WRITE) 6170 + continue; 6171 + 6172 + /* 6173 + * Check to make sure we don't have duplicate iov_base's in this 6174 + * iovec, if so return EINVAL, otherwise we'll get csum errors 6175 + * when reading back. 6176 + */ 6177 + for (i = seg + 1; i < nr_segs; i++) { 6178 + if (iov[seg].iov_base == iov[i].iov_base) 6179 + goto out; 6180 + } 6220 6181 } 6221 6182 retval = 0; 6222 6183 out:

+1 -1

fs/btrfs/ioctl.c

··· 2287 2287 struct btrfs_ioctl_space_info space; 2288 2288 struct btrfs_ioctl_space_info *dest; 2289 2289 struct btrfs_ioctl_space_info *dest_orig; 2290 - struct btrfs_ioctl_space_info *user_dest; 2290 + struct btrfs_ioctl_space_info __user *user_dest; 2291 2291 struct btrfs_space_info *info; 2292 2292 u64 types[] = {BTRFS_BLOCK_GROUP_DATA, 2293 2293 BTRFS_BLOCK_GROUP_SYSTEM,

+33 -9

fs/btrfs/super.c

··· 159 159 Opt_compress_type, Opt_compress_force, Opt_compress_force_type, 160 160 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 161 161 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 162 - Opt_enospc_debug, Opt_err, 162 + Opt_enospc_debug, Opt_subvolrootid, Opt_err, 163 163 }; 164 164 165 165 static match_table_t tokens = { ··· 189 189 {Opt_clear_cache, "clear_cache"}, 190 190 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, 191 191 {Opt_enospc_debug, "enospc_debug"}, 192 + {Opt_subvolrootid, "subvolrootid=%d"}, 192 193 {Opt_err, NULL}, 193 194 }; 194 195 ··· 233 232 break; 234 233 case Opt_subvol: 235 234 case Opt_subvolid: 235 + case Opt_subvolrootid: 236 236 case Opt_device: 237 237 /* 238 238 * These are parsed by btrfs_parse_early_options ··· 390 388 */ 391 389 static int btrfs_parse_early_options(const char *options, fmode_t flags, 392 390 void *holder, char **subvol_name, u64 *subvol_objectid, 393 - struct btrfs_fs_devices **fs_devices) 391 + u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) 394 392 { 395 393 substring_t args[MAX_OPT_ARGS]; 396 394 char *opts, *orig, *p; ··· 429 427 BTRFS_FS_TREE_OBJECTID; 430 428 else 431 429 *subvol_objectid = intarg; 430 + } 431 + break; 432 + case Opt_subvolrootid: 433 + intarg = 0; 434 + error = match_int(&args[0], &intarg); 435 + if (!error) { 436 + /* we want the original fs_tree */ 437 + if (!intarg) 438 + *subvol_rootid = 439 + BTRFS_FS_TREE_OBJECTID; 440 + else 441 + *subvol_rootid = intarg; 432 442 } 433 443 break; 434 444 case Opt_device: ··· 750 736 fmode_t mode = FMODE_READ; 751 737 char *subvol_name = NULL; 752 738 u64 subvol_objectid = 0; 739 + u64 subvol_rootid = 0; 753 740 int error = 0; 754 741 755 742 if (!(flags & MS_RDONLY)) ··· 758 743 759 744 error = btrfs_parse_early_options(data, mode, fs_type, 760 745 &subvol_name, &subvol_objectid, 761 - &fs_devices); 746 + &subvol_rootid, &fs_devices); 762 747 if (error) 763 748 return ERR_PTR(error); 764 749 ··· 822 807 s->s_flags |= MS_ACTIVE; 823 808 } 824 809 825 - root = get_default_root(s, subvol_objectid); 826 - if (IS_ERR(root)) { 827 - error = PTR_ERR(root); 828 - deactivate_locked_super(s); 829 - goto error_free_subvol_name; 830 - } 831 810 /* if they gave us a subvolume name bind mount into that */ 832 811 if (strcmp(subvol_name, ".")) { 833 812 struct dentry *new_root; 813 + 814 + root = get_default_root(s, subvol_rootid); 815 + if (IS_ERR(root)) { 816 + error = PTR_ERR(root); 817 + deactivate_locked_super(s); 818 + goto error_free_subvol_name; 819 + } 820 + 834 821 mutex_lock(&root->d_inode->i_mutex); 835 822 new_root = lookup_one_len(subvol_name, root, 836 823 strlen(subvol_name)); ··· 853 836 } 854 837 dput(root); 855 838 root = new_root; 839 + } else { 840 + root = get_default_root(s, subvol_objectid); 841 + if (IS_ERR(root)) { 842 + error = PTR_ERR(root); 843 + deactivate_locked_super(s); 844 + goto error_free_subvol_name; 845 + } 856 846 } 857 847 858 848 kfree(subvol_name);

+26 -22

fs/btrfs/transaction.c

··· 32 32 33 33 static noinline void put_transaction(struct btrfs_transaction *transaction) 34 34 { 35 - WARN_ON(transaction->use_count == 0); 36 - transaction->use_count--; 37 - if (transaction->use_count == 0) { 38 - list_del_init(&transaction->list); 35 + WARN_ON(atomic_read(&transaction->use_count) == 0); 36 + if (atomic_dec_and_test(&transaction->use_count)) { 39 37 memset(transaction, 0, sizeof(*transaction)); 40 38 kmem_cache_free(btrfs_transaction_cachep, transaction); 41 39 } ··· 58 60 if (!cur_trans) 59 61 return -ENOMEM; 60 62 root->fs_info->generation++; 61 - cur_trans->num_writers = 1; 63 + atomic_set(&cur_trans->num_writers, 1); 62 64 cur_trans->num_joined = 0; 63 65 cur_trans->transid = root->fs_info->generation; 64 66 init_waitqueue_head(&cur_trans->writer_wait); 65 67 init_waitqueue_head(&cur_trans->commit_wait); 66 68 cur_trans->in_commit = 0; 67 69 cur_trans->blocked = 0; 68 - cur_trans->use_count = 1; 70 + atomic_set(&cur_trans->use_count, 1); 69 71 cur_trans->commit_done = 0; 70 72 cur_trans->start_time = get_seconds(); 71 73 ··· 86 88 root->fs_info->running_transaction = cur_trans; 87 89 spin_unlock(&root->fs_info->new_trans_lock); 88 90 } else { 89 - cur_trans->num_writers++; 91 + atomic_inc(&cur_trans->num_writers); 90 92 cur_trans->num_joined++; 91 93 } 92 94 ··· 143 145 cur_trans = root->fs_info->running_transaction; 144 146 if (cur_trans && cur_trans->blocked) { 145 147 DEFINE_WAIT(wait); 146 - cur_trans->use_count++; 148 + atomic_inc(&cur_trans->use_count); 147 149 while (1) { 148 150 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 149 151 TASK_UNINTERRUPTIBLE); ··· 179 181 { 180 182 struct btrfs_trans_handle *h; 181 183 struct btrfs_transaction *cur_trans; 184 + int retries = 0; 182 185 int ret; 183 186 184 187 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) ··· 203 204 } 204 205 205 206 cur_trans = root->fs_info->running_transaction; 206 - cur_trans->use_count++; 207 + atomic_inc(&cur_trans->use_count); 207 208 if (type != TRANS_JOIN_NOLOCK) 208 209 mutex_unlock(&root->fs_info->trans_mutex); 209 210 ··· 223 224 224 225 if (num_items > 0) { 225 226 ret = btrfs_trans_reserve_metadata(h, root, num_items); 226 - if (ret == -EAGAIN) { 227 + if (ret == -EAGAIN && !retries) { 228 + retries++; 227 229 btrfs_commit_transaction(h, root); 228 230 goto again; 231 + } else if (ret == -EAGAIN) { 232 + /* 233 + * We have already retried and got EAGAIN, so really we 234 + * don't have space, so set ret to -ENOSPC. 235 + */ 236 + ret = -ENOSPC; 229 237 } 238 + 230 239 if (ret < 0) { 231 240 btrfs_end_transaction(h, root); 232 241 return ERR_PTR(ret); ··· 334 327 goto out_unlock; /* nothing committing|committed */ 335 328 } 336 329 337 - cur_trans->use_count++; 330 + atomic_inc(&cur_trans->use_count); 338 331 mutex_unlock(&root->fs_info->trans_mutex); 339 332 340 333 wait_for_commit(root, cur_trans); ··· 464 457 wake_up_process(info->transaction_kthread); 465 458 } 466 459 467 - if (lock) 468 - mutex_lock(&info->trans_mutex); 469 460 WARN_ON(cur_trans != info->running_transaction); 470 - WARN_ON(cur_trans->num_writers < 1); 471 - cur_trans->num_writers--; 461 + WARN_ON(atomic_read(&cur_trans->num_writers) < 1); 462 + atomic_dec(&cur_trans->num_writers); 472 463 473 464 smp_mb(); 474 465 if (waitqueue_active(&cur_trans->writer_wait)) 475 466 wake_up(&cur_trans->writer_wait); 476 467 put_transaction(cur_trans); 477 - if (lock) 478 - mutex_unlock(&info->trans_mutex); 479 468 480 469 if (current->journal_info == trans) 481 470 current->journal_info = NULL; ··· 1181 1178 /* take transaction reference */ 1182 1179 mutex_lock(&root->fs_info->trans_mutex); 1183 1180 cur_trans = trans->transaction; 1184 - cur_trans->use_count++; 1181 + atomic_inc(&cur_trans->use_count); 1185 1182 mutex_unlock(&root->fs_info->trans_mutex); 1186 1183 1187 1184 btrfs_end_transaction(trans, root); ··· 1240 1237 1241 1238 mutex_lock(&root->fs_info->trans_mutex); 1242 1239 if (cur_trans->in_commit) { 1243 - cur_trans->use_count++; 1240 + atomic_inc(&cur_trans->use_count); 1244 1241 mutex_unlock(&root->fs_info->trans_mutex); 1245 1242 btrfs_end_transaction(trans, root); 1246 1243 ··· 1262 1259 prev_trans = list_entry(cur_trans->list.prev, 1263 1260 struct btrfs_transaction, list); 1264 1261 if (!prev_trans->commit_done) { 1265 - prev_trans->use_count++; 1262 + atomic_inc(&prev_trans->use_count); 1266 1263 mutex_unlock(&root->fs_info->trans_mutex); 1267 1264 1268 1265 wait_for_commit(root, prev_trans); ··· 1303 1300 TASK_UNINTERRUPTIBLE); 1304 1301 1305 1302 smp_mb(); 1306 - if (cur_trans->num_writers > 1) 1303 + if (atomic_read(&cur_trans->num_writers) > 1) 1307 1304 schedule_timeout(MAX_SCHEDULE_TIMEOUT); 1308 1305 else if (should_grow) 1309 1306 schedule_timeout(1); 1310 1307 1311 1308 mutex_lock(&root->fs_info->trans_mutex); 1312 1309 finish_wait(&cur_trans->writer_wait, &wait); 1313 - } while (cur_trans->num_writers > 1 || 1310 + } while (atomic_read(&cur_trans->num_writers) > 1 || 1314 1311 (should_grow && cur_trans->num_joined != joined)); 1315 1312 1316 1313 ret = create_pending_snapshots(trans, root->fs_info); ··· 1397 1394 1398 1395 wake_up(&cur_trans->commit_wait); 1399 1396 1397 + list_del_init(&cur_trans->list); 1400 1398 put_transaction(cur_trans); 1401 1399 put_transaction(cur_trans); 1402 1400

+2 -2

fs/btrfs/transaction.h

··· 27 27 * total writers in this transaction, it must be zero before the 28 28 * transaction can end 29 29 */ 30 - unsigned long num_writers; 30 + atomic_t num_writers; 31 31 32 32 unsigned long num_joined; 33 33 int in_commit; 34 - int use_count; 34 + atomic_t use_count; 35 35 int commit_done; 36 36 int blocked; 37 37 struct list_head list;

+12 -21

fs/btrfs/xattr.c

··· 180 180 struct btrfs_path *path; 181 181 struct extent_buffer *leaf; 182 182 struct btrfs_dir_item *di; 183 - int ret = 0, slot, advance; 183 + int ret = 0, slot; 184 184 size_t total_size = 0, size_left = size; 185 185 unsigned long name_ptr; 186 186 size_t name_len; 187 - u32 nritems; 188 187 189 188 /* 190 189 * ok we want all objects associated with this id. ··· 203 204 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 204 205 if (ret < 0) 205 206 goto err; 206 - advance = 0; 207 + 207 208 while (1) { 208 209 leaf = path->nodes[0]; 209 - nritems = btrfs_header_nritems(leaf); 210 210 slot = path->slots[0]; 211 211 212 212 /* this is where we start walking through the path */ 213 - if (advance || slot >= nritems) { 213 + if (slot >= btrfs_header_nritems(leaf)) { 214 214 /* 215 215 * if we've reached the last slot in this leaf we need 216 216 * to go to the next leaf and reset everything 217 217 */ 218 - if (slot >= nritems-1) { 219 - ret = btrfs_next_leaf(root, path); 220 - if (ret) 221 - break; 222 - leaf = path->nodes[0]; 223 - nritems = btrfs_header_nritems(leaf); 224 - slot = path->slots[0]; 225 - } else { 226 - /* 227 - * just walking through the slots on this leaf 228 - */ 229 - slot++; 230 - path->slots[0]++; 231 - } 218 + ret = btrfs_next_leaf(root, path); 219 + if (ret < 0) 220 + goto err; 221 + else if (ret > 0) 222 + break; 223 + continue; 232 224 } 233 - advance = 1; 234 225 235 226 btrfs_item_key_to_cpu(leaf, &found_key, slot); 236 227 ··· 239 250 240 251 /* we are just looking for how big our buffer needs to be */ 241 252 if (!size) 242 - continue; 253 + goto next; 243 254 244 255 if (!buffer || (name_len + 1) > size_left) { 245 256 ret = -ERANGE; ··· 252 263 253 264 size_left -= name_len + 1; 254 265 buffer += name_len + 1; 266 + next: 267 + path->slots[0]++; 255 268 } 256 269 ret = total_size; 257 270

Configure Feed

Configure Feed