Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable

* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable:
Btrfs: break out of shrink_delalloc earlier
btrfs: fix not enough reserved space
btrfs: fix dip leak
Btrfs: make sure not to return overlapping extents to fiemap
Btrfs: deal with short returns from copy_from_user
Btrfs: fix regressions in copy_from_user handling

+135 -62
+9
fs/btrfs/ctree.h
··· 729 729 u64 disk_total; /* total bytes on disk, takes mirrors into 730 730 account */ 731 731 732 + /* 733 + * we bump reservation progress every time we decrement 734 + * bytes_reserved. This way people waiting for reservations 735 + * know something good has happened and they can check 736 + * for progress. The number here isn't to be trusted, it 737 + * just shows reclaim activity 738 + */ 739 + unsigned long reservation_progress; 740 + 732 741 int full; /* indicates that we cannot allocate any more 733 742 chunks for this space */ 734 743 int force_alloc; /* set if we need to force a chunk alloc for
+23 -12
fs/btrfs/extent-tree.c
··· 3342 3342 u64 max_reclaim; 3343 3343 u64 reclaimed = 0; 3344 3344 long time_left; 3345 - int pause = 1; 3346 3345 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3347 3346 int loops = 0; 3347 + unsigned long progress; 3348 3348 3349 3349 block_rsv = &root->fs_info->delalloc_block_rsv; 3350 3350 space_info = block_rsv->space_info; 3351 3351 3352 3352 smp_mb(); 3353 3353 reserved = space_info->bytes_reserved; 3354 + progress = space_info->reservation_progress; 3354 3355 3355 3356 if (reserved == 0) 3356 3357 return 0; ··· 3366 3365 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); 3367 3366 3368 3367 spin_lock(&space_info->lock); 3369 - if (reserved > space_info->bytes_reserved) { 3370 - loops = 0; 3368 + if (reserved > space_info->bytes_reserved) 3371 3369 reclaimed += reserved - space_info->bytes_reserved; 3372 - } else { 3373 - loops++; 3374 - } 3375 3370 reserved = space_info->bytes_reserved; 3376 3371 spin_unlock(&space_info->lock); 3372 + 3373 + loops++; 3377 3374 3378 3375 if (reserved == 0 || reclaimed >= max_reclaim) 3379 3376 break; ··· 3379 3380 if (trans && trans->transaction->blocked) 3380 3381 return -EAGAIN; 3381 3382 3382 - __set_current_state(TASK_INTERRUPTIBLE); 3383 - time_left = schedule_timeout(pause); 3383 + time_left = schedule_timeout_interruptible(1); 3384 3384 3385 3385 /* We were interrupted, exit */ 3386 3386 if (time_left) 3387 3387 break; 3388 3388 3389 - pause <<= 1; 3390 - if (pause > HZ / 10) 3391 - pause = HZ / 10; 3389 + /* we've kicked the IO a few times, if anything has been freed, 3390 + * exit. There is no sense in looping here for a long time 3391 + * when we really need to commit the transaction, or there are 3392 + * just too many writers without enough free space 3393 + */ 3394 + 3395 + if (loops > 3) { 3396 + smp_mb(); 3397 + if (progress != space_info->reservation_progress) 3398 + break; 3399 + } 3392 3400 3393 3401 } 3394 3402 return reclaimed >= to_reclaim; ··· 3618 3612 if (num_bytes) { 3619 3613 spin_lock(&space_info->lock); 3620 3614 space_info->bytes_reserved -= num_bytes; 3615 + space_info->reservation_progress++; 3621 3616 spin_unlock(&space_info->lock); 3622 3617 } 3623 3618 } ··· 3851 3844 if (block_rsv->reserved >= block_rsv->size) { 3852 3845 num_bytes = block_rsv->reserved - block_rsv->size; 3853 3846 sinfo->bytes_reserved -= num_bytes; 3847 + sinfo->reservation_progress++; 3854 3848 block_rsv->reserved = block_rsv->size; 3855 3849 block_rsv->full = 1; 3856 3850 } ··· 4013 4005 to_reserve = 0; 4014 4006 } 4015 4007 spin_unlock(&BTRFS_I(inode)->accounting_lock); 4016 - 4017 4008 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4018 4009 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); 4019 4010 if (ret) ··· 4140 4133 btrfs_set_block_group_used(&cache->item, old_val); 4141 4134 cache->reserved -= num_bytes; 4142 4135 cache->space_info->bytes_reserved -= num_bytes; 4136 + cache->space_info->reservation_progress++; 4143 4137 cache->space_info->bytes_used += num_bytes; 4144 4138 cache->space_info->disk_used += num_bytes * factor; 4145 4139 spin_unlock(&cache->lock); ··· 4192 4184 if (reserved) { 4193 4185 cache->reserved -= num_bytes; 4194 4186 cache->space_info->bytes_reserved -= num_bytes; 4187 + cache->space_info->reservation_progress++; 4195 4188 } 4196 4189 spin_unlock(&cache->lock); 4197 4190 spin_unlock(&cache->space_info->lock); ··· 4243 4234 space_info->bytes_readonly += num_bytes; 4244 4235 cache->reserved -= num_bytes; 4245 4236 space_info->bytes_reserved -= num_bytes; 4237 + space_info->reservation_progress++; 4246 4238 } 4247 4239 spin_unlock(&cache->lock); 4248 4240 spin_unlock(&space_info->lock); ··· 4722 4712 if (ret) { 4723 4713 spin_lock(&cache->space_info->lock); 4724 4714 cache->space_info->bytes_reserved -= buf->len; 4715 + cache->space_info->reservation_progress++; 4725 4716 spin_unlock(&cache->space_info->lock); 4726 4717 } 4727 4718 goto out;
+27 -6
fs/btrfs/extent_io.c
··· 3046 3046 } 3047 3047 3048 3048 while (!end) { 3049 - off = extent_map_end(em); 3050 - if (off >= max) 3051 - end = 1; 3049 + u64 offset_in_extent; 3052 3050 3053 - em_start = em->start; 3054 - em_len = em->len; 3051 + /* break if the extent we found is outside the range */ 3052 + if (em->start >= max || extent_map_end(em) < off) 3053 + break; 3054 + 3055 + /* 3056 + * get_extent may return an extent that starts before our 3057 + * requested range. We have to make sure the ranges 3058 + * we return to fiemap always move forward and don't 3059 + * overlap, so adjust the offsets here 3060 + */ 3061 + em_start = max(em->start, off); 3062 + 3063 + /* 3064 + * record the offset from the start of the extent 3065 + * for adjusting the disk offset below 3066 + */ 3067 + offset_in_extent = em_start - em->start; 3055 3068 em_end = extent_map_end(em); 3069 + em_len = em_end - em_start; 3056 3070 emflags = em->flags; 3057 3071 disko = 0; 3058 3072 flags = 0; 3073 + 3074 + /* 3075 + * bump off for our next call to get_extent 3076 + */ 3077 + off = extent_map_end(em); 3078 + if (off >= max) 3079 + end = 1; 3059 3080 3060 3081 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 3061 3082 end = 1; ··· 3088 3067 flags |= (FIEMAP_EXTENT_DELALLOC | 3089 3068 FIEMAP_EXTENT_UNKNOWN); 3090 3069 } else { 3091 - disko = em->block_start; 3070 + disko = em->block_start + offset_in_extent; 3092 3071 } 3093 3072 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 3094 3073 flags |= FIEMAP_EXTENT_ENCODED;
+72 -42
fs/btrfs/file.c
··· 70 70 71 71 /* Flush processor's dcache for this page */ 72 72 flush_dcache_page(page); 73 + 74 + /* 75 + * if we get a partial write, we can end up with 76 + * partially up to date pages. These add 77 + * a lot of complexity, so make sure they don't 78 + * happen by forcing this copy to be retried. 79 + * 80 + * The rest of the btrfs_file_write code will fall 81 + * back to page at a time copies after we return 0. 82 + */ 83 + if (!PageUptodate(page) && copied < count) 84 + copied = 0; 85 + 73 86 iov_iter_advance(i, copied); 74 87 write_bytes -= copied; 75 88 total_copied += copied; ··· 776 763 } 777 764 778 765 /* 766 + * on error we return an unlocked page and the error value 767 + * on success we return a locked page and 0 768 + */ 769 + static int prepare_uptodate_page(struct page *page, u64 pos) 770 + { 771 + int ret = 0; 772 + 773 + if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) { 774 + ret = btrfs_readpage(NULL, page); 775 + if (ret) 776 + return ret; 777 + lock_page(page); 778 + if (!PageUptodate(page)) { 779 + unlock_page(page); 780 + return -EIO; 781 + } 782 + } 783 + return 0; 784 + } 785 + 786 + /* 779 787 * this gets pages into the page cache and locks them down, it also properly 780 788 * waits for data=ordered extents to finish before allowing the pages to be 781 789 * modified. ··· 811 777 unsigned long index = pos >> PAGE_CACHE_SHIFT; 812 778 struct inode *inode = fdentry(file)->d_inode; 813 779 int err = 0; 780 + int faili = 0; 814 781 u64 start_pos; 815 782 u64 last_pos; 816 783 ··· 829 794 for (i = 0; i < num_pages; i++) { 830 795 pages[i] = grab_cache_page(inode->i_mapping, index + i); 831 796 if (!pages[i]) { 832 - int c; 833 - for (c = i - 1; c >= 0; c--) { 834 - unlock_page(pages[c]); 835 - page_cache_release(pages[c]); 836 - } 837 - return -ENOMEM; 797 + faili = i - 1; 798 + err = -ENOMEM; 799 + goto fail; 800 + } 801 + 802 + if (i == 0) 803 + err = prepare_uptodate_page(pages[i], pos); 804 + if (i == num_pages - 1) 805 + err = prepare_uptodate_page(pages[i], 806 + pos + write_bytes); 807 + if (err) { 808 + page_cache_release(pages[i]); 809 + faili = i - 1; 810 + goto fail; 838 811 } 839 812 wait_on_page_writeback(pages[i]); 840 813 } 814 + err = 0; 841 815 if (start_pos < inode->i_size) { 842 816 struct btrfs_ordered_extent *ordered; 843 817 lock_extent_bits(&BTRFS_I(inode)->io_tree, ··· 886 842 WARN_ON(!PageLocked(pages[i])); 887 843 } 888 844 return 0; 845 + fail: 846 + while (faili >= 0) { 847 + unlock_page(pages[faili]); 848 + page_cache_release(pages[faili]); 849 + faili--; 850 + } 851 + return err; 852 + 889 853 } 890 854 891 855 static ssize_t btrfs_file_aio_write(struct kiocb *iocb, ··· 903 851 struct file *file = iocb->ki_filp; 904 852 struct inode *inode = fdentry(file)->d_inode; 905 853 struct btrfs_root *root = BTRFS_I(inode)->root; 906 - struct page *pinned[2]; 907 854 struct page **pages = NULL; 908 855 struct iov_iter i; 909 856 loff_t *ppos = &iocb->ki_pos; ··· 922 871 923 872 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 924 873 (file->f_flags & O_DIRECT)); 925 - 926 - pinned[0] = NULL; 927 - pinned[1] = NULL; 928 874 929 875 start_pos = pos; 930 876 ··· 1010 962 first_index = pos >> PAGE_CACHE_SHIFT; 1011 963 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; 1012 964 1013 - /* 1014 - * there are lots of better ways to do this, but this code 1015 - * makes sure the first and last page in the file range are 1016 - * up to date and ready for cow 1017 - */ 1018 - if ((pos & (PAGE_CACHE_SIZE - 1))) { 1019 - pinned[0] = grab_cache_page(inode->i_mapping, first_index); 1020 - if (!PageUptodate(pinned[0])) { 1021 - ret = btrfs_readpage(NULL, pinned[0]); 1022 - BUG_ON(ret); 1023 - wait_on_page_locked(pinned[0]); 1024 - } else { 1025 - unlock_page(pinned[0]); 1026 - } 1027 - } 1028 - if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { 1029 - pinned[1] = grab_cache_page(inode->i_mapping, last_index); 1030 - if (!PageUptodate(pinned[1])) { 1031 - ret = btrfs_readpage(NULL, pinned[1]); 1032 - BUG_ON(ret); 1033 - wait_on_page_locked(pinned[1]); 1034 - } else { 1035 - unlock_page(pinned[1]); 1036 - } 1037 - } 1038 - 1039 965 while (iov_iter_count(&i) > 0) { 1040 966 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1041 967 size_t write_bytes = min(iov_iter_count(&i), ··· 1046 1024 1047 1025 copied = btrfs_copy_from_user(pos, num_pages, 1048 1026 write_bytes, pages, &i); 1049 - dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >> 1050 - PAGE_CACHE_SHIFT; 1027 + 1028 + /* 1029 + * if we have trouble faulting in the pages, fall 1030 + * back to one page at a time 1031 + */ 1032 + if (copied < write_bytes) 1033 + nrptrs = 1; 1034 + 1035 + if (copied == 0) 1036 + dirty_pages = 0; 1037 + else 1038 + dirty_pages = (copied + offset + 1039 + PAGE_CACHE_SIZE - 1) >> 1040 + PAGE_CACHE_SHIFT; 1051 1041 1052 1042 if (num_pages > dirty_pages) { 1053 1043 if (copied > 0) ··· 1103 1069 err = ret; 1104 1070 1105 1071 kfree(pages); 1106 - if (pinned[0]) 1107 - page_cache_release(pinned[0]); 1108 - if (pinned[1]) 1109 - page_cache_release(pinned[1]); 1110 1072 *ppos = pos; 1111 1073 1112 1074 /*
+4 -2
fs/btrfs/inode.c
··· 4821 4821 goto fail; 4822 4822 4823 4823 /* 4824 - * 1 item for inode ref 4824 + * 2 items for inode and inode ref 4825 4825 * 2 items for dir items 4826 + * 1 item for parent inode 4826 4827 */ 4827 - trans = btrfs_start_transaction(root, 3); 4828 + trans = btrfs_start_transaction(root, 5); 4828 4829 if (IS_ERR(trans)) { 4829 4830 err = PTR_ERR(trans); 4830 4831 goto fail; ··· 6057 6056 if (!skip_sum) { 6058 6057 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); 6059 6058 if (!dip->csums) { 6059 + kfree(dip); 6060 6060 ret = -ENOMEM; 6061 6061 goto free_ordered; 6062 6062 }