Merge tag 'for-f2fs-3.15' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs

+12

Documentation/ABI/testing/sysfs-fs-f2fs

··· 55 55 Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com> 56 56 Description: 57 57 Controls the number of trials to find a victim segment. 58 + 59 + What: /sys/fs/f2fs/<disk>/dir_level 60 + Date: March 2014 61 + Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com> 62 + Description: 63 + Controls the directory level for large directory. 64 + 65 + What: /sys/fs/f2fs/<disk>/ram_thresh 66 + Date: March 2014 67 + Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com> 68 + Description: 69 + Controls the memory footprint used by f2fs.

+24 -5

Documentation/filesystems/f2fs.txt

··· 122 122 inline_xattr Enable the inline xattrs feature. 123 123 inline_data Enable the inline data feature: New created small(<~3.4k) 124 124 files can be written into inode block. 125 + flush_merge Merge concurrent cache_flush commands as much as possible 126 + to eliminate redundant command issues. If the underlying 127 + device handles the cache_flush command relatively slowly, 128 + recommend to enable this option. 125 129 126 130 ================================================================================ 127 131 DEBUGFS ENTRIES ··· 173 169 174 170 reclaim_segments This parameter controls the number of prefree 175 171 segments to be reclaimed. If the number of prefree 176 - segments is larger than this number, f2fs tries to 177 - conduct checkpoint to reclaim the prefree segments 178 - to free segments. By default, 100 segments, 200MB. 172 + segments is larger than the number of segments 173 + in the proportion to the percentage over total 174 + volume size, f2fs tries to conduct checkpoint to 175 + reclaim the prefree segments to free segments. 176 + By default, 5% over total # of segments. 179 177 180 178 max_small_discards This parameter controls the number of discard 181 179 commands that consist small blocks less than 2MB. ··· 200 194 find a victim segment when conducting SSR and 201 195 cleaning operations. The default value is 4096 202 196 which covers 8GB block address range. 197 + 198 + dir_level This parameter controls the directory level to 199 + support large directory. If a directory has a 200 + number of files, it can reduce the file lookup 201 + latency by increasing this dir_level value. 202 + Otherwise, it needs to decrease this value to 203 + reduce the space overhead. The default value is 0. 204 + 205 + ram_thresh This parameter controls the memory footprint used 206 + by free nids and cached nat entries. By default, 207 + 10 is set, which indicates 10 MB / 1 GB RAM. 203 208 204 209 ================================================================================ 205 210 USAGE ··· 461 444 # of blocks in level #n = | 462 445 `- 4, Otherwise 463 446 464 - ,- 2^n, if n < MAX_DIR_HASH_DEPTH / 2, 447 + ,- 2^ (n + dir_level), 448 + | if n < MAX_DIR_HASH_DEPTH / 2, 465 449 # of buckets in level #n = | 466 - `- 2^((MAX_DIR_HASH_DEPTH / 2) - 1), Otherwise 450 + `- 2^((MAX_DIR_HASH_DEPTH / 2 + dir_level) - 1), 451 + Otherwise 467 452 468 453 When F2FS finds a file name in a directory, at first a hash value of the file 469 454 name is calculated. Then, F2FS scans the hash table in level #0 to find the

+7 -1

fs/f2fs/acl.c

··· 174 174 175 175 retval = f2fs_getxattr(inode, name_index, "", NULL, 0); 176 176 if (retval > 0) { 177 - value = kmalloc(retval, GFP_KERNEL); 177 + value = kmalloc(retval, GFP_F2FS_ZERO); 178 178 if (!value) 179 179 return ERR_PTR(-ENOMEM); 180 180 retval = f2fs_getxattr(inode, name_index, "", value, retval); ··· 202 202 void *value = NULL; 203 203 size_t size = 0; 204 204 int error; 205 + 206 + if (acl) { 207 + error = posix_acl_valid(acl); 208 + if (error < 0) 209 + return error; 210 + } 205 211 206 212 switch (type) { 207 213 case ACL_TYPE_ACCESS:

+150 -58

fs/f2fs/checkpoint.c

··· 33 33 struct address_space *mapping = META_MAPPING(sbi); 34 34 struct page *page = NULL; 35 35 repeat: 36 - page = grab_cache_page(mapping, index); 36 + page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); 37 37 if (!page) { 38 38 cond_resched(); 39 39 goto repeat; 40 40 } 41 41 42 - /* We wait writeback only inside grab_meta_page() */ 43 - wait_on_page_writeback(page); 44 42 SetPageUptodate(page); 45 43 return page; 46 44 } ··· 73 75 return page; 74 76 } 75 77 78 + inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) 79 + { 80 + switch (type) { 81 + case META_NAT: 82 + return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK; 83 + case META_SIT: 84 + return SIT_BLK_CNT(sbi); 85 + case META_SSA: 86 + case META_CP: 87 + return 0; 88 + default: 89 + BUG(); 90 + } 91 + } 92 + 93 + /* 94 + * Readahead CP/NAT/SIT/SSA pages 95 + */ 96 + int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) 97 + { 98 + block_t prev_blk_addr = 0; 99 + struct page *page; 100 + int blkno = start; 101 + int max_blks = get_max_meta_blks(sbi, type); 102 + 103 + struct f2fs_io_info fio = { 104 + .type = META, 105 + .rw = READ_SYNC | REQ_META | REQ_PRIO 106 + }; 107 + 108 + for (; nrpages-- > 0; blkno++) { 109 + block_t blk_addr; 110 + 111 + switch (type) { 112 + case META_NAT: 113 + /* get nat block addr */ 114 + if (unlikely(blkno >= max_blks)) 115 + blkno = 0; 116 + blk_addr = current_nat_addr(sbi, 117 + blkno * NAT_ENTRY_PER_BLOCK); 118 + break; 119 + case META_SIT: 120 + /* get sit block addr */ 121 + if (unlikely(blkno >= max_blks)) 122 + goto out; 123 + blk_addr = current_sit_addr(sbi, 124 + blkno * SIT_ENTRY_PER_BLOCK); 125 + if (blkno != start && prev_blk_addr + 1 != blk_addr) 126 + goto out; 127 + prev_blk_addr = blk_addr; 128 + break; 129 + case META_SSA: 130 + case META_CP: 131 + /* get ssa/cp block addr */ 132 + blk_addr = blkno; 133 + break; 134 + default: 135 + BUG(); 136 + } 137 + 138 + page = grab_cache_page(META_MAPPING(sbi), blk_addr); 139 + if (!page) 140 + continue; 141 + if (PageUptodate(page)) { 142 + mark_page_accessed(page); 143 + f2fs_put_page(page, 1); 144 + continue; 145 + } 146 + 147 + f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); 148 + mark_page_accessed(page); 149 + f2fs_put_page(page, 0); 150 + } 151 + out: 152 + f2fs_submit_merged_bio(sbi, META, READ); 153 + return blkno - start; 154 + } 155 + 76 156 static int f2fs_write_meta_page(struct page *page, 77 157 struct writeback_control *wbc) 78 158 { 79 159 struct inode *inode = page->mapping->host; 80 160 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 81 161 82 - /* Should not write any meta pages, if any IO error was occurred */ 83 - if (unlikely(sbi->por_doing || 84 - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) 162 + if (unlikely(sbi->por_doing)) 85 163 goto redirty_out; 86 - 87 164 if (wbc->for_reclaim) 88 165 goto redirty_out; 89 166 90 - wait_on_page_writeback(page); 167 + /* Should not write any meta pages, if any IO error was occurred */ 168 + if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) 169 + goto no_write; 91 170 171 + f2fs_wait_on_page_writeback(page, META); 92 172 write_meta_page(sbi, page); 173 + no_write: 93 174 dec_page_count(sbi, F2FS_DIRTY_META); 94 175 unlock_page(page); 95 176 return 0; ··· 176 99 redirty_out: 177 100 dec_page_count(sbi, F2FS_DIRTY_META); 178 101 wbc->pages_skipped++; 102 + account_page_redirty(page); 179 103 set_page_dirty(page); 180 104 return AOP_WRITEPAGE_ACTIVATE; 181 105 } ··· 185 107 struct writeback_control *wbc) 186 108 { 187 109 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 188 - int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 189 - long written; 190 - 191 - if (wbc->for_kupdate) 192 - return 0; 110 + long diff, written; 193 111 194 112 /* collect a number of dirty meta pages and write together */ 195 - if (get_pages(sbi, F2FS_DIRTY_META) < nrpages) 196 - return 0; 113 + if (wbc->for_kupdate || 114 + get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) 115 + goto skip_write; 197 116 198 117 /* if mounting is failed, skip writing node pages */ 199 118 mutex_lock(&sbi->cp_mutex); 200 - written = sync_meta_pages(sbi, META, nrpages); 119 + diff = nr_pages_to_write(sbi, META, wbc); 120 + written = sync_meta_pages(sbi, META, wbc->nr_to_write); 201 121 mutex_unlock(&sbi->cp_mutex); 202 - wbc->nr_to_write -= written; 122 + wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); 123 + return 0; 124 + 125 + skip_write: 126 + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); 203 127 return 0; 204 128 } 205 129 ··· 228 148 229 149 for (i = 0; i < nr_pages; i++) { 230 150 struct page *page = pvec.pages[i]; 151 + 231 152 lock_page(page); 232 - f2fs_bug_on(page->mapping != mapping); 233 - f2fs_bug_on(!PageDirty(page)); 234 - clear_page_dirty_for_io(page); 153 + 154 + if (unlikely(page->mapping != mapping)) { 155 + continue_unlock: 156 + unlock_page(page); 157 + continue; 158 + } 159 + if (!PageDirty(page)) { 160 + /* someone wrote it for us */ 161 + goto continue_unlock; 162 + } 163 + 164 + if (!clear_page_dirty_for_io(page)) 165 + goto continue_unlock; 166 + 235 167 if (f2fs_write_meta_page(page, &wbc)) { 236 168 unlock_page(page); 237 169 break; ··· 308 216 309 217 void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 310 218 { 311 - struct list_head *head, *this; 312 - struct orphan_inode_entry *new = NULL, *orphan = NULL; 219 + struct list_head *head; 220 + struct orphan_inode_entry *new, *orphan; 313 221 314 222 new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); 315 223 new->ino = ino; 316 224 317 225 spin_lock(&sbi->orphan_inode_lock); 318 226 head = &sbi->orphan_inode_list; 319 - list_for_each(this, head) { 320 - orphan = list_entry(this, struct orphan_inode_entry, list); 227 + list_for_each_entry(orphan, head, list) { 321 228 if (orphan->ino == ino) { 322 229 spin_unlock(&sbi->orphan_inode_lock); 323 230 kmem_cache_free(orphan_entry_slab, new); ··· 325 234 326 235 if (orphan->ino > ino) 327 236 break; 328 - orphan = NULL; 329 237 } 330 238 331 - /* add new_oentry into list which is sorted by inode number */ 332 - if (orphan) 333 - list_add(&new->list, this->prev); 334 - else 335 - list_add_tail(&new->list, head); 239 + /* add new orphan entry into list which is sorted by inode number */ 240 + list_add_tail(&new->list, &orphan->list); 336 241 spin_unlock(&sbi->orphan_inode_lock); 337 242 } 338 243 ··· 342 255 list_for_each_entry(orphan, head, list) { 343 256 if (orphan->ino == ino) { 344 257 list_del(&orphan->list); 345 - kmem_cache_free(orphan_entry_slab, orphan); 346 258 f2fs_bug_on(sbi->n_orphans == 0); 347 259 sbi->n_orphans--; 348 - break; 260 + spin_unlock(&sbi->orphan_inode_lock); 261 + kmem_cache_free(orphan_entry_slab, orphan); 262 + return; 349 263 } 350 264 } 351 265 spin_unlock(&sbi->orphan_inode_lock); ··· 372 284 sbi->por_doing = true; 373 285 start_blk = __start_cp_addr(sbi) + 1; 374 286 orphan_blkaddr = __start_sum_addr(sbi) - 1; 287 + 288 + ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP); 375 289 376 290 for (i = 0; i < orphan_blkaddr; i++) { 377 291 struct page *page = get_meta_page(sbi, start_blk + i); ··· 556 466 { 557 467 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 558 468 struct list_head *head = &sbi->dir_inode_list; 559 - struct list_head *this; 469 + struct dir_inode_entry *entry; 560 470 561 - list_for_each(this, head) { 562 - struct dir_inode_entry *entry; 563 - entry = list_entry(this, struct dir_inode_entry, list); 471 + list_for_each_entry(entry, head, list) 564 472 if (unlikely(entry->inode == inode)) 565 473 return -EEXIST; 566 - } 474 + 567 475 list_add_tail(&new->list, head); 568 476 stat_inc_dirty_dir(sbi); 569 477 return 0; ··· 571 483 { 572 484 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 573 485 struct dir_inode_entry *new; 486 + int ret = 0; 574 487 575 488 if (!S_ISDIR(inode->i_mode)) 576 489 return; ··· 581 492 INIT_LIST_HEAD(&new->list); 582 493 583 494 spin_lock(&sbi->dir_inode_lock); 584 - if (__add_dirty_inode(inode, new)) 585 - kmem_cache_free(inode_entry_slab, new); 586 - 587 - inc_page_count(sbi, F2FS_DIRTY_DENTS); 495 + ret = __add_dirty_inode(inode, new); 588 496 inode_inc_dirty_dents(inode); 589 497 SetPagePrivate(page); 590 498 spin_unlock(&sbi->dir_inode_lock); 499 + 500 + if (ret) 501 + kmem_cache_free(inode_entry_slab, new); 591 502 } 592 503 593 504 void add_dirty_dir_inode(struct inode *inode) ··· 595 506 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 596 507 struct dir_inode_entry *new = 597 508 f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); 509 + int ret = 0; 598 510 599 511 new->inode = inode; 600 512 INIT_LIST_HEAD(&new->list); 601 513 602 514 spin_lock(&sbi->dir_inode_lock); 603 - if (__add_dirty_inode(inode, new)) 604 - kmem_cache_free(inode_entry_slab, new); 515 + ret = __add_dirty_inode(inode, new); 605 516 spin_unlock(&sbi->dir_inode_lock); 517 + 518 + if (ret) 519 + kmem_cache_free(inode_entry_slab, new); 606 520 } 607 521 608 522 void remove_dirty_dir_inode(struct inode *inode) 609 523 { 610 524 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 611 - 612 - struct list_head *this, *head; 525 + struct list_head *head; 526 + struct dir_inode_entry *entry; 613 527 614 528 if (!S_ISDIR(inode->i_mode)) 615 529 return; 616 530 617 531 spin_lock(&sbi->dir_inode_lock); 618 - if (atomic_read(&F2FS_I(inode)->dirty_dents)) { 532 + if (get_dirty_dents(inode)) { 619 533 spin_unlock(&sbi->dir_inode_lock); 620 534 return; 621 535 } 622 536 623 537 head = &sbi->dir_inode_list; 624 - list_for_each(this, head) { 625 - struct dir_inode_entry *entry; 626 - entry = list_entry(this, struct dir_inode_entry, list); 538 + list_for_each_entry(entry, head, list) { 627 539 if (entry->inode == inode) { 628 540 list_del(&entry->list); 629 - kmem_cache_free(inode_entry_slab, entry); 630 541 stat_dec_dirty_dir(sbi); 631 - break; 542 + spin_unlock(&sbi->dir_inode_lock); 543 + kmem_cache_free(inode_entry_slab, entry); 544 + goto done; 632 545 } 633 546 } 634 547 spin_unlock(&sbi->dir_inode_lock); 635 548 549 + done: 636 550 /* Only from the recovery routine */ 637 551 if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { 638 552 clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); ··· 646 554 struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) 647 555 { 648 556 649 - struct list_head *this, *head; 557 + struct list_head *head; 650 558 struct inode *inode = NULL; 559 + struct dir_inode_entry *entry; 651 560 652 561 spin_lock(&sbi->dir_inode_lock); 653 562 654 563 head = &sbi->dir_inode_list; 655 - list_for_each(this, head) { 656 - struct dir_inode_entry *entry; 657 - entry = list_entry(this, struct dir_inode_entry, list); 564 + list_for_each_entry(entry, head, list) { 658 565 if (entry->inode->i_ino == ino) { 659 566 inode = entry->inode; 660 567 break; ··· 680 589 inode = igrab(entry->inode); 681 590 spin_unlock(&sbi->dir_inode_lock); 682 591 if (inode) { 683 - filemap_flush(inode->i_mapping); 592 + filemap_fdatawrite(inode->i_mapping); 684 593 iput(inode); 685 594 } else { 686 595 /* ··· 915 824 unblock_operations(sbi); 916 825 mutex_unlock(&sbi->cp_mutex); 917 826 827 + stat_inc_cp_count(sbi->stat_info); 918 828 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); 919 829 } 920 830 ··· 937 845 int __init create_checkpoint_caches(void) 938 846 { 939 847 orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", 940 - sizeof(struct orphan_inode_entry), NULL); 848 + sizeof(struct orphan_inode_entry)); 941 849 if (!orphan_entry_slab) 942 850 return -ENOMEM; 943 851 inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", 944 - sizeof(struct dir_inode_entry), NULL); 852 + sizeof(struct dir_inode_entry)); 945 853 if (!inode_entry_slab) { 946 854 kmem_cache_destroy(orphan_entry_slab); 947 855 return -ENOMEM;

+52 -56

fs/f2fs/data.c

··· 45 45 46 46 static void f2fs_write_end_io(struct bio *bio, int err) 47 47 { 48 - struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb); 48 + struct f2fs_sb_info *sbi = bio->bi_private; 49 49 struct bio_vec *bvec; 50 50 int i; 51 51 ··· 55 55 if (unlikely(err)) { 56 56 SetPageError(page); 57 57 set_bit(AS_EIO, &page->mapping->flags); 58 - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); 59 - sbi->sb->s_flags |= MS_RDONLY; 58 + f2fs_stop_checkpoint(sbi); 60 59 } 61 60 end_page_writeback(page); 62 61 dec_page_count(sbi, F2FS_WRITEBACK); 63 62 } 64 63 65 - if (bio->bi_private) 66 - complete(bio->bi_private); 64 + if (sbi->wait_io) { 65 + complete(sbi->wait_io); 66 + sbi->wait_io = NULL; 67 + } 67 68 68 69 if (!get_pages(sbi, F2FS_WRITEBACK) && 69 70 !list_empty(&sbi->cp_wait.task_list)) ··· 87 86 bio->bi_bdev = sbi->sb->s_bdev; 88 87 bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); 89 88 bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; 89 + bio->bi_private = sbi; 90 90 91 91 return bio; 92 92 } ··· 115 113 */ 116 114 if (fio->type == META_FLUSH) { 117 115 DECLARE_COMPLETION_ONSTACK(wait); 118 - io->bio->bi_private = &wait; 116 + io->sbi->wait_io = &wait; 119 117 submit_bio(rw, io->bio); 120 118 wait_for_completion(&wait); 121 119 } else { ··· 134 132 135 133 io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; 136 134 137 - mutex_lock(&io->io_mutex); 135 + down_write(&io->io_rwsem); 138 136 139 137 /* change META to META_FLUSH in the checkpoint procedure */ 140 138 if (type >= META_FLUSH) { ··· 142 140 io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; 143 141 } 144 142 __submit_merged_bio(io); 145 - mutex_unlock(&io->io_mutex); 143 + up_write(&io->io_rwsem); 146 144 } 147 145 148 146 /* ··· 180 178 181 179 verify_block_addr(sbi, blk_addr); 182 180 183 - mutex_lock(&io->io_mutex); 181 + down_write(&io->io_rwsem); 184 182 185 183 if (!is_read) 186 184 inc_page_count(sbi, F2FS_WRITEBACK); ··· 204 202 205 203 io->last_block_in_bio = blk_addr; 206 204 207 - mutex_unlock(&io->io_mutex); 205 + up_write(&io->io_rwsem); 208 206 trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr); 209 207 } 210 208 ··· 799 797 */ 800 798 offset = i_size & (PAGE_CACHE_SIZE - 1); 801 799 if ((page->index >= end_index + 1) || !offset) { 802 - if (S_ISDIR(inode->i_mode)) { 803 - dec_page_count(sbi, F2FS_DIRTY_DENTS); 804 - inode_dec_dirty_dents(inode); 805 - } 800 + inode_dec_dirty_dents(inode); 806 801 goto out; 807 802 } 808 803 809 804 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 810 805 write: 811 - if (unlikely(sbi->por_doing)) { 812 - err = AOP_WRITEPAGE_ACTIVATE; 806 + if (unlikely(sbi->por_doing)) 813 807 goto redirty_out; 814 - } 815 808 816 809 /* Dentry blocks are controlled by checkpoint */ 817 810 if (S_ISDIR(inode->i_mode)) { 818 - dec_page_count(sbi, F2FS_DIRTY_DENTS); 819 811 inode_dec_dirty_dents(inode); 820 812 err = do_write_data_page(page, &fio); 821 - } else { 822 - f2fs_lock_op(sbi); 823 - 824 - if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) { 825 - err = f2fs_write_inline_data(inode, page, offset); 826 - f2fs_unlock_op(sbi); 827 - goto out; 828 - } else { 829 - err = do_write_data_page(page, &fio); 830 - } 831 - 832 - f2fs_unlock_op(sbi); 833 - need_balance_fs = true; 813 + goto done; 834 814 } 835 - if (err == -ENOENT) 836 - goto out; 837 - else if (err) 815 + 816 + if (!wbc->for_reclaim) 817 + need_balance_fs = true; 818 + else if (has_not_enough_free_secs(sbi, 0)) 838 819 goto redirty_out; 839 820 840 - if (wbc->for_reclaim) { 841 - f2fs_submit_merged_bio(sbi, DATA, WRITE); 842 - need_balance_fs = false; 843 - } 821 + f2fs_lock_op(sbi); 822 + if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) 823 + err = f2fs_write_inline_data(inode, page, offset); 824 + else 825 + err = do_write_data_page(page, &fio); 826 + f2fs_unlock_op(sbi); 827 + done: 828 + if (err && err != -ENOENT) 829 + goto redirty_out; 844 830 845 831 clear_cold_data(page); 846 832 out: ··· 839 849 840 850 redirty_out: 841 851 wbc->pages_skipped++; 852 + account_page_redirty(page); 842 853 set_page_dirty(page); 843 - return err; 854 + return AOP_WRITEPAGE_ACTIVATE; 844 855 } 845 - 846 - #define MAX_DESIRED_PAGES_WP 4096 847 856 848 857 static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, 849 858 void *data) ··· 860 871 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 861 872 bool locked = false; 862 873 int ret; 863 - long excess_nrtw = 0, desired_nrtw; 874 + long diff; 864 875 865 876 /* deal with chardevs and other special file */ 866 877 if (!mapping->a_ops->writepage) 867 878 return 0; 868 879 869 - if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { 870 - desired_nrtw = MAX_DESIRED_PAGES_WP; 871 - excess_nrtw = desired_nrtw - wbc->nr_to_write; 872 - wbc->nr_to_write = desired_nrtw; 873 - } 880 + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && 881 + get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA)) 882 + goto skip_write; 883 + 884 + diff = nr_pages_to_write(sbi, DATA, wbc); 874 885 875 886 if (!S_ISDIR(inode->i_mode)) { 876 887 mutex_lock(&sbi->writepages); ··· 884 895 885 896 remove_dirty_dir_inode(inode); 886 897 887 - wbc->nr_to_write -= excess_nrtw; 898 + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); 888 899 return ret; 900 + 901 + skip_write: 902 + wbc->pages_skipped += get_dirty_dents(inode); 903 + return 0; 889 904 } 890 905 891 906 static int f2fs_write_begin(struct file *file, struct address_space *mapping, ··· 942 949 if (dn.data_blkaddr == NEW_ADDR) { 943 950 zero_user_segment(page, 0, PAGE_CACHE_SIZE); 944 951 } else { 945 - if (f2fs_has_inline_data(inode)) 952 + if (f2fs_has_inline_data(inode)) { 946 953 err = f2fs_read_inline_data(inode, page); 947 - else 954 + if (err) { 955 + page_cache_release(page); 956 + return err; 957 + } 958 + } else { 948 959 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, 949 960 READ_SYNC); 950 - if (err) 951 - return err; 961 + if (err) 962 + return err; 963 + } 964 + 952 965 lock_page(page); 953 966 if (unlikely(!PageUptodate(page))) { 954 967 f2fs_put_page(page, 1); ··· 1030 1031 unsigned int length) 1031 1032 { 1032 1033 struct inode *inode = page->mapping->host; 1033 - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 1034 - if (S_ISDIR(inode->i_mode) && PageDirty(page)) { 1035 - dec_page_count(sbi, F2FS_DIRTY_DENTS); 1034 + if (PageDirty(page)) 1036 1035 inode_dec_dirty_dents(inode); 1037 - } 1038 1036 ClearPagePrivate(page); 1039 1037 } 1040 1038

+5 -7

fs/f2fs/debug.c

··· 86 86 { 87 87 struct f2fs_stat_info *si = F2FS_STAT(sbi); 88 88 unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; 89 - struct sit_info *sit_i = SIT_I(sbi); 90 89 unsigned int segno, vblocks; 91 90 int ndirty = 0; 92 91 ··· 93 94 total_vblocks = 0; 94 95 blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); 95 96 hblks_per_sec = blks_per_sec / 2; 96 - mutex_lock(&sit_i->sentry_lock); 97 97 for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { 98 98 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); 99 99 dist = abs(vblocks - hblks_per_sec); ··· 103 105 ndirty++; 104 106 } 105 107 } 106 - mutex_unlock(&sit_i->sentry_lock); 107 108 dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; 108 109 si->bimodal = bimodal / dist; 109 110 if (si->dirty_count) ··· 233 236 si->dirty_count); 234 237 seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", 235 238 si->prefree_count, si->free_segs, si->free_secs); 239 + seq_printf(s, "CP calls: %d\n", si->cp_count); 236 240 seq_printf(s, "GC calls: %d (BG: %d)\n", 237 241 si->call_count, si->bg_gc); 238 242 seq_printf(s, " - data segments : %d\n", si->data_segs); ··· 250 252 si->ndirty_dent, si->ndirty_dirs); 251 253 seq_printf(s, " - meta: %4d in %4d\n", 252 254 si->ndirty_meta, si->meta_pages); 253 - seq_printf(s, " - NATs: %5d > %lu\n", 254 - si->nats, NM_WOUT_THRESHOLD); 255 - seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", 256 - si->sits, si->fnids); 255 + seq_printf(s, " - NATs: %9d\n - SITs: %9d\n", 256 + si->nats, si->sits); 257 + seq_printf(s, " - free_nids: %9d\n", 258 + si->fnids); 257 259 seq_puts(s, "\nDistribution of User Blocks:"); 258 260 seq_puts(s, " [ valid | invalid | free ]\n"); 259 261 seq_puts(s, " [");

+49 -36

fs/f2fs/dir.c

··· 21 21 >> PAGE_CACHE_SHIFT; 22 22 } 23 23 24 - static unsigned int dir_buckets(unsigned int level) 24 + static unsigned int dir_buckets(unsigned int level, int dir_level) 25 25 { 26 26 if (level < MAX_DIR_HASH_DEPTH / 2) 27 - return 1 << level; 27 + return 1 << (level + dir_level); 28 28 else 29 - return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1); 29 + return 1 << ((MAX_DIR_HASH_DEPTH / 2 + dir_level) - 1); 30 30 } 31 31 32 32 static unsigned int bucket_blocks(unsigned int level) ··· 65 65 de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; 66 66 } 67 67 68 - static unsigned long dir_block_index(unsigned int level, unsigned int idx) 68 + static unsigned long dir_block_index(unsigned int level, 69 + int dir_level, unsigned int idx) 69 70 { 70 71 unsigned long i; 71 72 unsigned long bidx = 0; 72 73 73 74 for (i = 0; i < level; i++) 74 - bidx += dir_buckets(i) * bucket_blocks(i); 75 + bidx += dir_buckets(i, dir_level) * bucket_blocks(i); 75 76 bidx += idx * bucket_blocks(level); 76 77 return bidx; 77 78 } ··· 94 93 f2fs_hash_t namehash, struct page **res_page) 95 94 { 96 95 struct f2fs_dir_entry *de; 97 - unsigned long bit_pos, end_pos, next_pos; 96 + unsigned long bit_pos = 0; 98 97 struct f2fs_dentry_block *dentry_blk = kmap(dentry_page); 99 - int slots; 98 + const void *dentry_bits = &dentry_blk->dentry_bitmap; 99 + int max_len = 0; 100 100 101 - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, 102 - NR_DENTRY_IN_BLOCK, 0); 103 101 while (bit_pos < NR_DENTRY_IN_BLOCK) { 102 + if (!test_bit_le(bit_pos, dentry_bits)) { 103 + if (bit_pos == 0) 104 + max_len = 1; 105 + else if (!test_bit_le(bit_pos - 1, dentry_bits)) 106 + max_len++; 107 + bit_pos++; 108 + continue; 109 + } 104 110 de = &dentry_blk->dentry[bit_pos]; 105 - slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); 106 - 107 111 if (early_match_name(name, namelen, namehash, de)) { 108 112 if (!memcmp(dentry_blk->filename[bit_pos], 109 113 name, namelen)) { ··· 116 110 goto found; 117 111 } 118 112 } 119 - next_pos = bit_pos + slots; 120 - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, 121 - NR_DENTRY_IN_BLOCK, next_pos); 122 - if (bit_pos >= NR_DENTRY_IN_BLOCK) 123 - end_pos = NR_DENTRY_IN_BLOCK; 124 - else 125 - end_pos = bit_pos; 126 - if (*max_slots < end_pos - next_pos) 127 - *max_slots = end_pos - next_pos; 113 + if (max_len > *max_slots) { 114 + *max_slots = max_len; 115 + max_len = 0; 116 + } 117 + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); 128 118 } 129 119 130 120 de = NULL; 131 121 kunmap(dentry_page); 132 122 found: 123 + if (max_len > *max_slots) 124 + *max_slots = max_len; 133 125 return de; 134 126 } 135 127 ··· 145 141 146 142 f2fs_bug_on(level > MAX_DIR_HASH_DEPTH); 147 143 148 - nbucket = dir_buckets(level); 144 + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); 149 145 nblock = bucket_blocks(level); 150 146 151 - bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket); 147 + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, 148 + le32_to_cpu(namehash) % nbucket); 152 149 end_block = bidx + nblock; 153 150 154 151 for (; bidx < end_block; bidx++) { ··· 253 248 struct page *page, struct inode *inode) 254 249 { 255 250 lock_page(page); 256 - wait_on_page_writeback(page); 251 + f2fs_wait_on_page_writeback(page, DATA); 257 252 de->ino = cpu_to_le32(inode->i_ino); 258 253 set_de_type(de, inode); 259 254 kunmap(page); ··· 352 347 err = f2fs_init_security(inode, dir, name, page); 353 348 if (err) 354 349 goto put_error; 355 - 356 - wait_on_page_writeback(page); 357 350 } else { 358 351 page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); 359 352 if (IS_ERR(page)) 360 353 return page; 361 354 362 - wait_on_page_writeback(page); 363 355 set_cold_node(inode, page); 364 356 } 365 357 ··· 374 372 375 373 put_error: 376 374 f2fs_put_page(page, 1); 375 + /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ 376 + truncate_inode_pages(&inode->i_data, 0); 377 + truncate_blocks(inode, 0); 378 + remove_dirty_dir_inode(inode); 377 379 error: 378 380 remove_inode_page(inode); 379 381 return ERR_PTR(err); ··· 400 394 F2FS_I(dir)->i_current_depth = current_depth; 401 395 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); 402 396 } 403 - 404 - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) 405 - update_inode_page(dir); 406 397 407 398 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) 408 399 clear_inode_flag(F2FS_I(inode), FI_INC_LINK); ··· 467 464 if (level == current_depth) 468 465 ++current_depth; 469 466 470 - nbucket = dir_buckets(level); 467 + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); 471 468 nblock = bucket_blocks(level); 472 469 473 - bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); 470 + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, 471 + (le32_to_cpu(dentry_hash) % nbucket)); 474 472 475 473 for (block = bidx; block <= (bidx + nblock - 1); block++) { 476 474 dentry_page = get_new_data_page(dir, NULL, block, true); ··· 491 487 ++level; 492 488 goto start; 493 489 add_dentry: 494 - wait_on_page_writeback(dentry_page); 490 + f2fs_wait_on_page_writeback(dentry_page, DATA); 495 491 492 + down_write(&F2FS_I(inode)->i_sem); 496 493 page = init_inode_metadata(inode, dir, name); 497 494 if (IS_ERR(page)) { 498 495 err = PTR_ERR(page); ··· 516 511 517 512 update_parent_metadata(dir, inode, current_depth); 518 513 fail: 519 - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); 514 + up_write(&F2FS_I(inode)->i_sem); 515 + 516 + if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { 517 + update_inode_page(dir); 518 + clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); 519 + } 520 520 kunmap(dentry_page); 521 521 f2fs_put_page(dentry_page, 1); 522 522 return err; ··· 538 528 unsigned int bit_pos; 539 529 struct address_space *mapping = page->mapping; 540 530 struct inode *dir = mapping->host; 541 - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 542 531 int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); 543 532 void *kaddr = page_address(page); 544 533 int i; 545 534 546 535 lock_page(page); 547 - wait_on_page_writeback(page); 536 + f2fs_wait_on_page_writeback(page, DATA); 548 537 549 538 dentry_blk = (struct f2fs_dentry_block *)kaddr; 550 539 bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; ··· 560 551 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 561 552 562 553 if (inode) { 554 + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 555 + 556 + down_write(&F2FS_I(inode)->i_sem); 557 + 563 558 if (S_ISDIR(inode->i_mode)) { 564 559 drop_nlink(dir); 565 560 update_inode_page(dir); ··· 574 561 drop_nlink(inode); 575 562 i_size_write(inode, 0); 576 563 } 564 + up_write(&F2FS_I(inode)->i_sem); 577 565 update_inode_page(inode); 578 566 579 567 if (inode->i_nlink == 0) ··· 587 573 truncate_hole(dir, page->index, page->index + 1); 588 574 clear_page_dirty_for_io(page); 589 575 ClearPageUptodate(page); 590 - dec_page_count(sbi, F2FS_DIRTY_DENTS); 591 576 inode_dec_dirty_dents(dir); 592 577 } 593 578 f2fs_put_page(page, 1);

+78 -27

fs/f2fs/f2fs.h

··· 40 40 #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 41 41 #define F2FS_MOUNT_INLINE_XATTR 0x00000080 42 42 #define F2FS_MOUNT_INLINE_DATA 0x00000100 43 + #define F2FS_MOUNT_FLUSH_MERGE 0x00000200 43 44 44 45 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) 45 46 #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) ··· 87 86 enum { 88 87 NAT_BITMAP, 89 88 SIT_BITMAP 89 + }; 90 + 91 + /* 92 + * For CP/NAT/SIT/SSA readahead 93 + */ 94 + enum { 95 + META_CP, 96 + META_NAT, 97 + META_SIT, 98 + META_SSA 90 99 }; 91 100 92 101 /* for the list of orphan inodes */ ··· 198 187 #define FADVISE_COLD_BIT 0x01 199 188 #define FADVISE_LOST_PINO_BIT 0x02 200 189 190 + #define DEF_DIR_LEVEL 0 191 + 201 192 struct f2fs_inode_info { 202 193 struct inode vfs_inode; /* serve a vfs inode */ 203 194 unsigned long i_flags; /* keep an inode flags for ioctl */ 204 195 unsigned char i_advise; /* use to give file attribute hints */ 196 + unsigned char i_dir_level; /* use for dentry level for large dir */ 205 197 unsigned int i_current_depth; /* use only in directory structure */ 206 198 unsigned int i_pino; /* parent inode number */ 207 199 umode_t i_acl_mode; /* keep file acl mode temporarily */ 208 200 209 201 /* Use below internally in f2fs*/ 210 202 unsigned long flags; /* use to pass per-file flags */ 203 + struct rw_semaphore i_sem; /* protect fi info */ 211 204 atomic_t dirty_dents; /* # of dirty dentry pages */ 212 205 f2fs_hash_t chash; /* hash value of given file name */ 213 206 unsigned int clevel; /* maximum level of given file name */ ··· 244 229 block_t nat_blkaddr; /* base disk address of NAT */ 245 230 nid_t max_nid; /* maximum possible node ids */ 246 231 nid_t next_scan_nid; /* the next nid to be scanned */ 232 + unsigned int ram_thresh; /* control the memory footprint */ 247 233 248 234 /* NAT cache management */ 249 235 struct radix_tree_root nat_root;/* root of the nat entry cache */ ··· 254 238 struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ 255 239 256 240 /* free node ids management */ 241 + struct radix_tree_root free_nid_root;/* root of the free_nid cache */ 257 242 struct list_head free_nid_list; /* a list for free nids */ 258 243 spinlock_t free_nid_list_lock; /* protect free nid list */ 259 244 unsigned int fcnt; /* the number of free node id */ ··· 317 300 NO_CHECK_TYPE 318 301 }; 319 302 303 + struct flush_cmd { 304 + struct flush_cmd *next; 305 + struct completion wait; 306 + int ret; 307 + }; 308 + 320 309 struct f2fs_sm_info { 321 310 struct sit_info *sit_info; /* whole segment information */ 322 311 struct free_segmap_info *free_info; /* free segment information */ ··· 351 328 352 329 unsigned int ipu_policy; /* in-place-update policy */ 353 330 unsigned int min_ipu_util; /* in-place-update threshold */ 331 + 332 + /* for flush command control */ 333 + struct task_struct *f2fs_issue_flush; /* flush thread */ 334 + wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ 335 + struct flush_cmd *issue_list; /* list for command issue */ 336 + struct flush_cmd *dispatch_list; /* list for command dispatch */ 337 + spinlock_t issue_lock; /* for issue list lock */ 338 + struct flush_cmd *issue_tail; /* list tail of issue list */ 354 339 }; 355 340 356 341 /* ··· 409 378 struct bio *bio; /* bios to merge */ 410 379 sector_t last_block_in_bio; /* last block number */ 411 380 struct f2fs_io_info fio; /* store buffered io info. */ 412 - struct mutex io_mutex; /* mutex for bio */ 381 + struct rw_semaphore io_rwsem; /* blocking op for bio */ 413 382 }; 414 383 415 384 struct f2fs_sb_info { ··· 429 398 /* for bio operations */ 430 399 struct f2fs_bio_info read_io; /* for read bios */ 431 400 struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ 401 + struct completion *wait_io; /* for completion bios */ 432 402 433 403 /* for checkpoint */ 434 404 struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ ··· 439 407 struct mutex node_write; /* locking node writes */ 440 408 struct mutex writepages; /* mutex for writepages() */ 441 409 bool por_doing; /* recovery is doing or not */ 442 - bool on_build_free_nids; /* build_free_nids is doing */ 443 410 wait_queue_head_t cp_wait; 444 411 445 412 /* for orphan inode management */ ··· 467 436 unsigned int total_valid_node_count; /* valid node block count */ 468 437 unsigned int total_valid_inode_count; /* valid inode count */ 469 438 int active_logs; /* # of active logs */ 439 + int dir_level; /* directory level */ 470 440 471 441 block_t user_block_count; /* # of user blocks */ 472 442 block_t total_valid_block_count; /* # of valid blocks */ ··· 654 622 return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; 655 623 } 656 624 625 + static inline bool f2fs_has_xattr_block(unsigned int ofs) 626 + { 627 + return ofs == XATTR_NODE_OFFSET; 628 + } 629 + 657 630 static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, 658 631 struct inode *inode, blkcnt_t count) 659 632 { ··· 698 661 699 662 static inline void inode_inc_dirty_dents(struct inode *inode) 700 663 { 664 + inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); 701 665 atomic_inc(&F2FS_I(inode)->dirty_dents); 702 666 } 703 667 ··· 709 671 710 672 static inline void inode_dec_dirty_dents(struct inode *inode) 711 673 { 674 + if (!S_ISDIR(inode->i_mode)) 675 + return; 676 + 677 + dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); 712 678 atomic_dec(&F2FS_I(inode)->dirty_dents); 713 679 } 714 680 715 681 static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) 716 682 { 717 683 return atomic_read(&sbi->nr_pages[count_type]); 684 + } 685 + 686 + static inline int get_dirty_dents(struct inode *inode) 687 + { 688 + return atomic_read(&F2FS_I(inode)->dirty_dents); 718 689 } 719 690 720 691 static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) ··· 736 689 737 690 static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) 738 691 { 739 - block_t ret; 740 - spin_lock(&sbi->stat_lock); 741 - ret = sbi->total_valid_block_count; 742 - spin_unlock(&sbi->stat_lock); 743 - return ret; 692 + return sbi->total_valid_block_count; 744 693 } 745 694 746 695 static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) ··· 832 789 833 790 static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) 834 791 { 835 - unsigned int ret; 836 - spin_lock(&sbi->stat_lock); 837 - ret = sbi->total_valid_node_count; 838 - spin_unlock(&sbi->stat_lock); 839 - return ret; 792 + return sbi->total_valid_node_count; 840 793 } 841 794 842 795 static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) ··· 853 814 854 815 static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) 855 816 { 856 - unsigned int ret; 857 - spin_lock(&sbi->stat_lock); 858 - ret = sbi->total_valid_inode_count; 859 - spin_unlock(&sbi->stat_lock); 860 - return ret; 817 + return sbi->total_valid_inode_count; 861 818 } 862 819 863 820 static inline void f2fs_put_page(struct page *page, int unlock) ··· 879 844 } 880 845 881 846 static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, 882 - size_t size, void (*ctor)(void *)) 847 + size_t size) 883 848 { 884 - return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor); 849 + return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); 885 850 } 886 851 887 852 static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, ··· 1018 983 ri->i_inline |= F2FS_INLINE_DATA; 1019 984 } 1020 985 986 + static inline int f2fs_has_inline_xattr(struct inode *inode) 987 + { 988 + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR); 989 + } 990 + 1021 991 static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) 1022 992 { 1023 - if (is_inode_flag_set(fi, FI_INLINE_XATTR)) 993 + if (f2fs_has_inline_xattr(&fi->vfs_inode)) 1024 994 return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; 1025 995 return DEF_ADDRS_PER_INODE; 1026 996 } 1027 997 1028 998 static inline void *inline_xattr_addr(struct page *page) 1029 999 { 1030 - struct f2fs_inode *ri; 1031 - ri = (struct f2fs_inode *)page_address(page); 1000 + struct f2fs_inode *ri = F2FS_INODE(page); 1032 1001 return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - 1033 1002 F2FS_INLINE_XATTR_ADDRS]); 1034 1003 } 1035 1004 1036 1005 static inline int inline_xattr_size(struct inode *inode) 1037 1006 { 1038 - if (is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR)) 1007 + if (f2fs_has_inline_xattr(inode)) 1039 1008 return F2FS_INLINE_XATTR_ADDRS << 2; 1040 1009 else 1041 1010 return 0; ··· 1052 1013 1053 1014 static inline void *inline_data_addr(struct page *page) 1054 1015 { 1055 - struct f2fs_inode *ri; 1056 - ri = (struct f2fs_inode *)page_address(page); 1016 + struct f2fs_inode *ri = F2FS_INODE(page); 1057 1017 return (void *)&(ri->i_addr[1]); 1058 1018 } 1059 1019 1060 1020 static inline int f2fs_readonly(struct super_block *sb) 1061 1021 { 1062 1022 return sb->s_flags & MS_RDONLY; 1023 + } 1024 + 1025 + static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) 1026 + { 1027 + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); 1028 + sbi->sb->s_flags |= MS_RDONLY; 1063 1029 } 1064 1030 1065 1031 #define get_inode_mode(i) \ ··· 1092 1048 struct inode *f2fs_iget(struct super_block *, unsigned long); 1093 1049 int try_to_free_nats(struct f2fs_sb_info *, int); 1094 1050 void update_inode(struct inode *, struct page *); 1095 - int update_inode_page(struct inode *); 1051 + void update_inode_page(struct inode *); 1096 1052 int f2fs_write_inode(struct inode *, struct writeback_control *); 1097 1053 void f2fs_evict_inode(struct inode *); 1098 1054 ··· 1141 1097 struct node_info; 1142 1098 1143 1099 int is_checkpointed_node(struct f2fs_sb_info *, nid_t); 1100 + bool fsync_mark_done(struct f2fs_sb_info *, nid_t); 1144 1101 void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); 1145 1102 int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); 1146 1103 int truncate_inode_blocks(struct inode *, pgoff_t); ··· 1160 1115 void alloc_nid_failed(struct f2fs_sb_info *, nid_t); 1161 1116 void recover_node_page(struct f2fs_sb_info *, struct page *, 1162 1117 struct f2fs_summary *, struct node_info *, block_t); 1118 + bool recover_xattr_data(struct inode *, struct page *, block_t); 1163 1119 int recover_inode_page(struct f2fs_sb_info *, struct page *); 1164 1120 int restore_node_summary(struct f2fs_sb_info *, unsigned int, 1165 1121 struct f2fs_summary_block *); ··· 1175 1129 */ 1176 1130 void f2fs_balance_fs(struct f2fs_sb_info *); 1177 1131 void f2fs_balance_fs_bg(struct f2fs_sb_info *); 1132 + int f2fs_issue_flush(struct f2fs_sb_info *); 1178 1133 void invalidate_blocks(struct f2fs_sb_info *, block_t); 1134 + void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); 1179 1135 void clear_prefree_segments(struct f2fs_sb_info *); 1180 1136 int npages_for_summary_flush(struct f2fs_sb_info *); 1181 1137 void allocate_new_segments(struct f2fs_sb_info *); ··· 1210 1162 */ 1211 1163 struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); 1212 1164 struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); 1165 + int ra_meta_pages(struct f2fs_sb_info *, int, int, int); 1213 1166 long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); 1214 1167 int acquire_orphan_inode(struct f2fs_sb_info *); 1215 1168 void release_orphan_inode(struct f2fs_sb_info *); ··· 1280 1231 int util_free, util_valid, util_invalid; 1281 1232 int rsvd_segs, overp_segs; 1282 1233 int dirty_count, node_pages, meta_pages; 1283 - int prefree_count, call_count; 1234 + int prefree_count, call_count, cp_count; 1284 1235 int tot_segs, node_segs, data_segs, free_segs, free_secs; 1285 1236 int tot_blks, data_blks, node_blks; 1286 1237 int curseg[NR_CURSEG_TYPE]; ··· 1297 1248 return (struct f2fs_stat_info *)sbi->stat_info; 1298 1249 } 1299 1250 1251 + #define stat_inc_cp_count(si) ((si)->cp_count++) 1300 1252 #define stat_inc_call_count(si) ((si)->call_count++) 1301 1253 #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) 1302 1254 #define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) ··· 1352 1302 void __init f2fs_create_root_stats(void); 1353 1303 void f2fs_destroy_root_stats(void); 1354 1304 #else 1305 + #define stat_inc_cp_count(si) 1355 1306 #define stat_inc_call_count(si) 1356 1307 #define stat_inc_bggc_count(si) 1357 1308 #define stat_inc_dirty_dir(sbi)

+22 -9

fs/f2fs/file.c

··· 76 76 trace_f2fs_vm_page_mkwrite(page, DATA); 77 77 mapped: 78 78 /* fill the page */ 79 - wait_on_page_writeback(page); 79 + f2fs_wait_on_page_writeback(page, DATA); 80 80 out: 81 81 sb_end_pagefault(inode->i_sb); 82 82 return block_page_mkwrite_return(err); ··· 111 111 int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 112 112 { 113 113 struct inode *inode = file->f_mapping->host; 114 + struct f2fs_inode_info *fi = F2FS_I(inode); 114 115 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 115 116 int ret = 0; 116 117 bool need_cp = false; 117 118 struct writeback_control wbc = { 118 - .sync_mode = WB_SYNC_NONE, 119 + .sync_mode = WB_SYNC_ALL, 119 120 .nr_to_write = LONG_MAX, 120 121 .for_reclaim = 0, 121 122 }; ··· 134 133 /* guarantee free sections for fsync */ 135 134 f2fs_balance_fs(sbi); 136 135 137 - mutex_lock(&inode->i_mutex); 136 + down_read(&fi->i_sem); 138 137 139 138 /* 140 139 * Both of fdatasync() and fsync() are able to be recovered from ··· 151 150 else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) 152 151 need_cp = true; 153 152 153 + up_read(&fi->i_sem); 154 + 154 155 if (need_cp) { 155 156 nid_t pino; 156 157 157 - F2FS_I(inode)->xattr_ver = 0; 158 - 159 158 /* all the dirty node pages should be flushed for POR */ 160 159 ret = f2fs_sync_fs(inode->i_sb, 1); 160 + 161 + down_write(&fi->i_sem); 162 + F2FS_I(inode)->xattr_ver = 0; 161 163 if (file_wrong_pino(inode) && inode->i_nlink == 1 && 162 164 get_parent_ino(inode, &pino)) { 163 165 F2FS_I(inode)->i_pino = pino; 164 166 file_got_pino(inode); 167 + up_write(&fi->i_sem); 165 168 mark_inode_dirty_sync(inode); 166 169 ret = f2fs_write_inode(inode, NULL); 167 170 if (ret) 168 171 goto out; 172 + } else { 173 + up_write(&fi->i_sem); 169 174 } 170 175 } else { 171 176 /* if there is no written node page, write its inode page */ 172 177 while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { 178 + if (fsync_mark_done(sbi, inode->i_ino)) 179 + goto out; 173 180 mark_inode_dirty_sync(inode); 174 181 ret = f2fs_write_inode(inode, NULL); 175 182 if (ret) ··· 186 177 ret = wait_on_node_pages_writeback(sbi, inode->i_ino); 187 178 if (ret) 188 179 goto out; 189 - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 180 + ret = f2fs_issue_flush(F2FS_SB(inode->i_sb)); 190 181 } 191 182 out: 192 - mutex_unlock(&inode->i_mutex); 193 183 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); 194 184 return ret; 195 185 } ··· 253 245 f2fs_put_page(page, 1); 254 246 return; 255 247 } 256 - wait_on_page_writeback(page); 248 + f2fs_wait_on_page_writeback(page, DATA); 257 249 zero_user(page, offset, PAGE_CACHE_SIZE - offset); 258 250 set_page_dirty(page); 259 251 f2fs_put_page(page, 1); ··· 430 422 f2fs_unlock_op(sbi); 431 423 432 424 if (!IS_ERR(page)) { 433 - wait_on_page_writeback(page); 425 + f2fs_wait_on_page_writeback(page, DATA); 434 426 zero_user(page, start, len); 435 427 set_page_dirty(page); 436 428 f2fs_put_page(page, 1); ··· 568 560 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 569 561 return -EOPNOTSUPP; 570 562 563 + mutex_lock(&inode->i_mutex); 564 + 571 565 if (mode & FALLOC_FL_PUNCH_HOLE) 572 566 ret = punch_hole(inode, offset, len); 573 567 else ··· 579 569 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 580 570 mark_inode_dirty(inode); 581 571 } 572 + 573 + mutex_unlock(&inode->i_mutex); 574 + 582 575 trace_f2fs_fallocate(inode, mode, offset, len, ret); 583 576 return ret; 584 577 }

+9 -7

fs/f2fs/gc.c

··· 531 531 set_page_dirty(page); 532 532 set_cold_data(page); 533 533 } else { 534 - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 535 - 536 534 f2fs_wait_on_page_writeback(page, DATA); 537 535 538 - if (clear_page_dirty_for_io(page) && 539 - S_ISDIR(inode->i_mode)) { 540 - dec_page_count(sbi, F2FS_DIRTY_DENTS); 536 + if (clear_page_dirty_for_io(page)) 541 537 inode_dec_dirty_dents(inode); 542 - } 543 538 set_cold_data(page); 544 539 do_write_data_page(page, &fio); 545 540 clear_cold_data(page); ··· 696 701 gc_more: 697 702 if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) 698 703 goto stop; 704 + if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) 705 + goto stop; 699 706 700 707 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { 701 708 gc_type = FG_GC; ··· 707 710 if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) 708 711 goto stop; 709 712 ret = 0; 713 + 714 + /* readahead multi ssa blocks those have contiguous address */ 715 + if (sbi->segs_per_sec > 1) 716 + ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, 717 + META_SSA); 710 718 711 719 for (i = 0; i < sbi->segs_per_sec; i++) 712 720 do_garbage_collect(sbi, segno + i, &ilist, gc_type); ··· 742 740 int __init create_gc_caches(void) 743 741 { 744 742 winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", 745 - sizeof(struct inode_entry), NULL); 743 + sizeof(struct inode_entry)); 746 744 if (!winode_slab) 747 745 return -ENOMEM; 748 746 return 0;

+3 -1

fs/f2fs/inline.c

··· 45 45 } 46 46 47 47 ipage = get_node_page(sbi, inode->i_ino); 48 - if (IS_ERR(ipage)) 48 + if (IS_ERR(ipage)) { 49 + unlock_page(page); 49 50 return PTR_ERR(ipage); 51 + } 50 52 51 53 zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); 52 54

+17 -10

fs/f2fs/inode.c

··· 107 107 fi->flags = 0; 108 108 fi->i_advise = ri->i_advise; 109 109 fi->i_pino = le32_to_cpu(ri->i_pino); 110 + fi->i_dir_level = ri->i_dir_level; 110 111 111 112 get_extent_info(&fi->ext, ri->i_ext); 112 113 get_inline_info(fi, ri); ··· 205 204 ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); 206 205 ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); 207 206 ri->i_generation = cpu_to_le32(inode->i_generation); 207 + ri->i_dir_level = F2FS_I(inode)->i_dir_level; 208 208 209 209 __set_inode_rdev(inode, ri); 210 210 set_cold_node(inode, node_page); ··· 214 212 clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); 215 213 } 216 214 217 - int update_inode_page(struct inode *inode) 215 + void update_inode_page(struct inode *inode) 218 216 { 219 217 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 220 218 struct page *node_page; 221 - 219 + retry: 222 220 node_page = get_node_page(sbi, inode->i_ino); 223 - if (IS_ERR(node_page)) 224 - return PTR_ERR(node_page); 225 - 221 + if (IS_ERR(node_page)) { 222 + int err = PTR_ERR(node_page); 223 + if (err == -ENOMEM) { 224 + cond_resched(); 225 + goto retry; 226 + } else if (err != -ENOENT) { 227 + f2fs_stop_checkpoint(sbi); 228 + } 229 + return; 230 + } 226 231 update_inode(inode, node_page); 227 232 f2fs_put_page(node_page, 1); 228 - return 0; 229 233 } 230 234 231 235 int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) 232 236 { 233 237 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 234 - int ret; 235 238 236 239 if (inode->i_ino == F2FS_NODE_INO(sbi) || 237 240 inode->i_ino == F2FS_META_INO(sbi)) ··· 250 243 * during the urgent cleaning time when runing out of free sections. 251 244 */ 252 245 f2fs_lock_op(sbi); 253 - ret = update_inode_page(inode); 246 + update_inode_page(inode); 254 247 f2fs_unlock_op(sbi); 255 248 256 249 if (wbc) 257 250 f2fs_balance_fs(sbi); 258 251 259 - return ret; 252 + return 0; 260 253 } 261 254 262 255 /* ··· 273 266 inode->i_ino == F2FS_META_INO(sbi)) 274 267 goto no_delete; 275 268 276 - f2fs_bug_on(atomic_read(&F2FS_I(inode)->dirty_dents)); 269 + f2fs_bug_on(get_dirty_dents(inode)); 277 270 remove_dirty_dir_inode(inode); 278 271 279 272 if (inode->i_nlink || is_bad_inode(inode))

+9

fs/f2fs/namei.c

··· 207 207 inode = f2fs_iget(dir->i_sb, ino); 208 208 if (IS_ERR(inode)) 209 209 return ERR_CAST(inode); 210 + 211 + stat_inc_inline_inode(inode); 210 212 } 211 213 212 214 return d_splice_alias(inode, dentry); ··· 426 424 } 427 425 428 426 f2fs_set_link(new_dir, new_entry, new_page, old_inode); 427 + down_write(&F2FS_I(old_inode)->i_sem); 429 428 F2FS_I(old_inode)->i_pino = new_dir->i_ino; 429 + up_write(&F2FS_I(old_inode)->i_sem); 430 430 431 431 new_inode->i_ctime = CURRENT_TIME; 432 + down_write(&F2FS_I(new_inode)->i_sem); 432 433 if (old_dir_entry) 433 434 drop_nlink(new_inode); 434 435 drop_nlink(new_inode); 436 + up_write(&F2FS_I(new_inode)->i_sem); 437 + 435 438 mark_inode_dirty(new_inode); 436 439 437 440 if (!new_inode->i_nlink) ··· 466 459 if (old_dir != new_dir) { 467 460 f2fs_set_link(old_inode, old_dir_entry, 468 461 old_dir_page, new_dir); 462 + down_write(&F2FS_I(old_inode)->i_sem); 469 463 F2FS_I(old_inode)->i_pino = new_dir->i_ino; 464 + up_write(&F2FS_I(old_inode)->i_sem); 470 465 update_inode_page(old_inode); 471 466 } else { 472 467 kunmap(old_dir_page);

+201 -133

fs/f2fs/node.c

··· 21 21 #include "segment.h" 22 22 #include <trace/events/f2fs.h> 23 23 24 + #define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) 25 + 24 26 static struct kmem_cache *nat_entry_slab; 25 27 static struct kmem_cache *free_nid_slab; 28 + 29 + static inline bool available_free_memory(struct f2fs_nm_info *nm_i, int type) 30 + { 31 + struct sysinfo val; 32 + unsigned long mem_size = 0; 33 + 34 + si_meminfo(&val); 35 + if (type == FREE_NIDS) 36 + mem_size = nm_i->fcnt * sizeof(struct free_nid); 37 + else if (type == NAT_ENTRIES) 38 + mem_size += nm_i->nat_cnt * sizeof(struct nat_entry); 39 + mem_size >>= 12; 40 + 41 + /* give 50:50 memory for free nids and nat caches respectively */ 42 + return (mem_size < ((val.totalram * nm_i->ram_thresh) >> 11)); 43 + } 26 44 27 45 static void clear_node_page_dirty(struct page *page) 28 46 { ··· 100 82 return dst_page; 101 83 } 102 84 103 - /* 104 - * Readahead NAT pages 105 - */ 106 - static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) 107 - { 108 - struct address_space *mapping = META_MAPPING(sbi); 109 - struct f2fs_nm_info *nm_i = NM_I(sbi); 110 - struct page *page; 111 - pgoff_t index; 112 - int i; 113 - struct f2fs_io_info fio = { 114 - .type = META, 115 - .rw = READ_SYNC | REQ_META | REQ_PRIO 116 - }; 117 - 118 - 119 - for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { 120 - if (unlikely(nid >= nm_i->max_nid)) 121 - nid = 0; 122 - index = current_nat_addr(sbi, nid); 123 - 124 - page = grab_cache_page(mapping, index); 125 - if (!page) 126 - continue; 127 - if (PageUptodate(page)) { 128 - mark_page_accessed(page); 129 - f2fs_put_page(page, 1); 130 - continue; 131 - } 132 - f2fs_submit_page_mbio(sbi, page, index, &fio); 133 - mark_page_accessed(page); 134 - f2fs_put_page(page, 0); 135 - } 136 - f2fs_submit_merged_bio(sbi, META, READ); 137 - } 138 - 139 85 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) 140 86 { 141 87 return radix_tree_lookup(&nm_i->nat_root, n); ··· 133 151 return is_cp; 134 152 } 135 153 154 + bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid) 155 + { 156 + struct f2fs_nm_info *nm_i = NM_I(sbi); 157 + struct nat_entry *e; 158 + bool fsync_done = false; 159 + 160 + read_lock(&nm_i->nat_tree_lock); 161 + e = __lookup_nat_cache(nm_i, nid); 162 + if (e) 163 + fsync_done = e->fsync_done; 164 + read_unlock(&nm_i->nat_tree_lock); 165 + return fsync_done; 166 + } 167 + 136 168 static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) 137 169 { 138 170 struct nat_entry *new; ··· 160 164 } 161 165 memset(new, 0, sizeof(struct nat_entry)); 162 166 nat_set_nid(new, nid); 167 + new->checkpointed = true; 163 168 list_add_tail(&new->list, &nm_i->nat_entries); 164 169 nm_i->nat_cnt++; 165 170 return new; ··· 182 185 nat_set_blkaddr(e, le32_to_cpu(ne->block_addr)); 183 186 nat_set_ino(e, le32_to_cpu(ne->ino)); 184 187 nat_set_version(e, ne->version); 185 - e->checkpointed = true; 186 188 } 187 189 write_unlock(&nm_i->nat_tree_lock); 188 190 } 189 191 190 192 static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, 191 - block_t new_blkaddr) 193 + block_t new_blkaddr, bool fsync_done) 192 194 { 193 195 struct f2fs_nm_info *nm_i = NM_I(sbi); 194 196 struct nat_entry *e; ··· 201 205 goto retry; 202 206 } 203 207 e->ni = *ni; 204 - e->checkpointed = true; 205 208 f2fs_bug_on(ni->blk_addr == NEW_ADDR); 206 209 } else if (new_blkaddr == NEW_ADDR) { 207 210 /* ··· 211 216 e->ni = *ni; 212 217 f2fs_bug_on(ni->blk_addr != NULL_ADDR); 213 218 } 214 - 215 - if (new_blkaddr == NEW_ADDR) 216 - e->checkpointed = false; 217 219 218 220 /* sanity check */ 219 221 f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); ··· 231 239 /* change address */ 232 240 nat_set_blkaddr(e, new_blkaddr); 233 241 __set_nat_cache_dirty(nm_i, e); 242 + 243 + /* update fsync_mark if its inode nat entry is still alive */ 244 + e = __lookup_nat_cache(nm_i, ni->ino); 245 + if (e) 246 + e->fsync_done = fsync_done; 234 247 write_unlock(&nm_i->nat_tree_lock); 235 248 } 236 249 ··· 243 246 { 244 247 struct f2fs_nm_info *nm_i = NM_I(sbi); 245 248 246 - if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD) 249 + if (available_free_memory(nm_i, NAT_ENTRIES)) 247 250 return 0; 248 251 249 252 write_lock(&nm_i->nat_tree_lock); ··· 502 505 /* Deallocate node address */ 503 506 invalidate_blocks(sbi, ni.blk_addr); 504 507 dec_valid_node_count(sbi, dn->inode); 505 - set_node_addr(sbi, &ni, NULL_ADDR); 508 + set_node_addr(sbi, &ni, NULL_ADDR, false); 506 509 507 510 if (dn->nid == dn->inode->i_ino) { 508 511 remove_orphan_inode(sbi, dn->nid); ··· 760 763 f2fs_put_page(page, 1); 761 764 goto restart; 762 765 } 763 - wait_on_page_writeback(page); 766 + f2fs_wait_on_page_writeback(page, NODE); 764 767 ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; 765 768 set_page_dirty(page); 766 769 unlock_page(page); ··· 849 852 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) 850 853 return ERR_PTR(-EPERM); 851 854 852 - page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); 855 + page = grab_cache_page_write_begin(NODE_MAPPING(sbi), 856 + dn->nid, AOP_FLAG_NOFS); 853 857 if (!page) 854 858 return ERR_PTR(-ENOMEM); 855 859 ··· 865 867 f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); 866 868 new_ni = old_ni; 867 869 new_ni.ino = dn->inode->i_ino; 868 - set_node_addr(sbi, &new_ni, NEW_ADDR); 870 + set_node_addr(sbi, &new_ni, NEW_ADDR, false); 869 871 870 872 fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); 871 873 set_cold_node(dn->inode, page); 872 874 SetPageUptodate(page); 873 875 set_page_dirty(page); 874 876 875 - if (ofs == XATTR_NODE_OFFSET) 877 + if (f2fs_has_xattr_block(ofs)) 876 878 F2FS_I(dn->inode)->i_xattr_nid = dn->nid; 877 879 878 880 dn->node_page = page; ··· 946 948 struct page *page; 947 949 int err; 948 950 repeat: 949 - page = grab_cache_page(NODE_MAPPING(sbi), nid); 951 + page = grab_cache_page_write_begin(NODE_MAPPING(sbi), 952 + nid, AOP_FLAG_NOFS); 950 953 if (!page) 951 954 return ERR_PTR(-ENOMEM); 952 955 ··· 958 959 goto got_it; 959 960 960 961 lock_page(page); 961 - if (unlikely(!PageUptodate(page))) { 962 + if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { 962 963 f2fs_put_page(page, 1); 963 964 return ERR_PTR(-EIO); 964 965 } ··· 967 968 goto repeat; 968 969 } 969 970 got_it: 970 - f2fs_bug_on(nid != nid_of_node(page)); 971 971 mark_page_accessed(page); 972 972 return page; 973 973 } ··· 1166 1168 continue; 1167 1169 1168 1170 if (ino && ino_of_node(page) == ino) { 1169 - wait_on_page_writeback(page); 1171 + f2fs_wait_on_page_writeback(page, NODE); 1170 1172 if (TestClearPageError(page)) 1171 1173 ret = -EIO; 1172 1174 } ··· 1199 1201 if (unlikely(sbi->por_doing)) 1200 1202 goto redirty_out; 1201 1203 1202 - wait_on_page_writeback(page); 1204 + f2fs_wait_on_page_writeback(page, NODE); 1203 1205 1204 1206 /* get old block addr of this node page */ 1205 1207 nid = nid_of_node(page); ··· 1220 1222 mutex_lock(&sbi->node_write); 1221 1223 set_page_writeback(page); 1222 1224 write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); 1223 - set_node_addr(sbi, &ni, new_addr); 1225 + set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); 1224 1226 dec_page_count(sbi, F2FS_DIRTY_NODES); 1225 1227 mutex_unlock(&sbi->node_write); 1226 1228 unlock_page(page); ··· 1229 1231 redirty_out: 1230 1232 dec_page_count(sbi, F2FS_DIRTY_NODES); 1231 1233 wbc->pages_skipped++; 1234 + account_page_redirty(page); 1232 1235 set_page_dirty(page); 1233 1236 return AOP_WRITEPAGE_ACTIVATE; 1234 1237 } 1235 1238 1236 - /* 1237 - * It is very important to gather dirty pages and write at once, so that we can 1238 - * submit a big bio without interfering other data writes. 1239 - * Be default, 512 pages (2MB) * 3 node types, is more reasonable. 1240 - */ 1241 - #define COLLECT_DIRTY_NODES 1536 1242 1239 static int f2fs_write_node_pages(struct address_space *mapping, 1243 1240 struct writeback_control *wbc) 1244 1241 { 1245 1242 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 1246 - long nr_to_write = wbc->nr_to_write; 1243 + long diff; 1247 1244 1248 1245 /* balancing f2fs's metadata in background */ 1249 1246 f2fs_balance_fs_bg(sbi); 1250 1247 1251 1248 /* collect a number of dirty node pages and write together */ 1252 - if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) 1253 - return 0; 1249 + if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) 1250 + goto skip_write; 1254 1251 1255 - /* if mounting is failed, skip writing node pages */ 1256 - wbc->nr_to_write = 3 * max_hw_blocks(sbi); 1252 + diff = nr_pages_to_write(sbi, NODE, wbc); 1257 1253 wbc->sync_mode = WB_SYNC_NONE; 1258 1254 sync_node_pages(sbi, 0, wbc); 1259 - wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - 1260 - wbc->nr_to_write); 1255 + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); 1256 + return 0; 1257 + 1258 + skip_write: 1259 + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); 1261 1260 return 0; 1262 1261 } 1263 1262 ··· 1302 1307 .releasepage = f2fs_release_node_page, 1303 1308 }; 1304 1309 1305 - static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head) 1310 + static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, 1311 + nid_t n) 1306 1312 { 1307 - struct list_head *this; 1308 - struct free_nid *i; 1309 - list_for_each(this, head) { 1310 - i = list_entry(this, struct free_nid, list); 1311 - if (i->nid == n) 1312 - return i; 1313 - } 1314 - return NULL; 1313 + return radix_tree_lookup(&nm_i->free_nid_root, n); 1315 1314 } 1316 1315 1317 - static void __del_from_free_nid_list(struct free_nid *i) 1316 + static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, 1317 + struct free_nid *i) 1318 1318 { 1319 1319 list_del(&i->list); 1320 - kmem_cache_free(free_nid_slab, i); 1320 + radix_tree_delete(&nm_i->free_nid_root, i->nid); 1321 1321 } 1322 1322 1323 1323 static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) ··· 1321 1331 struct nat_entry *ne; 1322 1332 bool allocated = false; 1323 1333 1324 - if (nm_i->fcnt > 2 * MAX_FREE_NIDS) 1334 + if (!available_free_memory(nm_i, FREE_NIDS)) 1325 1335 return -1; 1326 1336 1327 1337 /* 0 nid should not be used */ ··· 1332 1342 /* do not add allocated nids */ 1333 1343 read_lock(&nm_i->nat_tree_lock); 1334 1344 ne = __lookup_nat_cache(nm_i, nid); 1335 - if (ne && nat_get_blkaddr(ne) != NULL_ADDR) 1345 + if (ne && 1346 + (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR)) 1336 1347 allocated = true; 1337 1348 read_unlock(&nm_i->nat_tree_lock); 1338 1349 if (allocated) ··· 1345 1354 i->state = NID_NEW; 1346 1355 1347 1356 spin_lock(&nm_i->free_nid_list_lock); 1348 - if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) { 1357 + if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { 1349 1358 spin_unlock(&nm_i->free_nid_list_lock); 1350 1359 kmem_cache_free(free_nid_slab, i); 1351 1360 return 0; ··· 1359 1368 static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) 1360 1369 { 1361 1370 struct free_nid *i; 1371 + bool need_free = false; 1372 + 1362 1373 spin_lock(&nm_i->free_nid_list_lock); 1363 - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); 1374 + i = __lookup_free_nid_list(nm_i, nid); 1364 1375 if (i && i->state == NID_NEW) { 1365 - __del_from_free_nid_list(i); 1376 + __del_from_free_nid_list(nm_i, i); 1366 1377 nm_i->fcnt--; 1378 + need_free = true; 1367 1379 } 1368 1380 spin_unlock(&nm_i->free_nid_list_lock); 1381 + 1382 + if (need_free) 1383 + kmem_cache_free(free_nid_slab, i); 1369 1384 } 1370 1385 1371 1386 static void scan_nat_page(struct f2fs_nm_info *nm_i, ··· 1410 1413 return; 1411 1414 1412 1415 /* readahead nat pages to be scanned */ 1413 - ra_nat_pages(sbi, nid); 1416 + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT); 1414 1417 1415 1418 while (1) { 1416 1419 struct page *page = get_current_nat_page(sbi, nid); ··· 1451 1454 { 1452 1455 struct f2fs_nm_info *nm_i = NM_I(sbi); 1453 1456 struct free_nid *i = NULL; 1454 - struct list_head *this; 1455 1457 retry: 1456 1458 if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid)) 1457 1459 return false; ··· 1458 1462 spin_lock(&nm_i->free_nid_list_lock); 1459 1463 1460 1464 /* We should not use stale free nids created by build_free_nids */ 1461 - if (nm_i->fcnt && !sbi->on_build_free_nids) { 1465 + if (nm_i->fcnt && !on_build_free_nids(nm_i)) { 1462 1466 f2fs_bug_on(list_empty(&nm_i->free_nid_list)); 1463 - list_for_each(this, &nm_i->free_nid_list) { 1464 - i = list_entry(this, struct free_nid, list); 1467 + list_for_each_entry(i, &nm_i->free_nid_list, list) 1465 1468 if (i->state == NID_NEW) 1466 1469 break; 1467 - } 1468 1470 1469 1471 f2fs_bug_on(i->state != NID_NEW); 1470 1472 *nid = i->nid; ··· 1475 1481 1476 1482 /* Let's scan nat pages and its caches to get free nids */ 1477 1483 mutex_lock(&nm_i->build_lock); 1478 - sbi->on_build_free_nids = true; 1479 1484 build_free_nids(sbi); 1480 - sbi->on_build_free_nids = false; 1481 1485 mutex_unlock(&nm_i->build_lock); 1482 1486 goto retry; 1483 1487 } ··· 1489 1497 struct free_nid *i; 1490 1498 1491 1499 spin_lock(&nm_i->free_nid_list_lock); 1492 - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); 1500 + i = __lookup_free_nid_list(nm_i, nid); 1493 1501 f2fs_bug_on(!i || i->state != NID_ALLOC); 1494 - __del_from_free_nid_list(i); 1502 + __del_from_free_nid_list(nm_i, i); 1495 1503 spin_unlock(&nm_i->free_nid_list_lock); 1504 + 1505 + kmem_cache_free(free_nid_slab, i); 1496 1506 } 1497 1507 1498 1508 /* ··· 1504 1510 { 1505 1511 struct f2fs_nm_info *nm_i = NM_I(sbi); 1506 1512 struct free_nid *i; 1513 + bool need_free = false; 1507 1514 1508 1515 if (!nid) 1509 1516 return; 1510 1517 1511 1518 spin_lock(&nm_i->free_nid_list_lock); 1512 - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); 1519 + i = __lookup_free_nid_list(nm_i, nid); 1513 1520 f2fs_bug_on(!i || i->state != NID_ALLOC); 1514 - if (nm_i->fcnt > 2 * MAX_FREE_NIDS) { 1515 - __del_from_free_nid_list(i); 1521 + if (!available_free_memory(nm_i, FREE_NIDS)) { 1522 + __del_from_free_nid_list(nm_i, i); 1523 + need_free = true; 1516 1524 } else { 1517 1525 i->state = NID_NEW; 1518 1526 nm_i->fcnt++; 1519 1527 } 1520 1528 spin_unlock(&nm_i->free_nid_list_lock); 1529 + 1530 + if (need_free) 1531 + kmem_cache_free(free_nid_slab, i); 1521 1532 } 1522 1533 1523 1534 void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, ··· 1530 1531 block_t new_blkaddr) 1531 1532 { 1532 1533 rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr); 1533 - set_node_addr(sbi, ni, new_blkaddr); 1534 + set_node_addr(sbi, ni, new_blkaddr, false); 1534 1535 clear_node_page_dirty(page); 1536 + } 1537 + 1538 + void recover_inline_xattr(struct inode *inode, struct page *page) 1539 + { 1540 + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 1541 + void *src_addr, *dst_addr; 1542 + size_t inline_size; 1543 + struct page *ipage; 1544 + struct f2fs_inode *ri; 1545 + 1546 + if (!f2fs_has_inline_xattr(inode)) 1547 + return; 1548 + 1549 + if (!IS_INODE(page)) 1550 + return; 1551 + 1552 + ri = F2FS_INODE(page); 1553 + if (!(ri->i_inline & F2FS_INLINE_XATTR)) 1554 + return; 1555 + 1556 + ipage = get_node_page(sbi, inode->i_ino); 1557 + f2fs_bug_on(IS_ERR(ipage)); 1558 + 1559 + dst_addr = inline_xattr_addr(ipage); 1560 + src_addr = inline_xattr_addr(page); 1561 + inline_size = inline_xattr_size(inode); 1562 + 1563 + memcpy(dst_addr, src_addr, inline_size); 1564 + 1565 + update_inode(inode, ipage); 1566 + f2fs_put_page(ipage, 1); 1567 + } 1568 + 1569 + bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) 1570 + { 1571 + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 1572 + nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; 1573 + nid_t new_xnid = nid_of_node(page); 1574 + struct node_info ni; 1575 + 1576 + recover_inline_xattr(inode, page); 1577 + 1578 + if (!f2fs_has_xattr_block(ofs_of_node(page))) 1579 + return false; 1580 + 1581 + /* 1: invalidate the previous xattr nid */ 1582 + if (!prev_xnid) 1583 + goto recover_xnid; 1584 + 1585 + /* Deallocate node address */ 1586 + get_node_info(sbi, prev_xnid, &ni); 1587 + f2fs_bug_on(ni.blk_addr == NULL_ADDR); 1588 + invalidate_blocks(sbi, ni.blk_addr); 1589 + dec_valid_node_count(sbi, inode); 1590 + set_node_addr(sbi, &ni, NULL_ADDR, false); 1591 + 1592 + recover_xnid: 1593 + /* 2: allocate new xattr nid */ 1594 + if (unlikely(!inc_valid_node_count(sbi, inode))) 1595 + f2fs_bug_on(1); 1596 + 1597 + remove_free_nid(NM_I(sbi), new_xnid); 1598 + get_node_info(sbi, new_xnid, &ni); 1599 + ni.ino = inode->i_ino; 1600 + set_node_addr(sbi, &ni, NEW_ADDR, false); 1601 + F2FS_I(inode)->i_xattr_nid = new_xnid; 1602 + 1603 + /* 3: update xattr blkaddr */ 1604 + refresh_sit_entry(sbi, NEW_ADDR, blkaddr); 1605 + set_node_addr(sbi, &ni, blkaddr, false); 1606 + 1607 + update_inode_page(inode); 1608 + return true; 1535 1609 } 1536 1610 1537 1611 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) ··· 1639 1567 1640 1568 if (unlikely(!inc_valid_node_count(sbi, NULL))) 1641 1569 WARN_ON(1); 1642 - set_node_addr(sbi, &new_ni, NEW_ADDR); 1570 + set_node_addr(sbi, &new_ni, NEW_ADDR, false); 1643 1571 inc_valid_inode_count(sbi); 1644 1572 f2fs_put_page(ipage, 1); 1645 1573 return 0; ··· 1662 1590 for (; page_idx < start + nrpages; page_idx++) { 1663 1591 /* alloc temporal page for read node summary info*/ 1664 1592 page = alloc_page(GFP_F2FS_ZERO); 1665 - if (!page) { 1666 - struct page *tmp; 1667 - list_for_each_entry_safe(page, tmp, pages, lru) { 1668 - list_del(&page->lru); 1669 - unlock_page(page); 1670 - __free_pages(page, 0); 1671 - } 1672 - return -ENOMEM; 1673 - } 1593 + if (!page) 1594 + break; 1674 1595 1675 1596 lock_page(page); 1676 1597 page->index = page_idx; ··· 1674 1609 f2fs_submit_page_mbio(sbi, page, page->index, &fio); 1675 1610 1676 1611 f2fs_submit_merged_bio(sbi, META, READ); 1677 - return 0; 1612 + 1613 + return page_idx - start; 1678 1614 } 1679 1615 1680 1616 int restore_node_summary(struct f2fs_sb_info *sbi, ··· 1694 1628 addr = START_BLOCK(sbi, segno); 1695 1629 sum_entry = &sum->entries[0]; 1696 1630 1697 - for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { 1631 + for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) { 1698 1632 nrpages = min(last_offset - i, bio_blocks); 1699 1633 1700 1634 /* read ahead node pages */ 1701 - err = ra_sum_pages(sbi, &page_list, addr, nrpages); 1702 - if (err) 1703 - return err; 1635 + nrpages = ra_sum_pages(sbi, &page_list, addr, nrpages); 1636 + if (!nrpages) 1637 + return -ENOMEM; 1704 1638 1705 1639 list_for_each_entry_safe(page, tmp, &page_list, lru) { 1640 + if (err) 1641 + goto skip; 1706 1642 1707 1643 lock_page(page); 1708 1644 if (unlikely(!PageUptodate(page))) { ··· 1716 1648 sum_entry->ofs_in_node = 0; 1717 1649 sum_entry++; 1718 1650 } 1719 - 1720 - list_del(&page->lru); 1721 1651 unlock_page(page); 1652 + skip: 1653 + list_del(&page->lru); 1722 1654 __free_pages(page, 0); 1723 1655 } 1724 1656 } ··· 1777 1709 struct f2fs_nm_info *nm_i = NM_I(sbi); 1778 1710 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 1779 1711 struct f2fs_summary_block *sum = curseg->sum_blk; 1780 - struct list_head *cur, *n; 1712 + struct nat_entry *ne, *cur; 1781 1713 struct page *page = NULL; 1782 1714 struct f2fs_nat_block *nat_blk = NULL; 1783 1715 nid_t start_nid = 0, end_nid = 0; ··· 1789 1721 mutex_lock(&curseg->curseg_mutex); 1790 1722 1791 1723 /* 1) flush dirty nat caches */ 1792 - list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) { 1793 - struct nat_entry *ne; 1724 + list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) { 1794 1725 nid_t nid; 1795 1726 struct f2fs_nat_entry raw_ne; 1796 1727 int offset = -1; 1797 1728 block_t new_blkaddr; 1798 1729 1799 - ne = list_entry(cur, struct nat_entry, list); 1800 - nid = nat_get_nid(ne); 1801 - 1802 1730 if (nat_get_blkaddr(ne) == NEW_ADDR) 1803 1731 continue; 1732 + 1733 + nid = nat_get_nid(ne); 1734 + 1804 1735 if (flushed) 1805 1736 goto to_nat_page; 1806 1737 ··· 1850 1783 } else { 1851 1784 write_lock(&nm_i->nat_tree_lock); 1852 1785 __clear_nat_cache_dirty(nm_i, ne); 1853 - ne->checkpointed = true; 1854 1786 write_unlock(&nm_i->nat_tree_lock); 1855 1787 } 1856 1788 } 1857 1789 if (!flushed) 1858 1790 mutex_unlock(&curseg->curseg_mutex); 1859 1791 f2fs_put_page(page, 1); 1860 - 1861 - /* 2) shrink nat caches if necessary */ 1862 - try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD); 1863 1792 } 1864 1793 1865 1794 static int init_node_manager(struct f2fs_sb_info *sbi) ··· 1870 1807 /* segment_count_nat includes pair segment so divide to 2. */ 1871 1808 nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; 1872 1809 nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); 1873 - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; 1810 + 1811 + /* not used nids: 0, node, meta, (and root counted as valid node) */ 1812 + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks - 3; 1874 1813 nm_i->fcnt = 0; 1875 1814 nm_i->nat_cnt = 0; 1815 + nm_i->ram_thresh = DEF_RAM_THRESHOLD; 1876 1816 1817 + INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); 1877 1818 INIT_LIST_HEAD(&nm_i->free_nid_list); 1878 1819 INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); 1879 1820 INIT_LIST_HEAD(&nm_i->nat_entries); ··· 1931 1864 spin_lock(&nm_i->free_nid_list_lock); 1932 1865 list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { 1933 1866 f2fs_bug_on(i->state == NID_ALLOC); 1934 - __del_from_free_nid_list(i); 1867 + __del_from_free_nid_list(nm_i, i); 1935 1868 nm_i->fcnt--; 1869 + spin_unlock(&nm_i->free_nid_list_lock); 1870 + kmem_cache_free(free_nid_slab, i); 1871 + spin_lock(&nm_i->free_nid_list_lock); 1936 1872 } 1937 1873 f2fs_bug_on(nm_i->fcnt); 1938 1874 spin_unlock(&nm_i->free_nid_list_lock); ··· 1945 1875 while ((found = __gang_lookup_nat_cache(nm_i, 1946 1876 nid, NATVEC_SIZE, natvec))) { 1947 1877 unsigned idx; 1948 - for (idx = 0; idx < found; idx++) { 1949 - struct nat_entry *e = natvec[idx]; 1950 - nid = nat_get_nid(e) + 1; 1951 - __del_from_nat_cache(nm_i, e); 1952 - } 1878 + nid = nat_get_nid(natvec[found - 1]) + 1; 1879 + for (idx = 0; idx < found; idx++) 1880 + __del_from_nat_cache(nm_i, natvec[idx]); 1953 1881 } 1954 1882 f2fs_bug_on(nm_i->nat_cnt); 1955 1883 write_unlock(&nm_i->nat_tree_lock); ··· 1960 1892 int __init create_node_manager_caches(void) 1961 1893 { 1962 1894 nat_entry_slab = f2fs_kmem_cache_create("nat_entry", 1963 - sizeof(struct nat_entry), NULL); 1895 + sizeof(struct nat_entry)); 1964 1896 if (!nat_entry_slab) 1965 1897 return -ENOMEM; 1966 1898 1967 1899 free_nid_slab = f2fs_kmem_cache_create("free_nid", 1968 - sizeof(struct free_nid), NULL); 1900 + sizeof(struct free_nid)); 1969 1901 if (!free_nid_slab) { 1970 1902 kmem_cache_destroy(nat_entry_slab); 1971 1903 return -ENOMEM;

+17 -8

fs/f2fs/node.h

··· 17 17 /* # of pages to perform readahead before building free nids */ 18 18 #define FREE_NID_PAGES 4 19 19 20 - /* maximum # of free node ids to produce during build_free_nids */ 21 - #define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) 22 - 23 20 /* maximum readahead size for node during getting data blocks */ 24 21 #define MAX_RA_NODE 128 25 22 26 - /* maximum cached nat entries to manage memory footprint */ 27 - #define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK) 23 + /* control the memory footprint threshold (10MB per 1GB ram) */ 24 + #define DEF_RAM_THRESHOLD 10 28 25 29 26 /* vector size for gang look-up from nat cache that consists of radix tree */ 30 27 #define NATVEC_SIZE 64 ··· 42 45 struct nat_entry { 43 46 struct list_head list; /* for clean or dirty nat list */ 44 47 bool checkpointed; /* whether it is checkpointed or not */ 48 + bool fsync_done; /* whether the latest node has fsync mark */ 45 49 struct node_info ni; /* in-memory node information */ 46 50 }; 47 51 ··· 56 58 #define nat_set_version(nat, v) (nat->ni.version = v) 57 59 58 60 #define __set_nat_cache_dirty(nm_i, ne) \ 59 - list_move_tail(&ne->list, &nm_i->dirty_nat_entries); 61 + do { \ 62 + ne->checkpointed = false; \ 63 + list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \ 64 + } while (0); 60 65 #define __clear_nat_cache_dirty(nm_i, ne) \ 61 - list_move_tail(&ne->list, &nm_i->nat_entries); 66 + do { \ 67 + ne->checkpointed = true; \ 68 + list_move_tail(&ne->list, &nm_i->nat_entries); \ 69 + } while (0); 62 70 #define inc_node_version(version) (++version) 63 71 64 72 static inline void node_info_from_raw_nat(struct node_info *ni, ··· 74 70 ni->blk_addr = le32_to_cpu(raw_ne->block_addr); 75 71 ni->version = raw_ne->version; 76 72 } 73 + 74 + enum nid_type { 75 + FREE_NIDS, /* indicates the free nid list */ 76 + NAT_ENTRIES /* indicates the cached nat entry */ 77 + }; 77 78 78 79 /* 79 80 * For free nid mangement ··· 245 236 { 246 237 unsigned int ofs = ofs_of_node(node_page); 247 238 248 - if (ofs == XATTR_NODE_OFFSET) 239 + if (f2fs_has_xattr_block(ofs)) 249 240 return false; 250 241 251 242 if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||

+17 -20

fs/f2fs/recovery.c

··· 27 27 static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, 28 28 nid_t ino) 29 29 { 30 - struct list_head *this; 31 30 struct fsync_inode_entry *entry; 32 31 33 - list_for_each(this, head) { 34 - entry = list_entry(this, struct fsync_inode_entry, list); 32 + list_for_each_entry(entry, head, list) 35 33 if (entry->inode->i_ino == ino) 36 34 return entry; 37 - } 35 + 38 36 return NULL; 39 37 } 40 38 ··· 134 136 135 137 /* get node pages in the current segment */ 136 138 curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); 137 - blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff; 139 + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); 138 140 139 141 /* read node page */ 140 142 page = alloc_page(GFP_F2FS_ZERO); ··· 216 218 { 217 219 struct seg_entry *sentry; 218 220 unsigned int segno = GET_SEGNO(sbi, blkaddr); 219 - unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & 220 - (sbi->blocks_per_seg - 1); 221 + unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); 222 + struct f2fs_summary_block *sum_node; 221 223 struct f2fs_summary sum; 224 + struct page *sum_page, *node_page; 222 225 nid_t ino, nid; 223 - void *kaddr; 224 226 struct inode *inode; 225 - struct page *node_page; 226 227 unsigned int offset; 227 228 block_t bidx; 228 229 int i; ··· 235 238 struct curseg_info *curseg = CURSEG_I(sbi, i); 236 239 if (curseg->segno == segno) { 237 240 sum = curseg->sum_blk->entries[blkoff]; 238 - break; 241 + goto got_it; 239 242 } 240 243 } 241 - if (i > CURSEG_COLD_DATA) { 242 - struct page *sum_page = get_sum_page(sbi, segno); 243 - struct f2fs_summary_block *sum_node; 244 - kaddr = page_address(sum_page); 245 - sum_node = (struct f2fs_summary_block *)kaddr; 246 - sum = sum_node->entries[blkoff]; 247 - f2fs_put_page(sum_page, 1); 248 - } 249 244 245 + sum_page = get_sum_page(sbi, segno); 246 + sum_node = (struct f2fs_summary_block *)page_address(sum_page); 247 + sum = sum_node->entries[blkoff]; 248 + f2fs_put_page(sum_page, 1); 249 + got_it: 250 250 /* Use the locked dnode page and inode */ 251 251 nid = le32_to_cpu(sum.nid); 252 252 if (dn->inode->i_ino == nid) { ··· 295 301 if (recover_inline_data(inode, page)) 296 302 goto out; 297 303 304 + if (recover_xattr_data(inode, page, blkaddr)) 305 + goto out; 306 + 298 307 start = start_bidx_of_node(ofs_of_node(page), fi); 299 308 if (IS_INODE(page)) 300 309 end = start + ADDRS_PER_INODE(fi); ··· 314 317 goto out; 315 318 } 316 319 317 - wait_on_page_writeback(dn.node_page); 320 + f2fs_wait_on_page_writeback(dn.node_page, NODE); 318 321 319 322 get_node_info(sbi, dn.nid, &ni); 320 323 f2fs_bug_on(ni.ino != ino_of_node(page)); ··· 434 437 bool need_writecp = false; 435 438 436 439 fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", 437 - sizeof(struct fsync_inode_entry), NULL); 440 + sizeof(struct fsync_inode_entry)); 438 441 if (!fsync_entry_slab) 439 442 return -ENOMEM; 440 443

+147 -75

fs/f2fs/segment.c

··· 13 13 #include <linux/bio.h> 14 14 #include <linux/blkdev.h> 15 15 #include <linux/prefetch.h> 16 + #include <linux/kthread.h> 16 17 #include <linux/vmalloc.h> 17 18 #include <linux/swap.h> 18 19 ··· 25 24 #define __reverse_ffz(x) __reverse_ffs(~(x)) 26 25 27 26 static struct kmem_cache *discard_entry_slab; 27 + static struct kmem_cache *flush_cmd_slab; 28 28 29 29 /* 30 30 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since ··· 197 195 f2fs_sync_fs(sbi->sb, true); 198 196 } 199 197 198 + static int issue_flush_thread(void *data) 199 + { 200 + struct f2fs_sb_info *sbi = data; 201 + struct f2fs_sm_info *sm_i = SM_I(sbi); 202 + wait_queue_head_t *q = &sm_i->flush_wait_queue; 203 + repeat: 204 + if (kthread_should_stop()) 205 + return 0; 206 + 207 + spin_lock(&sm_i->issue_lock); 208 + if (sm_i->issue_list) { 209 + sm_i->dispatch_list = sm_i->issue_list; 210 + sm_i->issue_list = sm_i->issue_tail = NULL; 211 + } 212 + spin_unlock(&sm_i->issue_lock); 213 + 214 + if (sm_i->dispatch_list) { 215 + struct bio *bio = bio_alloc(GFP_NOIO, 0); 216 + struct flush_cmd *cmd, *next; 217 + int ret; 218 + 219 + bio->bi_bdev = sbi->sb->s_bdev; 220 + ret = submit_bio_wait(WRITE_FLUSH, bio); 221 + 222 + for (cmd = sm_i->dispatch_list; cmd; cmd = next) { 223 + cmd->ret = ret; 224 + next = cmd->next; 225 + complete(&cmd->wait); 226 + } 227 + sm_i->dispatch_list = NULL; 228 + } 229 + 230 + wait_event_interruptible(*q, kthread_should_stop() || sm_i->issue_list); 231 + goto repeat; 232 + } 233 + 234 + int f2fs_issue_flush(struct f2fs_sb_info *sbi) 235 + { 236 + struct f2fs_sm_info *sm_i = SM_I(sbi); 237 + struct flush_cmd *cmd; 238 + int ret; 239 + 240 + if (!test_opt(sbi, FLUSH_MERGE)) 241 + return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); 242 + 243 + cmd = f2fs_kmem_cache_alloc(flush_cmd_slab, GFP_ATOMIC); 244 + cmd->next = NULL; 245 + cmd->ret = 0; 246 + init_completion(&cmd->wait); 247 + 248 + spin_lock(&sm_i->issue_lock); 249 + if (sm_i->issue_list) 250 + sm_i->issue_tail->next = cmd; 251 + else 252 + sm_i->issue_list = cmd; 253 + sm_i->issue_tail = cmd; 254 + spin_unlock(&sm_i->issue_lock); 255 + 256 + if (!sm_i->dispatch_list) 257 + wake_up(&sm_i->flush_wait_queue); 258 + 259 + wait_for_completion(&cmd->wait); 260 + ret = cmd->ret; 261 + kmem_cache_free(flush_cmd_slab, cmd); 262 + return ret; 263 + } 264 + 200 265 static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, 201 266 enum dirty_type dirty_type) 202 267 { ··· 409 340 void clear_prefree_segments(struct f2fs_sb_info *sbi) 410 341 { 411 342 struct list_head *head = &(SM_I(sbi)->discard_list); 412 - struct list_head *this, *next; 413 - struct discard_entry *entry; 343 + struct discard_entry *entry, *this; 414 344 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 415 345 unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; 416 346 unsigned int total_segs = TOTAL_SEGS(sbi); ··· 438 370 mutex_unlock(&dirty_i->seglist_lock); 439 371 440 372 /* send small discards */ 441 - list_for_each_safe(this, next, head) { 442 - entry = list_entry(this, struct discard_entry, list); 373 + list_for_each_entry_safe(entry, this, head, list) { 443 374 f2fs_issue_discard(sbi, entry->blkaddr, entry->len); 444 375 list_del(&entry->list); 445 376 SM_I(sbi)->nr_discards -= entry->len; ··· 472 405 473 406 se = get_seg_entry(sbi, segno); 474 407 new_vblocks = se->valid_blocks + del; 475 - offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1); 408 + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); 476 409 477 410 f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) || 478 411 (new_vblocks > sbi->blocks_per_seg))); ··· 501 434 get_sec_entry(sbi, segno)->valid_blocks += del; 502 435 } 503 436 504 - static void refresh_sit_entry(struct f2fs_sb_info *sbi, 505 - block_t old_blkaddr, block_t new_blkaddr) 437 + void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new) 506 438 { 507 - update_sit_entry(sbi, new_blkaddr, 1); 508 - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) 509 - update_sit_entry(sbi, old_blkaddr, -1); 439 + update_sit_entry(sbi, new, 1); 440 + if (GET_SEGNO(sbi, old) != NULL_SEGNO) 441 + update_sit_entry(sbi, old, -1); 442 + 443 + locate_dirty_segment(sbi, GET_SEGNO(sbi, old)); 444 + locate_dirty_segment(sbi, GET_SEGNO(sbi, new)); 510 445 } 511 446 512 447 void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) ··· 950 881 951 882 stat_inc_block_count(sbi, curseg); 952 883 884 + if (!__has_curseg_space(sbi, type)) 885 + sit_i->s_ops->allocate_segment(sbi, type, false); 953 886 /* 954 887 * SIT information should be updated before segment allocation, 955 888 * since SSR needs latest valid block information. 956 889 */ 957 890 refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); 958 - 959 - if (!__has_curseg_space(sbi, type)) 960 - sit_i->s_ops->allocate_segment(sbi, type, false); 961 - 962 891 locate_dirty_segment(sbi, old_cursegno); 963 - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); 892 + 964 893 mutex_unlock(&sit_i->sentry_lock); 965 894 966 895 if (page && IS_NODESEG(type)) ··· 1054 987 change_curseg(sbi, type, true); 1055 988 } 1056 989 1057 - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & 1058 - (sbi->blocks_per_seg - 1); 990 + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); 1059 991 __add_sum_entry(sbi, type, sum); 1060 992 1061 993 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); 1062 - 1063 994 locate_dirty_segment(sbi, old_cursegno); 1064 - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); 1065 995 1066 996 mutex_unlock(&sit_i->sentry_lock); 1067 997 mutex_unlock(&curseg->curseg_mutex); ··· 1092 1028 curseg->next_segno = segno; 1093 1029 change_curseg(sbi, type, true); 1094 1030 } 1095 - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & 1096 - (sbi->blocks_per_seg - 1); 1031 + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); 1097 1032 __add_sum_entry(sbi, type, sum); 1098 1033 1099 1034 /* change the current log to the next block addr in advance */ ··· 1100 1037 curseg->next_segno = next_segno; 1101 1038 change_curseg(sbi, type, true); 1102 1039 } 1103 - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) & 1104 - (sbi->blocks_per_seg - 1); 1040 + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr); 1105 1041 1106 1042 /* rewrite node page */ 1107 1043 set_page_writeback(page); 1108 1044 f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio); 1109 1045 f2fs_submit_merged_bio(sbi, NODE, WRITE); 1110 1046 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); 1111 - 1112 1047 locate_dirty_segment(sbi, old_cursegno); 1113 - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); 1114 1048 1115 1049 mutex_unlock(&sit_i->sentry_lock); 1116 1050 mutex_unlock(&curseg->curseg_mutex); 1051 + } 1052 + 1053 + static inline bool is_merged_page(struct f2fs_sb_info *sbi, 1054 + struct page *page, enum page_type type) 1055 + { 1056 + enum page_type btype = PAGE_TYPE_OF_BIO(type); 1057 + struct f2fs_bio_info *io = &sbi->write_io[btype]; 1058 + struct bio_vec *bvec; 1059 + int i; 1060 + 1061 + down_read(&io->io_rwsem); 1062 + if (!io->bio) 1063 + goto out; 1064 + 1065 + bio_for_each_segment_all(bvec, io->bio, i) { 1066 + if (page == bvec->bv_page) { 1067 + up_read(&io->io_rwsem); 1068 + return true; 1069 + } 1070 + } 1071 + 1072 + out: 1073 + up_read(&io->io_rwsem); 1074 + return false; 1117 1075 } 1118 1076 1119 1077 void f2fs_wait_on_page_writeback(struct page *page, ··· 1142 1058 { 1143 1059 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 1144 1060 if (PageWriteback(page)) { 1145 - f2fs_submit_merged_bio(sbi, type, WRITE); 1061 + if (is_merged_page(sbi, page, type)) 1062 + f2fs_submit_merged_bio(sbi, type, WRITE); 1146 1063 wait_on_page_writeback(page); 1147 1064 } 1148 1065 } ··· 1252 1167 ns->ofs_in_node = 0; 1253 1168 } 1254 1169 } else { 1255 - if (restore_node_summary(sbi, segno, sum)) { 1170 + int err; 1171 + 1172 + err = restore_node_summary(sbi, segno, sum); 1173 + if (err) { 1256 1174 f2fs_put_page(new, 1); 1257 - return -EINVAL; 1175 + return err; 1258 1176 } 1259 1177 } 1260 1178 } ··· 1278 1190 static int restore_curseg_summaries(struct f2fs_sb_info *sbi) 1279 1191 { 1280 1192 int type = CURSEG_HOT_DATA; 1193 + int err; 1281 1194 1282 1195 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { 1283 1196 /* restore for compacted data summary */ ··· 1287 1198 type = CURSEG_HOT_NODE; 1288 1199 } 1289 1200 1290 - for (; type <= CURSEG_COLD_NODE; type++) 1291 - if (read_normal_summaries(sbi, type)) 1292 - return -EINVAL; 1201 + for (; type <= CURSEG_COLD_NODE; type++) { 1202 + err = read_normal_summaries(sbi, type); 1203 + if (err) 1204 + return err; 1205 + } 1206 + 1293 1207 return 0; 1294 1208 } 1295 1209 ··· 1675 1583 return restore_curseg_summaries(sbi); 1676 1584 } 1677 1585 1678 - static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages) 1679 - { 1680 - struct address_space *mapping = META_MAPPING(sbi); 1681 - struct page *page; 1682 - block_t blk_addr, prev_blk_addr = 0; 1683 - int sit_blk_cnt = SIT_BLK_CNT(sbi); 1684 - int blkno = start; 1685 - struct f2fs_io_info fio = { 1686 - .type = META, 1687 - .rw = READ_SYNC | REQ_META | REQ_PRIO 1688 - }; 1689 - 1690 - for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) { 1691 - 1692 - blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); 1693 - 1694 - if (blkno != start && prev_blk_addr + 1 != blk_addr) 1695 - break; 1696 - prev_blk_addr = blk_addr; 1697 - repeat: 1698 - page = grab_cache_page(mapping, blk_addr); 1699 - if (!page) { 1700 - cond_resched(); 1701 - goto repeat; 1702 - } 1703 - if (PageUptodate(page)) { 1704 - mark_page_accessed(page); 1705 - f2fs_put_page(page, 1); 1706 - continue; 1707 - } 1708 - 1709 - f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); 1710 - 1711 - mark_page_accessed(page); 1712 - f2fs_put_page(page, 0); 1713 - } 1714 - 1715 - f2fs_submit_merged_bio(sbi, META, READ); 1716 - return blkno - start; 1717 - } 1718 - 1719 1586 static void build_sit_entries(struct f2fs_sb_info *sbi) 1720 1587 { 1721 1588 struct sit_info *sit_i = SIT_I(sbi); ··· 1686 1635 int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 1687 1636 1688 1637 do { 1689 - readed = ra_sit_pages(sbi, start_blk, nrpages); 1638 + readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); 1690 1639 1691 1640 start = start_blk * sit_i->sents_per_block; 1692 1641 end = (start_blk + readed) * sit_i->sents_per_block; ··· 1832 1781 { 1833 1782 struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); 1834 1783 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); 1784 + dev_t dev = sbi->sb->s_bdev->bd_dev; 1835 1785 struct f2fs_sm_info *sm_info; 1836 1786 int err; 1837 1787 ··· 1851 1799 sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); 1852 1800 sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); 1853 1801 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); 1854 - sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS; 1802 + sm_info->rec_prefree_segments = sm_info->main_segments * 1803 + DEF_RECLAIM_PREFREE_SEGMENTS / 100; 1855 1804 sm_info->ipu_policy = F2FS_IPU_DISABLE; 1856 1805 sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; 1857 1806 1858 1807 INIT_LIST_HEAD(&sm_info->discard_list); 1859 1808 sm_info->nr_discards = 0; 1860 1809 sm_info->max_discards = 0; 1810 + 1811 + if (test_opt(sbi, FLUSH_MERGE)) { 1812 + spin_lock_init(&sm_info->issue_lock); 1813 + init_waitqueue_head(&sm_info->flush_wait_queue); 1814 + 1815 + sm_info->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, 1816 + "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); 1817 + if (IS_ERR(sm_info->f2fs_issue_flush)) 1818 + return PTR_ERR(sm_info->f2fs_issue_flush); 1819 + } 1861 1820 1862 1821 err = build_sit_info(sbi); 1863 1822 if (err) ··· 1978 1915 struct f2fs_sm_info *sm_info = SM_I(sbi); 1979 1916 if (!sm_info) 1980 1917 return; 1918 + if (sm_info->f2fs_issue_flush) 1919 + kthread_stop(sm_info->f2fs_issue_flush); 1981 1920 destroy_dirty_segmap(sbi); 1982 1921 destroy_curseg(sbi); 1983 1922 destroy_free_segmap(sbi); ··· 1991 1926 int __init create_segment_manager_caches(void) 1992 1927 { 1993 1928 discard_entry_slab = f2fs_kmem_cache_create("discard_entry", 1994 - sizeof(struct discard_entry), NULL); 1929 + sizeof(struct discard_entry)); 1995 1930 if (!discard_entry_slab) 1996 1931 return -ENOMEM; 1932 + flush_cmd_slab = f2fs_kmem_cache_create("flush_command", 1933 + sizeof(struct flush_cmd)); 1934 + if (!flush_cmd_slab) { 1935 + kmem_cache_destroy(discard_entry_slab); 1936 + return -ENOMEM; 1937 + } 1997 1938 return 0; 1998 1939 } 1999 1940 2000 1941 void destroy_segment_manager_caches(void) 2001 1942 { 2002 1943 kmem_cache_destroy(discard_entry_slab); 1944 + kmem_cache_destroy(flush_cmd_slab); 2003 1945 }

+50 -25

fs/f2fs/segment.h

··· 14 14 #define NULL_SEGNO ((unsigned int)(~0)) 15 15 #define NULL_SECNO ((unsigned int)(~0)) 16 16 17 - #define DEF_RECLAIM_PREFREE_SEGMENTS 100 /* 200MB of prefree segments */ 17 + #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ 18 18 19 19 /* L: Logical segment # in volume, R: Relative segment # in main area */ 20 20 #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) ··· 57 57 ((blk_addr) - SM_I(sbi)->seg0_blkaddr) 58 58 #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ 59 59 (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) 60 + #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ 61 + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) 62 + 60 63 #define GET_SEGNO(sbi, blk_addr) \ 61 64 (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ 62 65 NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ ··· 380 377 381 378 static inline block_t written_block_count(struct f2fs_sb_info *sbi) 382 379 { 383 - struct sit_info *sit_i = SIT_I(sbi); 384 - block_t vblocks; 385 - 386 - mutex_lock(&sit_i->sentry_lock); 387 - vblocks = sit_i->written_valid_blocks; 388 - mutex_unlock(&sit_i->sentry_lock); 389 - 390 - return vblocks; 380 + return SIT_I(sbi)->written_valid_blocks; 391 381 } 392 382 393 383 static inline unsigned int free_segments(struct f2fs_sb_info *sbi) 394 384 { 395 - struct free_segmap_info *free_i = FREE_I(sbi); 396 - unsigned int free_segs; 397 - 398 - read_lock(&free_i->segmap_lock); 399 - free_segs = free_i->free_segments; 400 - read_unlock(&free_i->segmap_lock); 401 - 402 - return free_segs; 385 + return FREE_I(sbi)->free_segments; 403 386 } 404 387 405 388 static inline int reserved_segments(struct f2fs_sb_info *sbi) ··· 395 406 396 407 static inline unsigned int free_sections(struct f2fs_sb_info *sbi) 397 408 { 398 - struct free_segmap_info *free_i = FREE_I(sbi); 399 - unsigned int free_secs; 400 - 401 - read_lock(&free_i->segmap_lock); 402 - free_secs = free_i->free_sections; 403 - read_unlock(&free_i->segmap_lock); 404 - 405 - return free_secs; 409 + return FREE_I(sbi)->free_sections; 406 410 } 407 411 408 412 static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) ··· 663 681 struct block_device *bdev = sbi->sb->s_bdev; 664 682 struct request_queue *q = bdev_get_queue(bdev); 665 683 return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); 684 + } 685 + 686 + /* 687 + * It is very important to gather dirty pages and write at once, so that we can 688 + * submit a big bio without interfering other data writes. 689 + * By default, 512 pages for directory data, 690 + * 512 pages (2MB) * 3 for three types of nodes, and 691 + * max_bio_blocks for meta are set. 692 + */ 693 + static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) 694 + { 695 + if (type == DATA) 696 + return sbi->blocks_per_seg; 697 + else if (type == NODE) 698 + return 3 * sbi->blocks_per_seg; 699 + else if (type == META) 700 + return MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 701 + else 702 + return 0; 703 + } 704 + 705 + /* 706 + * When writing pages, it'd better align nr_to_write for segment size. 707 + */ 708 + static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, 709 + struct writeback_control *wbc) 710 + { 711 + long nr_to_write, desired; 712 + 713 + if (wbc->sync_mode != WB_SYNC_NONE) 714 + return 0; 715 + 716 + nr_to_write = wbc->nr_to_write; 717 + 718 + if (type == DATA) 719 + desired = 4096; 720 + else if (type == NODE) 721 + desired = 3 * max_hw_blocks(sbi); 722 + else 723 + desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 724 + 725 + wbc->nr_to_write = desired; 726 + return desired - nr_to_write; 666 727 }

+64 -33

fs/f2fs/super.c

··· 51 51 Opt_disable_ext_identify, 52 52 Opt_inline_xattr, 53 53 Opt_inline_data, 54 + Opt_flush_merge, 54 55 Opt_err, 55 56 }; 56 57 ··· 68 67 {Opt_disable_ext_identify, "disable_ext_identify"}, 69 68 {Opt_inline_xattr, "inline_xattr"}, 70 69 {Opt_inline_data, "inline_data"}, 70 + {Opt_flush_merge, "flush_merge"}, 71 71 {Opt_err, NULL}, 72 72 }; 73 73 ··· 76 74 enum { 77 75 GC_THREAD, /* struct f2fs_gc_thread */ 78 76 SM_INFO, /* struct f2fs_sm_info */ 77 + NM_INFO, /* struct f2fs_nm_info */ 79 78 F2FS_SBI, /* struct f2fs_sb_info */ 80 79 }; 81 80 ··· 95 92 return (unsigned char *)sbi->gc_thread; 96 93 else if (struct_type == SM_INFO) 97 94 return (unsigned char *)SM_I(sbi); 95 + else if (struct_type == NM_INFO) 96 + return (unsigned char *)NM_I(sbi); 98 97 else if (struct_type == F2FS_SBI) 99 98 return (unsigned char *)sbi; 100 99 return NULL; ··· 188 183 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); 189 184 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); 190 185 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); 186 + F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); 191 187 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); 188 + F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); 192 189 193 190 #define ATTR_LIST(name) (&f2fs_attr_##name.attr) 194 191 static struct attribute *f2fs_attrs[] = { ··· 203 196 ATTR_LIST(ipu_policy), 204 197 ATTR_LIST(min_ipu_util), 205 198 ATTR_LIST(max_victim_search), 199 + ATTR_LIST(dir_level), 200 + ATTR_LIST(ram_thresh), 206 201 NULL, 207 202 }; 208 203 ··· 265 256 266 257 if (!name) 267 258 return -ENOMEM; 268 - if (!strncmp(name, "on", 2)) 259 + if (strlen(name) == 2 && !strncmp(name, "on", 2)) 269 260 set_opt(sbi, BG_GC); 270 - else if (!strncmp(name, "off", 3)) 261 + else if (strlen(name) == 3 && !strncmp(name, "off", 3)) 271 262 clear_opt(sbi, BG_GC); 272 263 else { 273 264 kfree(name); ··· 336 327 case Opt_inline_data: 337 328 set_opt(sbi, INLINE_DATA); 338 329 break; 330 + case Opt_flush_merge: 331 + set_opt(sbi, FLUSH_MERGE); 332 + break; 339 333 default: 340 334 f2fs_msg(sb, KERN_ERR, 341 335 "Unrecognized mount option \"%s\" or missing value", ··· 365 353 fi->i_current_depth = 1; 366 354 fi->i_advise = 0; 367 355 rwlock_init(&fi->ext.ext_lock); 356 + init_rwsem(&fi->i_sem); 368 357 369 358 set_inode_flag(fi, FI_NEW_INODE); 370 359 371 360 if (test_opt(F2FS_SB(sb), INLINE_XATTR)) 372 361 set_inode_flag(fi, FI_INLINE_XATTR); 362 + 363 + /* Will be used by directory only */ 364 + fi->i_dir_level = F2FS_SB(sb)->dir_level; 373 365 374 366 return &fi->vfs_inode; 375 367 } ··· 542 526 seq_puts(seq, ",disable_ext_identify"); 543 527 if (test_opt(sbi, INLINE_DATA)) 544 528 seq_puts(seq, ",inline_data"); 529 + if (test_opt(sbi, FLUSH_MERGE)) 530 + seq_puts(seq, ",flush_merge"); 545 531 seq_printf(seq, ",active_logs=%u", sbi->active_logs); 546 532 547 533 return 0; ··· 557 539 le32_to_cpu(sbi->raw_super->segment_count_main); 558 540 int i; 559 541 542 + seq_puts(seq, "format: segment_type|valid_blocks\n" 543 + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); 544 + 560 545 for (i = 0; i < total_segs; i++) { 561 - seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1)); 562 - if (i != 0 && (i % 10) == 0) 563 - seq_puts(seq, "\n"); 546 + struct seg_entry *se = get_seg_entry(sbi, i); 547 + 548 + if ((i % 10) == 0) 549 + seq_printf(seq, "%-5d", i); 550 + seq_printf(seq, "%d|%-3u", se->type, 551 + get_valid_blocks(sbi, i, 1)); 552 + if ((i % 10) == 9 || i == (total_segs - 1)) 553 + seq_putc(seq, '\n'); 564 554 else 565 - seq_puts(seq, " "); 555 + seq_putc(seq, ' '); 566 556 } 557 + 567 558 return 0; 568 559 } 569 560 ··· 666 639 struct inode *inode; 667 640 668 641 if (unlikely(ino < F2FS_ROOT_INO(sbi))) 642 + return ERR_PTR(-ESTALE); 643 + if (unlikely(ino >= NM_I(sbi)->max_nid)) 669 644 return ERR_PTR(-ESTALE); 670 645 671 646 /* ··· 816 787 817 788 for (i = 0; i < NR_COUNT_TYPE; i++) 818 789 atomic_set(&sbi->nr_pages[i], 0); 790 + 791 + sbi->dir_level = DEF_DIR_LEVEL; 819 792 } 820 793 821 794 /* ··· 929 898 sbi->por_doing = false; 930 899 spin_lock_init(&sbi->stat_lock); 931 900 932 - mutex_init(&sbi->read_io.io_mutex); 901 + init_rwsem(&sbi->read_io.io_rwsem); 933 902 sbi->read_io.sbi = sbi; 934 903 sbi->read_io.bio = NULL; 935 904 for (i = 0; i < NR_PAGE_TYPE; i++) { 936 - mutex_init(&sbi->write_io[i].io_mutex); 905 + init_rwsem(&sbi->write_io[i].io_rwsem); 937 906 sbi->write_io[i].sbi = sbi; 938 907 sbi->write_io[i].bio = NULL; 939 908 } ··· 1022 991 goto free_root_inode; 1023 992 } 1024 993 1025 - /* recover fsynced data */ 1026 - if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { 1027 - err = recover_fsync_data(sbi); 1028 - if (err) 1029 - f2fs_msg(sb, KERN_ERR, 1030 - "Cannot recover all fsync data errno=%ld", err); 1031 - } 1032 - 1033 - /* 1034 - * If filesystem is not mounted as read-only then 1035 - * do start the gc_thread. 1036 - */ 1037 - if (!(sb->s_flags & MS_RDONLY)) { 1038 - /* After POR, we can run background GC thread.*/ 1039 - err = start_gc_thread(sbi); 1040 - if (err) 1041 - goto free_gc; 1042 - } 1043 - 1044 994 err = f2fs_build_stats(sbi); 1045 995 if (err) 1046 - goto free_gc; 996 + goto free_root_inode; 1047 997 1048 998 if (f2fs_proc_root) 1049 999 sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); ··· 1046 1034 err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, 1047 1035 "%s", sb->s_id); 1048 1036 if (err) 1049 - goto fail; 1037 + goto free_proc; 1050 1038 1039 + /* recover fsynced data */ 1040 + if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { 1041 + err = recover_fsync_data(sbi); 1042 + if (err) 1043 + f2fs_msg(sb, KERN_ERR, 1044 + "Cannot recover all fsync data errno=%ld", err); 1045 + } 1046 + 1047 + /* 1048 + * If filesystem is not mounted as read-only then 1049 + * do start the gc_thread. 1050 + */ 1051 + if (!(sb->s_flags & MS_RDONLY)) { 1052 + /* After POR, we can run background GC thread.*/ 1053 + err = start_gc_thread(sbi); 1054 + if (err) 1055 + goto free_kobj; 1056 + } 1051 1057 return 0; 1052 - fail: 1058 + 1059 + free_kobj: 1060 + kobject_del(&sbi->s_kobj); 1061 + free_proc: 1053 1062 if (sbi->s_proc) { 1054 1063 remove_proc_entry("segment_info", sbi->s_proc); 1055 1064 remove_proc_entry(sb->s_id, f2fs_proc_root); 1056 1065 } 1057 1066 f2fs_destroy_stats(sbi); 1058 - free_gc: 1059 - stop_gc_thread(sbi); 1060 1067 free_root_inode: 1061 1068 dput(sb->s_root); 1062 1069 sb->s_root = NULL; ··· 1115 1084 static int __init init_inodecache(void) 1116 1085 { 1117 1086 f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", 1118 - sizeof(struct f2fs_inode_info), NULL); 1087 + sizeof(struct f2fs_inode_info)); 1119 1088 if (!f2fs_inode_cachep) 1120 1089 return -ENOMEM; 1121 1090 return 0;

+6 -1

fs/f2fs/xattr.c

··· 275 275 276 276 inline_size = inline_xattr_size(inode); 277 277 278 - txattr_addr = kzalloc(inline_size + size, GFP_KERNEL); 278 + txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); 279 279 if (!txattr_addr) 280 280 return NULL; 281 281 ··· 407 407 if (name == NULL) 408 408 return -EINVAL; 409 409 name_len = strlen(name); 410 + if (name_len > F2FS_NAME_LEN) 411 + return -ERANGE; 410 412 411 413 base_addr = read_all_xattrs(inode, NULL); 412 414 if (!base_addr) ··· 592 590 f2fs_balance_fs(sbi); 593 591 594 592 f2fs_lock_op(sbi); 593 + /* protect xattr_ver */ 594 + down_write(&F2FS_I(inode)->i_sem); 595 595 err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage); 596 + up_write(&F2FS_I(inode)->i_sem); 596 597 f2fs_unlock_op(sbi); 597 598 598 599 return err;

+1 -1

include/linux/f2fs_fs.h

··· 183 183 __le32 i_pino; /* parent inode number */ 184 184 __le32 i_namelen; /* file name length */ 185 185 __u8 i_name[F2FS_NAME_LEN]; /* file name for SPOR */ 186 - __u8 i_reserved2; /* for backward compatibility */ 186 + __u8 i_dir_level; /* dentry_level for large dir */ 187 187 188 188 struct f2fs_extent i_ext; /* caching a largest extent */ 189 189