commit 871eae4891a844e1fd065467b940f98dbf7aad1c · tjh.dev/kernel

+8 -5

fs/xfs/linux-2.6/xfs_aops.c

··· 852 SetPageUptodate(page); 853 854 if (count) { 855 - wbc->nr_to_write--; 856 - if (wbc->nr_to_write <= 0) 857 done = 1; 858 } 859 xfs_start_page_writeback(page, !page_dirty, count); ··· 1068 * by themselves. 1069 */ 1070 if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) 1071 - goto out_fail; 1072 1073 /* 1074 * We need a transaction if there are delalloc or unwritten buffers ··· 1080 */ 1081 xfs_count_page_state(page, &delalloc, &unwritten); 1082 if ((current->flags & PF_FSTRANS) && (delalloc || unwritten)) 1083 - goto out_fail; 1084 1085 /* Is this page beyond the end of the file? */ 1086 offset = i_size_read(inode); ··· 1245 if (iohead) 1246 xfs_cancel_ioend(iohead); 1247 1248 xfs_aops_discard_page(page); 1249 ClearPageUptodate(page); 1250 unlock_page(page); 1251 return err; 1252 1253 - out_fail: 1254 redirty_page_for_writepage(wbc, page); 1255 unlock_page(page); 1256 return 0;

··· 852 SetPageUptodate(page); 853 854 if (count) { 855 + if (--wbc->nr_to_write <= 0 && 856 + wbc->sync_mode == WB_SYNC_NONE) 857 done = 1; 858 } 859 xfs_start_page_writeback(page, !page_dirty, count); ··· 1068 * by themselves. 1069 */ 1070 if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) 1071 + goto redirty; 1072 1073 /* 1074 * We need a transaction if there are delalloc or unwritten buffers ··· 1080 */ 1081 xfs_count_page_state(page, &delalloc, &unwritten); 1082 if ((current->flags & PF_FSTRANS) && (delalloc || unwritten)) 1083 + goto redirty; 1084 1085 /* Is this page beyond the end of the file? */ 1086 offset = i_size_read(inode); ··· 1245 if (iohead) 1246 xfs_cancel_ioend(iohead); 1247 1248 + if (err == -EAGAIN) 1249 + goto redirty; 1250 + 1251 xfs_aops_discard_page(page); 1252 ClearPageUptodate(page); 1253 unlock_page(page); 1254 return err; 1255 1256 + redirty: 1257 redirty_page_for_writepage(wbc, page); 1258 unlock_page(page); 1259 return 0;

+7 -2

fs/xfs/linux-2.6/xfs_super.c

··· 1226 struct xfs_inode *ip = XFS_I(dentry->d_inode); 1227 __uint64_t fakeinos, id; 1228 xfs_extlen_t lsize; 1229 1230 statp->f_type = XFS_SB_MAGIC; 1231 statp->f_namelen = MAXNAMELEN - 1; ··· 1250 statp->f_files = min_t(typeof(statp->f_files), 1251 statp->f_files, 1252 mp->m_maxicount); 1253 - statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); 1254 spin_unlock(&mp->m_sb_lock); 1255 1256 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || ··· 1407 1408 xfs_save_resvblks(mp); 1409 xfs_quiesce_attr(mp); 1410 - return -xfs_fs_log_dummy(mp); 1411 } 1412 1413 STATIC int

··· 1226 struct xfs_inode *ip = XFS_I(dentry->d_inode); 1227 __uint64_t fakeinos, id; 1228 xfs_extlen_t lsize; 1229 + __int64_t ffree; 1230 1231 statp->f_type = XFS_SB_MAGIC; 1232 statp->f_namelen = MAXNAMELEN - 1; ··· 1249 statp->f_files = min_t(typeof(statp->f_files), 1250 statp->f_files, 1251 mp->m_maxicount); 1252 + 1253 + /* make sure statp->f_ffree does not underflow */ 1254 + ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); 1255 + statp->f_ffree = max_t(__int64_t, ffree, 0); 1256 + 1257 spin_unlock(&mp->m_sb_lock); 1258 1259 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || ··· 1402 1403 xfs_save_resvblks(mp); 1404 xfs_quiesce_attr(mp); 1405 + return -xfs_fs_log_dummy(mp, SYNC_WAIT); 1406 } 1407 1408 STATIC int

+6 -36

fs/xfs/linux-2.6/xfs_sync.c

··· 34 #include "xfs_inode_item.h" 35 #include "xfs_quota.h" 36 #include "xfs_trace.h" 37 38 #include <linux/kthread.h> 39 #include <linux/freezer.h> ··· 342 } 343 344 STATIC int 345 - xfs_commit_dummy_trans( 346 - struct xfs_mount *mp, 347 - uint flags) 348 - { 349 - struct xfs_inode *ip = mp->m_rootip; 350 - struct xfs_trans *tp; 351 - int error; 352 - 353 - /* 354 - * Put a dummy transaction in the log to tell recovery 355 - * that all others are OK. 356 - */ 357 - tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); 358 - error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); 359 - if (error) { 360 - xfs_trans_cancel(tp, 0); 361 - return error; 362 - } 363 - 364 - xfs_ilock(ip, XFS_ILOCK_EXCL); 365 - 366 - xfs_trans_ijoin(tp, ip); 367 - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 368 - error = xfs_trans_commit(tp, 0); 369 - xfs_iunlock(ip, XFS_ILOCK_EXCL); 370 - 371 - /* the log force ensures this transaction is pushed to disk */ 372 - xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); 373 - return error; 374 - } 375 - 376 - STATIC int 377 xfs_sync_fsdata( 378 struct xfs_mount *mp) 379 { ··· 401 402 /* mark the log as covered if needed */ 403 if (xfs_log_need_covered(mp)) 404 - error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT); 405 406 /* flush data-only devices */ 407 if (mp->m_rtdev_targp) ··· 532 /* 533 * Every sync period we need to unpin all items, reclaim inodes and sync 534 * disk quotas. We might need to cover the log to indicate that the 535 - * filesystem is idle. 536 */ 537 STATIC void 538 xfs_sync_worker( ··· 546 xfs_reclaim_inodes(mp, 0); 547 /* dgc: errors ignored here */ 548 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 549 - if (xfs_log_need_covered(mp)) 550 - error = xfs_commit_dummy_trans(mp, 0); 551 } 552 mp->m_sync_seq++; 553 wake_up(&mp->m_wait_single_sync_task);

··· 34 #include "xfs_inode_item.h" 35 #include "xfs_quota.h" 36 #include "xfs_trace.h" 37 + #include "xfs_fsops.h" 38 39 #include <linux/kthread.h> 40 #include <linux/freezer.h> ··· 341 } 342 343 STATIC int 344 xfs_sync_fsdata( 345 struct xfs_mount *mp) 346 { ··· 432 433 /* mark the log as covered if needed */ 434 if (xfs_log_need_covered(mp)) 435 + error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); 436 437 /* flush data-only devices */ 438 if (mp->m_rtdev_targp) ··· 563 /* 564 * Every sync period we need to unpin all items, reclaim inodes and sync 565 * disk quotas. We might need to cover the log to indicate that the 566 + * filesystem is idle and not frozen. 567 */ 568 STATIC void 569 xfs_sync_worker( ··· 577 xfs_reclaim_inodes(mp, 0); 578 /* dgc: errors ignored here */ 579 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 580 + if (mp->m_super->s_frozen == SB_UNFROZEN && 581 + xfs_log_need_covered(mp)) 582 + error = xfs_fs_log_dummy(mp, 0); 583 } 584 mp->m_sync_seq++; 585 wake_up(&mp->m_wait_single_sync_task);

+18 -13

fs/xfs/xfs_fsops.c

··· 604 return 0; 605 } 606 607 int 608 xfs_fs_log_dummy( 609 - xfs_mount_t *mp) 610 { 611 xfs_trans_t *tp; 612 - xfs_inode_t *ip; 613 int error; 614 615 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); 616 - error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); 617 if (error) { 618 xfs_trans_cancel(tp, 0); 619 return error; 620 } 621 622 - ip = mp->m_rootip; 623 - xfs_ilock(ip, XFS_ILOCK_EXCL); 624 - 625 - xfs_trans_ijoin(tp, ip); 626 - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 627 - xfs_trans_set_sync(tp); 628 - error = xfs_trans_commit(tp, 0); 629 - 630 - xfs_iunlock(ip, XFS_ILOCK_EXCL); 631 - return error; 632 } 633 634 int

··· 604 return 0; 605 } 606 607 + /* 608 + * Dump a transaction into the log that contains no real change. This is needed 609 + * to be able to make the log dirty or stamp the current tail LSN into the log 610 + * during the covering operation. 611 + * 612 + * We cannot use an inode here for this - that will push dirty state back up 613 + * into the VFS and then periodic inode flushing will prevent log covering from 614 + * making progress. Hence we log a field in the superblock instead. 615 + */ 616 int 617 xfs_fs_log_dummy( 618 + xfs_mount_t *mp, 619 + int flags) 620 { 621 xfs_trans_t *tp; 622 int error; 623 624 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); 625 + error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, 626 + XFS_DEFAULT_LOG_COUNT); 627 if (error) { 628 xfs_trans_cancel(tp, 0); 629 return error; 630 } 631 632 + /* log the UUID because it is an unchanging field */ 633 + xfs_mod_sb(tp, XFS_SB_UUID); 634 + if (flags & SYNC_WAIT) 635 + xfs_trans_set_sync(tp); 636 + return xfs_trans_commit(tp, 0); 637 } 638 639 int

+1 -1

fs/xfs/xfs_fsops.h

··· 25 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 26 xfs_fsop_resblks_t *outval); 27 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 28 - extern int xfs_fs_log_dummy(xfs_mount_t *mp); 29 30 #endif /* __XFS_FSOPS_H__ */

··· 25 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 26 xfs_fsop_resblks_t *outval); 27 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 28 + extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); 29 30 #endif /* __XFS_FSOPS_H__ */

+10 -6

fs/xfs/xfs_ialloc.c

··· 1213 struct xfs_inobt_rec_incore rec; 1214 struct xfs_btree_cur *cur; 1215 struct xfs_buf *agbp; 1216 - xfs_agino_t startino; 1217 int error; 1218 int i; 1219 ··· 1226 } 1227 1228 /* 1229 - * derive and lookup the exact inode record for the given agino. If the 1230 - * record cannot be found, then it's an invalid inode number and we 1231 - * should abort. 1232 */ 1233 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); 1234 - startino = agino & ~(XFS_IALLOC_INODES(mp) - 1); 1235 - error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i); 1236 if (!error) { 1237 if (i) 1238 error = xfs_inobt_get_rec(cur, &rec, &i); ··· 1244 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1245 if (error) 1246 return error; 1247 1248 /* for untrusted inodes check it is allocated first */ 1249 if ((flags & XFS_IGET_UNTRUSTED) &&

··· 1213 struct xfs_inobt_rec_incore rec; 1214 struct xfs_btree_cur *cur; 1215 struct xfs_buf *agbp; 1216 int error; 1217 int i; 1218 ··· 1227 } 1228 1229 /* 1230 + * Lookup the inode record for the given agino. If the record cannot be 1231 + * found, then it's an invalid inode number and we should abort. Once 1232 + * we have a record, we need to ensure it contains the inode number 1233 + * we are looking up. 1234 */ 1235 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); 1236 + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); 1237 if (!error) { 1238 if (i) 1239 error = xfs_inobt_get_rec(cur, &rec, &i); ··· 1245 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1246 if (error) 1247 return error; 1248 + 1249 + /* check that the returned record contains the required inode */ 1250 + if (rec.ir_startino > agino || 1251 + rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino) 1252 + return EINVAL; 1253 1254 /* for untrusted inodes check it is allocated first */ 1255 if ((flags & XFS_IGET_UNTRUSTED) &&

+26 -23

fs/xfs/xfs_inode.c

··· 1914 return 0; 1915 } 1916 1917 STATIC void 1918 xfs_ifree_cluster( 1919 xfs_inode_t *free_ip, ··· 1950 } 1951 1952 for (j = 0; j < nbufs; j++, inum += ninodes) { 1953 - int found = 0; 1954 - 1955 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 1956 XFS_INO_TO_AGBNO(mp, inum)); 1957 ··· 1968 /* 1969 * Walk the inodes already attached to the buffer and mark them 1970 * stale. These will all have the flush locks held, so an 1971 - * in-memory inode walk can't lock them. 1972 */ 1973 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 1974 while (lip) { ··· 1982 &iip->ili_flush_lsn, 1983 &iip->ili_item.li_lsn); 1984 xfs_iflags_set(iip->ili_inode, XFS_ISTALE); 1985 - found++; 1986 } 1987 lip = lip->li_bio_list; 1988 } 1989 1990 /* 1991 * For each inode in memory attempt to add it to the inode ··· 1998 * even trying to lock them. 1999 */ 2000 for (i = 0; i < ninodes; i++) { 2001 read_lock(&pag->pag_ici_lock); 2002 ip = radix_tree_lookup(&pag->pag_ici_root, 2003 XFS_INO_TO_AGINO(mp, (inum + i))); ··· 2009 continue; 2010 } 2011 2012 - /* don't try to lock/unlock the current inode */ 2013 if (ip != free_ip && 2014 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2015 read_unlock(&pag->pag_ici_lock); 2016 - continue; 2017 } 2018 read_unlock(&pag->pag_ici_lock); 2019 2020 - if (!xfs_iflock_nowait(ip)) { 2021 - if (ip != free_ip) 2022 - xfs_iunlock(ip, XFS_ILOCK_EXCL); 2023 - continue; 2024 - } 2025 - 2026 xfs_iflags_set(ip, XFS_ISTALE); 2027 - if (xfs_inode_clean(ip)) { 2028 - ASSERT(ip != free_ip); 2029 - xfs_ifunlock(ip); 2030 - xfs_iunlock(ip, XFS_ILOCK_EXCL); 2031 - continue; 2032 - } 2033 2034 iip = ip->i_itemp; 2035 - if (!iip) { 2036 - /* inode with unlogged changes only */ 2037 ASSERT(ip != free_ip); 2038 ip->i_update_core = 0; 2039 xfs_ifunlock(ip); 2040 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2041 continue; 2042 } 2043 - found++; 2044 2045 iip->ili_last_fields = iip->ili_format.ilf_fields; 2046 iip->ili_format.ilf_fields = 0; ··· 2053 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2054 } 2055 2056 - if (found) 2057 - xfs_trans_stale_inode_buf(tp, bp); 2058 xfs_trans_binval(tp, bp); 2059 } 2060

··· 1914 return 0; 1915 } 1916 1917 + /* 1918 + * A big issue when freeing the inode cluster is is that we _cannot_ skip any 1919 + * inodes that are in memory - they all must be marked stale and attached to 1920 + * the cluster buffer. 1921 + */ 1922 STATIC void 1923 xfs_ifree_cluster( 1924 xfs_inode_t *free_ip, ··· 1945 } 1946 1947 for (j = 0; j < nbufs; j++, inum += ninodes) { 1948 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 1949 XFS_INO_TO_AGBNO(mp, inum)); 1950 ··· 1965 /* 1966 * Walk the inodes already attached to the buffer and mark them 1967 * stale. These will all have the flush locks held, so an 1968 + * in-memory inode walk can't lock them. By marking them all 1969 + * stale first, we will not attempt to lock them in the loop 1970 + * below as the XFS_ISTALE flag will be set. 1971 */ 1972 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 1973 while (lip) { ··· 1977 &iip->ili_flush_lsn, 1978 &iip->ili_item.li_lsn); 1979 xfs_iflags_set(iip->ili_inode, XFS_ISTALE); 1980 } 1981 lip = lip->li_bio_list; 1982 } 1983 + 1984 1985 /* 1986 * For each inode in memory attempt to add it to the inode ··· 1993 * even trying to lock them. 1994 */ 1995 for (i = 0; i < ninodes; i++) { 1996 + retry: 1997 read_lock(&pag->pag_ici_lock); 1998 ip = radix_tree_lookup(&pag->pag_ici_root, 1999 XFS_INO_TO_AGINO(mp, (inum + i))); ··· 2003 continue; 2004 } 2005 2006 + /* 2007 + * Don't try to lock/unlock the current inode, but we 2008 + * _cannot_ skip the other inodes that we did not find 2009 + * in the list attached to the buffer and are not 2010 + * already marked stale. If we can't lock it, back off 2011 + * and retry. 2012 + */ 2013 if (ip != free_ip && 2014 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2015 read_unlock(&pag->pag_ici_lock); 2016 + delay(1); 2017 + goto retry; 2018 } 2019 read_unlock(&pag->pag_ici_lock); 2020 2021 + xfs_iflock(ip); 2022 xfs_iflags_set(ip, XFS_ISTALE); 2023 2024 + /* 2025 + * we don't need to attach clean inodes or those only 2026 + * with unlogged changes (which we throw away, anyway). 2027 + */ 2028 iip = ip->i_itemp; 2029 + if (!iip || xfs_inode_clean(ip)) { 2030 ASSERT(ip != free_ip); 2031 ip->i_update_core = 0; 2032 xfs_ifunlock(ip); 2033 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2034 continue; 2035 } 2036 2037 iip->ili_last_fields = iip->ili_format.ilf_fields; 2038 iip->ili_format.ilf_fields = 0; ··· 2049 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2050 } 2051 2052 + xfs_trans_stale_inode_buf(tp, bp); 2053 xfs_trans_binval(tp, bp); 2054 } 2055

+4 -3

fs/xfs/xfs_log.c

··· 3015 3016 XFS_STATS_INC(xs_log_force); 3017 3018 - xlog_cil_push(log, 1); 3019 3020 spin_lock(&log->l_icloglock); 3021 ··· 3168 XFS_STATS_INC(xs_log_force); 3169 3170 if (log->l_cilp) { 3171 - lsn = xlog_cil_push_lsn(log, lsn); 3172 if (lsn == NULLCOMMITLSN) 3173 return 0; 3174 } ··· 3725 * call below. 3726 */ 3727 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) 3728 - xlog_cil_push(log, 1); 3729 3730 /* 3731 * We must hold both the GRANT lock and the LOG lock,

··· 3015 3016 XFS_STATS_INC(xs_log_force); 3017 3018 + if (log->l_cilp) 3019 + xlog_cil_force(log); 3020 3021 spin_lock(&log->l_icloglock); 3022 ··· 3167 XFS_STATS_INC(xs_log_force); 3168 3169 if (log->l_cilp) { 3170 + lsn = xlog_cil_force_lsn(log, lsn); 3171 if (lsn == NULLCOMMITLSN) 3172 return 0; 3173 } ··· 3724 * call below. 3725 */ 3726 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) 3727 + xlog_cil_force(log); 3728 3729 /* 3730 * We must hold both the GRANT lock and the LOG lock,

+157 -106

fs/xfs/xfs_log_cil.c

··· 68 ctx->sequence = 1; 69 ctx->cil = cil; 70 cil->xc_ctx = ctx; 71 72 cil->xc_log = log; 73 log->l_cilp = cil; ··· 270 static void 271 xlog_cil_format_items( 272 struct log *log, 273 - struct xfs_log_vec *log_vector, 274 - struct xlog_ticket *ticket, 275 - xfs_lsn_t *start_lsn) 276 { 277 struct xfs_log_vec *lv; 278 - 279 - if (start_lsn) 280 - *start_lsn = log->l_cilp->xc_ctx->sequence; 281 282 ASSERT(log_vector); 283 for (lv = log_vector; lv; lv = lv->lv_next) { ··· 297 ptr += vec->i_len; 298 } 299 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); 300 - 301 - xlog_cil_insert(log, ticket, lv->lv_item, lv); 302 } 303 } 304 305 static void ··· 329 kmem_free(lv); 330 lv = next; 331 } 332 - } 333 - 334 - /* 335 - * Commit a transaction with the given vector to the Committed Item List. 336 - * 337 - * To do this, we need to format the item, pin it in memory if required and 338 - * account for the space used by the transaction. Once we have done that we 339 - * need to release the unused reservation for the transaction, attach the 340 - * transaction to the checkpoint context so we carry the busy extents through 341 - * to checkpoint completion, and then unlock all the items in the transaction. 342 - * 343 - * For more specific information about the order of operations in 344 - * xfs_log_commit_cil() please refer to the comments in 345 - * xfs_trans_commit_iclog(). 346 - * 347 - * Called with the context lock already held in read mode to lock out 348 - * background commit, returns without it held once background commits are 349 - * allowed again. 350 - */ 351 - int 352 - xfs_log_commit_cil( 353 - struct xfs_mount *mp, 354 - struct xfs_trans *tp, 355 - struct xfs_log_vec *log_vector, 356 - xfs_lsn_t *commit_lsn, 357 - int flags) 358 - { 359 - struct log *log = mp->m_log; 360 - int log_flags = 0; 361 - int push = 0; 362 - 363 - if (flags & XFS_TRANS_RELEASE_LOG_RES) 364 - log_flags = XFS_LOG_REL_PERM_RESERV; 365 - 366 - if (XLOG_FORCED_SHUTDOWN(log)) { 367 - xlog_cil_free_logvec(log_vector); 368 - return XFS_ERROR(EIO); 369 - } 370 - 371 - /* lock out background commit */ 372 - down_read(&log->l_cilp->xc_ctx_lock); 373 - xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); 374 - 375 - /* check we didn't blow the reservation */ 376 - if (tp->t_ticket->t_curr_res < 0) 377 - xlog_print_tic_res(log->l_mp, tp->t_ticket); 378 - 379 - /* attach the transaction to the CIL if it has any busy extents */ 380 - if (!list_empty(&tp->t_busy)) { 381 - spin_lock(&log->l_cilp->xc_cil_lock); 382 - list_splice_init(&tp->t_busy, 383 - &log->l_cilp->xc_ctx->busy_extents); 384 - spin_unlock(&log->l_cilp->xc_cil_lock); 385 - } 386 - 387 - tp->t_commit_lsn = *commit_lsn; 388 - xfs_log_done(mp, tp->t_ticket, NULL, log_flags); 389 - xfs_trans_unreserve_and_mod_sb(tp); 390 - 391 - /* check for background commit before unlock */ 392 - if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) 393 - push = 1; 394 - up_read(&log->l_cilp->xc_ctx_lock); 395 - 396 - /* 397 - * We need to push CIL every so often so we don't cache more than we 398 - * can fit in the log. The limit really is that a checkpoint can't be 399 - * more than half the log (the current checkpoint is not allowed to 400 - * overwrite the previous checkpoint), but commit latency and memory 401 - * usage limit this to a smaller size in most cases. 402 - */ 403 - if (push) 404 - xlog_cil_push(log, 0); 405 - return 0; 406 } 407 408 /* ··· 364 } 365 366 /* 367 - * Push the Committed Item List to the log. If the push_now flag is not set, 368 - * then it is a background flush and so we can chose to ignore it. 369 */ 370 - int 371 xlog_cil_push( 372 struct log *log, 373 - int push_now) 374 { 375 struct xfs_cil *cil = log->l_cilp; 376 struct xfs_log_vec *lv; ··· 400 if (!cil) 401 return 0; 402 403 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); 404 new_ctx->ticket = xlog_cil_ticket_alloc(log); 405 406 /* lock out transaction commit, but don't block on background push */ 407 if (!down_write_trylock(&cil->xc_ctx_lock)) { 408 - if (!push_now) 409 goto out_free_ticket; 410 down_write(&cil->xc_ctx_lock); 411 } ··· 418 goto out_skip; 419 420 /* check for spurious background flush */ 421 - if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) 422 goto out_skip; 423 424 /* ··· 466 new_ctx->sequence = ctx->sequence + 1; 467 new_ctx->cil = cil; 468 cil->xc_ctx = new_ctx; 469 470 /* 471 * The switch is now done, so we can drop the context lock and move out ··· 586 } 587 588 /* 589 * Conditionally push the CIL based on the sequence passed in. 590 * 591 * We only need to push if we haven't already pushed the sequence ··· 695 * commit lsn is there. It'll be empty, so this is broken for now. 696 */ 697 xfs_lsn_t 698 - xlog_cil_push_lsn( 699 struct log *log, 700 - xfs_lsn_t push_seq) 701 { 702 struct xfs_cil *cil = log->l_cilp; 703 struct xfs_cil_ctx *ctx; 704 xfs_lsn_t commit_lsn = NULLCOMMITLSN; 705 706 - restart: 707 - down_write(&cil->xc_ctx_lock); 708 - ASSERT(push_seq <= cil->xc_ctx->sequence); 709 710 - /* check to see if we need to force out the current context */ 711 - if (push_seq == cil->xc_ctx->sequence) { 712 - up_write(&cil->xc_ctx_lock); 713 - xlog_cil_push(log, 1); 714 - goto restart; 715 - } 716 717 /* 718 * See if we can find a previous sequence still committing. 719 - * We can drop the flush lock as soon as we have the cil lock 720 - * because we are now only comparing contexts protected by 721 - * the cil lock. 722 - * 723 * We need to wait for all previous sequence commits to complete 724 * before allowing the force of push_seq to go ahead. Hence block 725 * on commits for those as well. 726 */ 727 spin_lock(&cil->xc_cil_lock); 728 - up_write(&cil->xc_ctx_lock); 729 list_for_each_entry(ctx, &cil->xc_committing, committing) { 730 - if (ctx->sequence > push_seq) 731 continue; 732 if (!ctx->commit_lsn) { 733 /* ··· 732 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 733 goto restart; 734 } 735 - if (ctx->sequence != push_seq) 736 continue; 737 /* found it! */ 738 commit_lsn = ctx->commit_lsn;

··· 68 ctx->sequence = 1; 69 ctx->cil = cil; 70 cil->xc_ctx = ctx; 71 + cil->xc_current_sequence = ctx->sequence; 72 73 cil->xc_log = log; 74 log->l_cilp = cil; ··· 269 static void 270 xlog_cil_format_items( 271 struct log *log, 272 + struct xfs_log_vec *log_vector) 273 { 274 struct xfs_log_vec *lv; 275 276 ASSERT(log_vector); 277 for (lv = log_vector; lv; lv = lv->lv_next) { ··· 301 ptr += vec->i_len; 302 } 303 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); 304 } 305 + } 306 + 307 + static void 308 + xlog_cil_insert_items( 309 + struct log *log, 310 + struct xfs_log_vec *log_vector, 311 + struct xlog_ticket *ticket, 312 + xfs_lsn_t *start_lsn) 313 + { 314 + struct xfs_log_vec *lv; 315 + 316 + if (start_lsn) 317 + *start_lsn = log->l_cilp->xc_ctx->sequence; 318 + 319 + ASSERT(log_vector); 320 + for (lv = log_vector; lv; lv = lv->lv_next) 321 + xlog_cil_insert(log, ticket, lv->lv_item, lv); 322 } 323 324 static void ··· 318 kmem_free(lv); 319 lv = next; 320 } 321 } 322 323 /* ··· 427 } 428 429 /* 430 + * Push the Committed Item List to the log. If @push_seq flag is zero, then it 431 + * is a background flush and so we can chose to ignore it. Otherwise, if the 432 + * current sequence is the same as @push_seq we need to do a flush. If 433 + * @push_seq is less than the current sequence, then it has already been 434 + * flushed and we don't need to do anything - the caller will wait for it to 435 + * complete if necessary. 436 + * 437 + * @push_seq is a value rather than a flag because that allows us to do an 438 + * unlocked check of the sequence number for a match. Hence we can allows log 439 + * forces to run racily and not issue pushes for the same sequence twice. If we 440 + * get a race between multiple pushes for the same sequence they will block on 441 + * the first one and then abort, hence avoiding needless pushes. 442 */ 443 + STATIC int 444 xlog_cil_push( 445 struct log *log, 446 + xfs_lsn_t push_seq) 447 { 448 struct xfs_cil *cil = log->l_cilp; 449 struct xfs_log_vec *lv; ··· 453 if (!cil) 454 return 0; 455 456 + ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence); 457 + 458 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); 459 new_ctx->ticket = xlog_cil_ticket_alloc(log); 460 461 /* lock out transaction commit, but don't block on background push */ 462 if (!down_write_trylock(&cil->xc_ctx_lock)) { 463 + if (!push_seq) 464 goto out_free_ticket; 465 down_write(&cil->xc_ctx_lock); 466 } ··· 469 goto out_skip; 470 471 /* check for spurious background flush */ 472 + if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) 473 + goto out_skip; 474 + 475 + /* check for a previously pushed seqeunce */ 476 + if (push_seq < cil->xc_ctx->sequence) 477 goto out_skip; 478 479 /* ··· 513 new_ctx->sequence = ctx->sequence + 1; 514 new_ctx->cil = cil; 515 cil->xc_ctx = new_ctx; 516 + 517 + /* 518 + * mirror the new sequence into the cil structure so that we can do 519 + * unlocked checks against the current sequence in log forces without 520 + * risking deferencing a freed context pointer. 521 + */ 522 + cil->xc_current_sequence = new_ctx->sequence; 523 524 /* 525 * The switch is now done, so we can drop the context lock and move out ··· 626 } 627 628 /* 629 + * Commit a transaction with the given vector to the Committed Item List. 630 + * 631 + * To do this, we need to format the item, pin it in memory if required and 632 + * account for the space used by the transaction. Once we have done that we 633 + * need to release the unused reservation for the transaction, attach the 634 + * transaction to the checkpoint context so we carry the busy extents through 635 + * to checkpoint completion, and then unlock all the items in the transaction. 636 + * 637 + * For more specific information about the order of operations in 638 + * xfs_log_commit_cil() please refer to the comments in 639 + * xfs_trans_commit_iclog(). 640 + * 641 + * Called with the context lock already held in read mode to lock out 642 + * background commit, returns without it held once background commits are 643 + * allowed again. 644 + */ 645 + int 646 + xfs_log_commit_cil( 647 + struct xfs_mount *mp, 648 + struct xfs_trans *tp, 649 + struct xfs_log_vec *log_vector, 650 + xfs_lsn_t *commit_lsn, 651 + int flags) 652 + { 653 + struct log *log = mp->m_log; 654 + int log_flags = 0; 655 + int push = 0; 656 + 657 + if (flags & XFS_TRANS_RELEASE_LOG_RES) 658 + log_flags = XFS_LOG_REL_PERM_RESERV; 659 + 660 + if (XLOG_FORCED_SHUTDOWN(log)) { 661 + xlog_cil_free_logvec(log_vector); 662 + return XFS_ERROR(EIO); 663 + } 664 + 665 + /* 666 + * do all the hard work of formatting items (including memory 667 + * allocation) outside the CIL context lock. This prevents stalling CIL 668 + * pushes when we are low on memory and a transaction commit spends a 669 + * lot of time in memory reclaim. 670 + */ 671 + xlog_cil_format_items(log, log_vector); 672 + 673 + /* lock out background commit */ 674 + down_read(&log->l_cilp->xc_ctx_lock); 675 + xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn); 676 + 677 + /* check we didn't blow the reservation */ 678 + if (tp->t_ticket->t_curr_res < 0) 679 + xlog_print_tic_res(log->l_mp, tp->t_ticket); 680 + 681 + /* attach the transaction to the CIL if it has any busy extents */ 682 + if (!list_empty(&tp->t_busy)) { 683 + spin_lock(&log->l_cilp->xc_cil_lock); 684 + list_splice_init(&tp->t_busy, 685 + &log->l_cilp->xc_ctx->busy_extents); 686 + spin_unlock(&log->l_cilp->xc_cil_lock); 687 + } 688 + 689 + tp->t_commit_lsn = *commit_lsn; 690 + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); 691 + xfs_trans_unreserve_and_mod_sb(tp); 692 + 693 + /* 694 + * Once all the items of the transaction have been copied to the CIL, 695 + * the items can be unlocked and freed. 696 + * 697 + * This needs to be done before we drop the CIL context lock because we 698 + * have to update state in the log items and unlock them before they go 699 + * to disk. If we don't, then the CIL checkpoint can race with us and 700 + * we can run checkpoint completion before we've updated and unlocked 701 + * the log items. This affects (at least) processing of stale buffers, 702 + * inodes and EFIs. 703 + */ 704 + xfs_trans_free_items(tp, *commit_lsn, 0); 705 + 706 + /* check for background commit before unlock */ 707 + if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) 708 + push = 1; 709 + 710 + up_read(&log->l_cilp->xc_ctx_lock); 711 + 712 + /* 713 + * We need to push CIL every so often so we don't cache more than we 714 + * can fit in the log. The limit really is that a checkpoint can't be 715 + * more than half the log (the current checkpoint is not allowed to 716 + * overwrite the previous checkpoint), but commit latency and memory 717 + * usage limit this to a smaller size in most cases. 718 + */ 719 + if (push) 720 + xlog_cil_push(log, 0); 721 + return 0; 722 + } 723 + 724 + /* 725 * Conditionally push the CIL based on the sequence passed in. 726 * 727 * We only need to push if we haven't already pushed the sequence ··· 639 * commit lsn is there. It'll be empty, so this is broken for now. 640 */ 641 xfs_lsn_t 642 + xlog_cil_force_lsn( 643 struct log *log, 644 + xfs_lsn_t sequence) 645 { 646 struct xfs_cil *cil = log->l_cilp; 647 struct xfs_cil_ctx *ctx; 648 xfs_lsn_t commit_lsn = NULLCOMMITLSN; 649 650 + ASSERT(sequence <= cil->xc_current_sequence); 651 652 + /* 653 + * check to see if we need to force out the current context. 654 + * xlog_cil_push() handles racing pushes for the same sequence, 655 + * so no need to deal with it here. 656 + */ 657 + if (sequence == cil->xc_current_sequence) 658 + xlog_cil_push(log, sequence); 659 660 /* 661 * See if we can find a previous sequence still committing. 662 * We need to wait for all previous sequence commits to complete 663 * before allowing the force of push_seq to go ahead. Hence block 664 * on commits for those as well. 665 */ 666 + restart: 667 spin_lock(&cil->xc_cil_lock); 668 list_for_each_entry(ctx, &cil->xc_committing, committing) { 669 + if (ctx->sequence > sequence) 670 continue; 671 if (!ctx->commit_lsn) { 672 /* ··· 681 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 682 goto restart; 683 } 684 + if (ctx->sequence != sequence) 685 continue; 686 /* found it! */ 687 commit_lsn = ctx->commit_lsn;

+11 -2

fs/xfs/xfs_log_priv.h

··· 422 struct rw_semaphore xc_ctx_lock; 423 struct list_head xc_committing; 424 sv_t xc_commit_wait; 425 }; 426 427 /* ··· 563 void xlog_cil_init_post_recovery(struct log *log); 564 void xlog_cil_destroy(struct log *log); 565 566 - int xlog_cil_push(struct log *log, int push_now); 567 - xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); 568 569 /* 570 * Unmount record type is used as a pseudo transaction type for the ticket.

··· 422 struct rw_semaphore xc_ctx_lock; 423 struct list_head xc_committing; 424 sv_t xc_commit_wait; 425 + xfs_lsn_t xc_current_sequence; 426 }; 427 428 /* ··· 562 void xlog_cil_init_post_recovery(struct log *log); 563 void xlog_cil_destroy(struct log *log); 564 565 + /* 566 + * CIL force routines 567 + */ 568 + xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence); 569 + 570 + static inline void 571 + xlog_cil_force(struct log *log) 572 + { 573 + xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); 574 + } 575 576 /* 577 * Unmount record type is used as a pseudo transaction type for the ticket.

+1 -4

fs/xfs/xfs_trans.c

··· 1167 * Unlock all of the items of a transaction and free all the descriptors 1168 * of that transaction. 1169 */ 1170 - STATIC void 1171 xfs_trans_free_items( 1172 struct xfs_trans *tp, 1173 xfs_lsn_t commit_lsn, ··· 1653 return error; 1654 1655 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1656 - 1657 - /* xfs_trans_free_items() unlocks them first */ 1658 - xfs_trans_free_items(tp, *commit_lsn, 0); 1659 xfs_trans_free(tp); 1660 return 0; 1661 }

··· 1167 * Unlock all of the items of a transaction and free all the descriptors 1168 * of that transaction. 1169 */ 1170 + void 1171 xfs_trans_free_items( 1172 struct xfs_trans *tp, 1173 xfs_lsn_t commit_lsn, ··· 1653 return error; 1654 1655 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1656 xfs_trans_free(tp); 1657 return 0; 1658 }

+2 -1

fs/xfs/xfs_trans_priv.h

··· 25 26 void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); 27 void xfs_trans_del_item(struct xfs_log_item *); 28 - 29 void xfs_trans_item_committed(struct xfs_log_item *lip, 30 xfs_lsn_t commit_lsn, int aborted); 31 void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);

··· 25 26 void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); 27 void xfs_trans_del_item(struct xfs_log_item *); 28 + void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, 29 + int flags); 30 void xfs_trans_item_committed(struct xfs_log_item *lip, 31 xfs_lsn_t commit_lsn, int aborted); 32 void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);

+10 -16

mm/page-writeback.c

··· 985 } 986 } 987 988 - if (wbc->nr_to_write > 0) { 989 - if (--wbc->nr_to_write == 0 && 990 - wbc->sync_mode == WB_SYNC_NONE) { 991 - /* 992 - * We stop writing back only if we are 993 - * not doing integrity sync. In case of 994 - * integrity sync we have to keep going 995 - * because someone may be concurrently 996 - * dirtying pages, and we might have 997 - * synced a lot of newly appeared dirty 998 - * pages, but have not synced all of the 999 - * old dirty pages. 1000 - */ 1001 - done = 1; 1002 - break; 1003 - } 1004 } 1005 } 1006 pagevec_release(&pvec);

··· 985 } 986 } 987 988 + /* 989 + * We stop writing back only if we are not doing 990 + * integrity sync. In case of integrity sync we have to 991 + * keep going until we have written all the pages 992 + * we tagged for writeback prior to entering this loop. 993 + */ 994 + if (--wbc->nr_to_write <= 0 && 995 + wbc->sync_mode == WB_SYNC_NONE) { 996 + done = 1; 997 + break; 998 } 999 } 1000 pagevec_release(&pvec);