Merge git://oss.sgi.com:8090/xfs/linux-2.6

* git://oss.sgi.com:8090/xfs/linux-2.6:
[XFS] Don't do I/O beyond eof when unreserving space
[XFS] Fix use-after-free with buffers
[XFS] Prevent lockdep false positives when locking two inodes.
[XFS] Fix barrier status change detection.
[XFS] Prevent direct I/O from mapping extents beyond eof
[XFS] Fix regression introduced by remount fixup
[XFS] Move memory allocations for log tracing out of the critical path

+119 -47
+4
fs/xfs/linux-2.6/xfs_aops.c
··· 1338 offset = (xfs_off_t)iblock << inode->i_blkbits; 1339 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1340 size = bh_result->b_size; 1341 error = xfs_iomap(XFS_I(inode), offset, size, 1342 create ? flags : BMAPI_READ, &iomap, &niomap); 1343 if (error)
··· 1338 offset = (xfs_off_t)iblock << inode->i_blkbits; 1339 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1340 size = bh_result->b_size; 1341 + 1342 + if (!create && direct && offset >= i_size_read(inode)) 1343 + return 0; 1344 + 1345 error = xfs_iomap(XFS_I(inode), offset, size, 1346 create ? flags : BMAPI_READ, &iomap, &niomap); 1347 if (error)
+20
fs/xfs/linux-2.6/xfs_super.c
··· 1302 mp->m_flags &= ~XFS_MOUNT_BARRIER; 1303 break; 1304 default: 1305 printk(KERN_INFO 1306 "XFS: mount option \"%s\" not supported for remount\n", p); 1307 return -EINVAL; 1308 } 1309 } 1310
··· 1302 mp->m_flags &= ~XFS_MOUNT_BARRIER; 1303 break; 1304 default: 1305 + /* 1306 + * Logically we would return an error here to prevent 1307 + * users from believing they might have changed 1308 + * mount options using remount which can't be changed. 1309 + * 1310 + * But unfortunately mount(8) adds all options from 1311 + * mtab and fstab to the mount arguments in some cases 1312 + * so we can't blindly reject options, but have to 1313 + * check for each specified option if it actually 1314 + * differs from the currently set option and only 1315 + * reject it if that's the case. 1316 + * 1317 + * Until that is implemented we return success for 1318 + * every remount request, and silently ignore all 1319 + * options that we can't actually change. 1320 + */ 1321 + #if 0 1322 printk(KERN_INFO 1323 "XFS: mount option \"%s\" not supported for remount\n", p); 1324 return -EINVAL; 1325 + #else 1326 + return 0; 1327 + #endif 1328 } 1329 } 1330
+20 -24
fs/xfs/xfs_buf_item.c
··· 732 bip->bli_item.li_ops = &xfs_buf_item_ops; 733 bip->bli_item.li_mountp = mp; 734 bip->bli_buf = bp; 735 bip->bli_format.blf_type = XFS_LI_BUF; 736 bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); 737 bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); ··· 868 return (bip->bli_flags & XFS_BLI_DIRTY); 869 } 870 871 /* 872 * This is called when the buf log item is no longer needed. It should 873 * free the buf log item associated with the given buffer and clear ··· 903 (XFS_BUF_IODONE_FUNC(bp) != NULL)) { 904 XFS_BUF_CLR_IODONE_FUNC(bp); 905 } 906 - 907 - #ifdef XFS_TRANS_DEBUG 908 - kmem_free(bip->bli_orig); 909 - bip->bli_orig = NULL; 910 - kmem_free(bip->bli_logged); 911 - bip->bli_logged = NULL; 912 - #endif /* XFS_TRANS_DEBUG */ 913 - 914 - #ifdef XFS_BLI_TRACE 915 - ktrace_free(bip->bli_trace); 916 - #endif 917 - kmem_zone_free(xfs_buf_item_zone, bip); 918 } 919 920 ··· 1126 1127 ASSERT(bip->bli_buf == bp); 1128 1129 mp = bip->bli_item.li_mountp; 1130 1131 /* ··· 1143 * xfs_trans_delete_ail() drops the AIL lock. 1144 */ 1145 xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); 1146 - 1147 - #ifdef XFS_TRANS_DEBUG 1148 - kmem_free(bip->bli_orig); 1149 - bip->bli_orig = NULL; 1150 - kmem_free(bip->bli_logged); 1151 - bip->bli_logged = NULL; 1152 - #endif /* XFS_TRANS_DEBUG */ 1153 - 1154 - #ifdef XFS_BLI_TRACE 1155 - ktrace_free(bip->bli_trace); 1156 - #endif 1157 - kmem_zone_free(xfs_buf_item_zone, bip); 1158 } 1159 1160 #if defined(XFS_BLI_TRACE)
··· 732 bip->bli_item.li_ops = &xfs_buf_item_ops; 733 bip->bli_item.li_mountp = mp; 734 bip->bli_buf = bp; 735 + xfs_buf_hold(bp); 736 bip->bli_format.blf_type = XFS_LI_BUF; 737 bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); 738 bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); ··· 867 return (bip->bli_flags & XFS_BLI_DIRTY); 868 } 869 870 + STATIC void 871 + xfs_buf_item_free( 872 + xfs_buf_log_item_t *bip) 873 + { 874 + #ifdef XFS_TRANS_DEBUG 875 + kmem_free(bip->bli_orig); 876 + kmem_free(bip->bli_logged); 877 + #endif /* XFS_TRANS_DEBUG */ 878 + 879 + #ifdef XFS_BLI_TRACE 880 + ktrace_free(bip->bli_trace); 881 + #endif 882 + kmem_zone_free(xfs_buf_item_zone, bip); 883 + } 884 + 885 /* 886 * This is called when the buf log item is no longer needed. It should 887 * free the buf log item associated with the given buffer and clear ··· 887 (XFS_BUF_IODONE_FUNC(bp) != NULL)) { 888 XFS_BUF_CLR_IODONE_FUNC(bp); 889 } 890 + xfs_buf_rele(bp); 891 + xfs_buf_item_free(bip); 892 } 893 894 ··· 1120 1121 ASSERT(bip->bli_buf == bp); 1122 1123 + xfs_buf_rele(bp); 1124 mp = bip->bli_item.li_mountp; 1125 1126 /* ··· 1136 * xfs_trans_delete_ail() drops the AIL lock. 1137 */ 1138 xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); 1139 + xfs_buf_item_free(bip); 1140 } 1141 1142 #if defined(XFS_BLI_TRACE)
+8 -1
fs/xfs/xfs_dfrag.c
··· 149 150 sbp = &sxp->sx_stat; 151 152 - xfs_lock_two_inodes(ip, tip, lock_flags); 153 locked = 1; 154 155 /* Verify that both files have the same format */
··· 149 150 sbp = &sxp->sx_stat; 151 152 + /* 153 + * we have to do two separate lock calls here to keep lockdep 154 + * happy. If we try to get all the locks in one call, lock will 155 + * report false positives when we drop the ILOCK and regain them 156 + * below. 157 + */ 158 + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); 159 + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); 160 locked = 1; 161 162 /* Verify that both files have the same format */
+41 -21
fs/xfs/xfs_log.c
··· 124 STATIC int xlog_iclogs_empty(xlog_t *log); 125 126 #if defined(XFS_LOG_TRACE) 127 void 128 xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) 129 { 130 unsigned long cnts; 131 132 - if (!log->l_grant_trace) { 133 - log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP); 134 - if (!log->l_grant_trace) 135 - return; 136 - } 137 /* ticket counts are 1 byte each */ 138 cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; 139 ··· 168 } 169 170 void 171 xlog_trace_iclog(xlog_in_core_t *iclog, uint state) 172 { 173 - if (!iclog->ic_trace) 174 - iclog->ic_trace = ktrace_alloc(256, KM_NOFS); 175 ktrace_enter(iclog->ic_trace, 176 (void *)((unsigned long)state), 177 (void *)((unsigned long)current_pid()), ··· 191 (void *)NULL, (void *)NULL); 192 } 193 #else 194 #define xlog_trace_loggrant(log,tic,string) 195 #define xlog_trace_iclog(iclog,state) 196 #endif /* XFS_LOG_TRACE */ 197 198 ··· 1037 * layer, it means the underlyin device no longer supports 1038 * barrier I/O. Warn loudly and turn off barriers. 1039 */ 1040 - if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) { 1041 l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER; 1042 xfs_fs_cmn_err(CE_WARN, l->l_mp, 1043 "xlog_iodone: Barriers are no longer supported" ··· 1259 spin_lock_init(&log->l_grant_lock); 1260 sv_init(&log->l_flush_wait, 0, "flush_wait"); 1261 1262 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ 1263 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); 1264 ··· 1313 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); 1314 sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); 1315 sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); 1316 1317 iclogp = &iclog->ic_next; 1318 } ··· 1596 sv_destroy(&iclog->ic_force_wait); 1597 sv_destroy(&iclog->ic_write_wait); 1598 xfs_buf_free(iclog->ic_bp); 1599 - #ifdef XFS_LOG_TRACE 1600 - if (iclog->ic_trace != NULL) { 1601 - ktrace_free(iclog->ic_trace); 1602 - } 1603 - #endif 1604 next_iclog = iclog->ic_next; 1605 kmem_free(iclog); 1606 iclog = next_iclog; ··· 1605 spinlock_destroy(&log->l_grant_lock); 1606 1607 xfs_buf_free(log->l_xbuf); 1608 - #ifdef XFS_LOG_TRACE 1609 - if (log->l_trace != NULL) { 1610 - ktrace_free(log->l_trace); 1611 - } 1612 - if (log->l_grant_trace != NULL) { 1613 - ktrace_free(log->l_grant_trace); 1614 - } 1615 - #endif 1616 log->l_mp->m_log = NULL; 1617 kmem_free(log); 1618 } /* xlog_dealloc_log */
··· 124 STATIC int xlog_iclogs_empty(xlog_t *log); 125 126 #if defined(XFS_LOG_TRACE) 127 + 128 + #define XLOG_TRACE_LOGGRANT_SIZE 2048 129 + #define XLOG_TRACE_ICLOG_SIZE 256 130 + 131 + void 132 + xlog_trace_loggrant_alloc(xlog_t *log) 133 + { 134 + log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS); 135 + } 136 + 137 + void 138 + xlog_trace_loggrant_dealloc(xlog_t *log) 139 + { 140 + ktrace_free(log->l_grant_trace); 141 + } 142 + 143 void 144 xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) 145 { 146 unsigned long cnts; 147 148 /* ticket counts are 1 byte each */ 149 cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; 150 ··· 157 } 158 159 void 160 + xlog_trace_iclog_alloc(xlog_in_core_t *iclog) 161 + { 162 + iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS); 163 + } 164 + 165 + void 166 + xlog_trace_iclog_dealloc(xlog_in_core_t *iclog) 167 + { 168 + ktrace_free(iclog->ic_trace); 169 + } 170 + 171 + void 172 xlog_trace_iclog(xlog_in_core_t *iclog, uint state) 173 { 174 ktrace_enter(iclog->ic_trace, 175 (void *)((unsigned long)state), 176 (void *)((unsigned long)current_pid()), ··· 170 (void *)NULL, (void *)NULL); 171 } 172 #else 173 + 174 + #define xlog_trace_loggrant_alloc(log) 175 + #define xlog_trace_loggrant_dealloc(log) 176 #define xlog_trace_loggrant(log,tic,string) 177 + 178 + #define xlog_trace_iclog_alloc(iclog) 179 + #define xlog_trace_iclog_dealloc(iclog) 180 #define xlog_trace_iclog(iclog,state) 181 + 182 #endif /* XFS_LOG_TRACE */ 183 184 ··· 1009 * layer, it means the underlyin device no longer supports 1010 * barrier I/O. Warn loudly and turn off barriers. 1011 */ 1012 + if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ISORDERED(bp)) { 1013 l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER; 1014 xfs_fs_cmn_err(CE_WARN, l->l_mp, 1015 "xlog_iodone: Barriers are no longer supported" ··· 1231 spin_lock_init(&log->l_grant_lock); 1232 sv_init(&log->l_flush_wait, 0, "flush_wait"); 1233 1234 + xlog_trace_loggrant_alloc(log); 1235 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ 1236 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); 1237 ··· 1284 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); 1285 sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); 1286 sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); 1287 + 1288 + xlog_trace_iclog_alloc(iclog); 1289 1290 iclogp = &iclog->ic_next; 1291 } ··· 1565 sv_destroy(&iclog->ic_force_wait); 1566 sv_destroy(&iclog->ic_write_wait); 1567 xfs_buf_free(iclog->ic_bp); 1568 + xlog_trace_iclog_dealloc(iclog); 1569 next_iclog = iclog->ic_next; 1570 kmem_free(iclog); 1571 iclog = next_iclog; ··· 1578 spinlock_destroy(&log->l_grant_lock); 1579 1580 xfs_buf_free(log->l_xbuf); 1581 + xlog_trace_loggrant_dealloc(log); 1582 log->l_mp->m_log = NULL; 1583 kmem_free(log); 1584 } /* xlog_dealloc_log */
-1
fs/xfs/xfs_log_priv.h
··· 448 int l_grant_write_bytes; 449 450 #ifdef XFS_LOG_TRACE 451 - struct ktrace *l_trace; 452 struct ktrace *l_grant_trace; 453 #endif 454
··· 448 int l_grant_write_bytes; 449 450 #ifdef XFS_LOG_TRACE 451 struct ktrace *l_grant_trace; 452 #endif 453
+26
fs/xfs/xfs_vnodeops.c
··· 1838 #endif 1839 } 1840 1841 void 1842 xfs_lock_two_inodes( 1843 xfs_inode_t *ip0, ··· 1854 int attempts = 0; 1855 xfs_log_item_t *lp; 1856 1857 ASSERT(ip0->i_ino != ip1->i_ino); 1858 1859 if (ip0->i_ino > ip1->i_ino) { ··· 3160 /* 3161 * Zero file bytes between startoff and endoff inclusive. 3162 * The iolock is held exclusive and no blocks are buffered. 3163 */ 3164 STATIC int 3165 xfs_zero_remaining_bytes( ··· 3182 xfs_mount_t *mp = ip->i_mount; 3183 int nimap; 3184 int error = 0; 3185 3186 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, 3187 XFS_IS_REALTIME_INODE(ip) ?
··· 1838 #endif 1839 } 1840 1841 + /* 1842 + * xfs_lock_two_inodes() can only be used to lock one type of lock 1843 + * at a time - the iolock or the ilock, but not both at once. If 1844 + * we lock both at once, lockdep will report false positives saying 1845 + * we have violated locking orders. 1846 + */ 1847 void 1848 xfs_lock_two_inodes( 1849 xfs_inode_t *ip0, ··· 1848 int attempts = 0; 1849 xfs_log_item_t *lp; 1850 1851 + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) 1852 + ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); 1853 ASSERT(ip0->i_ino != ip1->i_ino); 1854 1855 if (ip0->i_ino > ip1->i_ino) { ··· 3152 /* 3153 * Zero file bytes between startoff and endoff inclusive. 3154 * The iolock is held exclusive and no blocks are buffered. 3155 + * 3156 + * This function is used by xfs_free_file_space() to zero 3157 + * partial blocks when the range to free is not block aligned. 3158 + * When unreserving space with boundaries that are not block 3159 + * aligned we round up the start and round down the end 3160 + * boundaries and then use this function to zero the parts of 3161 + * the blocks that got dropped during the rounding. 3162 */ 3163 STATIC int 3164 xfs_zero_remaining_bytes( ··· 3167 xfs_mount_t *mp = ip->i_mount; 3168 int nimap; 3169 int error = 0; 3170 + 3171 + /* 3172 + * Avoid doing I/O beyond eof - it's not necessary 3173 + * since nothing can read beyond eof. The space will 3174 + * be zeroed when the file is extended anyway. 3175 + */ 3176 + if (startoff >= ip->i_size) 3177 + return 0; 3178 + 3179 + if (endoff > ip->i_size) 3180 + endoff = ip->i_size; 3181 3182 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, 3183 XFS_IS_REALTIME_INODE(ip) ?