Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext3: avoid false EIO errors

Sometimes block_write_begin() can map buffers in a page but later we
fail to copy data into those buffers (because the source page has been
paged out in the mean time). We then end up with !uptodate mapped
buffers. To add a bit more to the confusion, block_write_end() does
not commit any data (and thus does not any mark buffers as uptodate) if
we didn't succeed with copying all the data.

Commit f4fc66a894546bdc88a775d0e83ad20a65210bcb (ext3: convert to new
aops) missed these cases and thus we were inserting non-uptodate
buffers to transaction's list which confuses JBD code and it reports IO
errors, aborts a transaction and generally makes users afraid about
their data ;-P.

This patch fixes the problem by reorganizing ext3_..._write_end() code
to first call block_write_end() to mark buffers with valid data
uptodate and after that we file only uptodate buffers to transaction's
lists.

We also fix a problem where we could leave blocks allocated beyond i_size
(i_disksize in fact) because of failed write. We now add inode to orphan
list when write fails (to be safe in case we crash) and then truncate blocks
beyond i_size in a separate transaction.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Jan Kara and committed by
Linus Torvalds
695f6ae0 de18f3b2

+74 -65
+74 -65
fs/ext3/inode.c
··· 1149 1149 struct page **pagep, void **fsdata) 1150 1150 { 1151 1151 struct inode *inode = mapping->host; 1152 - int ret, needed_blocks = ext3_writepage_trans_blocks(inode); 1152 + int ret; 1153 1153 handle_t *handle; 1154 1154 int retries = 0; 1155 1155 struct page *page; 1156 1156 pgoff_t index; 1157 1157 unsigned from, to; 1158 + /* Reserve one block more for addition to orphan list in case 1159 + * we allocate blocks but write fails for some reason */ 1160 + int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; 1158 1161 1159 1162 index = pos >> PAGE_CACHE_SHIFT; 1160 1163 from = pos & (PAGE_CACHE_SIZE - 1); ··· 1187 1184 } 1188 1185 write_begin_failed: 1189 1186 if (ret) { 1190 - ext3_journal_stop(handle); 1191 - unlock_page(page); 1192 - page_cache_release(page); 1193 1187 /* 1194 1188 * block_write_begin may have instantiated a few blocks 1195 1189 * outside i_size. Trim these off again. Don't need 1196 1190 * i_size_read because we hold i_mutex. 1191 + * 1192 + * Add inode to orphan list in case we crash before truncate 1193 + * finishes. 1197 1194 */ 1195 + if (pos + len > inode->i_size) 1196 + ext3_orphan_add(handle, inode); 1197 + ext3_journal_stop(handle); 1198 + unlock_page(page); 1199 + page_cache_release(page); 1198 1200 if (pos + len > inode->i_size) 1199 1201 vmtruncate(inode, inode->i_size); 1200 1202 } ··· 1219 1211 return err; 1220 1212 } 1221 1213 1214 + /* For ordered writepage and write_end functions */ 1215 + static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) 1216 + { 1217 + /* 1218 + * Write could have mapped the buffer but it didn't copy the data in 1219 + * yet. So avoid filing such buffer into a transaction. 1220 + */ 1221 + if (buffer_mapped(bh) && buffer_uptodate(bh)) 1222 + return ext3_journal_dirty_data(handle, bh); 1223 + return 0; 1224 + } 1225 + 1222 1226 /* For write_end() in data=journal mode */ 1223 1227 static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1224 1228 { ··· 1241 1221 } 1242 1222 1243 1223 /* 1244 - * Generic write_end handler for ordered and writeback ext3 journal modes. 1245 - * We can't use generic_write_end, because that unlocks the page and we need to 1246 - * unlock the page after ext3_journal_stop, but ext3_journal_stop must run 1247 - * after block_write_end. 1224 + * This is nasty and subtle: ext3_write_begin() could have allocated blocks 1225 + * for the whole page but later we failed to copy the data in. Update inode 1226 + * size according to what we managed to copy. The rest is going to be 1227 + * truncated in write_end function. 1248 1228 */ 1249 - static int ext3_generic_write_end(struct file *file, 1250 - struct address_space *mapping, 1251 - loff_t pos, unsigned len, unsigned copied, 1252 - struct page *page, void *fsdata) 1229 + static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied) 1253 1230 { 1254 - struct inode *inode = file->f_mapping->host; 1255 - 1256 - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1257 - 1258 - if (pos+copied > inode->i_size) { 1259 - i_size_write(inode, pos+copied); 1231 + /* What matters to us is i_disksize. We don't write i_size anywhere */ 1232 + if (pos + copied > inode->i_size) 1233 + i_size_write(inode, pos + copied); 1234 + if (pos + copied > EXT3_I(inode)->i_disksize) { 1235 + EXT3_I(inode)->i_disksize = pos + copied; 1260 1236 mark_inode_dirty(inode); 1261 1237 } 1262 - 1263 - return copied; 1264 1238 } 1265 1239 1266 1240 /* ··· 1274 1260 unsigned from, to; 1275 1261 int ret = 0, ret2; 1276 1262 1263 + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1264 + 1277 1265 from = pos & (PAGE_CACHE_SIZE - 1); 1278 - to = from + len; 1279 - 1266 + to = from + copied; 1280 1267 ret = walk_page_buffers(handle, page_buffers(page), 1281 - from, to, NULL, ext3_journal_dirty_data); 1268 + from, to, NULL, journal_dirty_data_fn); 1282 1269 1283 - if (ret == 0) { 1284 - /* 1285 - * generic_write_end() will run mark_inode_dirty() if i_size 1286 - * changes. So let's piggyback the i_disksize mark_inode_dirty 1287 - * into that. 1288 - */ 1289 - loff_t new_i_size; 1290 - 1291 - new_i_size = pos + copied; 1292 - if (new_i_size > EXT3_I(inode)->i_disksize) 1293 - EXT3_I(inode)->i_disksize = new_i_size; 1294 - ret2 = ext3_generic_write_end(file, mapping, pos, len, copied, 1295 - page, fsdata); 1296 - copied = ret2; 1297 - if (ret2 < 0) 1298 - ret = ret2; 1299 - } 1270 + if (ret == 0) 1271 + update_file_sizes(inode, pos, copied); 1272 + /* 1273 + * There may be allocated blocks outside of i_size because 1274 + * we failed to copy some data. Prepare for truncate. 1275 + */ 1276 + if (pos + len > inode->i_size) 1277 + ext3_orphan_add(handle, inode); 1300 1278 ret2 = ext3_journal_stop(handle); 1301 1279 if (!ret) 1302 1280 ret = ret2; 1303 1281 unlock_page(page); 1304 1282 page_cache_release(page); 1305 1283 1284 + if (pos + len > inode->i_size) 1285 + vmtruncate(inode, inode->i_size); 1306 1286 return ret ? ret : copied; 1307 1287 } 1308 1288 ··· 1307 1299 { 1308 1300 handle_t *handle = ext3_journal_current_handle(); 1309 1301 struct inode *inode = file->f_mapping->host; 1310 - int ret = 0, ret2; 1311 - loff_t new_i_size; 1302 + int ret; 1312 1303 1313 - new_i_size = pos + copied; 1314 - if (new_i_size > EXT3_I(inode)->i_disksize) 1315 - EXT3_I(inode)->i_disksize = new_i_size; 1316 - 1317 - ret2 = ext3_generic_write_end(file, mapping, pos, len, copied, 1318 - page, fsdata); 1319 - copied = ret2; 1320 - if (ret2 < 0) 1321 - ret = ret2; 1322 - 1323 - ret2 = ext3_journal_stop(handle); 1324 - if (!ret) 1325 - ret = ret2; 1304 + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1305 + update_file_sizes(inode, pos, copied); 1306 + /* 1307 + * There may be allocated blocks outside of i_size because 1308 + * we failed to copy some data. Prepare for truncate. 1309 + */ 1310 + if (pos + len > inode->i_size) 1311 + ext3_orphan_add(handle, inode); 1312 + ret = ext3_journal_stop(handle); 1326 1313 unlock_page(page); 1327 1314 page_cache_release(page); 1328 1315 1316 + if (pos + len > inode->i_size) 1317 + vmtruncate(inode, inode->i_size); 1329 1318 return ret ? ret : copied; 1330 1319 } 1331 1320 ··· 1343 1338 if (copied < len) { 1344 1339 if (!PageUptodate(page)) 1345 1340 copied = 0; 1346 - page_zero_new_buffers(page, from+copied, to); 1341 + page_zero_new_buffers(page, from + copied, to); 1342 + to = from + copied; 1347 1343 } 1348 1344 1349 1345 ret = walk_page_buffers(handle, page_buffers(page), from, 1350 1346 to, &partial, write_end_fn); 1351 1347 if (!partial) 1352 1348 SetPageUptodate(page); 1353 - if (pos+copied > inode->i_size) 1354 - i_size_write(inode, pos+copied); 1349 + 1350 + if (pos + copied > inode->i_size) 1351 + i_size_write(inode, pos + copied); 1352 + /* 1353 + * There may be allocated blocks outside of i_size because 1354 + * we failed to copy some data. Prepare for truncate. 1355 + */ 1356 + if (pos + len > inode->i_size) 1357 + ext3_orphan_add(handle, inode); 1355 1358 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1356 1359 if (inode->i_size > EXT3_I(inode)->i_disksize) { 1357 1360 EXT3_I(inode)->i_disksize = inode->i_size; ··· 1374 1361 unlock_page(page); 1375 1362 page_cache_release(page); 1376 1363 1364 + if (pos + len > inode->i_size) 1365 + vmtruncate(inode, inode->i_size); 1377 1366 return ret ? ret : copied; 1378 1367 } 1379 1368 ··· 1443 1428 return 0; 1444 1429 } 1445 1430 1446 - static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) 1447 - { 1448 - if (buffer_mapped(bh)) 1449 - return ext3_journal_dirty_data(handle, bh); 1450 - return 0; 1451 - } 1452 - 1453 1431 static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) 1454 1432 { 1455 1433 return !buffer_mapped(bh); 1456 1434 } 1435 + 1457 1436 /* 1458 1437 * Note that we always start a transaction even if we're not journalling 1459 1438 * data. This is to preserve ordering: any hole instantiation within