commit c5850150d0b9ae16840c5d9846f9d5a759996a15 · tjh.dev/kernel

+86 -258

fs/xfs/linux-2.6/xfs_buf.c

··· 94 94 } 95 95 96 96 /* 97 - * Page Region interfaces. 98 - * 99 - * For pages in filesystems where the blocksize is smaller than the 100 - * pagesize, we use the page->private field (long) to hold a bitmap 101 - * of uptodate regions within the page. 102 - * 103 - * Each such region is "bytes per page / bits per long" bytes long. 104 - * 105 - * NBPPR == number-of-bytes-per-page-region 106 - * BTOPR == bytes-to-page-region (rounded up) 107 - * BTOPRT == bytes-to-page-region-truncated (rounded down) 108 - */ 109 - #if (BITS_PER_LONG == 32) 110 - #define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ 111 - #elif (BITS_PER_LONG == 64) 112 - #define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */ 113 - #else 114 - #error BITS_PER_LONG must be 32 or 64 115 - #endif 116 - #define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG) 117 - #define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT) 118 - #define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT)) 119 - 120 - STATIC unsigned long 121 - page_region_mask( 122 - size_t offset, 123 - size_t length) 124 - { 125 - unsigned long mask; 126 - int first, final; 127 - 128 - first = BTOPR(offset); 129 - final = BTOPRT(offset + length - 1); 130 - first = min(first, final); 131 - 132 - mask = ~0UL; 133 - mask <<= BITS_PER_LONG - (final - first); 134 - mask >>= BITS_PER_LONG - (final); 135 - 136 - ASSERT(offset + length <= PAGE_CACHE_SIZE); 137 - ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0); 138 - 139 - return mask; 140 - } 141 - 142 - STATIC void 143 - set_page_region( 144 - struct page *page, 145 - size_t offset, 146 - size_t length) 147 - { 148 - set_page_private(page, 149 - page_private(page) | page_region_mask(offset, length)); 150 - if (page_private(page) == ~0UL) 151 - SetPageUptodate(page); 152 - } 153 - 154 - STATIC int 155 - test_page_region( 156 - struct page *page, 157 - size_t offset, 158 - size_t length) 159 - { 160 - unsigned long mask = page_region_mask(offset, length); 161 - 162 - return (mask && (page_private(page) & mask) == mask); 163 - } 164 - 165 - /* 166 97 * xfs_buf_lru_add - add a buffer to the LRU. 167 98 * 168 99 * The LRU takes a new reference to the buffer so that it will only be freed ··· 263 332 264 333 ASSERT(list_empty(&bp->b_lru)); 265 334 266 - if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 335 + if (bp->b_flags & _XBF_PAGES) { 267 336 uint i; 268 337 269 338 if (xfs_buf_is_vmapped(bp)) ··· 273 342 for (i = 0; i < bp->b_page_count; i++) { 274 343 struct page *page = bp->b_pages[i]; 275 344 276 - if (bp->b_flags & _XBF_PAGE_CACHE) 277 - ASSERT(!PagePrivate(page)); 278 - page_cache_release(page); 345 + __free_page(page); 279 346 } 280 - } 347 + } else if (bp->b_flags & _XBF_KMEM) 348 + kmem_free(bp->b_addr); 281 349 _xfs_buf_free_pages(bp); 282 350 xfs_buf_deallocate(bp); 283 351 } 284 352 285 353 /* 286 - * Finds all pages for buffer in question and builds it's page list. 354 + * Allocates all the pages for buffer in question and builds it's page list. 287 355 */ 288 356 STATIC int 289 - _xfs_buf_lookup_pages( 357 + xfs_buf_allocate_memory( 290 358 xfs_buf_t *bp, 291 359 uint flags) 292 360 { 293 - struct address_space *mapping = bp->b_target->bt_mapping; 294 - size_t blocksize = bp->b_target->bt_bsize; 295 361 size_t size = bp->b_count_desired; 296 362 size_t nbytes, offset; 297 363 gfp_t gfp_mask = xb_to_gfp(flags); ··· 297 369 xfs_off_t end; 298 370 int error; 299 371 372 + /* 373 + * for buffers that are contained within a single page, just allocate 374 + * the memory from the heap - there's no need for the complexity of 375 + * page arrays to keep allocation down to order 0. 376 + */ 377 + if (bp->b_buffer_length < PAGE_SIZE) { 378 + bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags)); 379 + if (!bp->b_addr) { 380 + /* low memory - use alloc_page loop instead */ 381 + goto use_alloc_page; 382 + } 383 + 384 + if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) & 385 + PAGE_MASK) != 386 + ((unsigned long)bp->b_addr & PAGE_MASK)) { 387 + /* b_addr spans two pages - use alloc_page instead */ 388 + kmem_free(bp->b_addr); 389 + bp->b_addr = NULL; 390 + goto use_alloc_page; 391 + } 392 + bp->b_offset = offset_in_page(bp->b_addr); 393 + bp->b_pages = bp->b_page_array; 394 + bp->b_pages[0] = virt_to_page(bp->b_addr); 395 + bp->b_page_count = 1; 396 + bp->b_flags |= XBF_MAPPED | _XBF_KMEM; 397 + return 0; 398 + } 399 + 400 + use_alloc_page: 300 401 end = bp->b_file_offset + bp->b_buffer_length; 301 402 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); 302 - 303 403 error = _xfs_buf_get_pages(bp, page_count, flags); 304 404 if (unlikely(error)) 305 405 return error; 306 - bp->b_flags |= _XBF_PAGE_CACHE; 307 406 308 407 offset = bp->b_offset; 309 - first = bp->b_file_offset >> PAGE_CACHE_SHIFT; 408 + first = bp->b_file_offset >> PAGE_SHIFT; 409 + bp->b_flags |= _XBF_PAGES; 310 410 311 411 for (i = 0; i < bp->b_page_count; i++) { 312 412 struct page *page; 313 413 uint retries = 0; 314 - 315 - retry: 316 - page = find_or_create_page(mapping, first + i, gfp_mask); 414 + retry: 415 + page = alloc_page(gfp_mask); 317 416 if (unlikely(page == NULL)) { 318 417 if (flags & XBF_READ_AHEAD) { 319 418 bp->b_page_count = i; 320 - for (i = 0; i < bp->b_page_count; i++) 321 - unlock_page(bp->b_pages[i]); 322 - return -ENOMEM; 419 + error = ENOMEM; 420 + goto out_free_pages; 323 421 } 324 422 325 423 /* ··· 366 412 367 413 XFS_STATS_INC(xb_page_found); 368 414 369 - nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); 415 + nbytes = min_t(size_t, size, PAGE_SIZE - offset); 370 416 size -= nbytes; 371 - 372 - ASSERT(!PagePrivate(page)); 373 - if (!PageUptodate(page)) { 374 - page_count--; 375 - if (blocksize >= PAGE_CACHE_SIZE) { 376 - if (flags & XBF_READ) 377 - bp->b_flags |= _XBF_PAGE_LOCKED; 378 - } else if (!PagePrivate(page)) { 379 - if (test_page_region(page, offset, nbytes)) 380 - page_count++; 381 - } 382 - } 383 - 384 417 bp->b_pages[i] = page; 385 418 offset = 0; 386 419 } 420 + return 0; 387 421 388 - if (!(bp->b_flags & _XBF_PAGE_LOCKED)) { 389 - for (i = 0; i < bp->b_page_count; i++) 390 - unlock_page(bp->b_pages[i]); 391 - } 392 - 393 - if (page_count == bp->b_page_count) 394 - bp->b_flags |= XBF_DONE; 395 - 422 + out_free_pages: 423 + for (i = 0; i < bp->b_page_count; i++) 424 + __free_page(bp->b_pages[i]); 396 425 return error; 397 426 } 398 427 ··· 387 450 xfs_buf_t *bp, 388 451 uint flags) 389 452 { 390 - /* A single page buffer is always mappable */ 453 + ASSERT(bp->b_flags & _XBF_PAGES); 391 454 if (bp->b_page_count == 1) { 455 + /* A single page buffer is always mappable */ 392 456 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 393 457 bp->b_flags |= XBF_MAPPED; 394 458 } else if (flags & XBF_MAPPED) { 395 - bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 396 - -1, PAGE_KERNEL); 397 - if (unlikely(bp->b_addr == NULL)) 459 + int retried = 0; 460 + 461 + do { 462 + bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 463 + -1, PAGE_KERNEL); 464 + if (bp->b_addr) 465 + break; 466 + vm_unmap_aliases(); 467 + } while (retried++ <= 1); 468 + 469 + if (!bp->b_addr) 398 470 return -ENOMEM; 399 471 bp->b_addr += bp->b_offset; 400 472 bp->b_flags |= XBF_MAPPED; ··· 514 568 } 515 569 } 516 570 571 + /* 572 + * if the buffer is stale, clear all the external state associated with 573 + * it. We need to keep flags such as how we allocated the buffer memory 574 + * intact here. 575 + */ 517 576 if (bp->b_flags & XBF_STALE) { 518 577 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 519 - bp->b_flags &= XBF_MAPPED; 578 + bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES; 520 579 } 521 580 522 581 trace_xfs_buf_find(bp, flags, _RET_IP_); ··· 542 591 xfs_buf_flags_t flags) 543 592 { 544 593 xfs_buf_t *bp, *new_bp; 545 - int error = 0, i; 594 + int error = 0; 546 595 547 596 new_bp = xfs_buf_allocate(flags); 548 597 if (unlikely(!new_bp)) ··· 550 599 551 600 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 552 601 if (bp == new_bp) { 553 - error = _xfs_buf_lookup_pages(bp, flags); 602 + error = xfs_buf_allocate_memory(bp, flags); 554 603 if (error) 555 604 goto no_buffer; 556 605 } else { ··· 558 607 if (unlikely(bp == NULL)) 559 608 return NULL; 560 609 } 561 - 562 - for (i = 0; i < bp->b_page_count; i++) 563 - mark_page_accessed(bp->b_pages[i]); 564 610 565 611 if (!(bp->b_flags & XBF_MAPPED)) { 566 612 error = _xfs_buf_map_pages(bp, flags); ··· 659 711 { 660 712 struct backing_dev_info *bdi; 661 713 662 - bdi = target->bt_mapping->backing_dev_info; 663 - if (bdi_read_congested(bdi)) 714 + if (bdi_read_congested(target->bt_bdi)) 664 715 return; 665 716 666 717 xfs_buf_read(target, ioff, isize, ··· 737 790 size_t buflen; 738 791 int page_count; 739 792 740 - pageaddr = (unsigned long)mem & PAGE_CACHE_MASK; 793 + pageaddr = (unsigned long)mem & PAGE_MASK; 741 794 offset = (unsigned long)mem - pageaddr; 742 - buflen = PAGE_CACHE_ALIGN(len + offset); 743 - page_count = buflen >> PAGE_CACHE_SHIFT; 795 + buflen = PAGE_ALIGN(len + offset); 796 + page_count = buflen >> PAGE_SHIFT; 744 797 745 798 /* Free any previous set of page pointers */ 746 799 if (bp->b_pages) ··· 757 810 758 811 for (i = 0; i < bp->b_page_count; i++) { 759 812 bp->b_pages[i] = mem_to_page((void *)pageaddr); 760 - pageaddr += PAGE_CACHE_SIZE; 813 + pageaddr += PAGE_SIZE; 761 814 } 762 815 763 816 bp->b_count_desired = len; 764 817 bp->b_buffer_length = buflen; 765 818 bp->b_flags |= XBF_MAPPED; 766 - bp->b_flags &= ~_XBF_PAGE_LOCKED; 767 819 768 820 return 0; 769 821 } ··· 869 923 870 924 871 925 /* 872 - * Mutual exclusion on buffers. Locking model: 873 - * 874 - * Buffers associated with inodes for which buffer locking 875 - * is not enabled are not protected by semaphores, and are 876 - * assumed to be exclusively owned by the caller. There is a 877 - * spinlock in the buffer, used by the caller when concurrent 878 - * access is possible. 879 - */ 880 - 881 - /* 882 - * Locks a buffer object, if it is not already locked. Note that this in 883 - * no way locks the underlying pages, so it is only useful for 884 - * synchronizing concurrent use of buffer objects, not for synchronizing 885 - * independent access to the underlying pages. 926 + * Lock a buffer object, if it is not already locked. 886 927 * 887 928 * If we come across a stale, pinned, locked buffer, we know that we are 888 929 * being asked to lock a buffer that has been reallocated. Because it is ··· 903 970 } 904 971 905 972 /* 906 - * Locks a buffer object. 907 - * Note that this in no way locks the underlying pages, so it is only 908 - * useful for synchronizing concurrent use of buffer objects, not for 909 - * synchronizing independent access to the underlying pages. 973 + * Lock a buffer object. 910 974 * 911 975 * If we come across a stale, pinned, locked buffer, we know that we 912 976 * are being asked to lock a buffer that has been reallocated. Because ··· 1176 1246 xfs_buf_t *bp, 1177 1247 int schedule) 1178 1248 { 1179 - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1180 - bp->b_flags &= ~_XBF_PAGE_LOCKED; 1249 + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1181 1250 xfs_buf_ioend(bp, schedule); 1182 - } 1183 1251 } 1184 1252 1185 1253 STATIC void ··· 1186 1258 int error) 1187 1259 { 1188 1260 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1189 - unsigned int blocksize = bp->b_target->bt_bsize; 1190 - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1191 1261 1192 1262 xfs_buf_ioerror(bp, -error); 1193 1263 1194 1264 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1195 1265 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1196 - 1197 - do { 1198 - struct page *page = bvec->bv_page; 1199 - 1200 - ASSERT(!PagePrivate(page)); 1201 - if (unlikely(bp->b_error)) { 1202 - if (bp->b_flags & XBF_READ) 1203 - ClearPageUptodate(page); 1204 - } else if (blocksize >= PAGE_CACHE_SIZE) { 1205 - SetPageUptodate(page); 1206 - } else if (!PagePrivate(page) && 1207 - (bp->b_flags & _XBF_PAGE_CACHE)) { 1208 - set_page_region(page, bvec->bv_offset, bvec->bv_len); 1209 - } 1210 - 1211 - if (--bvec >= bio->bi_io_vec) 1212 - prefetchw(&bvec->bv_page->flags); 1213 - 1214 - if (bp->b_flags & _XBF_PAGE_LOCKED) 1215 - unlock_page(page); 1216 - } while (bvec >= bio->bi_io_vec); 1217 1266 1218 1267 _xfs_buf_ioend(bp, 1); 1219 1268 bio_put(bio); ··· 1205 1300 int offset = bp->b_offset; 1206 1301 int size = bp->b_count_desired; 1207 1302 sector_t sector = bp->b_bn; 1208 - unsigned int blocksize = bp->b_target->bt_bsize; 1209 1303 1210 1304 total_nr_pages = bp->b_page_count; 1211 1305 map_i = 0; ··· 1225 1321 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; 1226 1322 } 1227 1323 1228 - /* Special code path for reading a sub page size buffer in -- 1229 - * we populate up the whole page, and hence the other metadata 1230 - * in the same page. This optimization is only valid when the 1231 - * filesystem block size is not smaller than the page size. 1232 - */ 1233 - if ((bp->b_buffer_length < PAGE_CACHE_SIZE) && 1234 - ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) == 1235 - (XBF_READ|_XBF_PAGE_LOCKED)) && 1236 - (blocksize >= PAGE_CACHE_SIZE)) { 1237 - bio = bio_alloc(GFP_NOIO, 1); 1238 - 1239 - bio->bi_bdev = bp->b_target->bt_bdev; 1240 - bio->bi_sector = sector - (offset >> BBSHIFT); 1241 - bio->bi_end_io = xfs_buf_bio_end_io; 1242 - bio->bi_private = bp; 1243 - 1244 - bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0); 1245 - size = 0; 1246 - 1247 - atomic_inc(&bp->b_io_remaining); 1248 - 1249 - goto submit_io; 1250 - } 1251 1324 1252 1325 next_chunk: 1253 1326 atomic_inc(&bp->b_io_remaining); ··· 1238 1357 bio->bi_end_io = xfs_buf_bio_end_io; 1239 1358 bio->bi_private = bp; 1240 1359 1360 + 1241 1361 for (; size && nr_pages; nr_pages--, map_i++) { 1242 - int rbytes, nbytes = PAGE_CACHE_SIZE - offset; 1362 + int rbytes, nbytes = PAGE_SIZE - offset; 1243 1363 1244 1364 if (nbytes > size) 1245 1365 nbytes = size; ··· 1255 1373 total_nr_pages--; 1256 1374 } 1257 1375 1258 - submit_io: 1259 1376 if (likely(bio->bi_size)) { 1260 1377 if (xfs_buf_is_vmapped(bp)) { 1261 1378 flush_kernel_vmap_range(bp->b_addr, ··· 1264 1383 if (size) 1265 1384 goto next_chunk; 1266 1385 } else { 1267 - /* 1268 - * if we get here, no pages were added to the bio. However, 1269 - * we can't just error out here - if the pages are locked then 1270 - * we have to unlock them otherwise we can hang on a later 1271 - * access to the page. 1272 - */ 1273 1386 xfs_buf_ioerror(bp, EIO); 1274 - if (bp->b_flags & _XBF_PAGE_LOCKED) { 1275 - int i; 1276 - for (i = 0; i < bp->b_page_count; i++) 1277 - unlock_page(bp->b_pages[i]); 1278 - } 1279 1387 bio_put(bio); 1280 1388 } 1281 1389 } ··· 1328 1458 return XFS_BUF_PTR(bp) + offset; 1329 1459 1330 1460 offset += bp->b_offset; 1331 - page = bp->b_pages[offset >> PAGE_CACHE_SHIFT]; 1332 - return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1)); 1461 + page = bp->b_pages[offset >> PAGE_SHIFT]; 1462 + return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); 1333 1463 } 1334 1464 1335 1465 /* ··· 1351 1481 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; 1352 1482 cpoff = xfs_buf_poff(boff + bp->b_offset); 1353 1483 csize = min_t(size_t, 1354 - PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff); 1484 + PAGE_SIZE-cpoff, bp->b_count_desired-boff); 1355 1485 1356 - ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); 1486 + ASSERT(((csize + cpoff) <= PAGE_SIZE)); 1357 1487 1358 1488 switch (mode) { 1359 1489 case XBRW_ZERO: ··· 1466 1596 xfs_flush_buftarg(btp, 1); 1467 1597 if (mp->m_flags & XFS_MOUNT_BARRIER) 1468 1598 xfs_blkdev_issue_flush(btp); 1469 - iput(btp->bt_mapping->host); 1470 1599 1471 1600 kthread_stop(btp->bt_task); 1472 1601 kmem_free(btp); ··· 1489 1620 return EINVAL; 1490 1621 } 1491 1622 1492 - if (verbose && 1493 - (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) { 1494 - printk(KERN_WARNING 1495 - "XFS: %u byte sectors in use on device %s. " 1496 - "This is suboptimal; %u or greater is ideal.\n", 1497 - sectorsize, XFS_BUFTARG_NAME(btp), 1498 - (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG); 1499 - } 1500 - 1501 1623 return 0; 1502 1624 } 1503 1625 ··· 1503 1643 struct block_device *bdev) 1504 1644 { 1505 1645 return xfs_setsize_buftarg_flags(btp, 1506 - PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0); 1646 + PAGE_SIZE, bdev_logical_block_size(bdev), 0); 1507 1647 } 1508 1648 1509 1649 int ··· 1513 1653 unsigned int sectorsize) 1514 1654 { 1515 1655 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); 1516 - } 1517 - 1518 - STATIC int 1519 - xfs_mapping_buftarg( 1520 - xfs_buftarg_t *btp, 1521 - struct block_device *bdev) 1522 - { 1523 - struct backing_dev_info *bdi; 1524 - struct inode *inode; 1525 - struct address_space *mapping; 1526 - static const struct address_space_operations mapping_aops = { 1527 - .migratepage = fail_migrate_page, 1528 - }; 1529 - 1530 - inode = new_inode(bdev->bd_inode->i_sb); 1531 - if (!inode) { 1532 - printk(KERN_WARNING 1533 - "XFS: Cannot allocate mapping inode for device %s\n", 1534 - XFS_BUFTARG_NAME(btp)); 1535 - return ENOMEM; 1536 - } 1537 - inode->i_ino = get_next_ino(); 1538 - inode->i_mode = S_IFBLK; 1539 - inode->i_bdev = bdev; 1540 - inode->i_rdev = bdev->bd_dev; 1541 - bdi = blk_get_backing_dev_info(bdev); 1542 - if (!bdi) 1543 - bdi = &default_backing_dev_info; 1544 - mapping = &inode->i_data; 1545 - mapping->a_ops = &mapping_aops; 1546 - mapping->backing_dev_info = bdi; 1547 - mapping_set_gfp_mask(mapping, GFP_NOFS); 1548 - btp->bt_mapping = mapping; 1549 - return 0; 1550 1656 } 1551 1657 1552 1658 STATIC int ··· 1543 1717 btp->bt_mount = mp; 1544 1718 btp->bt_dev = bdev->bd_dev; 1545 1719 btp->bt_bdev = bdev; 1720 + btp->bt_bdi = blk_get_backing_dev_info(bdev); 1721 + if (!btp->bt_bdi) 1722 + goto error; 1723 + 1546 1724 INIT_LIST_HEAD(&btp->bt_lru); 1547 1725 spin_lock_init(&btp->bt_lru_lock); 1548 1726 if (xfs_setsize_buftarg_early(btp, bdev)) 1549 - goto error; 1550 - if (xfs_mapping_buftarg(btp, bdev)) 1551 1727 goto error; 1552 1728 if (xfs_alloc_delwrite_queue(btp, fsname)) 1553 1729 goto error;

+4 -36

fs/xfs/linux-2.6/xfs_buf.h

··· 61 61 #define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */ 62 62 63 63 /* flags used only internally */ 64 - #define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */ 65 64 #define _XBF_PAGES (1 << 18)/* backed by refcounted pages */ 66 65 #define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */ 66 + #define _XBF_KMEM (1 << 20)/* backed by heap memory */ 67 67 #define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */ 68 - 69 - /* 70 - * Special flag for supporting metadata blocks smaller than a FSB. 71 - * 72 - * In this case we can have multiple xfs_buf_t on a single page and 73 - * need to lock out concurrent xfs_buf_t readers as they only 74 - * serialise access to the buffer. 75 - * 76 - * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation 77 - * between reads of the page. Hence we can have one thread read the 78 - * page and modify it, but then race with another thread that thinks 79 - * the page is not up-to-date and hence reads it again. 80 - * 81 - * The result is that the first modifcation to the page is lost. 82 - * This sort of AGF/AGI reading race can happen when unlinking inodes 83 - * that require truncation and results in the AGI unlinked list 84 - * modifications being lost. 85 - */ 86 - #define _XBF_PAGE_LOCKED (1 << 22) 87 68 88 69 typedef unsigned int xfs_buf_flags_t; 89 70 ··· 81 100 { XBF_LOCK, "LOCK" }, /* should never be set */\ 82 101 { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ 83 102 { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ 84 - { _XBF_PAGE_CACHE, "PAGE_CACHE" }, \ 85 103 { _XBF_PAGES, "PAGES" }, \ 86 104 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ 87 - { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 88 - { _XBF_PAGE_LOCKED, "PAGE_LOCKED" } 89 - 105 + { _XBF_KMEM, "KMEM" }, \ 106 + { _XBF_DELWRI_Q, "DELWRI_Q" } 90 107 91 108 typedef enum { 92 109 XBT_FORCE_SLEEP = 0, ··· 99 120 typedef struct xfs_buftarg { 100 121 dev_t bt_dev; 101 122 struct block_device *bt_bdev; 102 - struct address_space *bt_mapping; 123 + struct backing_dev_info *bt_bdi; 103 124 struct xfs_mount *bt_mount; 104 125 unsigned int bt_bsize; 105 126 unsigned int bt_sshift; ··· 117 138 spinlock_t bt_lru_lock; 118 139 unsigned int bt_lru_nr; 119 140 } xfs_buftarg_t; 120 - 121 - /* 122 - * xfs_buf_t: Buffer structure for pagecache-based buffers 123 - * 124 - * This buffer structure is used by the pagecache buffer management routines 125 - * to refer to an assembly of pages forming a logical buffer. 126 - * 127 - * The buffer structure is used on a temporary basis only, and discarded when 128 - * released. The real data storage is recorded in the pagecache. Buffers are 129 - * hashed to the block device on which the file system resides. 130 - */ 131 141 132 142 struct xfs_buf; 133 143 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);

+5 -1

fs/xfs/linux-2.6/xfs_file.c

··· 896 896 xfs_flock64_t bf; 897 897 xfs_inode_t *ip = XFS_I(inode); 898 898 int cmd = XFS_IOC_RESVSP; 899 + int attr_flags = XFS_ATTR_NOLOCK; 899 900 900 901 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 901 902 return -EOPNOTSUPP; ··· 919 918 goto out_unlock; 920 919 } 921 920 922 - error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK); 921 + if (file->f_flags & O_DSYNC) 922 + attr_flags |= XFS_ATTR_SYNC; 923 + 924 + error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags); 923 925 if (error) 924 926 goto out_unlock; 925 927

+4

fs/xfs/linux-2.6/xfs_ioctl.c

··· 624 624 625 625 if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) 626 626 attr_flags |= XFS_ATTR_NONBLOCK; 627 + 628 + if (filp->f_flags & O_DSYNC) 629 + attr_flags |= XFS_ATTR_SYNC; 630 + 627 631 if (ioflags & IO_INVIS) 628 632 attr_flags |= XFS_ATTR_DMI; 629 633

+25 -11

fs/xfs/linux-2.6/xfs_super.c

··· 1078 1078 error = 0; 1079 1079 goto out_unlock; 1080 1080 } 1081 - error = xfs_iflush(ip, 0); 1081 + error = xfs_iflush(ip, SYNC_TRYLOCK); 1082 1082 } 1083 1083 1084 1084 out_unlock: ··· 1539 1539 if (error) 1540 1540 goto out_free_sb; 1541 1541 1542 - error = xfs_mountfs(mp); 1543 - if (error) 1544 - goto out_filestream_unmount; 1545 - 1542 + /* 1543 + * we must configure the block size in the superblock before we run the 1544 + * full mount process as the mount process can lookup and cache inodes. 1545 + * For the same reason we must also initialise the syncd and register 1546 + * the inode cache shrinker so that inodes can be reclaimed during 1547 + * operations like a quotacheck that iterate all inodes in the 1548 + * filesystem. 1549 + */ 1546 1550 sb->s_magic = XFS_SB_MAGIC; 1547 1551 sb->s_blocksize = mp->m_sb.sb_blocksize; 1548 1552 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; 1549 1553 sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); 1550 1554 sb->s_time_gran = 1; 1551 1555 set_posix_acl_flag(sb); 1556 + 1557 + error = xfs_syncd_init(mp); 1558 + if (error) 1559 + goto out_filestream_unmount; 1560 + 1561 + xfs_inode_shrinker_register(mp); 1562 + 1563 + error = xfs_mountfs(mp); 1564 + if (error) 1565 + goto out_syncd_stop; 1552 1566 1553 1567 root = igrab(VFS_I(mp->m_rootip)); 1554 1568 if (!root) { ··· 1579 1565 goto fail_vnrele; 1580 1566 } 1581 1567 1582 - error = xfs_syncd_init(mp); 1583 - if (error) 1584 - goto fail_vnrele; 1585 - 1586 - xfs_inode_shrinker_register(mp); 1587 - 1588 1568 return 0; 1589 1569 1570 + out_syncd_stop: 1571 + xfs_inode_shrinker_unregister(mp); 1572 + xfs_syncd_stop(mp); 1590 1573 out_filestream_unmount: 1591 1574 xfs_filestream_unmount(mp); 1592 1575 out_free_sb: ··· 1607 1596 } 1608 1597 1609 1598 fail_unmount: 1599 + xfs_inode_shrinker_unregister(mp); 1600 + xfs_syncd_stop(mp); 1601 + 1610 1602 /* 1611 1603 * Blow away any referenced inode in the filestreams cache. 1612 1604 * This can and will cause log traffic as inodes go inactive

+27 -3

fs/xfs/linux-2.6/xfs_sync.c

··· 761 761 struct xfs_perag *pag, 762 762 int sync_mode) 763 763 { 764 - int error = 0; 764 + int error; 765 765 766 + restart: 767 + error = 0; 766 768 xfs_ilock(ip, XFS_ILOCK_EXCL); 767 769 if (!xfs_iflock_nowait(ip)) { 768 770 if (!(sync_mode & SYNC_WAIT)) ··· 790 788 if (xfs_inode_clean(ip)) 791 789 goto reclaim; 792 790 793 - /* Now we have an inode that needs flushing */ 794 - error = xfs_iflush(ip, sync_mode); 791 + /* 792 + * Now we have an inode that needs flushing. 793 + * 794 + * We do a nonblocking flush here even if we are doing a SYNC_WAIT 795 + * reclaim as we can deadlock with inode cluster removal. 796 + * xfs_ifree_cluster() can lock the inode buffer before it locks the 797 + * ip->i_lock, and we are doing the exact opposite here. As a result, 798 + * doing a blocking xfs_itobp() to get the cluster buffer will result 799 + * in an ABBA deadlock with xfs_ifree_cluster(). 800 + * 801 + * As xfs_ifree_cluser() must gather all inodes that are active in the 802 + * cache to mark them stale, if we hit this case we don't actually want 803 + * to do IO here - we want the inode marked stale so we can simply 804 + * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, 805 + * just unlock the inode, back off and try again. Hopefully the next 806 + * pass through will see the stale flag set on the inode. 807 + */ 808 + error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); 795 809 if (sync_mode & SYNC_WAIT) { 810 + if (error == EAGAIN) { 811 + xfs_iunlock(ip, XFS_ILOCK_EXCL); 812 + /* backoff longer than in xfs_ifree_cluster */ 813 + delay(2); 814 + goto restart; 815 + } 796 816 xfs_iflock(ip); 797 817 goto reclaim; 798 818 }

+1 -1

fs/xfs/xfs_inode.c

··· 2835 2835 * Get the buffer containing the on-disk inode. 2836 2836 */ 2837 2837 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2838 - (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK); 2838 + (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK); 2839 2839 if (error || !bp) { 2840 2840 xfs_ifunlock(ip); 2841 2841 return error;

+3 -3

fs/xfs/xfs_inode_item.c

··· 760 760 * Push the inode to it's backing buffer. This will not remove the 761 761 * inode from the AIL - a further push will be required to trigger a 762 762 * buffer push. However, this allows all the dirty inodes to be pushed 763 - * to the buffer before it is pushed to disk. THe buffer IO completion 764 - * will pull th einode from the AIL, mark it clean and unlock the flush 763 + * to the buffer before it is pushed to disk. The buffer IO completion 764 + * will pull the inode from the AIL, mark it clean and unlock the flush 765 765 * lock. 766 766 */ 767 - (void) xfs_iflush(ip, 0); 767 + (void) xfs_iflush(ip, SYNC_TRYLOCK); 768 768 xfs_iunlock(ip, XFS_ILOCK_SHARED); 769 769 } 770 770

+2 -1

fs/xfs/xfs_trans_buf.c

··· 383 383 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); 384 384 if (bp == NULL) { 385 385 *bpp = NULL; 386 - return 0; 386 + return (flags & XBF_TRYLOCK) ? 387 + 0 : XFS_ERROR(ENOMEM); 387 388 } 388 389 if (XFS_BUF_GETERROR(bp) != 0) { 389 390 XFS_BUF_SUPER_STALE(bp);

+2 -1

fs/xfs/xfs_vnodeops.c

··· 2831 2831 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; 2832 2832 2833 2833 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2834 - xfs_trans_set_sync(tp); 2834 + if (attr_flags & XFS_ATTR_SYNC) 2835 + xfs_trans_set_sync(tp); 2835 2836 2836 2837 error = xfs_trans_commit(tp, 0); 2837 2838

+1

fs/xfs/xfs_vnodeops.h

··· 18 18 #define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ 19 19 #define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ 20 20 #define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ 21 + #define XFS_ATTR_SYNC 0x10 /* synchronous operation required */ 21 22 22 23 int xfs_readlink(struct xfs_inode *ip, char *link); 23 24 int xfs_release(struct xfs_inode *ip);