commit c5850150d0b9ae16840c5d9846f9d5a759996a15 · tjh.dev/kernel

+86 -258

fs/xfs/linux-2.6/xfs_buf.c

··· 94 } 95 96 /* 97 - * Page Region interfaces. 98 - * 99 - * For pages in filesystems where the blocksize is smaller than the 100 - * pagesize, we use the page->private field (long) to hold a bitmap 101 - * of uptodate regions within the page. 102 - * 103 - * Each such region is "bytes per page / bits per long" bytes long. 104 - * 105 - * NBPPR == number-of-bytes-per-page-region 106 - * BTOPR == bytes-to-page-region (rounded up) 107 - * BTOPRT == bytes-to-page-region-truncated (rounded down) 108 - */ 109 - #if (BITS_PER_LONG == 32) 110 - #define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ 111 - #elif (BITS_PER_LONG == 64) 112 - #define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */ 113 - #else 114 - #error BITS_PER_LONG must be 32 or 64 115 - #endif 116 - #define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG) 117 - #define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT) 118 - #define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT)) 119 - 120 - STATIC unsigned long 121 - page_region_mask( 122 - size_t offset, 123 - size_t length) 124 - { 125 - unsigned long mask; 126 - int first, final; 127 - 128 - first = BTOPR(offset); 129 - final = BTOPRT(offset + length - 1); 130 - first = min(first, final); 131 - 132 - mask = ~0UL; 133 - mask <<= BITS_PER_LONG - (final - first); 134 - mask >>= BITS_PER_LONG - (final); 135 - 136 - ASSERT(offset + length <= PAGE_CACHE_SIZE); 137 - ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0); 138 - 139 - return mask; 140 - } 141 - 142 - STATIC void 143 - set_page_region( 144 - struct page *page, 145 - size_t offset, 146 - size_t length) 147 - { 148 - set_page_private(page, 149 - page_private(page) | page_region_mask(offset, length)); 150 - if (page_private(page) == ~0UL) 151 - SetPageUptodate(page); 152 - } 153 - 154 - STATIC int 155 - test_page_region( 156 - struct page *page, 157 - size_t offset, 158 - size_t length) 159 - { 160 - unsigned long mask = page_region_mask(offset, length); 161 - 162 - return (mask && (page_private(page) & mask) == mask); 163 - } 164 - 165 - /* 166 * xfs_buf_lru_add - add a buffer to the LRU. 167 * 168 * The LRU takes a new reference to the buffer so that it will only be freed ··· 263 264 ASSERT(list_empty(&bp->b_lru)); 265 266 - if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 267 uint i; 268 269 if (xfs_buf_is_vmapped(bp)) ··· 273 for (i = 0; i < bp->b_page_count; i++) { 274 struct page *page = bp->b_pages[i]; 275 276 - if (bp->b_flags & _XBF_PAGE_CACHE) 277 - ASSERT(!PagePrivate(page)); 278 - page_cache_release(page); 279 } 280 - } 281 _xfs_buf_free_pages(bp); 282 xfs_buf_deallocate(bp); 283 } 284 285 /* 286 - * Finds all pages for buffer in question and builds it's page list. 287 */ 288 STATIC int 289 - _xfs_buf_lookup_pages( 290 xfs_buf_t *bp, 291 uint flags) 292 { 293 - struct address_space *mapping = bp->b_target->bt_mapping; 294 - size_t blocksize = bp->b_target->bt_bsize; 295 size_t size = bp->b_count_desired; 296 size_t nbytes, offset; 297 gfp_t gfp_mask = xb_to_gfp(flags); ··· 297 xfs_off_t end; 298 int error; 299 300 end = bp->b_file_offset + bp->b_buffer_length; 301 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); 302 - 303 error = _xfs_buf_get_pages(bp, page_count, flags); 304 if (unlikely(error)) 305 return error; 306 - bp->b_flags |= _XBF_PAGE_CACHE; 307 308 offset = bp->b_offset; 309 - first = bp->b_file_offset >> PAGE_CACHE_SHIFT; 310 311 for (i = 0; i < bp->b_page_count; i++) { 312 struct page *page; 313 uint retries = 0; 314 - 315 - retry: 316 - page = find_or_create_page(mapping, first + i, gfp_mask); 317 if (unlikely(page == NULL)) { 318 if (flags & XBF_READ_AHEAD) { 319 bp->b_page_count = i; 320 - for (i = 0; i < bp->b_page_count; i++) 321 - unlock_page(bp->b_pages[i]); 322 - return -ENOMEM; 323 } 324 325 /* ··· 366 367 XFS_STATS_INC(xb_page_found); 368 369 - nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); 370 size -= nbytes; 371 - 372 - ASSERT(!PagePrivate(page)); 373 - if (!PageUptodate(page)) { 374 - page_count--; 375 - if (blocksize >= PAGE_CACHE_SIZE) { 376 - if (flags & XBF_READ) 377 - bp->b_flags |= _XBF_PAGE_LOCKED; 378 - } else if (!PagePrivate(page)) { 379 - if (test_page_region(page, offset, nbytes)) 380 - page_count++; 381 - } 382 - } 383 - 384 bp->b_pages[i] = page; 385 offset = 0; 386 } 387 388 - if (!(bp->b_flags & _XBF_PAGE_LOCKED)) { 389 - for (i = 0; i < bp->b_page_count; i++) 390 - unlock_page(bp->b_pages[i]); 391 - } 392 - 393 - if (page_count == bp->b_page_count) 394 - bp->b_flags |= XBF_DONE; 395 - 396 return error; 397 } 398 ··· 387 xfs_buf_t *bp, 388 uint flags) 389 { 390 - /* A single page buffer is always mappable */ 391 if (bp->b_page_count == 1) { 392 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 393 bp->b_flags |= XBF_MAPPED; 394 } else if (flags & XBF_MAPPED) { 395 - bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 396 - -1, PAGE_KERNEL); 397 - if (unlikely(bp->b_addr == NULL)) 398 return -ENOMEM; 399 bp->b_addr += bp->b_offset; 400 bp->b_flags |= XBF_MAPPED; ··· 514 } 515 } 516 517 if (bp->b_flags & XBF_STALE) { 518 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 519 - bp->b_flags &= XBF_MAPPED; 520 } 521 522 trace_xfs_buf_find(bp, flags, _RET_IP_); ··· 542 xfs_buf_flags_t flags) 543 { 544 xfs_buf_t *bp, *new_bp; 545 - int error = 0, i; 546 547 new_bp = xfs_buf_allocate(flags); 548 if (unlikely(!new_bp)) ··· 550 551 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 552 if (bp == new_bp) { 553 - error = _xfs_buf_lookup_pages(bp, flags); 554 if (error) 555 goto no_buffer; 556 } else { ··· 558 if (unlikely(bp == NULL)) 559 return NULL; 560 } 561 - 562 - for (i = 0; i < bp->b_page_count; i++) 563 - mark_page_accessed(bp->b_pages[i]); 564 565 if (!(bp->b_flags & XBF_MAPPED)) { 566 error = _xfs_buf_map_pages(bp, flags); ··· 659 { 660 struct backing_dev_info *bdi; 661 662 - bdi = target->bt_mapping->backing_dev_info; 663 - if (bdi_read_congested(bdi)) 664 return; 665 666 xfs_buf_read(target, ioff, isize, ··· 737 size_t buflen; 738 int page_count; 739 740 - pageaddr = (unsigned long)mem & PAGE_CACHE_MASK; 741 offset = (unsigned long)mem - pageaddr; 742 - buflen = PAGE_CACHE_ALIGN(len + offset); 743 - page_count = buflen >> PAGE_CACHE_SHIFT; 744 745 /* Free any previous set of page pointers */ 746 if (bp->b_pages) ··· 757 758 for (i = 0; i < bp->b_page_count; i++) { 759 bp->b_pages[i] = mem_to_page((void *)pageaddr); 760 - pageaddr += PAGE_CACHE_SIZE; 761 } 762 763 bp->b_count_desired = len; 764 bp->b_buffer_length = buflen; 765 bp->b_flags |= XBF_MAPPED; 766 - bp->b_flags &= ~_XBF_PAGE_LOCKED; 767 768 return 0; 769 } ··· 869 870 871 /* 872 - * Mutual exclusion on buffers. Locking model: 873 - * 874 - * Buffers associated with inodes for which buffer locking 875 - * is not enabled are not protected by semaphores, and are 876 - * assumed to be exclusively owned by the caller. There is a 877 - * spinlock in the buffer, used by the caller when concurrent 878 - * access is possible. 879 - */ 880 - 881 - /* 882 - * Locks a buffer object, if it is not already locked. Note that this in 883 - * no way locks the underlying pages, so it is only useful for 884 - * synchronizing concurrent use of buffer objects, not for synchronizing 885 - * independent access to the underlying pages. 886 * 887 * If we come across a stale, pinned, locked buffer, we know that we are 888 * being asked to lock a buffer that has been reallocated. Because it is ··· 903 } 904 905 /* 906 - * Locks a buffer object. 907 - * Note that this in no way locks the underlying pages, so it is only 908 - * useful for synchronizing concurrent use of buffer objects, not for 909 - * synchronizing independent access to the underlying pages. 910 * 911 * If we come across a stale, pinned, locked buffer, we know that we 912 * are being asked to lock a buffer that has been reallocated. Because ··· 1176 xfs_buf_t *bp, 1177 int schedule) 1178 { 1179 - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1180 - bp->b_flags &= ~_XBF_PAGE_LOCKED; 1181 xfs_buf_ioend(bp, schedule); 1182 - } 1183 } 1184 1185 STATIC void ··· 1186 int error) 1187 { 1188 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1189 - unsigned int blocksize = bp->b_target->bt_bsize; 1190 - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1191 1192 xfs_buf_ioerror(bp, -error); 1193 1194 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1195 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1196 - 1197 - do { 1198 - struct page *page = bvec->bv_page; 1199 - 1200 - ASSERT(!PagePrivate(page)); 1201 - if (unlikely(bp->b_error)) { 1202 - if (bp->b_flags & XBF_READ) 1203 - ClearPageUptodate(page); 1204 - } else if (blocksize >= PAGE_CACHE_SIZE) { 1205 - SetPageUptodate(page); 1206 - } else if (!PagePrivate(page) && 1207 - (bp->b_flags & _XBF_PAGE_CACHE)) { 1208 - set_page_region(page, bvec->bv_offset, bvec->bv_len); 1209 - } 1210 - 1211 - if (--bvec >= bio->bi_io_vec) 1212 - prefetchw(&bvec->bv_page->flags); 1213 - 1214 - if (bp->b_flags & _XBF_PAGE_LOCKED) 1215 - unlock_page(page); 1216 - } while (bvec >= bio->bi_io_vec); 1217 1218 _xfs_buf_ioend(bp, 1); 1219 bio_put(bio); ··· 1205 int offset = bp->b_offset; 1206 int size = bp->b_count_desired; 1207 sector_t sector = bp->b_bn; 1208 - unsigned int blocksize = bp->b_target->bt_bsize; 1209 1210 total_nr_pages = bp->b_page_count; 1211 map_i = 0; ··· 1225 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; 1226 } 1227 1228 - /* Special code path for reading a sub page size buffer in -- 1229 - * we populate up the whole page, and hence the other metadata 1230 - * in the same page. This optimization is only valid when the 1231 - * filesystem block size is not smaller than the page size. 1232 - */ 1233 - if ((bp->b_buffer_length < PAGE_CACHE_SIZE) && 1234 - ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) == 1235 - (XBF_READ|_XBF_PAGE_LOCKED)) && 1236 - (blocksize >= PAGE_CACHE_SIZE)) { 1237 - bio = bio_alloc(GFP_NOIO, 1); 1238 - 1239 - bio->bi_bdev = bp->b_target->bt_bdev; 1240 - bio->bi_sector = sector - (offset >> BBSHIFT); 1241 - bio->bi_end_io = xfs_buf_bio_end_io; 1242 - bio->bi_private = bp; 1243 - 1244 - bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0); 1245 - size = 0; 1246 - 1247 - atomic_inc(&bp->b_io_remaining); 1248 - 1249 - goto submit_io; 1250 - } 1251 1252 next_chunk: 1253 atomic_inc(&bp->b_io_remaining); ··· 1238 bio->bi_end_io = xfs_buf_bio_end_io; 1239 bio->bi_private = bp; 1240 1241 for (; size && nr_pages; nr_pages--, map_i++) { 1242 - int rbytes, nbytes = PAGE_CACHE_SIZE - offset; 1243 1244 if (nbytes > size) 1245 nbytes = size; ··· 1255 total_nr_pages--; 1256 } 1257 1258 - submit_io: 1259 if (likely(bio->bi_size)) { 1260 if (xfs_buf_is_vmapped(bp)) { 1261 flush_kernel_vmap_range(bp->b_addr, ··· 1264 if (size) 1265 goto next_chunk; 1266 } else { 1267 - /* 1268 - * if we get here, no pages were added to the bio. However, 1269 - * we can't just error out here - if the pages are locked then 1270 - * we have to unlock them otherwise we can hang on a later 1271 - * access to the page. 1272 - */ 1273 xfs_buf_ioerror(bp, EIO); 1274 - if (bp->b_flags & _XBF_PAGE_LOCKED) { 1275 - int i; 1276 - for (i = 0; i < bp->b_page_count; i++) 1277 - unlock_page(bp->b_pages[i]); 1278 - } 1279 bio_put(bio); 1280 } 1281 } ··· 1328 return XFS_BUF_PTR(bp) + offset; 1329 1330 offset += bp->b_offset; 1331 - page = bp->b_pages[offset >> PAGE_CACHE_SHIFT]; 1332 - return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1)); 1333 } 1334 1335 /* ··· 1351 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; 1352 cpoff = xfs_buf_poff(boff + bp->b_offset); 1353 csize = min_t(size_t, 1354 - PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff); 1355 1356 - ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); 1357 1358 switch (mode) { 1359 case XBRW_ZERO: ··· 1466 xfs_flush_buftarg(btp, 1); 1467 if (mp->m_flags & XFS_MOUNT_BARRIER) 1468 xfs_blkdev_issue_flush(btp); 1469 - iput(btp->bt_mapping->host); 1470 1471 kthread_stop(btp->bt_task); 1472 kmem_free(btp); ··· 1489 return EINVAL; 1490 } 1491 1492 - if (verbose && 1493 - (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) { 1494 - printk(KERN_WARNING 1495 - "XFS: %u byte sectors in use on device %s. " 1496 - "This is suboptimal; %u or greater is ideal.\n", 1497 - sectorsize, XFS_BUFTARG_NAME(btp), 1498 - (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG); 1499 - } 1500 - 1501 return 0; 1502 } 1503 ··· 1503 struct block_device *bdev) 1504 { 1505 return xfs_setsize_buftarg_flags(btp, 1506 - PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0); 1507 } 1508 1509 int ··· 1513 unsigned int sectorsize) 1514 { 1515 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); 1516 - } 1517 - 1518 - STATIC int 1519 - xfs_mapping_buftarg( 1520 - xfs_buftarg_t *btp, 1521 - struct block_device *bdev) 1522 - { 1523 - struct backing_dev_info *bdi; 1524 - struct inode *inode; 1525 - struct address_space *mapping; 1526 - static const struct address_space_operations mapping_aops = { 1527 - .migratepage = fail_migrate_page, 1528 - }; 1529 - 1530 - inode = new_inode(bdev->bd_inode->i_sb); 1531 - if (!inode) { 1532 - printk(KERN_WARNING 1533 - "XFS: Cannot allocate mapping inode for device %s\n", 1534 - XFS_BUFTARG_NAME(btp)); 1535 - return ENOMEM; 1536 - } 1537 - inode->i_ino = get_next_ino(); 1538 - inode->i_mode = S_IFBLK; 1539 - inode->i_bdev = bdev; 1540 - inode->i_rdev = bdev->bd_dev; 1541 - bdi = blk_get_backing_dev_info(bdev); 1542 - if (!bdi) 1543 - bdi = &default_backing_dev_info; 1544 - mapping = &inode->i_data; 1545 - mapping->a_ops = &mapping_aops; 1546 - mapping->backing_dev_info = bdi; 1547 - mapping_set_gfp_mask(mapping, GFP_NOFS); 1548 - btp->bt_mapping = mapping; 1549 - return 0; 1550 } 1551 1552 STATIC int ··· 1543 btp->bt_mount = mp; 1544 btp->bt_dev = bdev->bd_dev; 1545 btp->bt_bdev = bdev; 1546 INIT_LIST_HEAD(&btp->bt_lru); 1547 spin_lock_init(&btp->bt_lru_lock); 1548 if (xfs_setsize_buftarg_early(btp, bdev)) 1549 - goto error; 1550 - if (xfs_mapping_buftarg(btp, bdev)) 1551 goto error; 1552 if (xfs_alloc_delwrite_queue(btp, fsname)) 1553 goto error;

··· 94 } 95 96 /* 97 * xfs_buf_lru_add - add a buffer to the LRU. 98 * 99 * The LRU takes a new reference to the buffer so that it will only be freed ··· 332 333 ASSERT(list_empty(&bp->b_lru)); 334 335 + if (bp->b_flags & _XBF_PAGES) { 336 uint i; 337 338 if (xfs_buf_is_vmapped(bp)) ··· 342 for (i = 0; i < bp->b_page_count; i++) { 343 struct page *page = bp->b_pages[i]; 344 345 + __free_page(page); 346 } 347 + } else if (bp->b_flags & _XBF_KMEM) 348 + kmem_free(bp->b_addr); 349 _xfs_buf_free_pages(bp); 350 xfs_buf_deallocate(bp); 351 } 352 353 /* 354 + * Allocates all the pages for buffer in question and builds it's page list. 355 */ 356 STATIC int 357 + xfs_buf_allocate_memory( 358 xfs_buf_t *bp, 359 uint flags) 360 { 361 size_t size = bp->b_count_desired; 362 size_t nbytes, offset; 363 gfp_t gfp_mask = xb_to_gfp(flags); ··· 369 xfs_off_t end; 370 int error; 371 372 + /* 373 + * for buffers that are contained within a single page, just allocate 374 + * the memory from the heap - there's no need for the complexity of 375 + * page arrays to keep allocation down to order 0. 376 + */ 377 + if (bp->b_buffer_length < PAGE_SIZE) { 378 + bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags)); 379 + if (!bp->b_addr) { 380 + /* low memory - use alloc_page loop instead */ 381 + goto use_alloc_page; 382 + } 383 + 384 + if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) & 385 + PAGE_MASK) != 386 + ((unsigned long)bp->b_addr & PAGE_MASK)) { 387 + /* b_addr spans two pages - use alloc_page instead */ 388 + kmem_free(bp->b_addr); 389 + bp->b_addr = NULL; 390 + goto use_alloc_page; 391 + } 392 + bp->b_offset = offset_in_page(bp->b_addr); 393 + bp->b_pages = bp->b_page_array; 394 + bp->b_pages[0] = virt_to_page(bp->b_addr); 395 + bp->b_page_count = 1; 396 + bp->b_flags |= XBF_MAPPED | _XBF_KMEM; 397 + return 0; 398 + } 399 + 400 + use_alloc_page: 401 end = bp->b_file_offset + bp->b_buffer_length; 402 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); 403 error = _xfs_buf_get_pages(bp, page_count, flags); 404 if (unlikely(error)) 405 return error; 406 407 offset = bp->b_offset; 408 + first = bp->b_file_offset >> PAGE_SHIFT; 409 + bp->b_flags |= _XBF_PAGES; 410 411 for (i = 0; i < bp->b_page_count; i++) { 412 struct page *page; 413 uint retries = 0; 414 + retry: 415 + page = alloc_page(gfp_mask); 416 if (unlikely(page == NULL)) { 417 if (flags & XBF_READ_AHEAD) { 418 bp->b_page_count = i; 419 + error = ENOMEM; 420 + goto out_free_pages; 421 } 422 423 /* ··· 412 413 XFS_STATS_INC(xb_page_found); 414 415 + nbytes = min_t(size_t, size, PAGE_SIZE - offset); 416 size -= nbytes; 417 bp->b_pages[i] = page; 418 offset = 0; 419 } 420 + return 0; 421 422 + out_free_pages: 423 + for (i = 0; i < bp->b_page_count; i++) 424 + __free_page(bp->b_pages[i]); 425 return error; 426 } 427 ··· 450 xfs_buf_t *bp, 451 uint flags) 452 { 453 + ASSERT(bp->b_flags & _XBF_PAGES); 454 if (bp->b_page_count == 1) { 455 + /* A single page buffer is always mappable */ 456 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 457 bp->b_flags |= XBF_MAPPED; 458 } else if (flags & XBF_MAPPED) { 459 + int retried = 0; 460 + 461 + do { 462 + bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 463 + -1, PAGE_KERNEL); 464 + if (bp->b_addr) 465 + break; 466 + vm_unmap_aliases(); 467 + } while (retried++ <= 1); 468 + 469 + if (!bp->b_addr) 470 return -ENOMEM; 471 bp->b_addr += bp->b_offset; 472 bp->b_flags |= XBF_MAPPED; ··· 568 } 569 } 570 571 + /* 572 + * if the buffer is stale, clear all the external state associated with 573 + * it. We need to keep flags such as how we allocated the buffer memory 574 + * intact here. 575 + */ 576 if (bp->b_flags & XBF_STALE) { 577 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 578 + bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES; 579 } 580 581 trace_xfs_buf_find(bp, flags, _RET_IP_); ··· 591 xfs_buf_flags_t flags) 592 { 593 xfs_buf_t *bp, *new_bp; 594 + int error = 0; 595 596 new_bp = xfs_buf_allocate(flags); 597 if (unlikely(!new_bp)) ··· 599 600 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 601 if (bp == new_bp) { 602 + error = xfs_buf_allocate_memory(bp, flags); 603 if (error) 604 goto no_buffer; 605 } else { ··· 607 if (unlikely(bp == NULL)) 608 return NULL; 609 } 610 611 if (!(bp->b_flags & XBF_MAPPED)) { 612 error = _xfs_buf_map_pages(bp, flags); ··· 711 { 712 struct backing_dev_info *bdi; 713 714 + if (bdi_read_congested(target->bt_bdi)) 715 return; 716 717 xfs_buf_read(target, ioff, isize, ··· 790 size_t buflen; 791 int page_count; 792 793 + pageaddr = (unsigned long)mem & PAGE_MASK; 794 offset = (unsigned long)mem - pageaddr; 795 + buflen = PAGE_ALIGN(len + offset); 796 + page_count = buflen >> PAGE_SHIFT; 797 798 /* Free any previous set of page pointers */ 799 if (bp->b_pages) ··· 810 811 for (i = 0; i < bp->b_page_count; i++) { 812 bp->b_pages[i] = mem_to_page((void *)pageaddr); 813 + pageaddr += PAGE_SIZE; 814 } 815 816 bp->b_count_desired = len; 817 bp->b_buffer_length = buflen; 818 bp->b_flags |= XBF_MAPPED; 819 820 return 0; 821 } ··· 923 924 925 /* 926 + * Lock a buffer object, if it is not already locked. 927 * 928 * If we come across a stale, pinned, locked buffer, we know that we are 929 * being asked to lock a buffer that has been reallocated. Because it is ··· 970 } 971 972 /* 973 + * Lock a buffer object. 974 * 975 * If we come across a stale, pinned, locked buffer, we know that we 976 * are being asked to lock a buffer that has been reallocated. Because ··· 1246 xfs_buf_t *bp, 1247 int schedule) 1248 { 1249 + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1250 xfs_buf_ioend(bp, schedule); 1251 } 1252 1253 STATIC void ··· 1258 int error) 1259 { 1260 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1261 1262 xfs_buf_ioerror(bp, -error); 1263 1264 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1265 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1266 1267 _xfs_buf_ioend(bp, 1); 1268 bio_put(bio); ··· 1300 int offset = bp->b_offset; 1301 int size = bp->b_count_desired; 1302 sector_t sector = bp->b_bn; 1303 1304 total_nr_pages = bp->b_page_count; 1305 map_i = 0; ··· 1321 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; 1322 } 1323 1324 1325 next_chunk: 1326 atomic_inc(&bp->b_io_remaining); ··· 1357 bio->bi_end_io = xfs_buf_bio_end_io; 1358 bio->bi_private = bp; 1359 1360 + 1361 for (; size && nr_pages; nr_pages--, map_i++) { 1362 + int rbytes, nbytes = PAGE_SIZE - offset; 1363 1364 if (nbytes > size) 1365 nbytes = size; ··· 1373 total_nr_pages--; 1374 } 1375 1376 if (likely(bio->bi_size)) { 1377 if (xfs_buf_is_vmapped(bp)) { 1378 flush_kernel_vmap_range(bp->b_addr, ··· 1383 if (size) 1384 goto next_chunk; 1385 } else { 1386 xfs_buf_ioerror(bp, EIO); 1387 bio_put(bio); 1388 } 1389 } ··· 1458 return XFS_BUF_PTR(bp) + offset; 1459 1460 offset += bp->b_offset; 1461 + page = bp->b_pages[offset >> PAGE_SHIFT]; 1462 + return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); 1463 } 1464 1465 /* ··· 1481 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; 1482 cpoff = xfs_buf_poff(boff + bp->b_offset); 1483 csize = min_t(size_t, 1484 + PAGE_SIZE-cpoff, bp->b_count_desired-boff); 1485 1486 + ASSERT(((csize + cpoff) <= PAGE_SIZE)); 1487 1488 switch (mode) { 1489 case XBRW_ZERO: ··· 1596 xfs_flush_buftarg(btp, 1); 1597 if (mp->m_flags & XFS_MOUNT_BARRIER) 1598 xfs_blkdev_issue_flush(btp); 1599 1600 kthread_stop(btp->bt_task); 1601 kmem_free(btp); ··· 1620 return EINVAL; 1621 } 1622 1623 return 0; 1624 } 1625 ··· 1643 struct block_device *bdev) 1644 { 1645 return xfs_setsize_buftarg_flags(btp, 1646 + PAGE_SIZE, bdev_logical_block_size(bdev), 0); 1647 } 1648 1649 int ··· 1653 unsigned int sectorsize) 1654 { 1655 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); 1656 } 1657 1658 STATIC int ··· 1717 btp->bt_mount = mp; 1718 btp->bt_dev = bdev->bd_dev; 1719 btp->bt_bdev = bdev; 1720 + btp->bt_bdi = blk_get_backing_dev_info(bdev); 1721 + if (!btp->bt_bdi) 1722 + goto error; 1723 + 1724 INIT_LIST_HEAD(&btp->bt_lru); 1725 spin_lock_init(&btp->bt_lru_lock); 1726 if (xfs_setsize_buftarg_early(btp, bdev)) 1727 goto error; 1728 if (xfs_alloc_delwrite_queue(btp, fsname)) 1729 goto error;

+4 -36

fs/xfs/linux-2.6/xfs_buf.h

··· 61 #define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */ 62 63 /* flags used only internally */ 64 - #define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */ 65 #define _XBF_PAGES (1 << 18)/* backed by refcounted pages */ 66 #define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */ 67 #define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */ 68 - 69 - /* 70 - * Special flag for supporting metadata blocks smaller than a FSB. 71 - * 72 - * In this case we can have multiple xfs_buf_t on a single page and 73 - * need to lock out concurrent xfs_buf_t readers as they only 74 - * serialise access to the buffer. 75 - * 76 - * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation 77 - * between reads of the page. Hence we can have one thread read the 78 - * page and modify it, but then race with another thread that thinks 79 - * the page is not up-to-date and hence reads it again. 80 - * 81 - * The result is that the first modifcation to the page is lost. 82 - * This sort of AGF/AGI reading race can happen when unlinking inodes 83 - * that require truncation and results in the AGI unlinked list 84 - * modifications being lost. 85 - */ 86 - #define _XBF_PAGE_LOCKED (1 << 22) 87 88 typedef unsigned int xfs_buf_flags_t; 89 ··· 81 { XBF_LOCK, "LOCK" }, /* should never be set */\ 82 { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ 83 { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ 84 - { _XBF_PAGE_CACHE, "PAGE_CACHE" }, \ 85 { _XBF_PAGES, "PAGES" }, \ 86 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ 87 - { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 88 - { _XBF_PAGE_LOCKED, "PAGE_LOCKED" } 89 - 90 91 typedef enum { 92 XBT_FORCE_SLEEP = 0, ··· 99 typedef struct xfs_buftarg { 100 dev_t bt_dev; 101 struct block_device *bt_bdev; 102 - struct address_space *bt_mapping; 103 struct xfs_mount *bt_mount; 104 unsigned int bt_bsize; 105 unsigned int bt_sshift; ··· 117 spinlock_t bt_lru_lock; 118 unsigned int bt_lru_nr; 119 } xfs_buftarg_t; 120 - 121 - /* 122 - * xfs_buf_t: Buffer structure for pagecache-based buffers 123 - * 124 - * This buffer structure is used by the pagecache buffer management routines 125 - * to refer to an assembly of pages forming a logical buffer. 126 - * 127 - * The buffer structure is used on a temporary basis only, and discarded when 128 - * released. The real data storage is recorded in the pagecache. Buffers are 129 - * hashed to the block device on which the file system resides. 130 - */ 131 132 struct xfs_buf; 133 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);

··· 61 #define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */ 62 63 /* flags used only internally */ 64 #define _XBF_PAGES (1 << 18)/* backed by refcounted pages */ 65 #define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */ 66 + #define _XBF_KMEM (1 << 20)/* backed by heap memory */ 67 #define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */ 68 69 typedef unsigned int xfs_buf_flags_t; 70 ··· 100 { XBF_LOCK, "LOCK" }, /* should never be set */\ 101 { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ 102 { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ 103 { _XBF_PAGES, "PAGES" }, \ 104 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ 105 + { _XBF_KMEM, "KMEM" }, \ 106 + { _XBF_DELWRI_Q, "DELWRI_Q" } 107 108 typedef enum { 109 XBT_FORCE_SLEEP = 0, ··· 120 typedef struct xfs_buftarg { 121 dev_t bt_dev; 122 struct block_device *bt_bdev; 123 + struct backing_dev_info *bt_bdi; 124 struct xfs_mount *bt_mount; 125 unsigned int bt_bsize; 126 unsigned int bt_sshift; ··· 138 spinlock_t bt_lru_lock; 139 unsigned int bt_lru_nr; 140 } xfs_buftarg_t; 141 142 struct xfs_buf; 143 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);

+5 -1

fs/xfs/linux-2.6/xfs_file.c

··· 896 xfs_flock64_t bf; 897 xfs_inode_t *ip = XFS_I(inode); 898 int cmd = XFS_IOC_RESVSP; 899 900 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 901 return -EOPNOTSUPP; ··· 919 goto out_unlock; 920 } 921 922 - error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK); 923 if (error) 924 goto out_unlock; 925

··· 896 xfs_flock64_t bf; 897 xfs_inode_t *ip = XFS_I(inode); 898 int cmd = XFS_IOC_RESVSP; 899 + int attr_flags = XFS_ATTR_NOLOCK; 900 901 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 902 return -EOPNOTSUPP; ··· 918 goto out_unlock; 919 } 920 921 + if (file->f_flags & O_DSYNC) 922 + attr_flags |= XFS_ATTR_SYNC; 923 + 924 + error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags); 925 if (error) 926 goto out_unlock; 927

+4

fs/xfs/linux-2.6/xfs_ioctl.c

··· 624 625 if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) 626 attr_flags |= XFS_ATTR_NONBLOCK; 627 if (ioflags & IO_INVIS) 628 attr_flags |= XFS_ATTR_DMI; 629

··· 624 625 if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) 626 attr_flags |= XFS_ATTR_NONBLOCK; 627 + 628 + if (filp->f_flags & O_DSYNC) 629 + attr_flags |= XFS_ATTR_SYNC; 630 + 631 if (ioflags & IO_INVIS) 632 attr_flags |= XFS_ATTR_DMI; 633

+25 -11

fs/xfs/linux-2.6/xfs_super.c

··· 1078 error = 0; 1079 goto out_unlock; 1080 } 1081 - error = xfs_iflush(ip, 0); 1082 } 1083 1084 out_unlock: ··· 1539 if (error) 1540 goto out_free_sb; 1541 1542 - error = xfs_mountfs(mp); 1543 - if (error) 1544 - goto out_filestream_unmount; 1545 - 1546 sb->s_magic = XFS_SB_MAGIC; 1547 sb->s_blocksize = mp->m_sb.sb_blocksize; 1548 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; 1549 sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); 1550 sb->s_time_gran = 1; 1551 set_posix_acl_flag(sb); 1552 1553 root = igrab(VFS_I(mp->m_rootip)); 1554 if (!root) { ··· 1579 goto fail_vnrele; 1580 } 1581 1582 - error = xfs_syncd_init(mp); 1583 - if (error) 1584 - goto fail_vnrele; 1585 - 1586 - xfs_inode_shrinker_register(mp); 1587 - 1588 return 0; 1589 1590 out_filestream_unmount: 1591 xfs_filestream_unmount(mp); 1592 out_free_sb: ··· 1607 } 1608 1609 fail_unmount: 1610 /* 1611 * Blow away any referenced inode in the filestreams cache. 1612 * This can and will cause log traffic as inodes go inactive

··· 1078 error = 0; 1079 goto out_unlock; 1080 } 1081 + error = xfs_iflush(ip, SYNC_TRYLOCK); 1082 } 1083 1084 out_unlock: ··· 1539 if (error) 1540 goto out_free_sb; 1541 1542 + /* 1543 + * we must configure the block size in the superblock before we run the 1544 + * full mount process as the mount process can lookup and cache inodes. 1545 + * For the same reason we must also initialise the syncd and register 1546 + * the inode cache shrinker so that inodes can be reclaimed during 1547 + * operations like a quotacheck that iterate all inodes in the 1548 + * filesystem. 1549 + */ 1550 sb->s_magic = XFS_SB_MAGIC; 1551 sb->s_blocksize = mp->m_sb.sb_blocksize; 1552 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; 1553 sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); 1554 sb->s_time_gran = 1; 1555 set_posix_acl_flag(sb); 1556 + 1557 + error = xfs_syncd_init(mp); 1558 + if (error) 1559 + goto out_filestream_unmount; 1560 + 1561 + xfs_inode_shrinker_register(mp); 1562 + 1563 + error = xfs_mountfs(mp); 1564 + if (error) 1565 + goto out_syncd_stop; 1566 1567 root = igrab(VFS_I(mp->m_rootip)); 1568 if (!root) { ··· 1565 goto fail_vnrele; 1566 } 1567 1568 return 0; 1569 1570 + out_syncd_stop: 1571 + xfs_inode_shrinker_unregister(mp); 1572 + xfs_syncd_stop(mp); 1573 out_filestream_unmount: 1574 xfs_filestream_unmount(mp); 1575 out_free_sb: ··· 1596 } 1597 1598 fail_unmount: 1599 + xfs_inode_shrinker_unregister(mp); 1600 + xfs_syncd_stop(mp); 1601 + 1602 /* 1603 * Blow away any referenced inode in the filestreams cache. 1604 * This can and will cause log traffic as inodes go inactive

+27 -3

fs/xfs/linux-2.6/xfs_sync.c

··· 761 struct xfs_perag *pag, 762 int sync_mode) 763 { 764 - int error = 0; 765 766 xfs_ilock(ip, XFS_ILOCK_EXCL); 767 if (!xfs_iflock_nowait(ip)) { 768 if (!(sync_mode & SYNC_WAIT)) ··· 790 if (xfs_inode_clean(ip)) 791 goto reclaim; 792 793 - /* Now we have an inode that needs flushing */ 794 - error = xfs_iflush(ip, sync_mode); 795 if (sync_mode & SYNC_WAIT) { 796 xfs_iflock(ip); 797 goto reclaim; 798 }

··· 761 struct xfs_perag *pag, 762 int sync_mode) 763 { 764 + int error; 765 766 + restart: 767 + error = 0; 768 xfs_ilock(ip, XFS_ILOCK_EXCL); 769 if (!xfs_iflock_nowait(ip)) { 770 if (!(sync_mode & SYNC_WAIT)) ··· 788 if (xfs_inode_clean(ip)) 789 goto reclaim; 790 791 + /* 792 + * Now we have an inode that needs flushing. 793 + * 794 + * We do a nonblocking flush here even if we are doing a SYNC_WAIT 795 + * reclaim as we can deadlock with inode cluster removal. 796 + * xfs_ifree_cluster() can lock the inode buffer before it locks the 797 + * ip->i_lock, and we are doing the exact opposite here. As a result, 798 + * doing a blocking xfs_itobp() to get the cluster buffer will result 799 + * in an ABBA deadlock with xfs_ifree_cluster(). 800 + * 801 + * As xfs_ifree_cluser() must gather all inodes that are active in the 802 + * cache to mark them stale, if we hit this case we don't actually want 803 + * to do IO here - we want the inode marked stale so we can simply 804 + * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, 805 + * just unlock the inode, back off and try again. Hopefully the next 806 + * pass through will see the stale flag set on the inode. 807 + */ 808 + error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); 809 if (sync_mode & SYNC_WAIT) { 810 + if (error == EAGAIN) { 811 + xfs_iunlock(ip, XFS_ILOCK_EXCL); 812 + /* backoff longer than in xfs_ifree_cluster */ 813 + delay(2); 814 + goto restart; 815 + } 816 xfs_iflock(ip); 817 goto reclaim; 818 }

+1 -1

fs/xfs/xfs_inode.c

··· 2835 * Get the buffer containing the on-disk inode. 2836 */ 2837 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2838 - (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK); 2839 if (error || !bp) { 2840 xfs_ifunlock(ip); 2841 return error;

··· 2835 * Get the buffer containing the on-disk inode. 2836 */ 2837 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2838 + (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK); 2839 if (error || !bp) { 2840 xfs_ifunlock(ip); 2841 return error;

+3 -3

fs/xfs/xfs_inode_item.c

··· 760 * Push the inode to it's backing buffer. This will not remove the 761 * inode from the AIL - a further push will be required to trigger a 762 * buffer push. However, this allows all the dirty inodes to be pushed 763 - * to the buffer before it is pushed to disk. THe buffer IO completion 764 - * will pull th einode from the AIL, mark it clean and unlock the flush 765 * lock. 766 */ 767 - (void) xfs_iflush(ip, 0); 768 xfs_iunlock(ip, XFS_ILOCK_SHARED); 769 } 770

··· 760 * Push the inode to it's backing buffer. This will not remove the 761 * inode from the AIL - a further push will be required to trigger a 762 * buffer push. However, this allows all the dirty inodes to be pushed 763 + * to the buffer before it is pushed to disk. The buffer IO completion 764 + * will pull the inode from the AIL, mark it clean and unlock the flush 765 * lock. 766 */ 767 + (void) xfs_iflush(ip, SYNC_TRYLOCK); 768 xfs_iunlock(ip, XFS_ILOCK_SHARED); 769 } 770

+2 -1

fs/xfs/xfs_trans_buf.c

··· 383 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); 384 if (bp == NULL) { 385 *bpp = NULL; 386 - return 0; 387 } 388 if (XFS_BUF_GETERROR(bp) != 0) { 389 XFS_BUF_SUPER_STALE(bp);

··· 383 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); 384 if (bp == NULL) { 385 *bpp = NULL; 386 + return (flags & XBF_TRYLOCK) ? 387 + 0 : XFS_ERROR(ENOMEM); 388 } 389 if (XFS_BUF_GETERROR(bp) != 0) { 390 XFS_BUF_SUPER_STALE(bp);

+2 -1

fs/xfs/xfs_vnodeops.c

··· 2831 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; 2832 2833 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2834 - xfs_trans_set_sync(tp); 2835 2836 error = xfs_trans_commit(tp, 0); 2837

··· 2831 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; 2832 2833 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2834 + if (attr_flags & XFS_ATTR_SYNC) 2835 + xfs_trans_set_sync(tp); 2836 2837 error = xfs_trans_commit(tp, 0); 2838

+1

fs/xfs/xfs_vnodeops.h

··· 18 #define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ 19 #define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ 20 #define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ 21 22 int xfs_readlink(struct xfs_inode *ip, char *link); 23 int xfs_release(struct xfs_inode *ip);

··· 18 #define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ 19 #define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ 20 #define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ 21 + #define XFS_ATTR_SYNC 0x10 /* synchronous operation required */ 22 23 int xfs_readlink(struct xfs_inode *ip, char *link); 24 int xfs_release(struct xfs_inode *ip);