Merge branch 'akpm' (patches from Andrew)

+2 -3

Documentation/kasan.txt

··· 12 12 therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is 13 13 required for detection of out-of-bounds accesses to stack or global variables. 14 14 15 - Currently KASAN is supported only for x86_64 architecture and requires the 16 - kernel to be built with the SLUB allocator. 15 + Currently KASAN is supported only for x86_64 architecture. 17 16 18 17 1. Usage 19 18 ======== ··· 26 27 the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC 27 28 version 5.0 or later. 28 29 29 - Currently KASAN works only with the SLUB memory allocator. 30 + KASAN works with both SLUB and SLAB memory allocators. 30 31 For better bug detection and nicer reporting, enable CONFIG_STACKTRACE. 31 32 32 33 To disable instrumentation for specific files or directories, add a line

+14

MAINTAINERS

··· 6165 6165 F: Documentation/hwmon/k8temp 6166 6166 F: drivers/hwmon/k8temp.c 6167 6167 6168 + KASAN 6169 + M: Andrey Ryabinin <aryabinin@virtuozzo.com> 6170 + R: Alexander Potapenko <glider@google.com> 6171 + R: Dmitry Vyukov <dvyukov@google.com> 6172 + L: kasan-dev@googlegroups.com 6173 + S: Maintained 6174 + F: arch/*/include/asm/kasan.h 6175 + F: arch/*/mm/kasan_init* 6176 + F: Documentation/kasan.txt 6177 + F: include/linux/kasan.h 6178 + F: lib/test_kasan.c 6179 + F: mm/kasan/ 6180 + F: scripts/Makefile.kasan 6181 + 6168 6182 KCONFIG 6169 6183 M: "Yann E. MORIN" <yann.morin.1998@free.fr> 6170 6184 L: linux-kbuild@vger.kernel.org

+1 -1

arch/arm/include/asm/exception.h

··· 7 7 #ifndef __ASM_ARM_EXCEPTION_H 8 8 #define __ASM_ARM_EXCEPTION_H 9 9 10 - #include <linux/ftrace.h> 10 + #include <linux/interrupt.h> 11 11 12 12 #define __exception __attribute__((section(".exception.text"))) 13 13 #ifdef CONFIG_FUNCTION_GRAPH_TRACER

+1

arch/arm/kernel/vmlinux.lds.S

··· 108 108 *(.exception.text) 109 109 __exception_text_end = .; 110 110 IRQENTRY_TEXT 111 + SOFTIRQENTRY_TEXT 111 112 TEXT_TEXT 112 113 SCHED_TEXT 113 114 LOCK_TEXT

+1 -1

arch/arm64/include/asm/exception.h

··· 18 18 #ifndef __ASM_EXCEPTION_H 19 19 #define __ASM_EXCEPTION_H 20 20 21 - #include <linux/ftrace.h> 21 + #include <linux/interrupt.h> 22 22 23 23 #define __exception __attribute__((section(".exception.text"))) 24 24 #ifdef CONFIG_FUNCTION_GRAPH_TRACER

+1

arch/arm64/kernel/vmlinux.lds.S

··· 103 103 *(.exception.text) 104 104 __exception_text_end = .; 105 105 IRQENTRY_TEXT 106 + SOFTIRQENTRY_TEXT 106 107 TEXT_TEXT 107 108 SCHED_TEXT 108 109 LOCK_TEXT

+1

arch/blackfin/kernel/vmlinux.lds.S

··· 35 35 #endif 36 36 LOCK_TEXT 37 37 IRQENTRY_TEXT 38 + SOFTIRQENTRY_TEXT 38 39 KPROBES_TEXT 39 40 #ifdef CONFIG_ROMKERNEL 40 41 __sinittext = .;

+1

arch/c6x/kernel/vmlinux.lds.S

··· 72 72 SCHED_TEXT 73 73 LOCK_TEXT 74 74 IRQENTRY_TEXT 75 + SOFTIRQENTRY_TEXT 75 76 KPROBES_TEXT 76 77 *(.fixup) 77 78 *(.gnu.warning)

+1

arch/metag/kernel/vmlinux.lds.S

··· 24 24 LOCK_TEXT 25 25 KPROBES_TEXT 26 26 IRQENTRY_TEXT 27 + SOFTIRQENTRY_TEXT 27 28 *(.text.*) 28 29 *(.gnu.warning) 29 30 }

+1

arch/microblaze/kernel/vmlinux.lds.S

··· 36 36 LOCK_TEXT 37 37 KPROBES_TEXT 38 38 IRQENTRY_TEXT 39 + SOFTIRQENTRY_TEXT 39 40 . = ALIGN (4) ; 40 41 _etext = . ; 41 42 }

+1

arch/mips/kernel/vmlinux.lds.S

··· 58 58 LOCK_TEXT 59 59 KPROBES_TEXT 60 60 IRQENTRY_TEXT 61 + SOFTIRQENTRY_TEXT 61 62 *(.text.*) 62 63 *(.fixup) 63 64 *(.gnu.warning)

+1

arch/nios2/kernel/vmlinux.lds.S

··· 39 39 SCHED_TEXT 40 40 LOCK_TEXT 41 41 IRQENTRY_TEXT 42 + SOFTIRQENTRY_TEXT 42 43 KPROBES_TEXT 43 44 } =0 44 45 _etext = .;

+1

arch/openrisc/kernel/vmlinux.lds.S

··· 50 50 LOCK_TEXT 51 51 KPROBES_TEXT 52 52 IRQENTRY_TEXT 53 + SOFTIRQENTRY_TEXT 53 54 *(.fixup) 54 55 *(.text.__*) 55 56 _etext = .;

+1

arch/parisc/kernel/vmlinux.lds.S

··· 72 72 LOCK_TEXT 73 73 KPROBES_TEXT 74 74 IRQENTRY_TEXT 75 + SOFTIRQENTRY_TEXT 75 76 *(.text.do_softirq) 76 77 *(.text.sys_exit) 77 78 *(.text.do_sigaltstack)

+1

arch/powerpc/kernel/vmlinux.lds.S

··· 55 55 LOCK_TEXT 56 56 KPROBES_TEXT 57 57 IRQENTRY_TEXT 58 + SOFTIRQENTRY_TEXT 58 59 59 60 #ifdef CONFIG_PPC32 60 61 *(.got1)

+1

arch/s390/kernel/vmlinux.lds.S

··· 28 28 LOCK_TEXT 29 29 KPROBES_TEXT 30 30 IRQENTRY_TEXT 31 + SOFTIRQENTRY_TEXT 31 32 *(.fixup) 32 33 *(.gnu.warning) 33 34 } :text = 0x0700

+1

arch/sh/kernel/vmlinux.lds.S

··· 39 39 LOCK_TEXT 40 40 KPROBES_TEXT 41 41 IRQENTRY_TEXT 42 + SOFTIRQENTRY_TEXT 42 43 *(.fixup) 43 44 *(.gnu.warning) 44 45 _etext = .; /* End of text section */

+1

arch/sparc/kernel/vmlinux.lds.S

··· 48 48 LOCK_TEXT 49 49 KPROBES_TEXT 50 50 IRQENTRY_TEXT 51 + SOFTIRQENTRY_TEXT 51 52 *(.gnu.warning) 52 53 } = 0 53 54 _etext = .;

+1

arch/tile/kernel/vmlinux.lds.S

··· 45 45 LOCK_TEXT 46 46 KPROBES_TEXT 47 47 IRQENTRY_TEXT 48 + SOFTIRQENTRY_TEXT 48 49 __fix_text_end = .; /* tile-cpack won't rearrange before this */ 49 50 ALIGN_FUNCTION(); 50 51 *(.hottext*)

+1

arch/x86/kernel/Makefile

··· 19 19 KASAN_SANITIZE_head$(BITS).o := n 20 20 KASAN_SANITIZE_dumpstack.o := n 21 21 KASAN_SANITIZE_dumpstack_$(BITS).o := n 22 + KASAN_SANITIZE_stacktrace.o := n 22 23 23 24 OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y 24 25 OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y

+1

arch/x86/kernel/vmlinux.lds.S

··· 101 101 KPROBES_TEXT 102 102 ENTRY_TEXT 103 103 IRQENTRY_TEXT 104 + SOFTIRQENTRY_TEXT 104 105 *(.fixup) 105 106 *(.gnu.warning) 106 107 /* End of text section */

+3 -3

drivers/input/input-compat.c

··· 17 17 int input_event_from_user(const char __user *buffer, 18 18 struct input_event *event) 19 19 { 20 - if (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) { 20 + if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 21 21 struct input_event_compat compat_event; 22 22 23 23 if (copy_from_user(&compat_event, buffer, ··· 41 41 int input_event_to_user(char __user *buffer, 42 42 const struct input_event *event) 43 43 { 44 - if (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) { 44 + if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 45 45 struct input_event_compat compat_event; 46 46 47 47 compat_event.time.tv_sec = event->time.tv_sec; ··· 65 65 int input_ff_effect_from_user(const char __user *buffer, size_t size, 66 66 struct ff_effect *effect) 67 67 { 68 - if (INPUT_COMPAT_TEST) { 68 + if (in_compat_syscall()) { 69 69 struct ff_effect_compat *compat_effect; 70 70 71 71 if (size != sizeof(struct ff_effect_compat))

+1 -3

drivers/input/input-compat.h

··· 17 17 18 18 #ifdef CONFIG_COMPAT 19 19 20 - #define INPUT_COMPAT_TEST in_compat_syscall() 21 - 22 20 struct input_event_compat { 23 21 struct compat_timeval time; 24 22 __u16 type; ··· 55 57 56 58 static inline size_t input_event_size(void) 57 59 { 58 - return (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) ? 60 + return (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) ? 59 61 sizeof(struct input_event_compat) : sizeof(struct input_event); 60 62 } 61 63

+1 -1

drivers/input/input.c

··· 1015 1015 { 1016 1016 int len = 0; 1017 1017 1018 - if (INPUT_COMPAT_TEST) { 1018 + if (in_compat_syscall()) { 1019 1019 u32 dword = bits >> 32; 1020 1020 if (dword || !skip_empty) 1021 1021 len += snprintf(buf, buf_size, "%x ", dword);

+2 -2

drivers/input/misc/uinput.c

··· 664 664 static int uinput_ff_upload_to_user(char __user *buffer, 665 665 const struct uinput_ff_upload *ff_up) 666 666 { 667 - if (INPUT_COMPAT_TEST) { 667 + if (in_compat_syscall()) { 668 668 struct uinput_ff_upload_compat ff_up_compat; 669 669 670 670 ff_up_compat.request_id = ff_up->request_id; ··· 695 695 static int uinput_ff_upload_from_user(const char __user *buffer, 696 696 struct uinput_ff_upload *ff_up) 697 697 { 698 - if (INPUT_COMPAT_TEST) { 698 + if (in_compat_syscall()) { 699 699 struct uinput_ff_upload_compat ff_up_compat; 700 700 701 701 if (copy_from_user(&ff_up_compat, buffer,

+1 -2

drivers/memstick/host/r592.c

··· 298 298 sg_count = dma_map_sg(&dev->pci_dev->dev, &dev->req->sg, 1, is_write ? 299 299 PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE); 300 300 301 - if (sg_count != 1 || 302 - (sg_dma_len(&dev->req->sg) < dev->req->sg.length)) { 301 + if (sg_count != 1 || sg_dma_len(&dev->req->sg) < R592_LFIFO_SIZE) { 303 302 message("problem in dma_map_sg"); 304 303 return -EIO; 305 304 }

+64 -41

fs/ocfs2/alloc.c

··· 2516 2516 struct ocfs2_extent_block *eb; 2517 2517 u32 range; 2518 2518 2519 - /* 2520 - * In normal tree rotation process, we will never touch the 2521 - * tree branch above subtree_index and ocfs2_extend_rotate_transaction 2522 - * doesn't reserve the credits for them either. 2523 - * 2524 - * But we do have a special case here which will update the rightmost 2525 - * records for all the bh in the path. 2526 - * So we have to allocate extra credits and access them. 2527 - */ 2528 - ret = ocfs2_extend_trans(handle, subtree_index); 2529 - if (ret) { 2530 - mlog_errno(ret); 2531 - goto out; 2532 - } 2533 - 2534 2519 ret = ocfs2_journal_access_path(et->et_ci, handle, path); 2535 2520 if (ret) { 2536 2521 mlog_errno(ret); ··· 2941 2956 right_path->p_node[subtree_root].bh->b_blocknr, 2942 2957 right_path->p_tree_depth); 2943 2958 2944 - ret = ocfs2_extend_rotate_transaction(handle, subtree_root, 2959 + ret = ocfs2_extend_rotate_transaction(handle, 0, 2945 2960 orig_credits, left_path); 2946 2961 if (ret) { 2947 2962 mlog_errno(ret); ··· 3014 3029 struct ocfs2_extent_block *eb; 3015 3030 struct ocfs2_extent_list *el; 3016 3031 3017 - 3018 3032 ret = ocfs2_et_sanity_check(et); 3019 3033 if (ret) 3020 3034 goto out; 3021 - /* 3022 - * There's two ways we handle this depending on 3023 - * whether path is the only existing one. 3024 - */ 3025 - ret = ocfs2_extend_rotate_transaction(handle, 0, 3026 - handle->h_buffer_credits, 3027 - path); 3028 - if (ret) { 3029 - mlog_errno(ret); 3030 - goto out; 3031 - } 3032 3035 3033 3036 ret = ocfs2_journal_access_path(et->et_ci, handle, path); 3034 3037 if (ret) { ··· 3614 3641 */ 3615 3642 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && 3616 3643 le16_to_cpu(el->l_next_free_rec) == 1) { 3644 + /* extend credit for ocfs2_remove_rightmost_path */ 3645 + ret = ocfs2_extend_rotate_transaction(handle, 0, 3646 + handle->h_buffer_credits, 3647 + right_path); 3648 + if (ret) { 3649 + mlog_errno(ret); 3650 + goto out; 3651 + } 3617 3652 3618 3653 ret = ocfs2_remove_rightmost_path(handle, et, 3619 3654 right_path, ··· 3660 3679 BUG_ON(ctxt->c_contig_type == CONTIG_NONE); 3661 3680 3662 3681 if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { 3682 + /* extend credit for ocfs2_remove_rightmost_path */ 3683 + ret = ocfs2_extend_rotate_transaction(handle, 0, 3684 + handle->h_buffer_credits, 3685 + path); 3686 + if (ret) { 3687 + mlog_errno(ret); 3688 + goto out; 3689 + } 3663 3690 /* 3664 3691 * The merge code will need to create an empty 3665 3692 * extent to take the place of the newly ··· 3716 3727 */ 3717 3728 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3718 3729 3730 + /* extend credit for ocfs2_remove_rightmost_path */ 3731 + ret = ocfs2_extend_rotate_transaction(handle, 0, 3732 + handle->h_buffer_credits, 3733 + path); 3734 + if (ret) { 3735 + mlog_errno(ret); 3736 + goto out; 3737 + } 3738 + 3719 3739 /* The merge left us with an empty extent, remove it. */ 3720 3740 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 3721 3741 if (ret) { ··· 3741 3743 ret = ocfs2_merge_rec_left(path, handle, et, rec, 3742 3744 dealloc, split_index); 3743 3745 3746 + if (ret) { 3747 + mlog_errno(ret); 3748 + goto out; 3749 + } 3750 + 3751 + /* extend credit for ocfs2_remove_rightmost_path */ 3752 + ret = ocfs2_extend_rotate_transaction(handle, 0, 3753 + handle->h_buffer_credits, 3754 + path); 3744 3755 if (ret) { 3745 3756 mlog_errno(ret); 3746 3757 goto out; ··· 3790 3783 } 3791 3784 3792 3785 if (ctxt->c_split_covers_rec) { 3786 + /* extend credit for ocfs2_remove_rightmost_path */ 3787 + ret = ocfs2_extend_rotate_transaction(handle, 0, 3788 + handle->h_buffer_credits, 3789 + path); 3790 + if (ret) { 3791 + mlog_errno(ret); 3792 + ret = 0; 3793 + goto out; 3794 + } 3795 + 3793 3796 /* 3794 3797 * The merge may have left an empty extent in 3795 3798 * our leaf. Try to rotate it away. ··· 5359 5342 struct ocfs2_extent_block *eb; 5360 5343 5361 5344 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { 5345 + /* extend credit for ocfs2_remove_rightmost_path */ 5346 + ret = ocfs2_extend_rotate_transaction(handle, 0, 5347 + handle->h_buffer_credits, 5348 + path); 5349 + if (ret) { 5350 + mlog_errno(ret); 5351 + goto out; 5352 + } 5353 + 5362 5354 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 5363 5355 if (ret) { 5364 5356 mlog_errno(ret); ··· 5954 5928 5955 5929 ocfs2_journal_dirty(handle, tl_bh); 5956 5930 5957 - /* TODO: Perhaps we can calculate the bulk of the 5958 - * credits up front rather than extending like 5959 - * this. */ 5960 - status = ocfs2_extend_trans(handle, 5961 - OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); 5962 - if (status < 0) { 5963 - mlog_errno(status); 5964 - goto bail; 5965 - } 5966 - 5967 5931 rec = tl->tl_recs[i]; 5968 5932 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, 5969 5933 le32_to_cpu(rec.t_start)); ··· 5973 5957 mlog_errno(status); 5974 5958 goto bail; 5975 5959 } 5960 + } 5961 + 5962 + status = ocfs2_extend_trans(handle, 5963 + OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); 5964 + if (status < 0) { 5965 + mlog_errno(status); 5966 + goto bail; 5976 5967 } 5977 5968 i--; 5978 5969 } ··· 6039 6016 goto out_mutex; 6040 6017 } 6041 6018 6042 - handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 6019 + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); 6043 6020 if (IS_ERR(handle)) { 6044 6021 status = PTR_ERR(handle); 6045 6022 mlog_errno(status); ··· 6102 6079 if (cancel) 6103 6080 cancel_delayed_work(&osb->osb_truncate_log_wq); 6104 6081 6105 - queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, 6082 + queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq, 6106 6083 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); 6107 6084 } 6108 6085 } ··· 6276 6253 6277 6254 if (tl_inode) { 6278 6255 cancel_delayed_work(&osb->osb_truncate_log_wq); 6279 - flush_workqueue(ocfs2_wq); 6256 + flush_workqueue(osb->ocfs2_wq); 6280 6257 6281 6258 status = ocfs2_flush_truncate_log(osb); 6282 6259 if (status < 0)

+560 -584

fs/ocfs2/aops.c

··· 499 499 return status; 500 500 } 501 501 502 - /* 503 - * TODO: Make this into a generic get_blocks function. 504 - * 505 - * From do_direct_io in direct-io.c: 506 - * "So what we do is to permit the ->get_blocks function to populate 507 - * bh.b_size with the size of IO which is permitted at this offset and 508 - * this i_blkbits." 509 - * 510 - * This function is called directly from get_more_blocks in direct-io.c. 511 - * 512 - * called like this: dio->get_blocks(dio->inode, fs_startblk, 513 - * fs_count, map_bh, dio->rw == WRITE); 514 - */ 515 - static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, 516 - struct buffer_head *bh_result, int create) 517 - { 518 - int ret; 519 - u32 cpos = 0; 520 - int alloc_locked = 0; 521 - u64 p_blkno, inode_blocks, contig_blocks; 522 - unsigned int ext_flags; 523 - unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 524 - unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; 525 - unsigned long len = bh_result->b_size; 526 - unsigned int clusters_to_alloc = 0, contig_clusters = 0; 527 - 528 - cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); 529 - 530 - /* This function won't even be called if the request isn't all 531 - * nicely aligned and of the right size, so there's no need 532 - * for us to check any of that. */ 533 - 534 - inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 535 - 536 - down_read(&OCFS2_I(inode)->ip_alloc_sem); 537 - 538 - /* This figures out the size of the next contiguous block, and 539 - * our logical offset */ 540 - ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, 541 - &contig_blocks, &ext_flags); 542 - up_read(&OCFS2_I(inode)->ip_alloc_sem); 543 - 544 - if (ret) { 545 - mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", 546 - (unsigned long long)iblock); 547 - ret = -EIO; 548 - goto bail; 549 - } 550 - 551 - /* We should already CoW the refcounted extent in case of create. */ 552 - BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); 553 - 554 - /* allocate blocks if no p_blkno is found, and create == 1 */ 555 - if (!p_blkno && create) { 556 - ret = ocfs2_inode_lock(inode, NULL, 1); 557 - if (ret < 0) { 558 - mlog_errno(ret); 559 - goto bail; 560 - } 561 - 562 - alloc_locked = 1; 563 - 564 - down_write(&OCFS2_I(inode)->ip_alloc_sem); 565 - 566 - /* fill hole, allocate blocks can't be larger than the size 567 - * of the hole */ 568 - clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); 569 - contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb, 570 - contig_blocks); 571 - if (clusters_to_alloc > contig_clusters) 572 - clusters_to_alloc = contig_clusters; 573 - 574 - /* allocate extent and insert them into the extent tree */ 575 - ret = ocfs2_extend_allocation(inode, cpos, 576 - clusters_to_alloc, 0); 577 - if (ret < 0) { 578 - up_write(&OCFS2_I(inode)->ip_alloc_sem); 579 - mlog_errno(ret); 580 - goto bail; 581 - } 582 - 583 - ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, 584 - &contig_blocks, &ext_flags); 585 - if (ret < 0) { 586 - up_write(&OCFS2_I(inode)->ip_alloc_sem); 587 - mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", 588 - (unsigned long long)iblock); 589 - ret = -EIO; 590 - goto bail; 591 - } 592 - set_buffer_new(bh_result); 593 - up_write(&OCFS2_I(inode)->ip_alloc_sem); 594 - } 595 - 596 - /* 597 - * get_more_blocks() expects us to describe a hole by clearing 598 - * the mapped bit on bh_result(). 599 - * 600 - * Consider an unwritten extent as a hole. 601 - */ 602 - if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 603 - map_bh(bh_result, inode->i_sb, p_blkno); 604 - else 605 - clear_buffer_mapped(bh_result); 606 - 607 - /* make sure we don't map more than max_blocks blocks here as 608 - that's all the kernel will handle at this point. */ 609 - if (max_blocks < contig_blocks) 610 - contig_blocks = max_blocks; 611 - bh_result->b_size = contig_blocks << blocksize_bits; 612 - bail: 613 - if (alloc_locked) 614 - ocfs2_inode_unlock(inode, 1); 615 - return ret; 616 - } 617 - 618 - /* 619 - * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're 620 - * particularly interested in the aio/dio case. We use the rw_lock DLM lock 621 - * to protect io on one node from truncation on another. 622 - */ 623 - static int ocfs2_dio_end_io(struct kiocb *iocb, 624 - loff_t offset, 625 - ssize_t bytes, 626 - void *private) 627 - { 628 - struct inode *inode = file_inode(iocb->ki_filp); 629 - int level; 630 - 631 - if (bytes <= 0) 632 - return 0; 633 - 634 - /* this io's submitter should not have unlocked this before we could */ 635 - BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 636 - 637 - if (ocfs2_iocb_is_unaligned_aio(iocb)) { 638 - ocfs2_iocb_clear_unaligned_aio(iocb); 639 - 640 - mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); 641 - } 642 - 643 - /* Let rw unlock to be done later to protect append direct io write */ 644 - if (offset + bytes <= i_size_read(inode)) { 645 - ocfs2_iocb_clear_rw_locked(iocb); 646 - 647 - level = ocfs2_iocb_rw_locked_level(iocb); 648 - ocfs2_rw_unlock(inode, level); 649 - } 650 - 651 - return 0; 652 - } 653 - 654 502 static int ocfs2_releasepage(struct page *page, gfp_t wait) 655 503 { 656 504 if (!page_has_buffers(page)) 657 505 return 0; 658 506 return try_to_free_buffers(page); 659 - } 660 - 661 - static int ocfs2_is_overwrite(struct ocfs2_super *osb, 662 - struct inode *inode, loff_t offset) 663 - { 664 - int ret = 0; 665 - u32 v_cpos = 0; 666 - u32 p_cpos = 0; 667 - unsigned int num_clusters = 0; 668 - unsigned int ext_flags = 0; 669 - 670 - v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); 671 - ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, 672 - &num_clusters, &ext_flags); 673 - if (ret < 0) { 674 - mlog_errno(ret); 675 - return ret; 676 - } 677 - 678 - if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 679 - return 1; 680 - 681 - return 0; 682 - } 683 - 684 - static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb, 685 - struct inode *inode, loff_t offset, 686 - u64 zero_len, int cluster_align) 687 - { 688 - u32 p_cpos = 0; 689 - u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); 690 - unsigned int num_clusters = 0; 691 - unsigned int ext_flags = 0; 692 - int ret = 0; 693 - 694 - if (offset <= i_size_read(inode) || cluster_align) 695 - return 0; 696 - 697 - ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, 698 - &ext_flags); 699 - if (ret < 0) { 700 - mlog_errno(ret); 701 - return ret; 702 - } 703 - 704 - if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { 705 - u64 s = i_size_read(inode); 706 - sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) + 707 - (do_div(s, osb->s_clustersize) >> 9); 708 - 709 - ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector, 710 - zero_len >> 9, GFP_NOFS, false); 711 - if (ret < 0) 712 - mlog_errno(ret); 713 - } 714 - 715 - return ret; 716 - } 717 - 718 - static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb, 719 - struct inode *inode, loff_t offset) 720 - { 721 - u64 zero_start, zero_len, total_zero_len; 722 - u32 p_cpos = 0, clusters_to_add; 723 - u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); 724 - unsigned int num_clusters = 0; 725 - unsigned int ext_flags = 0; 726 - u32 size_div, offset_div; 727 - int ret = 0; 728 - 729 - { 730 - u64 o = offset; 731 - u64 s = i_size_read(inode); 732 - 733 - offset_div = do_div(o, osb->s_clustersize); 734 - size_div = do_div(s, osb->s_clustersize); 735 - } 736 - 737 - if (offset <= i_size_read(inode)) 738 - return 0; 739 - 740 - clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) - 741 - ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode)); 742 - total_zero_len = offset - i_size_read(inode); 743 - if (clusters_to_add) 744 - total_zero_len -= offset_div; 745 - 746 - /* Allocate clusters to fill out holes, and this is only needed 747 - * when we add more than one clusters. Otherwise the cluster will 748 - * be allocated during direct IO */ 749 - if (clusters_to_add > 1) { 750 - ret = ocfs2_extend_allocation(inode, 751 - OCFS2_I(inode)->ip_clusters, 752 - clusters_to_add - 1, 0); 753 - if (ret) { 754 - mlog_errno(ret); 755 - goto out; 756 - } 757 - } 758 - 759 - while (total_zero_len) { 760 - ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, 761 - &ext_flags); 762 - if (ret < 0) { 763 - mlog_errno(ret); 764 - goto out; 765 - } 766 - 767 - zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) + 768 - size_div; 769 - zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) - 770 - size_div; 771 - zero_len = min(total_zero_len, zero_len); 772 - 773 - if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { 774 - ret = blkdev_issue_zeroout(osb->sb->s_bdev, 775 - zero_start >> 9, zero_len >> 9, 776 - GFP_NOFS, false); 777 - if (ret < 0) { 778 - mlog_errno(ret); 779 - goto out; 780 - } 781 - } 782 - 783 - total_zero_len -= zero_len; 784 - v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div); 785 - 786 - /* Only at first iteration can be cluster not aligned. 787 - * So set size_div to 0 for the rest */ 788 - size_div = 0; 789 - } 790 - 791 - out: 792 - return ret; 793 - } 794 - 795 - static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, 796 - struct iov_iter *iter, 797 - loff_t offset) 798 - { 799 - ssize_t ret = 0; 800 - ssize_t written = 0; 801 - bool orphaned = false; 802 - int is_overwrite = 0; 803 - struct file *file = iocb->ki_filp; 804 - struct inode *inode = file_inode(file)->i_mapping->host; 805 - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 806 - struct buffer_head *di_bh = NULL; 807 - size_t count = iter->count; 808 - journal_t *journal = osb->journal->j_journal; 809 - u64 zero_len_head, zero_len_tail; 810 - int cluster_align_head, cluster_align_tail; 811 - loff_t final_size = offset + count; 812 - int append_write = offset >= i_size_read(inode) ? 1 : 0; 813 - unsigned int num_clusters = 0; 814 - unsigned int ext_flags = 0; 815 - 816 - { 817 - u64 o = offset; 818 - u64 s = i_size_read(inode); 819 - 820 - zero_len_head = do_div(o, 1 << osb->s_clustersize_bits); 821 - cluster_align_head = !zero_len_head; 822 - 823 - zero_len_tail = osb->s_clustersize - 824 - do_div(s, osb->s_clustersize); 825 - if ((offset - i_size_read(inode)) < zero_len_tail) 826 - zero_len_tail = offset - i_size_read(inode); 827 - cluster_align_tail = !zero_len_tail; 828 - } 829 - 830 - /* 831 - * when final_size > inode->i_size, inode->i_size will be 832 - * updated after direct write, so add the inode to orphan 833 - * dir first. 834 - */ 835 - if (final_size > i_size_read(inode)) { 836 - ret = ocfs2_add_inode_to_orphan(osb, inode); 837 - if (ret < 0) { 838 - mlog_errno(ret); 839 - goto out; 840 - } 841 - orphaned = true; 842 - } 843 - 844 - if (append_write) { 845 - ret = ocfs2_inode_lock(inode, NULL, 1); 846 - if (ret < 0) { 847 - mlog_errno(ret); 848 - goto clean_orphan; 849 - } 850 - 851 - /* zeroing out the previously allocated cluster tail 852 - * that but not zeroed */ 853 - if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 854 - down_read(&OCFS2_I(inode)->ip_alloc_sem); 855 - ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, 856 - zero_len_tail, cluster_align_tail); 857 - up_read(&OCFS2_I(inode)->ip_alloc_sem); 858 - } else { 859 - down_write(&OCFS2_I(inode)->ip_alloc_sem); 860 - ret = ocfs2_direct_IO_extend_no_holes(osb, inode, 861 - offset); 862 - up_write(&OCFS2_I(inode)->ip_alloc_sem); 863 - } 864 - if (ret < 0) { 865 - mlog_errno(ret); 866 - ocfs2_inode_unlock(inode, 1); 867 - goto clean_orphan; 868 - } 869 - 870 - is_overwrite = ocfs2_is_overwrite(osb, inode, offset); 871 - if (is_overwrite < 0) { 872 - mlog_errno(is_overwrite); 873 - ret = is_overwrite; 874 - ocfs2_inode_unlock(inode, 1); 875 - goto clean_orphan; 876 - } 877 - 878 - ocfs2_inode_unlock(inode, 1); 879 - } 880 - 881 - written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, 882 - offset, ocfs2_direct_IO_get_blocks, 883 - ocfs2_dio_end_io, NULL, 0); 884 - /* overwrite aio may return -EIOCBQUEUED, and it is not an error */ 885 - if ((written < 0) && (written != -EIOCBQUEUED)) { 886 - loff_t i_size = i_size_read(inode); 887 - 888 - if (offset + count > i_size) { 889 - ret = ocfs2_inode_lock(inode, &di_bh, 1); 890 - if (ret < 0) { 891 - mlog_errno(ret); 892 - goto clean_orphan; 893 - } 894 - 895 - if (i_size == i_size_read(inode)) { 896 - ret = ocfs2_truncate_file(inode, di_bh, 897 - i_size); 898 - if (ret < 0) { 899 - if (ret != -ENOSPC) 900 - mlog_errno(ret); 901 - 902 - ocfs2_inode_unlock(inode, 1); 903 - brelse(di_bh); 904 - di_bh = NULL; 905 - goto clean_orphan; 906 - } 907 - } 908 - 909 - ocfs2_inode_unlock(inode, 1); 910 - brelse(di_bh); 911 - di_bh = NULL; 912 - 913 - ret = jbd2_journal_force_commit(journal); 914 - if (ret < 0) 915 - mlog_errno(ret); 916 - } 917 - } else if (written > 0 && append_write && !is_overwrite && 918 - !cluster_align_head) { 919 - /* zeroing out the allocated cluster head */ 920 - u32 p_cpos = 0; 921 - u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); 922 - 923 - ret = ocfs2_inode_lock(inode, NULL, 0); 924 - if (ret < 0) { 925 - mlog_errno(ret); 926 - goto clean_orphan; 927 - } 928 - 929 - ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, 930 - &num_clusters, &ext_flags); 931 - if (ret < 0) { 932 - mlog_errno(ret); 933 - ocfs2_inode_unlock(inode, 0); 934 - goto clean_orphan; 935 - } 936 - 937 - BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)); 938 - 939 - ret = blkdev_issue_zeroout(osb->sb->s_bdev, 940 - (u64)p_cpos << (osb->s_clustersize_bits - 9), 941 - zero_len_head >> 9, GFP_NOFS, false); 942 - if (ret < 0) 943 - mlog_errno(ret); 944 - 945 - ocfs2_inode_unlock(inode, 0); 946 - } 947 - 948 - clean_orphan: 949 - if (orphaned) { 950 - int tmp_ret; 951 - int update_isize = written > 0 ? 1 : 0; 952 - loff_t end = update_isize ? offset + written : 0; 953 - 954 - tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1); 955 - if (tmp_ret < 0) { 956 - ret = tmp_ret; 957 - mlog_errno(ret); 958 - goto out; 959 - } 960 - 961 - tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 962 - update_isize, end); 963 - if (tmp_ret < 0) { 964 - ocfs2_inode_unlock(inode, 1); 965 - ret = tmp_ret; 966 - mlog_errno(ret); 967 - brelse(di_bh); 968 - goto out; 969 - } 970 - 971 - ocfs2_inode_unlock(inode, 1); 972 - brelse(di_bh); 973 - 974 - tmp_ret = jbd2_journal_force_commit(journal); 975 - if (tmp_ret < 0) { 976 - ret = tmp_ret; 977 - mlog_errno(tmp_ret); 978 - } 979 - } 980 - 981 - out: 982 - if (ret >= 0) 983 - ret = written; 984 - return ret; 985 - } 986 - 987 - static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, 988 - loff_t offset) 989 - { 990 - struct file *file = iocb->ki_filp; 991 - struct inode *inode = file_inode(file)->i_mapping->host; 992 - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 993 - int full_coherency = !(osb->s_mount_opt & 994 - OCFS2_MOUNT_COHERENCY_BUFFERED); 995 - 996 - /* 997 - * Fallback to buffered I/O if we see an inode without 998 - * extents. 999 - */ 1000 - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1001 - return 0; 1002 - 1003 - /* Fallback to buffered I/O if we are appending and 1004 - * concurrent O_DIRECT writes are allowed. 1005 - */ 1006 - if (i_size_read(inode) <= offset && !full_coherency) 1007 - return 0; 1008 - 1009 - if (iov_iter_rw(iter) == READ) 1010 - return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, 1011 - iter, offset, 1012 - ocfs2_direct_IO_get_blocks, 1013 - ocfs2_dio_end_io, NULL, 0); 1014 - else 1015 - return ocfs2_direct_IO_write(iocb, iter, offset); 1016 507 } 1017 508 1018 509 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, ··· 692 1201 693 1202 #define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) 694 1203 1204 + struct ocfs2_unwritten_extent { 1205 + struct list_head ue_node; 1206 + struct list_head ue_ip_node; 1207 + u32 ue_cpos; 1208 + u32 ue_phys; 1209 + }; 1210 + 695 1211 /* 696 1212 * Describe the state of a single cluster to be written to. 697 1213 */ ··· 710 1212 * filled. 711 1213 */ 712 1214 unsigned c_new; 713 - unsigned c_unwritten; 1215 + unsigned c_clear_unwritten; 714 1216 unsigned c_needs_zero; 715 1217 }; 716 1218 ··· 721 1223 722 1224 /* First cluster allocated in a nonsparse extend */ 723 1225 u32 w_first_new_cpos; 1226 + 1227 + /* Type of caller. Must be one of buffer, mmap, direct. */ 1228 + ocfs2_write_type_t w_type; 724 1229 725 1230 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; 726 1231 ··· 773 1272 struct buffer_head *w_di_bh; 774 1273 775 1274 struct ocfs2_cached_dealloc_ctxt w_dealloc; 1275 + 1276 + struct list_head w_unwritten_list; 776 1277 }; 777 1278 778 1279 void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) ··· 813 1310 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); 814 1311 } 815 1312 816 - static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) 1313 + static void ocfs2_free_unwritten_list(struct inode *inode, 1314 + struct list_head *head) 817 1315 { 1316 + struct ocfs2_inode_info *oi = OCFS2_I(inode); 1317 + struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL; 1318 + 1319 + list_for_each_entry_safe(ue, tmp, head, ue_node) { 1320 + list_del(&ue->ue_node); 1321 + spin_lock(&oi->ip_lock); 1322 + list_del(&ue->ue_ip_node); 1323 + spin_unlock(&oi->ip_lock); 1324 + kfree(ue); 1325 + } 1326 + } 1327 + 1328 + static void ocfs2_free_write_ctxt(struct inode *inode, 1329 + struct ocfs2_write_ctxt *wc) 1330 + { 1331 + ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list); 818 1332 ocfs2_unlock_pages(wc); 819 1333 brelse(wc->w_di_bh); 820 1334 kfree(wc); ··· 839 1319 840 1320 static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, 841 1321 struct ocfs2_super *osb, loff_t pos, 842 - unsigned len, struct buffer_head *di_bh) 1322 + unsigned len, ocfs2_write_type_t type, 1323 + struct buffer_head *di_bh) 843 1324 { 844 1325 u32 cend; 845 1326 struct ocfs2_write_ctxt *wc; ··· 855 1334 wc->w_clen = cend - wc->w_cpos + 1; 856 1335 get_bh(di_bh); 857 1336 wc->w_di_bh = di_bh; 1337 + wc->w_type = type; 858 1338 859 1339 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) 860 1340 wc->w_large_pages = 1; ··· 863 1341 wc->w_large_pages = 0; 864 1342 865 1343 ocfs2_init_dealloc_ctxt(&wc->w_dealloc); 1344 + INIT_LIST_HEAD(&wc->w_unwritten_list); 866 1345 867 1346 *wcp = wc; 868 1347 ··· 924 1401 to = user_pos + user_len; 925 1402 struct page *tmppage; 926 1403 927 - ocfs2_zero_new_buffers(wc->w_target_page, from, to); 1404 + if (wc->w_target_page) 1405 + ocfs2_zero_new_buffers(wc->w_target_page, from, to); 928 1406 929 1407 for(i = 0; i < wc->w_num_pages; i++) { 930 1408 tmppage = wc->w_pages[i]; 931 1409 932 - if (page_has_buffers(tmppage)) { 1410 + if (tmppage && page_has_buffers(tmppage)) { 933 1411 if (ocfs2_should_order_data(inode)) 934 1412 ocfs2_jbd2_file_inode(wc->w_handle, inode); 935 1413 ··· 1060 1536 wc->w_num_pages = 1; 1061 1537 start = target_index; 1062 1538 } 1539 + end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT; 1063 1540 1064 1541 for(i = 0; i < wc->w_num_pages; i++) { 1065 1542 index = start + i; 1066 1543 1067 - if (index == target_index && mmap_page) { 1544 + if (index >= target_index && index <= end_index && 1545 + wc->w_type == OCFS2_WRITE_MMAP) { 1068 1546 /* 1069 1547 * ocfs2_pagemkwrite() is a little different 1070 1548 * and wants us to directly use the page ··· 1085 1559 page_cache_get(mmap_page); 1086 1560 wc->w_pages[i] = mmap_page; 1087 1561 wc->w_target_locked = true; 1562 + } else if (index >= target_index && index <= end_index && 1563 + wc->w_type == OCFS2_WRITE_DIRECT) { 1564 + /* Direct write has no mapping page. */ 1565 + wc->w_pages[i] = NULL; 1566 + continue; 1088 1567 } else { 1089 1568 wc->w_pages[i] = find_or_create_page(mapping, index, 1090 1569 GFP_NOFS); ··· 1114 1583 * Prepare a single cluster for write one cluster into the file. 1115 1584 */ 1116 1585 static int ocfs2_write_cluster(struct address_space *mapping, 1117 - u32 phys, unsigned int unwritten, 1586 + u32 *phys, unsigned int new, 1587 + unsigned int clear_unwritten, 1118 1588 unsigned int should_zero, 1119 1589 struct ocfs2_alloc_context *data_ac, 1120 1590 struct ocfs2_alloc_context *meta_ac, 1121 1591 struct ocfs2_write_ctxt *wc, u32 cpos, 1122 1592 loff_t user_pos, unsigned user_len) 1123 1593 { 1124 - int ret, i, new; 1125 - u64 v_blkno, p_blkno; 1594 + int ret, i; 1595 + u64 p_blkno; 1126 1596 struct inode *inode = mapping->host; 1127 1597 struct ocfs2_extent_tree et; 1598 + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 1128 1599 1129 - new = phys == 0 ? 1 : 0; 1130 1600 if (new) { 1131 1601 u32 tmp_pos; 1132 1602 ··· 1137 1605 */ 1138 1606 tmp_pos = cpos; 1139 1607 ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, 1140 - &tmp_pos, 1, 0, wc->w_di_bh, 1141 - wc->w_handle, data_ac, 1142 - meta_ac, NULL); 1608 + &tmp_pos, 1, !clear_unwritten, 1609 + wc->w_di_bh, wc->w_handle, 1610 + data_ac, meta_ac, NULL); 1143 1611 /* 1144 1612 * This shouldn't happen because we must have already 1145 1613 * calculated the correct meta data allocation required. The ··· 1156 1624 mlog_errno(ret); 1157 1625 goto out; 1158 1626 } 1159 - } else if (unwritten) { 1627 + } else if (clear_unwritten) { 1160 1628 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), 1161 1629 wc->w_di_bh); 1162 1630 ret = ocfs2_mark_extent_written(inode, &et, 1163 - wc->w_handle, cpos, 1, phys, 1631 + wc->w_handle, cpos, 1, *phys, 1164 1632 meta_ac, &wc->w_dealloc); 1165 1633 if (ret < 0) { 1166 1634 mlog_errno(ret); ··· 1168 1636 } 1169 1637 } 1170 1638 1171 - if (should_zero) 1172 - v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); 1173 - else 1174 - v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; 1175 - 1176 1639 /* 1177 1640 * The only reason this should fail is due to an inability to 1178 1641 * find the extent added. 1179 1642 */ 1180 - ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, 1181 - NULL); 1643 + ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL); 1182 1644 if (ret < 0) { 1183 1645 mlog(ML_ERROR, "Get physical blkno failed for inode %llu, " 1184 - "at logical block %llu", 1185 - (unsigned long long)OCFS2_I(inode)->ip_blkno, 1186 - (unsigned long long)v_blkno); 1646 + "at logical cluster %u", 1647 + (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); 1187 1648 goto out; 1188 1649 } 1189 1650 1190 - BUG_ON(p_blkno == 0); 1651 + BUG_ON(*phys == 0); 1652 + 1653 + p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys); 1654 + if (!should_zero) 1655 + p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1); 1191 1656 1192 1657 for(i = 0; i < wc->w_num_pages; i++) { 1193 1658 int tmpret; 1659 + 1660 + /* This is the direct io target page. */ 1661 + if (wc->w_pages[i] == NULL) { 1662 + p_blkno++; 1663 + continue; 1664 + } 1194 1665 1195 1666 tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, 1196 1667 wc->w_pages[i], cpos, ··· 1241 1706 if ((cluster_off + local_len) > osb->s_clustersize) 1242 1707 local_len = osb->s_clustersize - cluster_off; 1243 1708 1244 - ret = ocfs2_write_cluster(mapping, desc->c_phys, 1245 - desc->c_unwritten, 1709 + ret = ocfs2_write_cluster(mapping, &desc->c_phys, 1710 + desc->c_new, 1711 + desc->c_clear_unwritten, 1246 1712 desc->c_needs_zero, 1247 1713 data_ac, meta_ac, 1248 1714 wc, desc->c_cpos, pos, local_len); ··· 1311 1775 wc->w_target_from = 0; 1312 1776 wc->w_target_to = PAGE_CACHE_SIZE; 1313 1777 } 1778 + } 1779 + 1780 + /* 1781 + * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to 1782 + * do the zero work. And should not to clear UNWRITTEN since it will be cleared 1783 + * by the direct io procedure. 1784 + * If this is a new extent that allocated by direct io, we should mark it in 1785 + * the ip_unwritten_list. 1786 + */ 1787 + static int ocfs2_unwritten_check(struct inode *inode, 1788 + struct ocfs2_write_ctxt *wc, 1789 + struct ocfs2_write_cluster_desc *desc) 1790 + { 1791 + struct ocfs2_inode_info *oi = OCFS2_I(inode); 1792 + struct ocfs2_unwritten_extent *ue = NULL, *new = NULL; 1793 + int ret = 0; 1794 + 1795 + if (!desc->c_needs_zero) 1796 + return 0; 1797 + 1798 + retry: 1799 + spin_lock(&oi->ip_lock); 1800 + /* Needs not to zero no metter buffer or direct. The one who is zero 1801 + * the cluster is doing zero. And he will clear unwritten after all 1802 + * cluster io finished. */ 1803 + list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) { 1804 + if (desc->c_cpos == ue->ue_cpos) { 1805 + BUG_ON(desc->c_new); 1806 + desc->c_needs_zero = 0; 1807 + desc->c_clear_unwritten = 0; 1808 + goto unlock; 1809 + } 1810 + } 1811 + 1812 + if (wc->w_type != OCFS2_WRITE_DIRECT) 1813 + goto unlock; 1814 + 1815 + if (new == NULL) { 1816 + spin_unlock(&oi->ip_lock); 1817 + new = kmalloc(sizeof(struct ocfs2_unwritten_extent), 1818 + GFP_NOFS); 1819 + if (new == NULL) { 1820 + ret = -ENOMEM; 1821 + goto out; 1822 + } 1823 + goto retry; 1824 + } 1825 + /* This direct write will doing zero. */ 1826 + new->ue_cpos = desc->c_cpos; 1827 + new->ue_phys = desc->c_phys; 1828 + desc->c_clear_unwritten = 0; 1829 + list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list); 1830 + list_add_tail(&new->ue_node, &wc->w_unwritten_list); 1831 + new = NULL; 1832 + unlock: 1833 + spin_unlock(&oi->ip_lock); 1834 + out: 1835 + if (new) 1836 + kfree(new); 1837 + return ret; 1314 1838 } 1315 1839 1316 1840 /* ··· 1448 1852 if (phys == 0) { 1449 1853 desc->c_new = 1; 1450 1854 desc->c_needs_zero = 1; 1855 + desc->c_clear_unwritten = 1; 1451 1856 *clusters_to_alloc = *clusters_to_alloc + 1; 1452 1857 } 1453 1858 1454 1859 if (ext_flags & OCFS2_EXT_UNWRITTEN) { 1455 - desc->c_unwritten = 1; 1860 + desc->c_clear_unwritten = 1; 1456 1861 desc->c_needs_zero = 1; 1862 + } 1863 + 1864 + ret = ocfs2_unwritten_check(inode, wc, desc); 1865 + if (ret) { 1866 + mlog_errno(ret); 1867 + goto out; 1457 1868 } 1458 1869 1459 1870 num_clusters--; ··· 1625 2022 if (ret) 1626 2023 mlog_errno(ret); 1627 2024 1628 - wc->w_first_new_cpos = 1629 - ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); 2025 + /* There is no wc if this is call from direct. */ 2026 + if (wc) 2027 + wc->w_first_new_cpos = 2028 + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); 1630 2029 1631 2030 return ret; 1632 2031 } ··· 1682 2077 return ret; 1683 2078 } 1684 2079 1685 - int ocfs2_write_begin_nolock(struct file *filp, 1686 - struct address_space *mapping, 1687 - loff_t pos, unsigned len, unsigned flags, 2080 + int ocfs2_write_begin_nolock(struct address_space *mapping, 2081 + loff_t pos, unsigned len, ocfs2_write_type_t type, 1688 2082 struct page **pagep, void **fsdata, 1689 2083 struct buffer_head *di_bh, struct page *mmap_page) 1690 2084 { ··· 1700 2096 int try_free = 1, ret1; 1701 2097 1702 2098 try_again: 1703 - ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); 2099 + ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh); 1704 2100 if (ret) { 1705 2101 mlog_errno(ret); 1706 2102 return ret; ··· 1719 2115 } 1720 2116 } 1721 2117 1722 - if (ocfs2_sparse_alloc(osb)) 1723 - ret = ocfs2_zero_tail(inode, di_bh, pos); 1724 - else 1725 - ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, 1726 - wc); 1727 - if (ret) { 1728 - mlog_errno(ret); 1729 - goto out; 2118 + /* Direct io change i_size late, should not zero tail here. */ 2119 + if (type != OCFS2_WRITE_DIRECT) { 2120 + if (ocfs2_sparse_alloc(osb)) 2121 + ret = ocfs2_zero_tail(inode, di_bh, pos); 2122 + else 2123 + ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, 2124 + len, wc); 2125 + if (ret) { 2126 + mlog_errno(ret); 2127 + goto out; 2128 + } 1730 2129 } 1731 2130 1732 2131 ret = ocfs2_check_range_for_refcount(inode, pos, len); ··· 1760 2153 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1761 2154 (long long)i_size_read(inode), 1762 2155 le32_to_cpu(di->i_clusters), 1763 - pos, len, flags, mmap_page, 2156 + pos, len, type, mmap_page, 1764 2157 clusters_to_alloc, extents_to_split); 1765 2158 1766 2159 /* ··· 1790 2183 1791 2184 credits = ocfs2_calc_extend_credits(inode->i_sb, 1792 2185 &di->id2.i_list); 1793 - 1794 - } 2186 + } else if (type == OCFS2_WRITE_DIRECT) 2187 + /* direct write needs not to start trans if no extents alloc. */ 2188 + goto success; 1795 2189 1796 2190 /* 1797 2191 * We have to zero sparse allocated clusters, unwritten extent clusters, 1798 2192 * and non-sparse clusters we just extended. For non-sparse writes, 1799 2193 * we know zeros will only be needed in the first and/or last cluster. 1800 2194 */ 1801 - if (clusters_to_alloc || extents_to_split || 1802 - (wc->w_clen && (wc->w_desc[0].c_needs_zero || 1803 - wc->w_desc[wc->w_clen - 1].c_needs_zero))) 2195 + if (wc->w_clen && (wc->w_desc[0].c_needs_zero || 2196 + wc->w_desc[wc->w_clen - 1].c_needs_zero)) 1804 2197 cluster_of_pages = 1; 1805 2198 else 1806 2199 cluster_of_pages = 0; ··· 1867 2260 ocfs2_free_alloc_context(meta_ac); 1868 2261 1869 2262 success: 1870 - *pagep = wc->w_target_page; 2263 + if (pagep) 2264 + *pagep = wc->w_target_page; 1871 2265 *fsdata = wc; 1872 2266 return 0; 1873 2267 out_quota: ··· 1879 2271 ocfs2_commit_trans(osb, handle); 1880 2272 1881 2273 out: 1882 - ocfs2_free_write_ctxt(wc); 2274 + ocfs2_free_write_ctxt(inode, wc); 1883 2275 1884 2276 if (data_ac) { 1885 2277 ocfs2_free_alloc_context(data_ac); ··· 1931 2323 */ 1932 2324 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1933 2325 1934 - ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, 1935 - fsdata, di_bh, NULL); 2326 + ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER, 2327 + pagep, fsdata, di_bh, NULL); 1936 2328 if (ret) { 1937 2329 mlog_errno(ret); 1938 2330 goto out_fail; ··· 1989 2381 handle_t *handle = wc->w_handle; 1990 2382 struct page *tmppage; 1991 2383 1992 - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, 1993 - OCFS2_JOURNAL_ACCESS_WRITE); 1994 - if (ret) { 1995 - copied = ret; 1996 - mlog_errno(ret); 1997 - goto out; 2384 + BUG_ON(!list_empty(&wc->w_unwritten_list)); 2385 + 2386 + if (handle) { 2387 + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), 2388 + wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2389 + if (ret) { 2390 + copied = ret; 2391 + mlog_errno(ret); 2392 + goto out; 2393 + } 1998 2394 } 1999 2395 2000 2396 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { ··· 2006 2394 goto out_write_size; 2007 2395 } 2008 2396 2009 - if (unlikely(copied < len)) { 2397 + if (unlikely(copied < len) && wc->w_target_page) { 2010 2398 if (!PageUptodate(wc->w_target_page)) 2011 2399 copied = 0; 2012 2400 2013 2401 ocfs2_zero_new_buffers(wc->w_target_page, start+copied, 2014 2402 start+len); 2015 2403 } 2016 - flush_dcache_page(wc->w_target_page); 2404 + if (wc->w_target_page) 2405 + flush_dcache_page(wc->w_target_page); 2017 2406 2018 2407 for(i = 0; i < wc->w_num_pages; i++) { 2019 2408 tmppage = wc->w_pages[i]; 2409 + 2410 + /* This is the direct io target page. */ 2411 + if (tmppage == NULL) 2412 + continue; 2020 2413 2021 2414 if (tmppage == wc->w_target_page) { 2022 2415 from = wc->w_target_from; ··· 2041 2424 } 2042 2425 2043 2426 if (page_has_buffers(tmppage)) { 2044 - if (ocfs2_should_order_data(inode)) 2045 - ocfs2_jbd2_file_inode(wc->w_handle, inode); 2427 + if (handle && ocfs2_should_order_data(inode)) 2428 + ocfs2_jbd2_file_inode(handle, inode); 2046 2429 block_commit_write(tmppage, from, to); 2047 2430 } 2048 2431 } 2049 2432 2050 2433 out_write_size: 2051 - pos += copied; 2052 - if (pos > i_size_read(inode)) { 2053 - i_size_write(inode, pos); 2054 - mark_inode_dirty(inode); 2434 + /* Direct io do not update i_size here. */ 2435 + if (wc->w_type != OCFS2_WRITE_DIRECT) { 2436 + pos += copied; 2437 + if (pos > i_size_read(inode)) { 2438 + i_size_write(inode, pos); 2439 + mark_inode_dirty(inode); 2440 + } 2441 + inode->i_blocks = ocfs2_inode_sector_count(inode); 2442 + di->i_size = cpu_to_le64((u64)i_size_read(inode)); 2443 + inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2444 + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 2445 + di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 2446 + ocfs2_update_inode_fsync_trans(handle, inode, 1); 2055 2447 } 2056 - inode->i_blocks = ocfs2_inode_sector_count(inode); 2057 - di->i_size = cpu_to_le64((u64)i_size_read(inode)); 2058 - inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2059 - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 2060 - di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 2061 - ocfs2_update_inode_fsync_trans(handle, inode, 1); 2062 - ocfs2_journal_dirty(handle, wc->w_di_bh); 2448 + if (handle) 2449 + ocfs2_journal_dirty(handle, wc->w_di_bh); 2063 2450 2064 2451 out: 2065 2452 /* unlock pages before dealloc since it needs acquiring j_trans_barrier ··· 2073 2452 */ 2074 2453 ocfs2_unlock_pages(wc); 2075 2454 2076 - ocfs2_commit_trans(osb, handle); 2455 + if (handle) 2456 + ocfs2_commit_trans(osb, handle); 2077 2457 2078 2458 ocfs2_run_deallocs(osb, &wc->w_dealloc); 2079 2459 ··· 2097 2475 ocfs2_inode_unlock(inode, 1); 2098 2476 2099 2477 return ret; 2478 + } 2479 + 2480 + struct ocfs2_dio_write_ctxt { 2481 + struct list_head dw_zero_list; 2482 + unsigned dw_zero_count; 2483 + int dw_orphaned; 2484 + pid_t dw_writer_pid; 2485 + }; 2486 + 2487 + static struct ocfs2_dio_write_ctxt * 2488 + ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc) 2489 + { 2490 + struct ocfs2_dio_write_ctxt *dwc = NULL; 2491 + 2492 + if (bh->b_private) 2493 + return bh->b_private; 2494 + 2495 + dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS); 2496 + if (dwc == NULL) 2497 + return NULL; 2498 + INIT_LIST_HEAD(&dwc->dw_zero_list); 2499 + dwc->dw_zero_count = 0; 2500 + dwc->dw_orphaned = 0; 2501 + dwc->dw_writer_pid = task_pid_nr(current); 2502 + bh->b_private = dwc; 2503 + *alloc = 1; 2504 + 2505 + return dwc; 2506 + } 2507 + 2508 + static void ocfs2_dio_free_write_ctx(struct inode *inode, 2509 + struct ocfs2_dio_write_ctxt *dwc) 2510 + { 2511 + ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list); 2512 + kfree(dwc); 2513 + } 2514 + 2515 + /* 2516 + * TODO: Make this into a generic get_blocks function. 2517 + * 2518 + * From do_direct_io in direct-io.c: 2519 + * "So what we do is to permit the ->get_blocks function to populate 2520 + * bh.b_size with the size of IO which is permitted at this offset and 2521 + * this i_blkbits." 2522 + * 2523 + * This function is called directly from get_more_blocks in direct-io.c. 2524 + * 2525 + * called like this: dio->get_blocks(dio->inode, fs_startblk, 2526 + * fs_count, map_bh, dio->rw == WRITE); 2527 + */ 2528 + static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, 2529 + struct buffer_head *bh_result, int create) 2530 + { 2531 + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2532 + struct ocfs2_inode_info *oi = OCFS2_I(inode); 2533 + struct ocfs2_write_ctxt *wc; 2534 + struct ocfs2_write_cluster_desc *desc = NULL; 2535 + struct ocfs2_dio_write_ctxt *dwc = NULL; 2536 + struct buffer_head *di_bh = NULL; 2537 + u64 p_blkno; 2538 + loff_t pos = iblock << inode->i_sb->s_blocksize_bits; 2539 + unsigned len, total_len = bh_result->b_size; 2540 + int ret = 0, first_get_block = 0; 2541 + 2542 + len = osb->s_clustersize - (pos & (osb->s_clustersize - 1)); 2543 + len = min(total_len, len); 2544 + 2545 + mlog(0, "get block of %lu at %llu:%u req %u\n", 2546 + inode->i_ino, pos, len, total_len); 2547 + 2548 + /* 2549 + * Because we need to change file size in ocfs2_dio_end_io_write(), or 2550 + * we may need to add it to orphan dir. So can not fall to fast path 2551 + * while file size will be changed. 2552 + */ 2553 + if (pos + total_len <= i_size_read(inode)) { 2554 + down_read(&oi->ip_alloc_sem); 2555 + /* This is the fast path for re-write. */ 2556 + ret = ocfs2_get_block(inode, iblock, bh_result, create); 2557 + 2558 + up_read(&oi->ip_alloc_sem); 2559 + 2560 + if (buffer_mapped(bh_result) && 2561 + !buffer_new(bh_result) && 2562 + ret == 0) 2563 + goto out; 2564 + 2565 + /* Clear state set by ocfs2_get_block. */ 2566 + bh_result->b_state = 0; 2567 + } 2568 + 2569 + dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block); 2570 + if (unlikely(dwc == NULL)) { 2571 + ret = -ENOMEM; 2572 + mlog_errno(ret); 2573 + goto out; 2574 + } 2575 + 2576 + if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) > 2577 + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) && 2578 + !dwc->dw_orphaned) { 2579 + /* 2580 + * when we are going to alloc extents beyond file size, add the 2581 + * inode to orphan dir, so we can recall those spaces when 2582 + * system crashed during write. 2583 + */ 2584 + ret = ocfs2_add_inode_to_orphan(osb, inode); 2585 + if (ret < 0) { 2586 + mlog_errno(ret); 2587 + goto out; 2588 + } 2589 + dwc->dw_orphaned = 1; 2590 + } 2591 + 2592 + ret = ocfs2_inode_lock(inode, &di_bh, 1); 2593 + if (ret) { 2594 + mlog_errno(ret); 2595 + goto out; 2596 + } 2597 + 2598 + down_write(&oi->ip_alloc_sem); 2599 + 2600 + if (first_get_block) { 2601 + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 2602 + ret = ocfs2_zero_tail(inode, di_bh, pos); 2603 + else 2604 + ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, 2605 + total_len, NULL); 2606 + if (ret < 0) { 2607 + mlog_errno(ret); 2608 + goto unlock; 2609 + } 2610 + } 2611 + 2612 + ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len, 2613 + OCFS2_WRITE_DIRECT, NULL, 2614 + (void **)&wc, di_bh, NULL); 2615 + if (ret) { 2616 + mlog_errno(ret); 2617 + goto unlock; 2618 + } 2619 + 2620 + desc = &wc->w_desc[0]; 2621 + 2622 + p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys); 2623 + BUG_ON(p_blkno == 0); 2624 + p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1); 2625 + 2626 + map_bh(bh_result, inode->i_sb, p_blkno); 2627 + bh_result->b_size = len; 2628 + if (desc->c_needs_zero) 2629 + set_buffer_new(bh_result); 2630 + 2631 + /* May sleep in end_io. It should not happen in a irq context. So defer 2632 + * it to dio work queue. */ 2633 + set_buffer_defer_completion(bh_result); 2634 + 2635 + if (!list_empty(&wc->w_unwritten_list)) { 2636 + struct ocfs2_unwritten_extent *ue = NULL; 2637 + 2638 + ue = list_first_entry(&wc->w_unwritten_list, 2639 + struct ocfs2_unwritten_extent, 2640 + ue_node); 2641 + BUG_ON(ue->ue_cpos != desc->c_cpos); 2642 + /* The physical address may be 0, fill it. */ 2643 + ue->ue_phys = desc->c_phys; 2644 + 2645 + list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list); 2646 + dwc->dw_zero_count++; 2647 + } 2648 + 2649 + ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc); 2650 + BUG_ON(ret != len); 2651 + ret = 0; 2652 + unlock: 2653 + up_write(&oi->ip_alloc_sem); 2654 + ocfs2_inode_unlock(inode, 1); 2655 + brelse(di_bh); 2656 + out: 2657 + if (ret < 0) 2658 + ret = -EIO; 2659 + return ret; 2660 + } 2661 + 2662 + static void ocfs2_dio_end_io_write(struct inode *inode, 2663 + struct ocfs2_dio_write_ctxt *dwc, 2664 + loff_t offset, 2665 + ssize_t bytes) 2666 + { 2667 + struct ocfs2_cached_dealloc_ctxt dealloc; 2668 + struct ocfs2_extent_tree et; 2669 + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2670 + struct ocfs2_inode_info *oi = OCFS2_I(inode); 2671 + struct ocfs2_unwritten_extent *ue = NULL; 2672 + struct buffer_head *di_bh = NULL; 2673 + struct ocfs2_dinode *di; 2674 + struct ocfs2_alloc_context *data_ac = NULL; 2675 + struct ocfs2_alloc_context *meta_ac = NULL; 2676 + handle_t *handle = NULL; 2677 + loff_t end = offset + bytes; 2678 + int ret = 0, credits = 0, locked = 0; 2679 + 2680 + ocfs2_init_dealloc_ctxt(&dealloc); 2681 + 2682 + /* We do clear unwritten, delete orphan, change i_size here. If neither 2683 + * of these happen, we can skip all this. */ 2684 + if (list_empty(&dwc->dw_zero_list) && 2685 + end <= i_size_read(inode) && 2686 + !dwc->dw_orphaned) 2687 + goto out; 2688 + 2689 + /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we 2690 + * are in that context. */ 2691 + if (dwc->dw_writer_pid != task_pid_nr(current)) { 2692 + mutex_lock(&inode->i_mutex); 2693 + locked = 1; 2694 + } 2695 + 2696 + ret = ocfs2_inode_lock(inode, &di_bh, 1); 2697 + if (ret < 0) { 2698 + mlog_errno(ret); 2699 + goto out; 2700 + } 2701 + 2702 + down_write(&oi->ip_alloc_sem); 2703 + 2704 + /* Delete orphan before acquire i_mutex. */ 2705 + if (dwc->dw_orphaned) { 2706 + BUG_ON(dwc->dw_writer_pid != task_pid_nr(current)); 2707 + 2708 + end = end > i_size_read(inode) ? end : 0; 2709 + 2710 + ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 2711 + !!end, end); 2712 + if (ret < 0) 2713 + mlog_errno(ret); 2714 + } 2715 + 2716 + di = (struct ocfs2_dinode *)di_bh; 2717 + 2718 + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); 2719 + 2720 + ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, 2721 + &data_ac, &meta_ac); 2722 + if (ret) { 2723 + mlog_errno(ret); 2724 + goto unlock; 2725 + } 2726 + 2727 + credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list); 2728 + 2729 + handle = ocfs2_start_trans(osb, credits); 2730 + if (IS_ERR(handle)) { 2731 + ret = PTR_ERR(handle); 2732 + mlog_errno(ret); 2733 + goto unlock; 2734 + } 2735 + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 2736 + OCFS2_JOURNAL_ACCESS_WRITE); 2737 + if (ret) { 2738 + mlog_errno(ret); 2739 + goto commit; 2740 + } 2741 + 2742 + list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { 2743 + ret = ocfs2_mark_extent_written(inode, &et, handle, 2744 + ue->ue_cpos, 1, 2745 + ue->ue_phys, 2746 + meta_ac, &dealloc); 2747 + if (ret < 0) { 2748 + mlog_errno(ret); 2749 + break; 2750 + } 2751 + } 2752 + 2753 + if (end > i_size_read(inode)) { 2754 + ret = ocfs2_set_inode_size(handle, inode, di_bh, end); 2755 + if (ret < 0) 2756 + mlog_errno(ret); 2757 + } 2758 + commit: 2759 + ocfs2_commit_trans(osb, handle); 2760 + unlock: 2761 + up_write(&oi->ip_alloc_sem); 2762 + ocfs2_inode_unlock(inode, 1); 2763 + brelse(di_bh); 2764 + out: 2765 + if (data_ac) 2766 + ocfs2_free_alloc_context(data_ac); 2767 + if (meta_ac) 2768 + ocfs2_free_alloc_context(meta_ac); 2769 + ocfs2_run_deallocs(osb, &dealloc); 2770 + if (locked) 2771 + mutex_unlock(&inode->i_mutex); 2772 + ocfs2_dio_free_write_ctx(inode, dwc); 2773 + } 2774 + 2775 + /* 2776 + * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're 2777 + * particularly interested in the aio/dio case. We use the rw_lock DLM lock 2778 + * to protect io on one node from truncation on another. 2779 + */ 2780 + static int ocfs2_dio_end_io(struct kiocb *iocb, 2781 + loff_t offset, 2782 + ssize_t bytes, 2783 + void *private) 2784 + { 2785 + struct inode *inode = file_inode(iocb->ki_filp); 2786 + int level; 2787 + 2788 + if (bytes <= 0) 2789 + return 0; 2790 + 2791 + /* this io's submitter should not have unlocked this before we could */ 2792 + BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 2793 + 2794 + if (private) 2795 + ocfs2_dio_end_io_write(inode, private, offset, bytes); 2796 + 2797 + ocfs2_iocb_clear_rw_locked(iocb); 2798 + 2799 + level = ocfs2_iocb_rw_locked_level(iocb); 2800 + ocfs2_rw_unlock(inode, level); 2801 + return 0; 2802 + } 2803 + 2804 + static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, 2805 + loff_t offset) 2806 + { 2807 + struct file *file = iocb->ki_filp; 2808 + struct inode *inode = file_inode(file)->i_mapping->host; 2809 + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2810 + loff_t end = offset + iter->count; 2811 + get_block_t *get_block; 2812 + 2813 + /* 2814 + * Fallback to buffered I/O if we see an inode without 2815 + * extents. 2816 + */ 2817 + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 2818 + return 0; 2819 + 2820 + /* Fallback to buffered I/O if we do not support append dio. */ 2821 + if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb)) 2822 + return 0; 2823 + 2824 + if (iov_iter_rw(iter) == READ) 2825 + get_block = ocfs2_get_block; 2826 + else 2827 + get_block = ocfs2_dio_get_block; 2828 + 2829 + return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, 2830 + iter, offset, get_block, 2831 + ocfs2_dio_end_io, NULL, 0); 2100 2832 } 2101 2833 2102 2834 const struct address_space_operations ocfs2_aops = {

+8 -11

fs/ocfs2/aops.h

··· 47 47 loff_t pos, unsigned len, unsigned copied, 48 48 struct page *page, void *fsdata); 49 49 50 - int ocfs2_write_begin_nolock(struct file *filp, 51 - struct address_space *mapping, 52 - loff_t pos, unsigned len, unsigned flags, 50 + typedef enum { 51 + OCFS2_WRITE_BUFFER = 0, 52 + OCFS2_WRITE_DIRECT, 53 + OCFS2_WRITE_MMAP, 54 + } ocfs2_write_type_t; 55 + 56 + int ocfs2_write_begin_nolock(struct address_space *mapping, 57 + loff_t pos, unsigned len, ocfs2_write_type_t type, 53 58 struct page **pagep, void **fsdata, 54 59 struct buffer_head *di_bh, struct page *mmap_page); 55 60 ··· 84 79 enum ocfs2_iocb_lock_bits { 85 80 OCFS2_IOCB_RW_LOCK = 0, 86 81 OCFS2_IOCB_RW_LOCK_LEVEL, 87 - OCFS2_IOCB_UNALIGNED_IO, 88 82 OCFS2_IOCB_NUM_LOCKS 89 83 }; 90 84 ··· 91 87 clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) 92 88 #define ocfs2_iocb_rw_locked_level(iocb) \ 93 89 test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) 94 - 95 - #define ocfs2_iocb_set_unaligned_aio(iocb) \ 96 - set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) 97 - #define ocfs2_iocb_clear_unaligned_aio(iocb) \ 98 - clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) 99 - #define ocfs2_iocb_is_unaligned_aio(iocb) \ 100 - test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) 101 90 102 91 #endif /* OCFS2_FILE_H */

+2 -2

fs/ocfs2/cluster/heartbeat.c

··· 1444 1444 debugfs_remove(reg->hr_debug_dir); 1445 1445 kfree(reg->hr_db_livenodes); 1446 1446 kfree(reg->hr_db_regnum); 1447 - kfree(reg->hr_debug_elapsed_time); 1448 - kfree(reg->hr_debug_pinned); 1447 + kfree(reg->hr_db_elapsed_time); 1448 + kfree(reg->hr_db_pinned); 1449 1449 1450 1450 spin_lock(&o2hb_live_lock); 1451 1451 list_del(&reg->hr_all_item);

+29 -1

fs/ocfs2/dlm/dlmconvert.c

··· 212 212 if (lock->lksb->flags & DLM_LKSB_PUT_LVB) 213 213 memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); 214 214 215 + /* 216 + * Move the lock to the tail because it may be the only lock which has 217 + * an invalid lvb. 218 + */ 219 + list_move_tail(&lock->list, &res->granted); 220 + 215 221 status = DLM_NORMAL; 216 222 *call_ast = 1; 217 223 goto unlock_exit; ··· 268 262 struct dlm_lock *lock, int flags, int type) 269 263 { 270 264 enum dlm_status status; 265 + u8 old_owner = res->owner; 271 266 272 267 mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, 273 268 lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); ··· 294 287 status = DLM_DENIED; 295 288 goto bail; 296 289 } 290 + 291 + if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) { 292 + mlog(0, "last convert request returned DLM_RECOVERING, but " 293 + "owner has already queued and sent ast to me. res %.*s, " 294 + "(cookie=%u:%llu, type=%d, conv=%d)\n", 295 + res->lockname.len, res->lockname.name, 296 + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), 297 + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), 298 + lock->ml.type, lock->ml.convert_type); 299 + status = DLM_NORMAL; 300 + goto bail; 301 + } 302 + 297 303 res->state |= DLM_LOCK_RES_IN_PROGRESS; 298 304 /* move lock to local convert queue */ 299 305 /* do not alter lock refcount. switching lists. */ ··· 336 316 spin_lock(&res->spinlock); 337 317 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 338 318 lock->convert_pending = 0; 339 - /* if it failed, move it back to granted queue */ 319 + /* if it failed, move it back to granted queue. 320 + * if master returns DLM_NORMAL and then down before sending ast, 321 + * it may have already been moved to granted queue, reset to 322 + * DLM_RECOVERING and retry convert */ 340 323 if (status != DLM_NORMAL) { 341 324 if (status != DLM_NOTQUEUED) 342 325 dlm_error(status); 343 326 dlm_revert_pending_convert(res, lock); 327 + } else if ((res->state & DLM_LOCK_RES_RECOVERING) || 328 + (old_owner != res->owner)) { 329 + mlog(0, "res %.*s is in recovering or has been recovered.\n", 330 + res->lockname.len, res->lockname.name); 331 + status = DLM_RECOVERING; 344 332 } 345 333 bail: 346 334 spin_unlock(&res->spinlock);

-1

fs/ocfs2/dlm/dlmrecovery.c

··· 2083 2083 dlm_lock_get(lock); 2084 2084 if (lock->convert_pending) { 2085 2085 /* move converting lock back to granted */ 2086 - BUG_ON(i != DLM_CONVERTING_LIST); 2087 2086 mlog(0, "node died with convert pending " 2088 2087 "on %.*s. move back to granted list.\n", 2089 2088 res->lockname.len, res->lockname.name);

+15 -150

fs/ocfs2/file.c

··· 1381 1381 return ret; 1382 1382 } 1383 1383 1384 - /* 1385 - * Will look for holes and unwritten extents in the range starting at 1386 - * pos for count bytes (inclusive). 1387 - */ 1388 - static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, 1389 - size_t count) 1390 - { 1391 - int ret = 0; 1392 - unsigned int extent_flags; 1393 - u32 cpos, clusters, extent_len, phys_cpos; 1394 - struct super_block *sb = inode->i_sb; 1395 - 1396 - cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; 1397 - clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; 1398 - 1399 - while (clusters) { 1400 - ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, 1401 - &extent_flags); 1402 - if (ret < 0) { 1403 - mlog_errno(ret); 1404 - goto out; 1405 - } 1406 - 1407 - if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { 1408 - ret = 1; 1409 - break; 1410 - } 1411 - 1412 - if (extent_len > clusters) 1413 - extent_len = clusters; 1414 - 1415 - clusters -= extent_len; 1416 - cpos += extent_len; 1417 - } 1418 - out: 1419 - return ret; 1420 - } 1421 - 1422 1384 static int ocfs2_write_remove_suid(struct inode *inode) 1423 1385 { 1424 1386 int ret; ··· 2091 2129 2092 2130 static int ocfs2_prepare_inode_for_write(struct file *file, 2093 2131 loff_t pos, 2094 - size_t count, 2095 - int appending, 2096 - int *direct_io, 2097 - int *has_refcount) 2132 + size_t count) 2098 2133 { 2099 2134 int ret = 0, meta_level = 0; 2100 2135 struct dentry *dentry = file->f_path.dentry; 2101 2136 struct inode *inode = d_inode(dentry); 2102 2137 loff_t end; 2103 - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2104 - int full_coherency = !(osb->s_mount_opt & 2105 - OCFS2_MOUNT_COHERENCY_BUFFERED); 2106 2138 2107 2139 /* 2108 2140 * We start with a read level meta lock and only jump to an ex ··· 2145 2189 pos, 2146 2190 count, 2147 2191 &meta_level); 2148 - if (has_refcount) 2149 - *has_refcount = 1; 2150 - if (direct_io) 2151 - *direct_io = 0; 2152 2192 } 2153 2193 2154 2194 if (ret < 0) { ··· 2152 2200 goto out_unlock; 2153 2201 } 2154 2202 2155 - /* 2156 - * Skip the O_DIRECT checks if we don't need 2157 - * them. 2158 - */ 2159 - if (!direct_io || !(*direct_io)) 2160 - break; 2161 - 2162 - /* 2163 - * There's no sane way to do direct writes to an inode 2164 - * with inline data. 2165 - */ 2166 - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 2167 - *direct_io = 0; 2168 - break; 2169 - } 2170 - 2171 - /* 2172 - * Allowing concurrent direct writes means 2173 - * i_size changes wouldn't be synchronized, so 2174 - * one node could wind up truncating another 2175 - * nodes writes. 2176 - */ 2177 - if (end > i_size_read(inode) && !full_coherency) { 2178 - *direct_io = 0; 2179 - break; 2180 - } 2181 - 2182 - /* 2183 - * Fallback to old way if the feature bit is not set. 2184 - */ 2185 - if (end > i_size_read(inode) && 2186 - !ocfs2_supports_append_dio(osb)) { 2187 - *direct_io = 0; 2188 - break; 2189 - } 2190 - 2191 - /* 2192 - * We don't fill holes during direct io, so 2193 - * check for them here. If any are found, the 2194 - * caller will have to retake some cluster 2195 - * locks and initiate the io as buffered. 2196 - */ 2197 - ret = ocfs2_check_range_for_holes(inode, pos, count); 2198 - if (ret == 1) { 2199 - /* 2200 - * Fallback to old way if the feature bit is not set. 2201 - * Otherwise try dio first and then complete the rest 2202 - * request through buffer io. 2203 - */ 2204 - if (!ocfs2_supports_append_dio(osb)) 2205 - *direct_io = 0; 2206 - ret = 0; 2207 - } else if (ret < 0) 2208 - mlog_errno(ret); 2209 2203 break; 2210 2204 } 2211 2205 2212 2206 out_unlock: 2213 2207 trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, 2214 - pos, appending, count, 2215 - direct_io, has_refcount); 2208 + pos, count); 2216 2209 2217 2210 if (meta_level >= 0) 2218 2211 ocfs2_inode_unlock(inode, meta_level); ··· 2169 2272 static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, 2170 2273 struct iov_iter *from) 2171 2274 { 2172 - int direct_io, appending, rw_level; 2173 - int can_do_direct, has_refcount = 0; 2275 + int direct_io, rw_level; 2174 2276 ssize_t written = 0; 2175 2277 ssize_t ret; 2176 - size_t count = iov_iter_count(from), orig_count; 2278 + size_t count = iov_iter_count(from); 2177 2279 struct file *file = iocb->ki_filp; 2178 2280 struct inode *inode = file_inode(file); 2179 2281 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2180 2282 int full_coherency = !(osb->s_mount_opt & 2181 2283 OCFS2_MOUNT_COHERENCY_BUFFERED); 2182 - int unaligned_dio = 0; 2183 - int dropped_dio = 0; 2284 + void *saved_ki_complete = NULL; 2184 2285 int append_write = ((iocb->ki_pos + count) >= 2185 2286 i_size_read(inode) ? 1 : 0); 2186 2287 ··· 2191 2296 if (count == 0) 2192 2297 return 0; 2193 2298 2194 - appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0; 2195 2299 direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; 2196 2300 2197 2301 inode_lock(inode); 2198 2302 2199 - relock: 2200 2303 /* 2201 2304 * Concurrent O_DIRECT writes are allowed with 2202 2305 * mount_option "coherency=buffered". ··· 2227 2334 ocfs2_inode_unlock(inode, 1); 2228 2335 } 2229 2336 2230 - orig_count = iov_iter_count(from); 2231 2337 ret = generic_write_checks(iocb, from); 2232 2338 if (ret <= 0) { 2233 2339 if (ret) ··· 2235 2343 } 2236 2344 count = ret; 2237 2345 2238 - can_do_direct = direct_io; 2239 - ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending, 2240 - &can_do_direct, &has_refcount); 2346 + ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count); 2241 2347 if (ret < 0) { 2242 2348 mlog_errno(ret); 2243 2349 goto out; 2244 2350 } 2245 2351 2246 - if (direct_io && !is_sync_kiocb(iocb)) 2247 - unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos); 2248 - 2249 - /* 2250 - * We can't complete the direct I/O as requested, fall back to 2251 - * buffered I/O. 2252 - */ 2253 - if (direct_io && !can_do_direct) { 2254 - ocfs2_rw_unlock(inode, rw_level); 2255 - 2256 - rw_level = -1; 2257 - 2258 - direct_io = 0; 2259 - iocb->ki_flags &= ~IOCB_DIRECT; 2260 - iov_iter_reexpand(from, orig_count); 2261 - dropped_dio = 1; 2262 - goto relock; 2263 - } 2264 - 2265 - if (unaligned_dio) { 2352 + if (direct_io && !is_sync_kiocb(iocb) && 2353 + ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) { 2266 2354 /* 2267 - * Wait on previous unaligned aio to complete before 2268 - * proceeding. 2355 + * Make it a sync io if it's an unaligned aio. 2269 2356 */ 2270 - mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio); 2271 - /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */ 2272 - ocfs2_iocb_set_unaligned_aio(iocb); 2357 + saved_ki_complete = xchg(&iocb->ki_complete, NULL); 2273 2358 } 2274 2359 2275 2360 /* communicate with ocfs2_dio_end_io */ ··· 2267 2398 */ 2268 2399 if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { 2269 2400 rw_level = -1; 2270 - unaligned_dio = 0; 2271 2401 } 2272 2402 2273 2403 if (unlikely(written <= 0)) 2274 - goto no_sync; 2404 + goto out; 2275 2405 2276 2406 if (((file->f_flags & O_DSYNC) && !direct_io) || 2277 - IS_SYNC(inode) || dropped_dio) { 2407 + IS_SYNC(inode)) { 2278 2408 ret = filemap_fdatawrite_range(file->f_mapping, 2279 2409 iocb->ki_pos - written, 2280 2410 iocb->ki_pos - 1); ··· 2292 2424 iocb->ki_pos - 1); 2293 2425 } 2294 2426 2295 - no_sync: 2296 - if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) { 2297 - ocfs2_iocb_clear_unaligned_aio(iocb); 2298 - mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); 2299 - } 2300 - 2301 2427 out: 2428 + if (saved_ki_complete) 2429 + xchg(&iocb->ki_complete, saved_ki_complete); 2430 + 2302 2431 if (rw_level != -1) 2303 2432 ocfs2_rw_unlock(inode, rw_level); 2304 2433

+3

fs/ocfs2/inode.c

··· 1170 1170 mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), 1171 1171 "Clear inode of %llu, inode has io markers\n", 1172 1172 (unsigned long long)oi->ip_blkno); 1173 + mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list), 1174 + "Clear inode of %llu, inode has unwritten extents\n", 1175 + (unsigned long long)oi->ip_blkno); 1173 1176 1174 1177 ocfs2_extent_map_trunc(inode, 0); 1175 1178

+3 -3

fs/ocfs2/inode.h

··· 43 43 /* protects extended attribute changes on this inode */ 44 44 struct rw_semaphore ip_xattr_sem; 45 45 46 - /* Number of outstanding AIO's which are not page aligned */ 47 - struct mutex ip_unaligned_aio; 48 - 49 46 /* These fields are protected by ip_lock */ 50 47 spinlock_t ip_lock; 51 48 u32 ip_open_count; ··· 53 56 struct mutex ip_io_mutex; 54 57 u32 ip_flags; /* see below */ 55 58 u32 ip_attr; /* inode attributes */ 59 + 60 + /* Record unwritten extents during direct io. */ 61 + struct list_head ip_unwritten_list; 56 62 57 63 /* protected by recovery_lock. */ 58 64 struct inode *ip_next_orphan;

+4 -4

fs/ocfs2/journal.c

··· 231 231 /* At this point, we know that no more recovery threads can be 232 232 * launched, so wait for any recovery completion work to 233 233 * complete. */ 234 - flush_workqueue(ocfs2_wq); 234 + flush_workqueue(osb->ocfs2_wq); 235 235 236 236 /* 237 237 * Now that recovery is shut down, and the osb is about to be ··· 1326 1326 1327 1327 spin_lock(&journal->j_lock); 1328 1328 list_add_tail(&item->lri_list, &journal->j_la_cleanups); 1329 - queue_work(ocfs2_wq, &journal->j_recovery_work); 1329 + queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work); 1330 1330 spin_unlock(&journal->j_lock); 1331 1331 } 1332 1332 ··· 1968 1968 mutex_lock(&os->os_lock); 1969 1969 ocfs2_queue_orphan_scan(osb); 1970 1970 if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) 1971 - queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, 1971 + queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work, 1972 1972 ocfs2_orphan_scan_timeout()); 1973 1973 mutex_unlock(&os->os_lock); 1974 1974 } ··· 2008 2008 atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); 2009 2009 else { 2010 2010 atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); 2011 - queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, 2011 + queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work, 2012 2012 ocfs2_orphan_scan_timeout()); 2013 2013 } 2014 2014 }

+2 -2

fs/ocfs2/localalloc.c

··· 386 386 struct ocfs2_dinode *alloc = NULL; 387 387 388 388 cancel_delayed_work(&osb->la_enable_wq); 389 - flush_workqueue(ocfs2_wq); 389 + flush_workqueue(osb->ocfs2_wq); 390 390 391 391 if (osb->local_alloc_state == OCFS2_LA_UNUSED) 392 392 goto out; ··· 1085 1085 } else { 1086 1086 osb->local_alloc_state = OCFS2_LA_DISABLED; 1087 1087 } 1088 - queue_delayed_work(ocfs2_wq, &osb->la_enable_wq, 1088 + queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq, 1089 1089 OCFS2_LA_ENABLE_INTERVAL); 1090 1090 goto out_unlock; 1091 1091 }

+2 -2

fs/ocfs2/mmap.c

··· 104 104 if (page->index == last_index) 105 105 len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; 106 106 107 - ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, 108 - &fsdata, di_bh, page); 107 + ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, 108 + &locked_page, &fsdata, di_bh, page); 109 109 if (ret) { 110 110 if (ret != -ENOSPC) 111 111 mlog_errno(ret);

+8

fs/ocfs2/ocfs2.h

··· 464 464 struct ocfs2_refcount_tree *osb_ref_tree_lru; 465 465 466 466 struct mutex system_file_mutex; 467 + 468 + /* 469 + * OCFS2 needs to schedule several different types of work which 470 + * require cluster locking, disk I/O, recovery waits, etc. Since these 471 + * types of work tend to be heavy we avoid using the kernel events 472 + * workqueue and schedule on our own. 473 + */ 474 + struct workqueue_struct *ocfs2_wq; 467 475 }; 468 476 469 477 #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)

+4 -12

fs/ocfs2/ocfs2_trace.h

··· 1450 1450 1451 1451 TRACE_EVENT(ocfs2_prepare_inode_for_write, 1452 1452 TP_PROTO(unsigned long long ino, unsigned long long saved_pos, 1453 - int appending, unsigned long count, 1454 - int *direct_io, int *has_refcount), 1455 - TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount), 1453 + unsigned long count), 1454 + TP_ARGS(ino, saved_pos, count), 1456 1455 TP_STRUCT__entry( 1457 1456 __field(unsigned long long, ino) 1458 1457 __field(unsigned long long, saved_pos) 1459 - __field(int, appending) 1460 1458 __field(unsigned long, count) 1461 - __field(int, direct_io) 1462 - __field(int, has_refcount) 1463 1459 ), 1464 1460 TP_fast_assign( 1465 1461 __entry->ino = ino; 1466 1462 __entry->saved_pos = saved_pos; 1467 - __entry->appending = appending; 1468 1463 __entry->count = count; 1469 - __entry->direct_io = direct_io ? *direct_io : -1; 1470 - __entry->has_refcount = has_refcount ? *has_refcount : -1; 1471 1464 ), 1472 - TP_printk("%llu %llu %d %lu %d %d", __entry->ino, 1473 - __entry->saved_pos, __entry->appending, __entry->count, 1474 - __entry->direct_io, __entry->has_refcount) 1465 + TP_printk("%llu %llu %lu", __entry->ino, 1466 + __entry->saved_pos, __entry->count) 1475 1467 ); 1476 1468 1477 1469 DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);

+1 -1

fs/ocfs2/quota_global.c

··· 726 726 dqgrab(dquot); 727 727 /* First entry on list -> queue work */ 728 728 if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) 729 - queue_work(ocfs2_wq, &osb->dquot_drop_work); 729 + queue_work(osb->ocfs2_wq, &osb->dquot_drop_work); 730 730 goto out; 731 731 } 732 732 status = ocfs2_lock_global_qf(oinfo, 1);

+1 -1

fs/ocfs2/resize.c

··· 196 196 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { 197 197 blkno = ocfs2_backup_super_blkno(inode->i_sb, i); 198 198 cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); 199 - if (cluster > clusters) 199 + if (cluster >= clusters) 200 200 break; 201 201 202 202 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);

+16 -23

fs/ocfs2/super.c

··· 80 80 struct kmem_cache *ocfs2_dquot_cachep; 81 81 struct kmem_cache *ocfs2_qf_chunk_cachep; 82 82 83 - /* OCFS2 needs to schedule several different types of work which 84 - * require cluster locking, disk I/O, recovery waits, etc. Since these 85 - * types of work tend to be heavy we avoid using the kernel events 86 - * workqueue and schedule on our own. */ 87 - struct workqueue_struct *ocfs2_wq = NULL; 88 - 89 83 static struct dentry *ocfs2_debugfs_root; 90 84 91 85 MODULE_AUTHOR("Oracle"); ··· 1607 1613 if (status < 0) 1608 1614 goto out2; 1609 1615 1610 - ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); 1611 - if (!ocfs2_wq) { 1612 - status = -ENOMEM; 1613 - goto out3; 1614 - } 1615 - 1616 1616 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); 1617 1617 if (!ocfs2_debugfs_root) { 1618 1618 status = -ENOMEM; 1619 1619 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 1620 - goto out4; 1620 + goto out3; 1621 1621 } 1622 1622 1623 1623 ocfs2_set_locking_protocol(); 1624 1624 1625 1625 status = register_quota_format(&ocfs2_quota_format); 1626 1626 if (status < 0) 1627 - goto out4; 1627 + goto out3; 1628 1628 status = register_filesystem(&ocfs2_fs_type); 1629 1629 if (!status) 1630 1630 return 0; 1631 1631 1632 1632 unregister_quota_format(&ocfs2_quota_format); 1633 - out4: 1634 - destroy_workqueue(ocfs2_wq); 1635 - debugfs_remove(ocfs2_debugfs_root); 1636 1633 out3: 1634 + debugfs_remove(ocfs2_debugfs_root); 1637 1635 ocfs2_free_mem_caches(); 1638 1636 out2: 1639 1637 exit_ocfs2_uptodate_cache(); ··· 1636 1650 1637 1651 static void __exit ocfs2_exit(void) 1638 1652 { 1639 - if (ocfs2_wq) { 1640 - flush_workqueue(ocfs2_wq); 1641 - destroy_workqueue(ocfs2_wq); 1642 - } 1643 - 1644 1653 unregister_quota_format(&ocfs2_quota_format); 1645 1654 1646 1655 debugfs_remove(ocfs2_debugfs_root); ··· 1726 1745 spin_lock_init(&oi->ip_lock); 1727 1746 ocfs2_extent_map_init(&oi->vfs_inode); 1728 1747 INIT_LIST_HEAD(&oi->ip_io_markers); 1748 + INIT_LIST_HEAD(&oi->ip_unwritten_list); 1729 1749 oi->ip_dir_start_lookup = 0; 1730 - mutex_init(&oi->ip_unaligned_aio); 1731 1750 init_rwsem(&oi->ip_alloc_sem); 1732 1751 init_rwsem(&oi->ip_xattr_sem); 1733 1752 mutex_init(&oi->ip_io_mutex); ··· 2330 2349 } 2331 2350 cleancache_init_shared_fs(sb); 2332 2351 2352 + osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); 2353 + if (!osb->ocfs2_wq) { 2354 + status = -ENOMEM; 2355 + mlog_errno(status); 2356 + } 2357 + 2333 2358 bail: 2334 2359 return status; 2335 2360 } ··· 2522 2535 static void ocfs2_delete_osb(struct ocfs2_super *osb) 2523 2536 { 2524 2537 /* This function assumes that the caller has the main osb resource */ 2538 + 2539 + /* ocfs2_initializer_super have already created this workqueue */ 2540 + if (osb->ocfs2_wq) { 2541 + flush_workqueue(osb->ocfs2_wq); 2542 + destroy_workqueue(osb->ocfs2_wq); 2543 + } 2525 2544 2526 2545 ocfs2_free_slot_info(osb); 2527 2546

-2

fs/ocfs2/super.h

··· 26 26 #ifndef OCFS2_SUPER_H 27 27 #define OCFS2_SUPER_H 28 28 29 - extern struct workqueue_struct *ocfs2_wq; 30 - 31 29 int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, 32 30 int node_num); 33 31

+11 -1

include/asm-generic/vmlinux.lds.h

··· 456 456 *(.entry.text) \ 457 457 VMLINUX_SYMBOL(__entry_text_end) = .; 458 458 459 - #ifdef CONFIG_FUNCTION_GRAPH_TRACER 459 + #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) 460 460 #define IRQENTRY_TEXT \ 461 461 ALIGN_FUNCTION(); \ 462 462 VMLINUX_SYMBOL(__irqentry_text_start) = .; \ ··· 464 464 VMLINUX_SYMBOL(__irqentry_text_end) = .; 465 465 #else 466 466 #define IRQENTRY_TEXT 467 + #endif 468 + 469 + #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) 470 + #define SOFTIRQENTRY_TEXT \ 471 + ALIGN_FUNCTION(); \ 472 + VMLINUX_SYMBOL(__softirqentry_text_start) = .; \ 473 + *(.softirqentry.text) \ 474 + VMLINUX_SYMBOL(__softirqentry_text_end) = .; 475 + #else 476 + #define SOFTIRQENTRY_TEXT 467 477 #endif 468 478 469 479 /* Section used for early init (in .S files) */

-11

include/linux/ftrace.h

··· 811 811 */ 812 812 #define __notrace_funcgraph notrace 813 813 814 - /* 815 - * We want to which function is an entrypoint of a hardirq. 816 - * That will help us to put a signal on output. 817 - */ 818 - #define __irq_entry __attribute__((__section__(".irqentry.text"))) 819 - 820 - /* Limits of hardirq entrypoints */ 821 - extern char __irqentry_text_start[]; 822 - extern char __irqentry_text_end[]; 823 - 824 814 #define FTRACE_NOTRACE_DEPTH 65536 825 815 #define FTRACE_RETFUNC_DEPTH 50 826 816 #define FTRACE_RETSTACK_ALLOC_SIZE 32 ··· 847 857 #else /* !CONFIG_FUNCTION_GRAPH_TRACER */ 848 858 849 859 #define __notrace_funcgraph 850 - #define __irq_entry 851 860 #define INIT_FTRACE_GRAPH 852 861 853 862 static inline void ftrace_graph_init_task(struct task_struct *t) { }

+20

include/linux/interrupt.h

··· 683 683 extern int arch_probe_nr_irqs(void); 684 684 extern int arch_early_irq_init(void); 685 685 686 + #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) 687 + /* 688 + * We want to know which function is an entrypoint of a hardirq or a softirq. 689 + */ 690 + #define __irq_entry __attribute__((__section__(".irqentry.text"))) 691 + #define __softirq_entry \ 692 + __attribute__((__section__(".softirqentry.text"))) 693 + 694 + /* Limits of hardirq entrypoints */ 695 + extern char __irqentry_text_start[]; 696 + extern char __irqentry_text_end[]; 697 + /* Limits of softirq entrypoints */ 698 + extern char __softirqentry_text_start[]; 699 + extern char __softirqentry_text_end[]; 700 + 701 + #else 702 + #define __irq_entry 703 + #define __softirq_entry 704 + #endif 705 + 686 706 #endif

+23 -8

include/linux/kasan.h

··· 48 48 void kasan_alloc_pages(struct page *page, unsigned int order); 49 49 void kasan_free_pages(struct page *page, unsigned int order); 50 50 51 + void kasan_cache_create(struct kmem_cache *cache, size_t *size, 52 + unsigned long *flags); 53 + 51 54 void kasan_poison_slab(struct page *page); 52 55 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object); 53 56 void kasan_poison_object_data(struct kmem_cache *cache, void *object); 54 57 55 - void kasan_kmalloc_large(const void *ptr, size_t size); 58 + void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags); 56 59 void kasan_kfree_large(const void *ptr); 57 60 void kasan_kfree(void *ptr); 58 - void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size); 59 - void kasan_krealloc(const void *object, size_t new_size); 61 + void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, 62 + gfp_t flags); 63 + void kasan_krealloc(const void *object, size_t new_size, gfp_t flags); 60 64 61 - void kasan_slab_alloc(struct kmem_cache *s, void *object); 65 + void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags); 62 66 void kasan_slab_free(struct kmem_cache *s, void *object); 67 + 68 + struct kasan_cache { 69 + int alloc_meta_offset; 70 + int free_meta_offset; 71 + }; 63 72 64 73 int kasan_module_alloc(void *addr, size_t size); 65 74 void kasan_free_shadow(const struct vm_struct *vm); ··· 85 76 static inline void kasan_alloc_pages(struct page *page, unsigned int order) {} 86 77 static inline void kasan_free_pages(struct page *page, unsigned int order) {} 87 78 79 + static inline void kasan_cache_create(struct kmem_cache *cache, 80 + size_t *size, 81 + unsigned long *flags) {} 82 + 88 83 static inline void kasan_poison_slab(struct page *page) {} 89 84 static inline void kasan_unpoison_object_data(struct kmem_cache *cache, 90 85 void *object) {} 91 86 static inline void kasan_poison_object_data(struct kmem_cache *cache, 92 87 void *object) {} 93 88 94 - static inline void kasan_kmalloc_large(void *ptr, size_t size) {} 89 + static inline void kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags) {} 95 90 static inline void kasan_kfree_large(const void *ptr) {} 96 91 static inline void kasan_kfree(void *ptr) {} 97 92 static inline void kasan_kmalloc(struct kmem_cache *s, const void *object, 98 - size_t size) {} 99 - static inline void kasan_krealloc(const void *object, size_t new_size) {} 93 + size_t size, gfp_t flags) {} 94 + static inline void kasan_krealloc(const void *object, size_t new_size, 95 + gfp_t flags) {} 100 96 101 - static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {} 97 + static inline void kasan_slab_alloc(struct kmem_cache *s, void *object, 98 + gfp_t flags) {} 102 99 static inline void kasan_slab_free(struct kmem_cache *s, void *object) {} 103 100 104 101 static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }

+2

include/linux/mm.h

··· 1132 1132 struct address_space *check_mapping; /* Check page->mapping if set */ 1133 1133 pgoff_t first_index; /* Lowest page->index to unmap */ 1134 1134 pgoff_t last_index; /* Highest page->index to unmap */ 1135 + bool ignore_dirty; /* Ignore dirty pages */ 1136 + bool check_swap_entries; /* Check also swap entries */ 1135 1137 }; 1136 1138 1137 1139 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,

+1 -3

include/linux/oom.h

··· 76 76 struct mem_cgroup *memcg, const nodemask_t *nodemask, 77 77 unsigned long totalpages); 78 78 79 - extern int oom_kills_count(void); 80 - extern void note_oom_kill(void); 81 79 extern void oom_kill_process(struct oom_control *oc, struct task_struct *p, 82 80 unsigned int points, unsigned long totalpages, 83 81 struct mem_cgroup *memcg, const char *message); ··· 89 91 90 92 extern bool out_of_memory(struct oom_control *oc); 91 93 92 - extern void exit_oom_victim(void); 94 + extern void exit_oom_victim(struct task_struct *tsk); 93 95 94 96 extern int register_oom_notifier(struct notifier_block *nb); 95 97 extern int unregister_oom_notifier(struct notifier_block *nb);

+4

include/linux/sched.h

··· 426 426 extern signed long schedule_timeout_interruptible(signed long timeout); 427 427 extern signed long schedule_timeout_killable(signed long timeout); 428 428 extern signed long schedule_timeout_uninterruptible(signed long timeout); 429 + extern signed long schedule_timeout_idle(signed long timeout); 429 430 asmlinkage void schedule(void); 430 431 extern void schedule_preempt_disabled(void); 431 432 ··· 1849 1848 unsigned long task_state_change; 1850 1849 #endif 1851 1850 int pagefault_disabled; 1851 + #ifdef CONFIG_MMU 1852 + struct task_struct *oom_reaper_list; 1853 + #endif 1852 1854 /* CPU-specific state of this task */ 1853 1855 struct thread_struct thread; 1854 1856 /*

+8 -2

include/linux/slab.h

··· 92 92 # define SLAB_ACCOUNT 0x00000000UL 93 93 #endif 94 94 95 + #ifdef CONFIG_KASAN 96 + #define SLAB_KASAN 0x08000000UL 97 + #else 98 + #define SLAB_KASAN 0x00000000UL 99 + #endif 100 + 95 101 /* The following flags affect the page allocator grouping pages by mobility */ 96 102 #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ 97 103 #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ ··· 376 370 { 377 371 void *ret = kmem_cache_alloc(s, flags); 378 372 379 - kasan_kmalloc(s, ret, size); 373 + kasan_kmalloc(s, ret, size, flags); 380 374 return ret; 381 375 } 382 376 ··· 387 381 { 388 382 void *ret = kmem_cache_alloc_node(s, gfpflags, node); 389 383 390 - kasan_kmalloc(s, ret, size); 384 + kasan_kmalloc(s, ret, size, gfpflags); 391 385 return ret; 392 386 } 393 387 #endif /* CONFIG_TRACING */

+14

include/linux/slab_def.h

··· 76 76 #ifdef CONFIG_MEMCG 77 77 struct memcg_cache_params memcg_params; 78 78 #endif 79 + #ifdef CONFIG_KASAN 80 + struct kasan_cache kasan_info; 81 + #endif 79 82 80 83 struct kmem_cache_node *node[MAX_NUMNODES]; 81 84 }; 85 + 86 + static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, 87 + void *x) { 88 + void *object = x - (x - page->s_mem) % cache->size; 89 + void *last_object = page->s_mem + (cache->num - 1) * cache->size; 90 + 91 + if (unlikely(object > last_object)) 92 + return last_object; 93 + else 94 + return object; 95 + } 82 96 83 97 #endif /* _LINUX_SLAB_DEF_H */

+11

include/linux/slub_def.h

··· 130 130 void object_err(struct kmem_cache *s, struct page *page, 131 131 u8 *object, char *reason); 132 132 133 + static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, 134 + void *x) { 135 + void *object = x - (x - page_address(page)) % cache->size; 136 + void *last_object = page_address(page) + 137 + (page->objects - 1) * cache->size; 138 + if (unlikely(object > last_object)) 139 + return last_object; 140 + else 141 + return object; 142 + } 143 + 133 144 #endif /* _LINUX_SLUB_DEF_H */

+32

include/linux/stackdepot.h

··· 1 + /* 2 + * A generic stack depot implementation 3 + * 4 + * Author: Alexander Potapenko <glider@google.com> 5 + * Copyright (C) 2016 Google, Inc. 6 + * 7 + * Based on code by Dmitry Chernenkov. 8 + * 9 + * This program is free software; you can redistribute it and/or modify 10 + * it under the terms of the GNU General Public License as published by 11 + * the Free Software Foundation; either version 2 of the License, or 12 + * (at your option) any later version. 13 + * 14 + * This program is distributed in the hope that it will be useful, 15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 + * GNU General Public License for more details. 18 + * 19 + */ 20 + 21 + #ifndef _LINUX_STACKDEPOT_H 22 + #define _LINUX_STACKDEPOT_H 23 + 24 + typedef u32 depot_stack_handle_t; 25 + 26 + struct stack_trace; 27 + 28 + depot_stack_handle_t depot_save_stack(struct stack_trace *trace, gfp_t flags); 29 + 30 + void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace); 31 + 32 + #endif

+1 -1

kernel/exit.c

··· 435 435 mm_update_next_owner(mm); 436 436 mmput(mm); 437 437 if (test_thread_flag(TIF_MEMDIE)) 438 - exit_oom_victim(); 438 + exit_oom_victim(tsk); 439 439 } 440 440 441 441 static struct task_struct *find_alive_thread(struct task_struct *p)

+1 -1

kernel/softirq.c

··· 227 227 static inline void lockdep_softirq_end(bool in_hardirq) { } 228 228 #endif 229 229 230 - asmlinkage __visible void __do_softirq(void) 230 + asmlinkage __visible void __softirq_entry __do_softirq(void) 231 231 { 232 232 unsigned long end = jiffies + MAX_SOFTIRQ_TIME; 233 233 unsigned long old_flags = current->flags;

+11

kernel/time/timer.c

··· 1566 1566 } 1567 1567 EXPORT_SYMBOL(schedule_timeout_uninterruptible); 1568 1568 1569 + /* 1570 + * Like schedule_timeout_uninterruptible(), except this task will not contribute 1571 + * to load average. 1572 + */ 1573 + signed long __sched schedule_timeout_idle(signed long timeout) 1574 + { 1575 + __set_current_state(TASK_IDLE); 1576 + return schedule_timeout(timeout); 1577 + } 1578 + EXPORT_SYMBOL(schedule_timeout_idle); 1579 + 1569 1580 #ifdef CONFIG_HOTPLUG_CPU 1570 1581 static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) 1571 1582 {

+1

kernel/trace/trace_functions_graph.c

··· 8 8 */ 9 9 #include <linux/uaccess.h> 10 10 #include <linux/ftrace.h> 11 + #include <linux/interrupt.h> 11 12 #include <linux/slab.h> 12 13 #include <linux/fs.h> 13 14

+4

lib/Kconfig

··· 536 536 config ARCH_HAS_MMIO_FLUSH 537 537 bool 538 538 539 + config STACKDEPOT 540 + bool 541 + select STACKTRACE 542 + 539 543 endmenu

+4 -1

lib/Kconfig.kasan

··· 5 5 6 6 config KASAN 7 7 bool "KASan: runtime memory debugger" 8 - depends on SLUB_DEBUG 8 + depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB) 9 9 select CONSTRUCTORS 10 + select STACKDEPOT if SLAB 10 11 help 11 12 Enables kernel address sanitizer - runtime memory debugger, 12 13 designed to find out-of-bounds accesses and use-after-free bugs. ··· 17 16 This feature consumes about 1/8 of available memory and brings about 18 17 ~x3 performance slowdown. 19 18 For better error detection enable CONFIG_STACKTRACE. 19 + Currently CONFIG_KASAN doesn't work with CONFIG_DEBUG_SLAB 20 + (the resulting kernel does not boot). 20 21 21 22 choice 22 23 prompt "Instrumentation type"

+3

lib/Makefile

··· 181 181 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o 182 182 obj-$(CONFIG_IRQ_POLL) += irq_poll.o 183 183 184 + obj-$(CONFIG_STACKDEPOT) += stackdepot.o 185 + KASAN_SANITIZE_stackdepot.o := n 186 + 184 187 libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \ 185 188 fdt_empty_tree.o 186 189 $(foreach file, $(libfdt_files), \

+284

lib/stackdepot.c

··· 1 + /* 2 + * Generic stack depot for storing stack traces. 3 + * 4 + * Some debugging tools need to save stack traces of certain events which can 5 + * be later presented to the user. For example, KASAN needs to safe alloc and 6 + * free stacks for each object, but storing two stack traces per object 7 + * requires too much memory (e.g. SLUB_DEBUG needs 256 bytes per object for 8 + * that). 9 + * 10 + * Instead, stack depot maintains a hashtable of unique stacktraces. Since alloc 11 + * and free stacks repeat a lot, we save about 100x space. 12 + * Stacks are never removed from depot, so we store them contiguously one after 13 + * another in a contiguos memory allocation. 14 + * 15 + * Author: Alexander Potapenko <glider@google.com> 16 + * Copyright (C) 2016 Google, Inc. 17 + * 18 + * Based on code by Dmitry Chernenkov. 19 + * 20 + * This program is free software; you can redistribute it and/or 21 + * modify it under the terms of the GNU General Public License 22 + * version 2 as published by the Free Software Foundation. 23 + * 24 + * This program is distributed in the hope that it will be useful, but 25 + * WITHOUT ANY WARRANTY; without even the implied warranty of 26 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 27 + * General Public License for more details. 28 + * 29 + */ 30 + 31 + #include <linux/gfp.h> 32 + #include <linux/jhash.h> 33 + #include <linux/kernel.h> 34 + #include <linux/mm.h> 35 + #include <linux/percpu.h> 36 + #include <linux/printk.h> 37 + #include <linux/slab.h> 38 + #include <linux/stacktrace.h> 39 + #include <linux/stackdepot.h> 40 + #include <linux/string.h> 41 + #include <linux/types.h> 42 + 43 + #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8) 44 + 45 + #define STACK_ALLOC_ORDER 2 /* 'Slab' size order for stack depot, 4 pages */ 46 + #define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER)) 47 + #define STACK_ALLOC_ALIGN 4 48 + #define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \ 49 + STACK_ALLOC_ALIGN) 50 + #define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - STACK_ALLOC_OFFSET_BITS) 51 + #define STACK_ALLOC_SLABS_CAP 1024 52 + #define STACK_ALLOC_MAX_SLABS \ 53 + (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ 54 + (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP) 55 + 56 + /* The compact structure to store the reference to stacks. */ 57 + union handle_parts { 58 + depot_stack_handle_t handle; 59 + struct { 60 + u32 slabindex : STACK_ALLOC_INDEX_BITS; 61 + u32 offset : STACK_ALLOC_OFFSET_BITS; 62 + }; 63 + }; 64 + 65 + struct stack_record { 66 + struct stack_record *next; /* Link in the hashtable */ 67 + u32 hash; /* Hash in the hastable */ 68 + u32 size; /* Number of frames in the stack */ 69 + union handle_parts handle; 70 + unsigned long entries[1]; /* Variable-sized array of entries. */ 71 + }; 72 + 73 + static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; 74 + 75 + static int depot_index; 76 + static int next_slab_inited; 77 + static size_t depot_offset; 78 + static DEFINE_SPINLOCK(depot_lock); 79 + 80 + static bool init_stack_slab(void **prealloc) 81 + { 82 + if (!*prealloc) 83 + return false; 84 + /* 85 + * This smp_load_acquire() pairs with smp_store_release() to 86 + * |next_slab_inited| below and in depot_alloc_stack(). 87 + */ 88 + if (smp_load_acquire(&next_slab_inited)) 89 + return true; 90 + if (stack_slabs[depot_index] == NULL) { 91 + stack_slabs[depot_index] = *prealloc; 92 + } else { 93 + stack_slabs[depot_index + 1] = *prealloc; 94 + /* 95 + * This smp_store_release pairs with smp_load_acquire() from 96 + * |next_slab_inited| above and in depot_save_stack(). 97 + */ 98 + smp_store_release(&next_slab_inited, 1); 99 + } 100 + *prealloc = NULL; 101 + return true; 102 + } 103 + 104 + /* Allocation of a new stack in raw storage */ 105 + static struct stack_record *depot_alloc_stack(unsigned long *entries, int size, 106 + u32 hash, void **prealloc, gfp_t alloc_flags) 107 + { 108 + int required_size = offsetof(struct stack_record, entries) + 109 + sizeof(unsigned long) * size; 110 + struct stack_record *stack; 111 + 112 + required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); 113 + 114 + if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) { 115 + if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) { 116 + WARN_ONCE(1, "Stack depot reached limit capacity"); 117 + return NULL; 118 + } 119 + depot_index++; 120 + depot_offset = 0; 121 + /* 122 + * smp_store_release() here pairs with smp_load_acquire() from 123 + * |next_slab_inited| in depot_save_stack() and 124 + * init_stack_slab(). 125 + */ 126 + if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) 127 + smp_store_release(&next_slab_inited, 0); 128 + } 129 + init_stack_slab(prealloc); 130 + if (stack_slabs[depot_index] == NULL) 131 + return NULL; 132 + 133 + stack = stack_slabs[depot_index] + depot_offset; 134 + 135 + stack->hash = hash; 136 + stack->size = size; 137 + stack->handle.slabindex = depot_index; 138 + stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; 139 + memcpy(stack->entries, entries, size * sizeof(unsigned long)); 140 + depot_offset += required_size; 141 + 142 + return stack; 143 + } 144 + 145 + #define STACK_HASH_ORDER 20 146 + #define STACK_HASH_SIZE (1L << STACK_HASH_ORDER) 147 + #define STACK_HASH_MASK (STACK_HASH_SIZE - 1) 148 + #define STACK_HASH_SEED 0x9747b28c 149 + 150 + static struct stack_record *stack_table[STACK_HASH_SIZE] = { 151 + [0 ... STACK_HASH_SIZE - 1] = NULL 152 + }; 153 + 154 + /* Calculate hash for a stack */ 155 + static inline u32 hash_stack(unsigned long *entries, unsigned int size) 156 + { 157 + return jhash2((u32 *)entries, 158 + size * sizeof(unsigned long) / sizeof(u32), 159 + STACK_HASH_SEED); 160 + } 161 + 162 + /* Find a stack that is equal to the one stored in entries in the hash */ 163 + static inline struct stack_record *find_stack(struct stack_record *bucket, 164 + unsigned long *entries, int size, 165 + u32 hash) 166 + { 167 + struct stack_record *found; 168 + 169 + for (found = bucket; found; found = found->next) { 170 + if (found->hash == hash && 171 + found->size == size && 172 + !memcmp(entries, found->entries, 173 + size * sizeof(unsigned long))) { 174 + return found; 175 + } 176 + } 177 + return NULL; 178 + } 179 + 180 + void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace) 181 + { 182 + union handle_parts parts = { .handle = handle }; 183 + void *slab = stack_slabs[parts.slabindex]; 184 + size_t offset = parts.offset << STACK_ALLOC_ALIGN; 185 + struct stack_record *stack = slab + offset; 186 + 187 + trace->nr_entries = trace->max_entries = stack->size; 188 + trace->entries = stack->entries; 189 + trace->skip = 0; 190 + } 191 + 192 + /** 193 + * depot_save_stack - save stack in a stack depot. 194 + * @trace - the stacktrace to save. 195 + * @alloc_flags - flags for allocating additional memory if required. 196 + * 197 + * Returns the handle of the stack struct stored in depot. 198 + */ 199 + depot_stack_handle_t depot_save_stack(struct stack_trace *trace, 200 + gfp_t alloc_flags) 201 + { 202 + u32 hash; 203 + depot_stack_handle_t retval = 0; 204 + struct stack_record *found = NULL, **bucket; 205 + unsigned long flags; 206 + struct page *page = NULL; 207 + void *prealloc = NULL; 208 + 209 + if (unlikely(trace->nr_entries == 0)) 210 + goto fast_exit; 211 + 212 + hash = hash_stack(trace->entries, trace->nr_entries); 213 + /* Bad luck, we won't store this stack. */ 214 + if (hash == 0) 215 + goto exit; 216 + 217 + bucket = &stack_table[hash & STACK_HASH_MASK]; 218 + 219 + /* 220 + * Fast path: look the stack trace up without locking. 221 + * The smp_load_acquire() here pairs with smp_store_release() to 222 + * |bucket| below. 223 + */ 224 + found = find_stack(smp_load_acquire(bucket), trace->entries, 225 + trace->nr_entries, hash); 226 + if (found) 227 + goto exit; 228 + 229 + /* 230 + * Check if the current or the next stack slab need to be initialized. 231 + * If so, allocate the memory - we won't be able to do that under the 232 + * lock. 233 + * 234 + * The smp_load_acquire() here pairs with smp_store_release() to 235 + * |next_slab_inited| in depot_alloc_stack() and init_stack_slab(). 236 + */ 237 + if (unlikely(!smp_load_acquire(&next_slab_inited))) { 238 + /* 239 + * Zero out zone modifiers, as we don't have specific zone 240 + * requirements. Keep the flags related to allocation in atomic 241 + * contexts and I/O. 242 + */ 243 + alloc_flags &= ~GFP_ZONEMASK; 244 + alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); 245 + page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); 246 + if (page) 247 + prealloc = page_address(page); 248 + } 249 + 250 + spin_lock_irqsave(&depot_lock, flags); 251 + 252 + found = find_stack(*bucket, trace->entries, trace->nr_entries, hash); 253 + if (!found) { 254 + struct stack_record *new = 255 + depot_alloc_stack(trace->entries, trace->nr_entries, 256 + hash, &prealloc, alloc_flags); 257 + if (new) { 258 + new->next = *bucket; 259 + /* 260 + * This smp_store_release() pairs with 261 + * smp_load_acquire() from |bucket| above. 262 + */ 263 + smp_store_release(bucket, new); 264 + found = new; 265 + } 266 + } else if (prealloc) { 267 + /* 268 + * We didn't need to store this stack trace, but let's keep 269 + * the preallocated memory for the future. 270 + */ 271 + WARN_ON(!init_stack_slab(&prealloc)); 272 + } 273 + 274 + spin_unlock_irqrestore(&depot_lock, flags); 275 + exit: 276 + if (prealloc) { 277 + /* Nobody used this memory, ok to free it. */ 278 + free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER); 279 + } 280 + if (found) 281 + retval = found->handle.handle; 282 + fast_exit: 283 + return retval; 284 + }

+29 -1

lib/test_kasan.c

··· 65 65 kfree(ptr); 66 66 } 67 67 68 - static noinline void __init kmalloc_large_oob_right(void) 68 + #ifdef CONFIG_SLUB 69 + static noinline void __init kmalloc_pagealloc_oob_right(void) 69 70 { 70 71 char *ptr; 71 72 size_t size = KMALLOC_MAX_CACHE_SIZE + 10; 72 73 74 + /* Allocate a chunk that does not fit into a SLUB cache to trigger 75 + * the page allocator fallback. 76 + */ 77 + pr_info("kmalloc pagealloc allocation: out-of-bounds to right\n"); 78 + ptr = kmalloc(size, GFP_KERNEL); 79 + if (!ptr) { 80 + pr_err("Allocation failed\n"); 81 + return; 82 + } 83 + 84 + ptr[size] = 0; 85 + kfree(ptr); 86 + } 87 + #endif 88 + 89 + static noinline void __init kmalloc_large_oob_right(void) 90 + { 91 + char *ptr; 92 + size_t size = KMALLOC_MAX_CACHE_SIZE - 256; 93 + /* Allocate a chunk that is large enough, but still fits into a slab 94 + * and does not trigger the page allocator fallback in SLUB. 95 + */ 73 96 pr_info("kmalloc large allocation: out-of-bounds to right\n"); 74 97 ptr = kmalloc(size, GFP_KERNEL); 75 98 if (!ptr) { ··· 294 271 } 295 272 296 273 ptr1[40] = 'x'; 274 + if (ptr1 == ptr2) 275 + pr_err("Could not detect use-after-free: ptr1 == ptr2\n"); 297 276 kfree(ptr2); 298 277 } 299 278 ··· 349 324 kmalloc_oob_right(); 350 325 kmalloc_oob_left(); 351 326 kmalloc_node_oob_right(); 327 + #ifdef CONFIG_SLUB 328 + kmalloc_pagealloc_oob_right(); 329 + #endif 352 330 kmalloc_large_oob_right(); 353 331 kmalloc_oob_krealloc_more(); 354 332 kmalloc_oob_krealloc_less();

+1

mm/Makefile

··· 3 3 # 4 4 5 5 KASAN_SANITIZE_slab_common.o := n 6 + KASAN_SANITIZE_slab.o := n 6 7 KASAN_SANITIZE_slub.o := n 7 8 8 9 # These files are disabled because they produce non-interesting and/or

+4 -3

mm/filemap.c

··· 1840 1840 ssize_t retval = 0; 1841 1841 loff_t *ppos = &iocb->ki_pos; 1842 1842 loff_t pos = *ppos; 1843 + size_t count = iov_iter_count(iter); 1844 + 1845 + if (!count) 1846 + goto out; /* skip atime */ 1843 1847 1844 1848 if (iocb->ki_flags & IOCB_DIRECT) { 1845 1849 struct address_space *mapping = file->f_mapping; 1846 1850 struct inode *inode = mapping->host; 1847 - size_t count = iov_iter_count(iter); 1848 1851 loff_t size; 1849 1852 1850 - if (!count) 1851 - goto out; /* skip atime */ 1852 1853 size = i_size_read(inode); 1853 1854 retval = filemap_write_and_wait_range(mapping, pos, 1854 1855 pos + count - 1);

+1 -1

mm/huge_memory.c

··· 2578 2578 } 2579 2579 khugepaged_node_load[node]++; 2580 2580 if (!PageLRU(page)) { 2581 - result = SCAN_SCAN_ABORT; 2581 + result = SCAN_PAGE_LRU; 2582 2582 goto out_unmap; 2583 2583 } 2584 2584 if (PageLocked(page)) {

+5

mm/internal.h

··· 38 38 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 39 39 unsigned long floor, unsigned long ceiling); 40 40 41 + void unmap_page_range(struct mmu_gather *tlb, 42 + struct vm_area_struct *vma, 43 + unsigned long addr, unsigned long end, 44 + struct zap_details *details); 45 + 41 46 extern int __do_page_cache_readahead(struct address_space *mapping, 42 47 struct file *filp, pgoff_t offset, unsigned long nr_to_read, 43 48 unsigned long lookahead_size);

+154 -8

mm/kasan/kasan.c

··· 17 17 #define DISABLE_BRANCH_PROFILING 18 18 19 19 #include <linux/export.h> 20 + #include <linux/interrupt.h> 20 21 #include <linux/init.h> 22 + #include <linux/kasan.h> 21 23 #include <linux/kernel.h> 22 24 #include <linux/kmemleak.h> 23 25 #include <linux/linkage.h> ··· 34 32 #include <linux/string.h> 35 33 #include <linux/types.h> 36 34 #include <linux/vmalloc.h> 37 - #include <linux/kasan.h> 38 35 39 36 #include "kasan.h" 40 37 #include "../slab.h" ··· 335 334 KASAN_FREE_PAGE); 336 335 } 337 336 337 + #ifdef CONFIG_SLAB 338 + /* 339 + * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. 340 + * For larger allocations larger redzones are used. 341 + */ 342 + static size_t optimal_redzone(size_t object_size) 343 + { 344 + int rz = 345 + object_size <= 64 - 16 ? 16 : 346 + object_size <= 128 - 32 ? 32 : 347 + object_size <= 512 - 64 ? 64 : 348 + object_size <= 4096 - 128 ? 128 : 349 + object_size <= (1 << 14) - 256 ? 256 : 350 + object_size <= (1 << 15) - 512 ? 512 : 351 + object_size <= (1 << 16) - 1024 ? 1024 : 2048; 352 + return rz; 353 + } 354 + 355 + void kasan_cache_create(struct kmem_cache *cache, size_t *size, 356 + unsigned long *flags) 357 + { 358 + int redzone_adjust; 359 + /* Make sure the adjusted size is still less than 360 + * KMALLOC_MAX_CACHE_SIZE. 361 + * TODO: this check is only useful for SLAB, but not SLUB. We'll need 362 + * to skip it for SLUB when it starts using kasan_cache_create(). 363 + */ 364 + if (*size > KMALLOC_MAX_CACHE_SIZE - 365 + sizeof(struct kasan_alloc_meta) - 366 + sizeof(struct kasan_free_meta)) 367 + return; 368 + *flags |= SLAB_KASAN; 369 + /* Add alloc meta. */ 370 + cache->kasan_info.alloc_meta_offset = *size; 371 + *size += sizeof(struct kasan_alloc_meta); 372 + 373 + /* Add free meta. */ 374 + if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor || 375 + cache->object_size < sizeof(struct kasan_free_meta)) { 376 + cache->kasan_info.free_meta_offset = *size; 377 + *size += sizeof(struct kasan_free_meta); 378 + } 379 + redzone_adjust = optimal_redzone(cache->object_size) - 380 + (*size - cache->object_size); 381 + if (redzone_adjust > 0) 382 + *size += redzone_adjust; 383 + *size = min(KMALLOC_MAX_CACHE_SIZE, 384 + max(*size, 385 + cache->object_size + 386 + optimal_redzone(cache->object_size))); 387 + } 388 + #endif 389 + 338 390 void kasan_poison_slab(struct page *page) 339 391 { 340 392 kasan_poison_shadow(page_address(page), ··· 405 351 kasan_poison_shadow(object, 406 352 round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), 407 353 KASAN_KMALLOC_REDZONE); 354 + #ifdef CONFIG_SLAB 355 + if (cache->flags & SLAB_KASAN) { 356 + struct kasan_alloc_meta *alloc_info = 357 + get_alloc_info(cache, object); 358 + alloc_info->state = KASAN_STATE_INIT; 359 + } 360 + #endif 408 361 } 409 362 410 - void kasan_slab_alloc(struct kmem_cache *cache, void *object) 363 + #ifdef CONFIG_SLAB 364 + static inline int in_irqentry_text(unsigned long ptr) 411 365 { 412 - kasan_kmalloc(cache, object, cache->object_size); 366 + return (ptr >= (unsigned long)&__irqentry_text_start && 367 + ptr < (unsigned long)&__irqentry_text_end) || 368 + (ptr >= (unsigned long)&__softirqentry_text_start && 369 + ptr < (unsigned long)&__softirqentry_text_end); 370 + } 371 + 372 + static inline void filter_irq_stacks(struct stack_trace *trace) 373 + { 374 + int i; 375 + 376 + if (!trace->nr_entries) 377 + return; 378 + for (i = 0; i < trace->nr_entries; i++) 379 + if (in_irqentry_text(trace->entries[i])) { 380 + /* Include the irqentry function into the stack. */ 381 + trace->nr_entries = i + 1; 382 + break; 383 + } 384 + } 385 + 386 + static inline depot_stack_handle_t save_stack(gfp_t flags) 387 + { 388 + unsigned long entries[KASAN_STACK_DEPTH]; 389 + struct stack_trace trace = { 390 + .nr_entries = 0, 391 + .entries = entries, 392 + .max_entries = KASAN_STACK_DEPTH, 393 + .skip = 0 394 + }; 395 + 396 + save_stack_trace(&trace); 397 + filter_irq_stacks(&trace); 398 + if (trace.nr_entries != 0 && 399 + trace.entries[trace.nr_entries-1] == ULONG_MAX) 400 + trace.nr_entries--; 401 + 402 + return depot_save_stack(&trace, flags); 403 + } 404 + 405 + static inline void set_track(struct kasan_track *track, gfp_t flags) 406 + { 407 + track->pid = current->pid; 408 + track->stack = save_stack(flags); 409 + } 410 + 411 + struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, 412 + const void *object) 413 + { 414 + BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); 415 + return (void *)object + cache->kasan_info.alloc_meta_offset; 416 + } 417 + 418 + struct kasan_free_meta *get_free_info(struct kmem_cache *cache, 419 + const void *object) 420 + { 421 + BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); 422 + return (void *)object + cache->kasan_info.free_meta_offset; 423 + } 424 + #endif 425 + 426 + void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) 427 + { 428 + kasan_kmalloc(cache, object, cache->object_size, flags); 413 429 } 414 430 415 431 void kasan_slab_free(struct kmem_cache *cache, void *object) ··· 491 367 if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) 492 368 return; 493 369 370 + #ifdef CONFIG_SLAB 371 + if (cache->flags & SLAB_KASAN) { 372 + struct kasan_free_meta *free_info = 373 + get_free_info(cache, object); 374 + struct kasan_alloc_meta *alloc_info = 375 + get_alloc_info(cache, object); 376 + alloc_info->state = KASAN_STATE_FREE; 377 + set_track(&free_info->track); 378 + } 379 + #endif 380 + 494 381 kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); 495 382 } 496 383 497 - void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) 384 + void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, 385 + gfp_t flags) 498 386 { 499 387 unsigned long redzone_start; 500 388 unsigned long redzone_end; ··· 522 386 kasan_unpoison_shadow(object, size); 523 387 kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, 524 388 KASAN_KMALLOC_REDZONE); 389 + #ifdef CONFIG_SLAB 390 + if (cache->flags & SLAB_KASAN) { 391 + struct kasan_alloc_meta *alloc_info = 392 + get_alloc_info(cache, object); 393 + 394 + alloc_info->state = KASAN_STATE_ALLOC; 395 + alloc_info->alloc_size = size; 396 + set_track(&alloc_info->track, flags); 397 + } 398 + #endif 525 399 } 526 400 EXPORT_SYMBOL(kasan_kmalloc); 527 401 528 - void kasan_kmalloc_large(const void *ptr, size_t size) 402 + void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) 529 403 { 530 404 struct page *page; 531 405 unsigned long redzone_start; ··· 554 408 KASAN_PAGE_REDZONE); 555 409 } 556 410 557 - void kasan_krealloc(const void *object, size_t size) 411 + void kasan_krealloc(const void *object, size_t size, gfp_t flags) 558 412 { 559 413 struct page *page; 560 414 ··· 564 418 page = virt_to_head_page(object); 565 419 566 420 if (unlikely(!PageSlab(page))) 567 - kasan_kmalloc_large(object, size); 421 + kasan_kmalloc_large(object, size, flags); 568 422 else 569 - kasan_kmalloc(page->slab_cache, object, size); 423 + kasan_kmalloc(page->slab_cache, object, size, flags); 570 424 } 571 425 572 426 void kasan_kfree(void *ptr)

+37

mm/kasan/kasan.h

··· 2 2 #define __MM_KASAN_KASAN_H 3 3 4 4 #include <linux/kasan.h> 5 + #include <linux/stackdepot.h> 5 6 6 7 #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) 7 8 #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) ··· 54 53 struct kasan_source_location *location; 55 54 #endif 56 55 }; 56 + 57 + /** 58 + * Structures to keep alloc and free tracks * 59 + */ 60 + 61 + enum kasan_state { 62 + KASAN_STATE_INIT, 63 + KASAN_STATE_ALLOC, 64 + KASAN_STATE_FREE 65 + }; 66 + 67 + #define KASAN_STACK_DEPTH 64 68 + 69 + struct kasan_track { 70 + u32 pid; 71 + depot_stack_handle_t stack; 72 + }; 73 + 74 + struct kasan_alloc_meta { 75 + struct kasan_track track; 76 + u32 state : 2; /* enum kasan_state */ 77 + u32 alloc_size : 30; 78 + u32 reserved; 79 + }; 80 + 81 + struct kasan_free_meta { 82 + /* Allocator freelist pointer, unused by KASAN. */ 83 + void **freelist; 84 + struct kasan_track track; 85 + }; 86 + 87 + struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, 88 + const void *object); 89 + struct kasan_free_meta *get_free_info(struct kmem_cache *cache, 90 + const void *object); 91 + 57 92 58 93 static inline const void *kasan_shadow_to_mem(const void *shadow_addr) 59 94 {

+51 -11

mm/kasan/report.c

··· 18 18 #include <linux/printk.h> 19 19 #include <linux/sched.h> 20 20 #include <linux/slab.h> 21 + #include <linux/stackdepot.h> 21 22 #include <linux/stacktrace.h> 22 23 #include <linux/string.h> 23 24 #include <linux/types.h> ··· 116 115 sizeof(init_thread_union.stack)); 117 116 } 118 117 118 + #ifdef CONFIG_SLAB 119 + static void print_track(struct kasan_track *track) 120 + { 121 + pr_err("PID = %u\n", track->pid); 122 + if (track->stack) { 123 + struct stack_trace trace; 124 + 125 + depot_fetch_stack(track->stack, &trace); 126 + print_stack_trace(&trace, 0); 127 + } else { 128 + pr_err("(stack is not available)\n"); 129 + } 130 + } 131 + 132 + static void object_err(struct kmem_cache *cache, struct page *page, 133 + void *object, char *unused_reason) 134 + { 135 + struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); 136 + struct kasan_free_meta *free_info; 137 + 138 + dump_stack(); 139 + pr_err("Object at %p, in cache %s\n", object, cache->name); 140 + if (!(cache->flags & SLAB_KASAN)) 141 + return; 142 + switch (alloc_info->state) { 143 + case KASAN_STATE_INIT: 144 + pr_err("Object not allocated yet\n"); 145 + break; 146 + case KASAN_STATE_ALLOC: 147 + pr_err("Object allocated with size %u bytes.\n", 148 + alloc_info->alloc_size); 149 + pr_err("Allocation:\n"); 150 + print_track(&alloc_info->track); 151 + break; 152 + case KASAN_STATE_FREE: 153 + pr_err("Object freed, allocated with size %u bytes\n", 154 + alloc_info->alloc_size); 155 + free_info = get_free_info(cache, object); 156 + pr_err("Allocation:\n"); 157 + print_track(&alloc_info->track); 158 + pr_err("Deallocation:\n"); 159 + print_track(&free_info->track); 160 + break; 161 + } 162 + } 163 + #endif 164 + 119 165 static void print_address_description(struct kasan_access_info *info) 120 166 { 121 167 const void *addr = info->access_addr; ··· 174 126 if (PageSlab(page)) { 175 127 void *object; 176 128 struct kmem_cache *cache = page->slab_cache; 177 - void *last_object; 178 - 179 - object = virt_to_obj(cache, page_address(page), addr); 180 - last_object = page_address(page) + 181 - page->objects * cache->size; 182 - 183 - if (unlikely(object > last_object)) 184 - object = last_object; /* we hit into padding */ 185 - 129 + object = nearest_obj(cache, page, 130 + (void *)info->access_addr); 186 131 object_err(cache, page, object, 187 - "kasan: bad access detected"); 132 + "kasan: bad access detected"); 188 133 return; 189 134 } 190 135 dump_page(page, "kasan: bad access detected"); ··· 187 146 if (!init_task_stack_addr(addr)) 188 147 pr_err("Address belongs to variable %pS\n", addr); 189 148 } 190 - 191 149 dump_stack(); 192 150 } 193 151

+10 -7

mm/memory.c

··· 1102 1102 1103 1103 if (!PageAnon(page)) { 1104 1104 if (pte_dirty(ptent)) { 1105 + /* 1106 + * oom_reaper cannot tear down dirty 1107 + * pages 1108 + */ 1109 + if (unlikely(details && details->ignore_dirty)) 1110 + continue; 1105 1111 force_flush = 1; 1106 1112 set_page_dirty(page); 1107 1113 } ··· 1126 1120 } 1127 1121 continue; 1128 1122 } 1129 - /* If details->check_mapping, we leave swap entries. */ 1130 - if (unlikely(details)) 1123 + /* only check swap_entries if explicitly asked for in details */ 1124 + if (unlikely(details && !details->check_swap_entries)) 1131 1125 continue; 1132 1126 1133 1127 entry = pte_to_swp_entry(ptent); ··· 1232 1226 return addr; 1233 1227 } 1234 1228 1235 - static void unmap_page_range(struct mmu_gather *tlb, 1229 + void unmap_page_range(struct mmu_gather *tlb, 1236 1230 struct vm_area_struct *vma, 1237 1231 unsigned long addr, unsigned long end, 1238 1232 struct zap_details *details) 1239 1233 { 1240 1234 pgd_t *pgd; 1241 1235 unsigned long next; 1242 - 1243 - if (details && !details->check_mapping) 1244 - details = NULL; 1245 1236 1246 1237 BUG_ON(addr >= end); 1247 1238 tlb_start_vma(tlb, vma); ··· 2435 2432 void unmap_mapping_range(struct address_space *mapping, 2436 2433 loff_t const holebegin, loff_t const holelen, int even_cows) 2437 2434 { 2438 - struct zap_details details; 2435 + struct zap_details details = { }; 2439 2436 pgoff_t hba = holebegin >> PAGE_SHIFT; 2440 2437 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; 2441 2438

+8 -8

mm/mempool.c

··· 112 112 kasan_free_pages(element, (unsigned long)pool->pool_data); 113 113 } 114 114 115 - static void kasan_unpoison_element(mempool_t *pool, void *element) 115 + static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags) 116 116 { 117 117 if (pool->alloc == mempool_alloc_slab) 118 - kasan_slab_alloc(pool->pool_data, element); 118 + kasan_slab_alloc(pool->pool_data, element, flags); 119 119 if (pool->alloc == mempool_kmalloc) 120 - kasan_krealloc(element, (size_t)pool->pool_data); 120 + kasan_krealloc(element, (size_t)pool->pool_data, flags); 121 121 if (pool->alloc == mempool_alloc_pages) 122 122 kasan_alloc_pages(element, (unsigned long)pool->pool_data); 123 123 } ··· 130 130 pool->elements[pool->curr_nr++] = element; 131 131 } 132 132 133 - static void *remove_element(mempool_t *pool) 133 + static void *remove_element(mempool_t *pool, gfp_t flags) 134 134 { 135 135 void *element = pool->elements[--pool->curr_nr]; 136 136 137 137 BUG_ON(pool->curr_nr < 0); 138 - kasan_unpoison_element(pool, element); 138 + kasan_unpoison_element(pool, element, flags); 139 139 check_element(pool, element); 140 140 return element; 141 141 } ··· 154 154 return; 155 155 156 156 while (pool->curr_nr) { 157 - void *element = remove_element(pool); 157 + void *element = remove_element(pool, GFP_KERNEL); 158 158 pool->free(element, pool->pool_data); 159 159 } 160 160 kfree(pool->elements); ··· 250 250 spin_lock_irqsave(&pool->lock, flags); 251 251 if (new_min_nr <= pool->min_nr) { 252 252 while (new_min_nr < pool->curr_nr) { 253 - element = remove_element(pool); 253 + element = remove_element(pool, GFP_KERNEL); 254 254 spin_unlock_irqrestore(&pool->lock, flags); 255 255 pool->free(element, pool->pool_data); 256 256 spin_lock_irqsave(&pool->lock, flags); ··· 347 347 348 348 spin_lock_irqsave(&pool->lock, flags); 349 349 if (likely(pool->curr_nr)) { 350 - element = remove_element(pool); 350 + element = remove_element(pool, gfp_temp); 351 351 spin_unlock_irqrestore(&pool->lock, flags); 352 352 /* paired with rmb in mempool_free(), read comment there */ 353 353 smp_wmb();

+187 -9

mm/oom_kill.c

··· 35 35 #include <linux/freezer.h> 36 36 #include <linux/ftrace.h> 37 37 #include <linux/ratelimit.h> 38 + #include <linux/kthread.h> 39 + #include <linux/init.h> 40 + 41 + #include <asm/tlb.h> 42 + #include "internal.h" 38 43 39 44 #define CREATE_TRACE_POINTS 40 45 #include <trace/events/oom.h> ··· 410 405 411 406 bool oom_killer_disabled __read_mostly; 412 407 408 + #define K(x) ((x) << (PAGE_SHIFT-10)) 409 + 410 + #ifdef CONFIG_MMU 411 + /* 412 + * OOM Reaper kernel thread which tries to reap the memory used by the OOM 413 + * victim (if that is possible) to help the OOM killer to move on. 414 + */ 415 + static struct task_struct *oom_reaper_th; 416 + static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); 417 + static struct task_struct *oom_reaper_list; 418 + static DEFINE_SPINLOCK(oom_reaper_lock); 419 + 420 + 421 + static bool __oom_reap_task(struct task_struct *tsk) 422 + { 423 + struct mmu_gather tlb; 424 + struct vm_area_struct *vma; 425 + struct mm_struct *mm; 426 + struct task_struct *p; 427 + struct zap_details details = {.check_swap_entries = true, 428 + .ignore_dirty = true}; 429 + bool ret = true; 430 + 431 + /* 432 + * Make sure we find the associated mm_struct even when the particular 433 + * thread has already terminated and cleared its mm. 434 + * We might have race with exit path so consider our work done if there 435 + * is no mm. 436 + */ 437 + p = find_lock_task_mm(tsk); 438 + if (!p) 439 + return true; 440 + 441 + mm = p->mm; 442 + if (!atomic_inc_not_zero(&mm->mm_users)) { 443 + task_unlock(p); 444 + return true; 445 + } 446 + 447 + task_unlock(p); 448 + 449 + if (!down_read_trylock(&mm->mmap_sem)) { 450 + ret = false; 451 + goto out; 452 + } 453 + 454 + tlb_gather_mmu(&tlb, mm, 0, -1); 455 + for (vma = mm->mmap ; vma; vma = vma->vm_next) { 456 + if (is_vm_hugetlb_page(vma)) 457 + continue; 458 + 459 + /* 460 + * mlocked VMAs require explicit munlocking before unmap. 461 + * Let's keep it simple here and skip such VMAs. 462 + */ 463 + if (vma->vm_flags & VM_LOCKED) 464 + continue; 465 + 466 + /* 467 + * Only anonymous pages have a good chance to be dropped 468 + * without additional steps which we cannot afford as we 469 + * are OOM already. 470 + * 471 + * We do not even care about fs backed pages because all 472 + * which are reclaimable have already been reclaimed and 473 + * we do not want to block exit_mmap by keeping mm ref 474 + * count elevated without a good reason. 475 + */ 476 + if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) 477 + unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, 478 + &details); 479 + } 480 + tlb_finish_mmu(&tlb, 0, -1); 481 + pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", 482 + task_pid_nr(tsk), tsk->comm, 483 + K(get_mm_counter(mm, MM_ANONPAGES)), 484 + K(get_mm_counter(mm, MM_FILEPAGES)), 485 + K(get_mm_counter(mm, MM_SHMEMPAGES))); 486 + up_read(&mm->mmap_sem); 487 + 488 + /* 489 + * Clear TIF_MEMDIE because the task shouldn't be sitting on a 490 + * reasonably reclaimable memory anymore. OOM killer can continue 491 + * by selecting other victim if unmapping hasn't led to any 492 + * improvements. This also means that selecting this task doesn't 493 + * make any sense. 494 + */ 495 + tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; 496 + exit_oom_victim(tsk); 497 + out: 498 + mmput(mm); 499 + return ret; 500 + } 501 + 502 + #define MAX_OOM_REAP_RETRIES 10 503 + static void oom_reap_task(struct task_struct *tsk) 504 + { 505 + int attempts = 0; 506 + 507 + /* Retry the down_read_trylock(mmap_sem) a few times */ 508 + while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk)) 509 + schedule_timeout_idle(HZ/10); 510 + 511 + if (attempts > MAX_OOM_REAP_RETRIES) { 512 + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", 513 + task_pid_nr(tsk), tsk->comm); 514 + debug_show_all_locks(); 515 + } 516 + 517 + /* Drop a reference taken by wake_oom_reaper */ 518 + put_task_struct(tsk); 519 + } 520 + 521 + static int oom_reaper(void *unused) 522 + { 523 + set_freezable(); 524 + 525 + while (true) { 526 + struct task_struct *tsk = NULL; 527 + 528 + wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); 529 + spin_lock(&oom_reaper_lock); 530 + if (oom_reaper_list != NULL) { 531 + tsk = oom_reaper_list; 532 + oom_reaper_list = tsk->oom_reaper_list; 533 + } 534 + spin_unlock(&oom_reaper_lock); 535 + 536 + if (tsk) 537 + oom_reap_task(tsk); 538 + } 539 + 540 + return 0; 541 + } 542 + 543 + static void wake_oom_reaper(struct task_struct *tsk) 544 + { 545 + if (!oom_reaper_th || tsk->oom_reaper_list) 546 + return; 547 + 548 + get_task_struct(tsk); 549 + 550 + spin_lock(&oom_reaper_lock); 551 + tsk->oom_reaper_list = oom_reaper_list; 552 + oom_reaper_list = tsk; 553 + spin_unlock(&oom_reaper_lock); 554 + wake_up(&oom_reaper_wait); 555 + } 556 + 557 + static int __init oom_init(void) 558 + { 559 + oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); 560 + if (IS_ERR(oom_reaper_th)) { 561 + pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", 562 + PTR_ERR(oom_reaper_th)); 563 + oom_reaper_th = NULL; 564 + } 565 + return 0; 566 + } 567 + subsys_initcall(oom_init) 568 + #else 569 + static void wake_oom_reaper(struct task_struct *tsk) 570 + { 571 + } 572 + #endif 573 + 413 574 /** 414 575 * mark_oom_victim - mark the given task as OOM victim 415 576 * @tsk: task to mark ··· 602 431 /** 603 432 * exit_oom_victim - note the exit of an OOM victim 604 433 */ 605 - void exit_oom_victim(void) 434 + void exit_oom_victim(struct task_struct *tsk) 606 435 { 607 - clear_thread_flag(TIF_MEMDIE); 436 + if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) 437 + return; 608 438 609 439 if (!atomic_dec_return(&oom_victims)) 610 440 wake_up_all(&oom_victims_wait); ··· 666 494 return false; 667 495 } 668 496 669 - #define K(x) ((x) << (PAGE_SHIFT-10)) 670 497 /* 671 498 * Must be called while holding a reference to p, which will be released upon 672 499 * returning. ··· 681 510 unsigned int victim_points = 0; 682 511 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, 683 512 DEFAULT_RATELIMIT_BURST); 513 + bool can_oom_reap = true; 684 514 685 515 /* 686 516 * If the task is already exiting, don't alarm the sysadmin or kill ··· 772 600 continue; 773 601 if (same_thread_group(p, victim)) 774 602 continue; 775 - if (unlikely(p->flags & PF_KTHREAD)) 603 + if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || 604 + p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { 605 + /* 606 + * We cannot use oom_reaper for the mm shared by this 607 + * process because it wouldn't get killed and so the 608 + * memory might be still used. 609 + */ 610 + can_oom_reap = false; 776 611 continue; 777 - if (is_global_init(p)) 778 - continue; 779 - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) 780 - continue; 781 - 612 + } 782 613 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); 783 614 } 784 615 rcu_read_unlock(); 616 + 617 + if (can_oom_reap) 618 + wake_oom_reaper(victim); 785 619 786 620 mmdrop(mm); 787 621 put_task_struct(victim);

+33 -13

mm/page_alloc.c

··· 692 692 unsigned long combined_idx; 693 693 unsigned long uninitialized_var(buddy_idx); 694 694 struct page *buddy; 695 - unsigned int max_order = MAX_ORDER; 695 + unsigned int max_order; 696 + 697 + max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); 696 698 697 699 VM_BUG_ON(!zone_is_initialized(zone)); 698 700 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); 699 701 700 702 VM_BUG_ON(migratetype == -1); 701 - if (is_migrate_isolate(migratetype)) { 702 - /* 703 - * We restrict max order of merging to prevent merge 704 - * between freepages on isolate pageblock and normal 705 - * pageblock. Without this, pageblock isolation 706 - * could cause incorrect freepage accounting. 707 - */ 708 - max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); 709 - } else { 703 + if (likely(!is_migrate_isolate(migratetype))) 710 704 __mod_zone_freepage_state(zone, 1 << order, migratetype); 711 - } 712 705 713 - page_idx = pfn & ((1 << max_order) - 1); 706 + page_idx = pfn & ((1 << MAX_ORDER) - 1); 714 707 715 708 VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); 716 709 VM_BUG_ON_PAGE(bad_range(zone, page), page); 717 710 711 + continue_merging: 718 712 while (order < max_order - 1) { 719 713 buddy_idx = __find_buddy_index(page_idx, order); 720 714 buddy = page + (buddy_idx - page_idx); 721 715 if (!page_is_buddy(page, buddy, order)) 722 - break; 716 + goto done_merging; 723 717 /* 724 718 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 725 719 * merge with it and move up one order. ··· 730 736 page_idx = combined_idx; 731 737 order++; 732 738 } 739 + if (max_order < MAX_ORDER) { 740 + /* If we are here, it means order is >= pageblock_order. 741 + * We want to prevent merge between freepages on isolate 742 + * pageblock and normal pageblock. Without this, pageblock 743 + * isolation could cause incorrect freepage or CMA accounting. 744 + * 745 + * We don't want to hit this code for the more frequent 746 + * low-order merging. 747 + */ 748 + if (unlikely(has_isolate_pageblock(zone))) { 749 + int buddy_mt; 750 + 751 + buddy_idx = __find_buddy_index(page_idx, order); 752 + buddy = page + (buddy_idx - page_idx); 753 + buddy_mt = get_pageblock_migratetype(buddy); 754 + 755 + if (migratetype != buddy_mt 756 + && (is_migrate_isolate(migratetype) || 757 + is_migrate_isolate(buddy_mt))) 758 + goto done_merging; 759 + } 760 + max_order++; 761 + goto continue_merging; 762 + } 763 + 764 + done_merging: 733 765 set_page_order(page, order); 734 766 735 767 /*

+37 -5

mm/slab.c

··· 2086 2086 } 2087 2087 #endif 2088 2088 2089 + kasan_cache_create(cachep, &size, &flags); 2090 + 2089 2091 size = ALIGN(size, cachep->align); 2090 2092 /* 2091 2093 * We should restrict the number of objects in a slab to implement ··· 2389 2387 * cache which they are a constructor for. Otherwise, deadlock. 2390 2388 * They must also be threaded. 2391 2389 */ 2392 - if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2390 + if (cachep->ctor && !(cachep->flags & SLAB_POISON)) { 2391 + kasan_unpoison_object_data(cachep, 2392 + objp + obj_offset(cachep)); 2393 2393 cachep->ctor(objp + obj_offset(cachep)); 2394 + kasan_poison_object_data( 2395 + cachep, objp + obj_offset(cachep)); 2396 + } 2394 2397 2395 2398 if (cachep->flags & SLAB_RED_ZONE) { 2396 2399 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) ··· 2416 2409 struct page *page) 2417 2410 { 2418 2411 int i; 2412 + void *objp; 2419 2413 2420 2414 cache_init_objs_debug(cachep, page); 2421 2415 ··· 2427 2419 2428 2420 for (i = 0; i < cachep->num; i++) { 2429 2421 /* constructor could break poison info */ 2430 - if (DEBUG == 0 && cachep->ctor) 2431 - cachep->ctor(index_to_obj(cachep, page, i)); 2422 + if (DEBUG == 0 && cachep->ctor) { 2423 + objp = index_to_obj(cachep, page, i); 2424 + kasan_unpoison_object_data(cachep, objp); 2425 + cachep->ctor(objp); 2426 + kasan_poison_object_data(cachep, objp); 2427 + } 2432 2428 2433 2429 set_free_obj(page, i, i); 2434 2430 } ··· 2562 2550 2563 2551 slab_map_pages(cachep, page, freelist); 2564 2552 2553 + kasan_poison_slab(page); 2565 2554 cache_init_objs(cachep, page); 2566 2555 2567 2556 if (gfpflags_allow_blocking(local_flags)) ··· 3329 3316 { 3330 3317 struct array_cache *ac = cpu_cache_get(cachep); 3331 3318 3319 + kasan_slab_free(cachep, objp); 3320 + 3332 3321 check_irq_off(); 3333 3322 kmemleak_free_recursive(objp, cachep->flags); 3334 3323 objp = cache_free_debugcheck(cachep, objp, caller); ··· 3378 3363 { 3379 3364 void *ret = slab_alloc(cachep, flags, _RET_IP_); 3380 3365 3366 + kasan_slab_alloc(cachep, ret, flags); 3381 3367 trace_kmem_cache_alloc(_RET_IP_, ret, 3382 3368 cachep->object_size, cachep->size, flags); 3383 3369 ··· 3444 3428 3445 3429 ret = slab_alloc(cachep, flags, _RET_IP_); 3446 3430 3431 + kasan_kmalloc(cachep, ret, size, flags); 3447 3432 trace_kmalloc(_RET_IP_, ret, 3448 3433 size, cachep->size, flags); 3449 3434 return ret; ··· 3468 3451 { 3469 3452 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3470 3453 3454 + kasan_slab_alloc(cachep, ret, flags); 3471 3455 trace_kmem_cache_alloc_node(_RET_IP_, ret, 3472 3456 cachep->object_size, cachep->size, 3473 3457 flags, nodeid); ··· 3487 3469 3488 3470 ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3489 3471 3472 + kasan_kmalloc(cachep, ret, size, flags); 3490 3473 trace_kmalloc_node(_RET_IP_, ret, 3491 3474 size, cachep->size, 3492 3475 flags, nodeid); ··· 3500 3481 __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) 3501 3482 { 3502 3483 struct kmem_cache *cachep; 3484 + void *ret; 3503 3485 3504 3486 cachep = kmalloc_slab(size, flags); 3505 3487 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3506 3488 return cachep; 3507 - return kmem_cache_alloc_node_trace(cachep, flags, node, size); 3489 + ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); 3490 + kasan_kmalloc(cachep, ret, size, flags); 3491 + 3492 + return ret; 3508 3493 } 3509 3494 3510 3495 void *__kmalloc_node(size_t size, gfp_t flags, int node) ··· 3542 3519 return cachep; 3543 3520 ret = slab_alloc(cachep, flags, caller); 3544 3521 3522 + kasan_kmalloc(cachep, ret, size, flags); 3545 3523 trace_kmalloc(caller, ret, 3546 3524 size, cachep->size, flags); 3547 3525 ··· 4314 4290 */ 4315 4291 size_t ksize(const void *objp) 4316 4292 { 4293 + size_t size; 4294 + 4317 4295 BUG_ON(!objp); 4318 4296 if (unlikely(objp == ZERO_SIZE_PTR)) 4319 4297 return 0; 4320 4298 4321 - return virt_to_cache(objp)->object_size; 4299 + size = virt_to_cache(objp)->object_size; 4300 + /* We assume that ksize callers could use the whole allocated area, 4301 + * so we need to unpoison this area. 4302 + */ 4303 + kasan_krealloc(objp, size, GFP_NOWAIT); 4304 + 4305 + return size; 4322 4306 } 4323 4307 EXPORT_SYMBOL(ksize);

+1 -1

mm/slab.h

··· 405 405 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 406 406 kmemleak_alloc_recursive(object, s->object_size, 1, 407 407 s->flags, flags); 408 - kasan_slab_alloc(s, object); 408 + kasan_slab_alloc(s, object, flags); 409 409 } 410 410 memcg_kmem_put_cache(s); 411 411 }

+3 -3

mm/slab_common.c

··· 35 35 */ 36 36 #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 37 37 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ 38 - SLAB_FAILSLAB) 38 + SLAB_FAILSLAB | SLAB_KASAN) 39 39 40 40 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ 41 41 SLAB_NOTRACK | SLAB_ACCOUNT) ··· 1013 1013 page = alloc_kmem_pages(flags, order); 1014 1014 ret = page ? page_address(page) : NULL; 1015 1015 kmemleak_alloc(ret, size, 1, flags); 1016 - kasan_kmalloc_large(ret, size); 1016 + kasan_kmalloc_large(ret, size, flags); 1017 1017 return ret; 1018 1018 } 1019 1019 EXPORT_SYMBOL(kmalloc_order); ··· 1192 1192 ks = ksize(p); 1193 1193 1194 1194 if (ks >= new_size) { 1195 - kasan_krealloc((void *)p, new_size); 1195 + kasan_krealloc((void *)p, new_size, flags); 1196 1196 return (void *)p; 1197 1197 } 1198 1198

+8 -7

mm/slub.c

··· 1313 1313 static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) 1314 1314 { 1315 1315 kmemleak_alloc(ptr, size, 1, flags); 1316 - kasan_kmalloc_large(ptr, size); 1316 + kasan_kmalloc_large(ptr, size, flags); 1317 1317 } 1318 1318 1319 1319 static inline void kfree_hook(const void *x) ··· 2596 2596 { 2597 2597 void *ret = slab_alloc(s, gfpflags, _RET_IP_); 2598 2598 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); 2599 - kasan_kmalloc(s, ret, size); 2599 + kasan_kmalloc(s, ret, size, gfpflags); 2600 2600 return ret; 2601 2601 } 2602 2602 EXPORT_SYMBOL(kmem_cache_alloc_trace); ··· 2624 2624 trace_kmalloc_node(_RET_IP_, ret, 2625 2625 size, s->size, gfpflags, node); 2626 2626 2627 - kasan_kmalloc(s, ret, size); 2627 + kasan_kmalloc(s, ret, size, gfpflags); 2628 2628 return ret; 2629 2629 } 2630 2630 EXPORT_SYMBOL(kmem_cache_alloc_node_trace); ··· 3182 3182 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 3183 3183 init_tracking(kmem_cache_node, n); 3184 3184 #endif 3185 - kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node)); 3185 + kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), 3186 + GFP_KERNEL); 3186 3187 init_kmem_cache_node(n); 3187 3188 inc_slabs_node(kmem_cache_node, node, page->objects); 3188 3189 ··· 3562 3561 3563 3562 trace_kmalloc(_RET_IP_, ret, size, s->size, flags); 3564 3563 3565 - kasan_kmalloc(s, ret, size); 3564 + kasan_kmalloc(s, ret, size, flags); 3566 3565 3567 3566 return ret; 3568 3567 } ··· 3607 3606 3608 3607 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); 3609 3608 3610 - kasan_kmalloc(s, ret, size); 3609 + kasan_kmalloc(s, ret, size, flags); 3611 3610 3612 3611 return ret; 3613 3612 } ··· 3636 3635 size_t size = __ksize(object); 3637 3636 /* We assume that ksize callers could use whole allocated area, 3638 3637 so we need unpoison this area. */ 3639 - kasan_krealloc(object, size); 3638 + kasan_krealloc(object, size, GFP_NOWAIT); 3640 3639 return size; 3641 3640 } 3642 3641 EXPORT_SYMBOL(ksize);