Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-4.0-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq

Pull workqueue fix from Tejun Heo:
"One fix patch for a subtle livelock condition which can happen on
PREEMPT_NONE kernels involving two racing cancel_work calls. Whoever
comes in the second has to wait for the previous one to finish. This
was implemented by making the later one block for the same condition
that the former would be (work item completion) and then loop and
retest; unfortunately, depending on the wake up order, the later one
could lock out the former one to finish by busy looping on the cpu.

This is fixed by implementing explicit wait mechanism. Work item
might not belong anywhere at this point and there's remote possibility
of thundering herd problem. I originally tried to use bit_waitqueue
but it didn't work for static work items on modules. It's currently
using single wait queue with filtering wake up function and exclusive
wakeup. If this ever becomes a problem, which is not very likely, we
can try to figure out a way to piggy back on bit_waitqueue"

* 'for-4.0-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq:
workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE

+54 -5
+2 -1
include/linux/workqueue.h
··· 70 70 /* data contains off-queue information when !WORK_STRUCT_PWQ */ 71 71 WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT, 72 72 73 - WORK_OFFQ_CANCELING = (1 << WORK_OFFQ_FLAG_BASE), 73 + __WORK_OFFQ_CANCELING = WORK_OFFQ_FLAG_BASE, 74 + WORK_OFFQ_CANCELING = (1 << __WORK_OFFQ_CANCELING), 74 75 75 76 /* 76 77 * When a work item is off queue, its high bits point to the last
+52 -4
kernel/workqueue.c
··· 2728 2728 } 2729 2729 EXPORT_SYMBOL_GPL(flush_work); 2730 2730 2731 + struct cwt_wait { 2732 + wait_queue_t wait; 2733 + struct work_struct *work; 2734 + }; 2735 + 2736 + static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key) 2737 + { 2738 + struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); 2739 + 2740 + if (cwait->work != key) 2741 + return 0; 2742 + return autoremove_wake_function(wait, mode, sync, key); 2743 + } 2744 + 2731 2745 static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) 2732 2746 { 2747 + static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq); 2733 2748 unsigned long flags; 2734 2749 int ret; 2735 2750 2736 2751 do { 2737 2752 ret = try_to_grab_pending(work, is_dwork, &flags); 2738 2753 /* 2739 - * If someone else is canceling, wait for the same event it 2740 - * would be waiting for before retrying. 2754 + * If someone else is already canceling, wait for it to 2755 + * finish. flush_work() doesn't work for PREEMPT_NONE 2756 + * because we may get scheduled between @work's completion 2757 + * and the other canceling task resuming and clearing 2758 + * CANCELING - flush_work() will return false immediately 2759 + * as @work is no longer busy, try_to_grab_pending() will 2760 + * return -ENOENT as @work is still being canceled and the 2761 + * other canceling task won't be able to clear CANCELING as 2762 + * we're hogging the CPU. 2763 + * 2764 + * Let's wait for completion using a waitqueue. As this 2765 + * may lead to the thundering herd problem, use a custom 2766 + * wake function which matches @work along with exclusive 2767 + * wait and wakeup. 2741 2768 */ 2742 - if (unlikely(ret == -ENOENT)) 2743 - flush_work(work); 2769 + if (unlikely(ret == -ENOENT)) { 2770 + struct cwt_wait cwait; 2771 + 2772 + init_wait(&cwait.wait); 2773 + cwait.wait.func = cwt_wakefn; 2774 + cwait.work = work; 2775 + 2776 + prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait, 2777 + TASK_UNINTERRUPTIBLE); 2778 + if (work_is_canceling(work)) 2779 + schedule(); 2780 + finish_wait(&cancel_waitq, &cwait.wait); 2781 + } 2744 2782 } while (unlikely(ret < 0)); 2745 2783 2746 2784 /* tell other tasks trying to grab @work to back off */ ··· 2787 2749 2788 2750 flush_work(work); 2789 2751 clear_work_data(work); 2752 + 2753 + /* 2754 + * Paired with prepare_to_wait() above so that either 2755 + * waitqueue_active() is visible here or !work_is_canceling() is 2756 + * visible there. 2757 + */ 2758 + smp_mb(); 2759 + if (waitqueue_active(&cancel_waitq)) 2760 + __wake_up(&cancel_waitq, TASK_NORMAL, 1, work); 2761 + 2790 2762 return ret; 2791 2763 } 2792 2764