Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dm vdo: add specialized request queueing functionality

This patch adds funnel_queue, a mostly lock-free multi-producer,
single-consumer queue. It also adds the request queue used by the dm-vdo
deduplication index, and the work_queue used by the dm-vdo data store. Both
of these are built on top of funnel queue and are intended to support the
dispatching of many short-running tasks. The work_queue also supports
priorities. Finally, this patch adds vdo_completion, the structure which is
enqueued on work_queues.

Co-developed-by: J. corwin Coburn <corwin@hurlbutnet.net>
Signed-off-by: J. corwin Coburn <corwin@hurlbutnet.net>
Co-developed-by: Michael Sclafani <dm-devel@lists.linux.dev>
Signed-off-by: Michael Sclafani <dm-devel@lists.linux.dev>
Co-developed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Co-developed-by: Ken Raeburn <raeburn@redhat.com>
Signed-off-by: Ken Raeburn <raeburn@redhat.com>
Signed-off-by: Matthew Sakai <msakai@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>

authored by

Matthew Sakai and committed by
Mike Snitzer
d9e894d9 89f9b701

+1635
+140
drivers/md/dm-vdo/completion.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright 2023 Red Hat 4 + */ 5 + 6 + #include "completion.h" 7 + 8 + #include <linux/kernel.h> 9 + 10 + #include "logger.h" 11 + #include "permassert.h" 12 + 13 + #include "status-codes.h" 14 + #include "types.h" 15 + #include "vio.h" 16 + #include "vdo.h" 17 + 18 + /** 19 + * DOC: vdo completions. 20 + * 21 + * Most of vdo's data structures are lock free, each either belonging to a single "zone," or 22 + * divided into a number of zones whose accesses to the structure do not overlap. During normal 23 + * operation, at most one thread will be operating in any given zone. Each zone has a 24 + * vdo_work_queue which holds vdo_completions that are to be run in that zone. A completion may 25 + * only be enqueued on one queue or operating in a single zone at a time. 26 + * 27 + * At each step of a multi-threaded operation, the completion performing the operation is given a 28 + * callback, error handler, and thread id for the next step. A completion is "run" when it is 29 + * operating on the correct thread (as specified by its callback_thread_id). If the value of its 30 + * "result" field is an error (i.e. not VDO_SUCCESS), the function in its "error_handler" will be 31 + * invoked. If the error_handler is NULL, or there is no error, the function set as its "callback" 32 + * will be invoked. Generally, a completion will not be run directly, but rather will be 33 + * "launched." In this case, it will check whether it is operating on the correct thread. If it is, 34 + * it will run immediately. Otherwise, it will be enqueue on the vdo_work_queue associated with the 35 + * completion's "callback_thread_id". When it is dequeued, it will be on the correct thread, and 36 + * will get run. In some cases, the completion should get queued instead of running immediately, 37 + * even if it is being launched from the correct thread. This is usually in cases where there is a 38 + * long chain of callbacks, all on the same thread, which could overflow the stack. In such cases, 39 + * the completion's "requeue" field should be set to true. Doing so will skip the current thread 40 + * check and simply enqueue the completion. 41 + * 42 + * A completion may be "finished," in which case its "complete" field will be set to true before it 43 + * is next run. It is a bug to attempt to set the result or re-finish a finished completion. 44 + * Because a completion's fields are not safe to examine from any thread other than the one on 45 + * which the completion is currently operating, this field is used only to aid in detecting 46 + * programming errors. It can not be used for cross-thread checking on the status of an operation. 47 + * A completion must be "reset" before it can be reused after it has been finished. Resetting will 48 + * also clear any error from the result field. 49 + **/ 50 + 51 + void vdo_initialize_completion(struct vdo_completion *completion, 52 + struct vdo *vdo, 53 + enum vdo_completion_type type) 54 + { 55 + memset(completion, 0, sizeof(*completion)); 56 + completion->vdo = vdo; 57 + completion->type = type; 58 + vdo_reset_completion(completion); 59 + } 60 + 61 + static inline void assert_incomplete(struct vdo_completion *completion) 62 + { 63 + ASSERT_LOG_ONLY(!completion->complete, "completion is not complete"); 64 + } 65 + 66 + /** 67 + * vdo_set_completion_result() - Set the result of a completion. 68 + * 69 + * Older errors will not be masked. 70 + */ 71 + void vdo_set_completion_result(struct vdo_completion *completion, int result) 72 + { 73 + assert_incomplete(completion); 74 + if (completion->result == VDO_SUCCESS) 75 + completion->result = result; 76 + } 77 + 78 + /** 79 + * vdo_launch_completion_with_priority() - Run or enqueue a completion. 80 + * @priority: The priority at which to enqueue the completion. 81 + * 82 + * If called on the correct thread (i.e. the one specified in the completion's callback_thread_id 83 + * field) and not marked for requeue, the completion will be run immediately. Otherwise, the 84 + * completion will be enqueued on the specified thread. 85 + */ 86 + void vdo_launch_completion_with_priority(struct vdo_completion *completion, 87 + enum vdo_completion_priority priority) 88 + { 89 + thread_id_t callback_thread = completion->callback_thread_id; 90 + 91 + if (completion->requeue || (callback_thread != vdo_get_callback_thread_id())) { 92 + vdo_enqueue_completion(completion, priority); 93 + return; 94 + } 95 + 96 + vdo_run_completion(completion); 97 + } 98 + 99 + /** vdo_finish_completion() - Mark a completion as complete and then launch it. */ 100 + void vdo_finish_completion(struct vdo_completion *completion) 101 + { 102 + assert_incomplete(completion); 103 + completion->complete = true; 104 + if (completion->callback != NULL) 105 + vdo_launch_completion(completion); 106 + } 107 + 108 + void vdo_enqueue_completion(struct vdo_completion *completion, 109 + enum vdo_completion_priority priority) 110 + { 111 + struct vdo *vdo = completion->vdo; 112 + thread_id_t thread_id = completion->callback_thread_id; 113 + 114 + if (ASSERT(thread_id < vdo->thread_config.thread_count, 115 + "thread_id %u (completion type %d) is less than thread count %u", 116 + thread_id, completion->type, 117 + vdo->thread_config.thread_count) != UDS_SUCCESS) 118 + BUG(); 119 + 120 + completion->requeue = false; 121 + completion->priority = priority; 122 + completion->my_queue = NULL; 123 + vdo_enqueue_work_queue(vdo->threads[thread_id].queue, completion); 124 + } 125 + 126 + /** 127 + * vdo_requeue_completion_if_needed() - Requeue a completion if not called on the specified thread. 128 + * 129 + * Return: True if the completion was requeued; callers may not access the completion in this case. 130 + */ 131 + bool vdo_requeue_completion_if_needed(struct vdo_completion *completion, 132 + thread_id_t callback_thread_id) 133 + { 134 + if (vdo_get_callback_thread_id() == callback_thread_id) 135 + return false; 136 + 137 + completion->callback_thread_id = callback_thread_id; 138 + vdo_enqueue_completion(completion, VDO_WORK_Q_DEFAULT_PRIORITY); 139 + return true; 140 + }
+152
drivers/md/dm-vdo/completion.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright 2023 Red Hat 4 + */ 5 + 6 + #ifndef VDO_COMPLETION_H 7 + #define VDO_COMPLETION_H 8 + 9 + #include "permassert.h" 10 + 11 + #include "status-codes.h" 12 + #include "types.h" 13 + 14 + /** 15 + * vdo_run_completion() - Run a completion's callback or error handler on the current thread. 16 + * 17 + * Context: This function must be called from the correct callback thread. 18 + */ 19 + static inline void vdo_run_completion(struct vdo_completion *completion) 20 + { 21 + if ((completion->result != VDO_SUCCESS) && (completion->error_handler != NULL)) { 22 + completion->error_handler(completion); 23 + return; 24 + } 25 + 26 + completion->callback(completion); 27 + } 28 + 29 + void vdo_set_completion_result(struct vdo_completion *completion, int result); 30 + 31 + void vdo_initialize_completion(struct vdo_completion *completion, struct vdo *vdo, 32 + enum vdo_completion_type type); 33 + 34 + /** 35 + * vdo_reset_completion() - Reset a completion to a clean state, while keeping the type, vdo and 36 + * parent information. 37 + */ 38 + static inline void vdo_reset_completion(struct vdo_completion *completion) 39 + { 40 + completion->result = VDO_SUCCESS; 41 + completion->complete = false; 42 + } 43 + 44 + void vdo_launch_completion_with_priority(struct vdo_completion *completion, 45 + enum vdo_completion_priority priority); 46 + 47 + /** 48 + * vdo_launch_completion() - Launch a completion with default priority. 49 + */ 50 + static inline void vdo_launch_completion(struct vdo_completion *completion) 51 + { 52 + vdo_launch_completion_with_priority(completion, VDO_WORK_Q_DEFAULT_PRIORITY); 53 + } 54 + 55 + /** 56 + * vdo_continue_completion() - Continue processing a completion. 57 + * @result: The current result (will not mask older errors). 58 + * 59 + * Continue processing a completion by setting the current result and calling 60 + * vdo_launch_completion(). 61 + */ 62 + static inline void vdo_continue_completion(struct vdo_completion *completion, int result) 63 + { 64 + vdo_set_completion_result(completion, result); 65 + vdo_launch_completion(completion); 66 + } 67 + 68 + void vdo_finish_completion(struct vdo_completion *completion); 69 + 70 + /** 71 + * vdo_fail_completion() - Set the result of a completion if it does not already have an error, 72 + * then finish it. 73 + */ 74 + static inline void vdo_fail_completion(struct vdo_completion *completion, int result) 75 + { 76 + vdo_set_completion_result(completion, result); 77 + vdo_finish_completion(completion); 78 + } 79 + 80 + /** 81 + * vdo_assert_completion_type() - Assert that a completion is of the correct type. 82 + * 83 + * Return: VDO_SUCCESS or an error 84 + */ 85 + static inline int vdo_assert_completion_type(struct vdo_completion *completion, 86 + enum vdo_completion_type expected) 87 + { 88 + return ASSERT(expected == completion->type, 89 + "completion type should be %u, not %u", expected, 90 + completion->type); 91 + } 92 + 93 + static inline void vdo_set_completion_callback(struct vdo_completion *completion, 94 + vdo_action_fn callback, 95 + thread_id_t callback_thread_id) 96 + { 97 + completion->callback = callback; 98 + completion->callback_thread_id = callback_thread_id; 99 + } 100 + 101 + /** 102 + * vdo_launch_completion_callback() - Set the callback for a completion and launch it immediately. 103 + */ 104 + static inline void vdo_launch_completion_callback(struct vdo_completion *completion, 105 + vdo_action_fn callback, 106 + thread_id_t callback_thread_id) 107 + { 108 + vdo_set_completion_callback(completion, callback, callback_thread_id); 109 + vdo_launch_completion(completion); 110 + } 111 + 112 + /** 113 + * vdo_prepare_completion() - Prepare a completion for launch. 114 + * 115 + * Resets the completion, and then sets its callback, error handler, callback thread, and parent. 116 + */ 117 + static inline void vdo_prepare_completion(struct vdo_completion *completion, 118 + vdo_action_fn callback, 119 + vdo_action_fn error_handler, 120 + thread_id_t callback_thread_id, void *parent) 121 + { 122 + vdo_reset_completion(completion); 123 + vdo_set_completion_callback(completion, callback, callback_thread_id); 124 + completion->error_handler = error_handler; 125 + completion->parent = parent; 126 + } 127 + 128 + /** 129 + * vdo_prepare_completion_for_requeue() - Prepare a completion for launch ensuring that it will 130 + * always be requeued. 131 + * 132 + * Resets the completion, and then sets its callback, error handler, callback thread, and parent. 133 + */ 134 + static inline void vdo_prepare_completion_for_requeue(struct vdo_completion *completion, 135 + vdo_action_fn callback, 136 + vdo_action_fn error_handler, 137 + thread_id_t callback_thread_id, 138 + void *parent) 139 + { 140 + vdo_prepare_completion(completion, callback, error_handler, 141 + callback_thread_id, parent); 142 + completion->requeue = true; 143 + } 144 + 145 + void vdo_enqueue_completion(struct vdo_completion *completion, 146 + enum vdo_completion_priority priority); 147 + 148 + 149 + bool vdo_requeue_completion_if_needed(struct vdo_completion *completion, 150 + thread_id_t callback_thread_id); 151 + 152 + #endif /* VDO_COMPLETION_H */
+59
drivers/md/dm-vdo/cpu.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright 2023 Red Hat 4 + */ 5 + 6 + #ifndef UDS_CPU_H 7 + #define UDS_CPU_H 8 + 9 + #include <linux/cache.h> 10 + 11 + /** 12 + * uds_prefetch_address() - Minimize cache-miss latency by attempting to move data into a CPU cache 13 + * before it is accessed. 14 + * 15 + * @address: the address to fetch (may be invalid) 16 + * @for_write: must be constant at compile time--false if for reading, true if for writing 17 + */ 18 + static inline void uds_prefetch_address(const void *address, bool for_write) 19 + { 20 + /* 21 + * for_write won't be a constant if we are compiled with optimization turned off, in which 22 + * case prefetching really doesn't matter. clang can't figure out that if for_write is a 23 + * constant, it can be passed as the second, mandatorily constant argument to prefetch(), 24 + * at least currently on llvm 12. 25 + */ 26 + if (__builtin_constant_p(for_write)) { 27 + if (for_write) 28 + __builtin_prefetch(address, true); 29 + else 30 + __builtin_prefetch(address, false); 31 + } 32 + } 33 + 34 + /** 35 + * uds_prefetch_range() - Minimize cache-miss latency by attempting to move a range of addresses 36 + * into a CPU cache before they are accessed. 37 + * 38 + * @start: the starting address to fetch (may be invalid) 39 + * @size: the number of bytes in the address range 40 + * @for_write: must be constant at compile time--false if for reading, true if for writing 41 + */ 42 + static inline void uds_prefetch_range(const void *start, unsigned int size, 43 + bool for_write) 44 + { 45 + /* 46 + * Count the number of cache lines to fetch, allowing for the address range to span an 47 + * extra cache line boundary due to address alignment. 48 + */ 49 + const char *address = (const char *) start; 50 + unsigned int offset = ((uintptr_t) address % L1_CACHE_BYTES); 51 + unsigned int cache_lines = (1 + ((size + offset) / L1_CACHE_BYTES)); 52 + 53 + while (cache_lines-- > 0) { 54 + uds_prefetch_address(address, for_write); 55 + address += L1_CACHE_BYTES; 56 + } 57 + } 58 + 59 + #endif /* UDS_CPU_H */
+171
drivers/md/dm-vdo/funnel-queue.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright 2023 Red Hat 4 + */ 5 + 6 + #include "funnel-queue.h" 7 + 8 + #include "cpu.h" 9 + #include "memory-alloc.h" 10 + #include "permassert.h" 11 + #include "uds.h" 12 + 13 + int uds_make_funnel_queue(struct funnel_queue **queue_ptr) 14 + { 15 + int result; 16 + struct funnel_queue *queue; 17 + 18 + result = uds_allocate(1, struct funnel_queue, "funnel queue", &queue); 19 + if (result != UDS_SUCCESS) 20 + return result; 21 + 22 + /* 23 + * Initialize the stub entry and put it in the queue, establishing the invariant that 24 + * queue->newest and queue->oldest are never null. 25 + */ 26 + queue->stub.next = NULL; 27 + queue->newest = &queue->stub; 28 + queue->oldest = &queue->stub; 29 + 30 + *queue_ptr = queue; 31 + return UDS_SUCCESS; 32 + } 33 + 34 + void uds_free_funnel_queue(struct funnel_queue *queue) 35 + { 36 + uds_free(queue); 37 + } 38 + 39 + static struct funnel_queue_entry *get_oldest(struct funnel_queue *queue) 40 + { 41 + /* 42 + * Barrier requirements: We need a read barrier between reading a "next" field pointer 43 + * value and reading anything it points to. There's an accompanying barrier in 44 + * uds_funnel_queue_put() between its caller setting up the entry and making it visible. 45 + */ 46 + struct funnel_queue_entry *oldest = queue->oldest; 47 + struct funnel_queue_entry *next = READ_ONCE(oldest->next); 48 + 49 + if (oldest == &queue->stub) { 50 + /* 51 + * When the oldest entry is the stub and it has no successor, the queue is 52 + * logically empty. 53 + */ 54 + if (next == NULL) 55 + return NULL; 56 + /* 57 + * The stub entry has a successor, so the stub can be dequeued and ignored without 58 + * breaking the queue invariants. 59 + */ 60 + oldest = next; 61 + queue->oldest = oldest; 62 + next = READ_ONCE(oldest->next); 63 + } 64 + 65 + /* 66 + * We have a non-stub candidate to dequeue. If it lacks a successor, we'll need to put the 67 + * stub entry back on the queue first. 68 + */ 69 + if (next == NULL) { 70 + struct funnel_queue_entry *newest = READ_ONCE(queue->newest); 71 + 72 + if (oldest != newest) { 73 + /* 74 + * Another thread has already swung queue->newest atomically, but not yet 75 + * assigned previous->next. The queue is really still empty. 76 + */ 77 + return NULL; 78 + } 79 + 80 + /* 81 + * Put the stub entry back on the queue, ensuring a successor will eventually be 82 + * seen. 83 + */ 84 + uds_funnel_queue_put(queue, &queue->stub); 85 + 86 + /* Check again for a successor. */ 87 + next = READ_ONCE(oldest->next); 88 + if (next == NULL) { 89 + /* 90 + * We lost a race with a producer who swapped queue->newest before we did, 91 + * but who hasn't yet updated previous->next. Try again later. 92 + */ 93 + return NULL; 94 + } 95 + } 96 + 97 + return oldest; 98 + } 99 + 100 + /* 101 + * Poll a queue, removing the oldest entry if the queue is not empty. This function must only be 102 + * called from a single consumer thread. 103 + */ 104 + struct funnel_queue_entry *uds_funnel_queue_poll(struct funnel_queue *queue) 105 + { 106 + struct funnel_queue_entry *oldest = get_oldest(queue); 107 + 108 + if (oldest == NULL) 109 + return oldest; 110 + 111 + /* 112 + * Dequeue the oldest entry and return it. Only one consumer thread may call this function, 113 + * so no locking, atomic operations, or fences are needed; queue->oldest is owned by the 114 + * consumer and oldest->next is never used by a producer thread after it is swung from NULL 115 + * to non-NULL. 116 + */ 117 + queue->oldest = READ_ONCE(oldest->next); 118 + /* 119 + * Make sure the caller sees the proper stored data for this entry. Since we've already 120 + * fetched the entry pointer we stored in "queue->oldest", this also ensures that on entry 121 + * to the next call we'll properly see the dependent data. 122 + */ 123 + smp_rmb(); 124 + /* 125 + * If "oldest" is a very light-weight work item, we'll be looking for the next one very 126 + * soon, so prefetch it now. 127 + */ 128 + uds_prefetch_address(queue->oldest, true); 129 + WRITE_ONCE(oldest->next, NULL); 130 + return oldest; 131 + } 132 + 133 + /* 134 + * Check whether the funnel queue is empty or not. If the queue is in a transition state with one 135 + * or more entries being added such that the list view is incomplete, this function will report the 136 + * queue as empty. 137 + */ 138 + bool uds_is_funnel_queue_empty(struct funnel_queue *queue) 139 + { 140 + return get_oldest(queue) == NULL; 141 + } 142 + 143 + /* 144 + * Check whether the funnel queue is idle or not. If the queue has entries available to be 145 + * retrieved, it is not idle. If the queue is in a transition state with one or more entries being 146 + * added such that the list view is incomplete, it may not be possible to retrieve an entry with 147 + * the uds_funnel_queue_poll() function, but the queue will not be considered idle. 148 + */ 149 + bool uds_is_funnel_queue_idle(struct funnel_queue *queue) 150 + { 151 + /* 152 + * Oldest is not the stub, so there's another entry, though if next is NULL we can't 153 + * retrieve it yet. 154 + */ 155 + if (queue->oldest != &queue->stub) 156 + return false; 157 + 158 + /* 159 + * Oldest is the stub, but newest has been updated by _put(); either there's another, 160 + * retrievable entry in the list, or the list is officially empty but in the intermediate 161 + * state of having an entry added. 162 + * 163 + * Whether anything is retrievable depends on whether stub.next has been updated and become 164 + * visible to us, but for idleness we don't care. And due to memory ordering in _put(), the 165 + * update to newest would be visible to us at the same time or sooner. 166 + */ 167 + if (READ_ONCE(queue->newest) != &queue->stub) 168 + return false; 169 + 170 + return true; 171 + }
+110
drivers/md/dm-vdo/funnel-queue.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright 2023 Red Hat 4 + */ 5 + 6 + #ifndef UDS_FUNNEL_QUEUE_H 7 + #define UDS_FUNNEL_QUEUE_H 8 + 9 + #include <linux/atomic.h> 10 + #include <linux/cache.h> 11 + 12 + /* 13 + * A funnel queue is a simple (almost) lock-free queue that accepts entries from multiple threads 14 + * (multi-producer) and delivers them to a single thread (single-consumer). "Funnel" is an attempt 15 + * to evoke the image of requests from more than one producer being "funneled down" to a single 16 + * consumer. 17 + * 18 + * This is an unsynchronized but thread-safe data structure when used as intended. There is no 19 + * mechanism to ensure that only one thread is consuming from the queue. If more than one thread 20 + * attempts to consume from the queue, the resulting behavior is undefined. Clients must not 21 + * directly access or manipulate the internals of the queue, which are only exposed for the purpose 22 + * of allowing the very simple enqueue operation to be inlined. 23 + * 24 + * The implementation requires that a funnel_queue_entry structure (a link pointer) is embedded in 25 + * the queue entries, and pointers to those structures are used exclusively by the queue. No macros 26 + * are defined to template the queue, so the offset of the funnel_queue_entry in the records placed 27 + * in the queue must all be the same so the client can derive their structure pointer from the 28 + * entry pointer returned by uds_funnel_queue_poll(). 29 + * 30 + * Callers are wholly responsible for allocating and freeing the entries. Entries may be freed as 31 + * soon as they are returned since this queue is not susceptible to the "ABA problem" present in 32 + * many lock-free data structures. The queue is dynamically allocated to ensure cache-line 33 + * alignment, but no other dynamic allocation is used. 34 + * 35 + * The algorithm is not actually 100% lock-free. There is a single point in uds_funnel_queue_put() 36 + * at which a preempted producer will prevent the consumers from seeing items added to the queue by 37 + * later producers, and only if the queue is short enough or the consumer fast enough for it to 38 + * reach what was the end of the queue at the time of the preemption. 39 + * 40 + * The consumer function, uds_funnel_queue_poll(), will return NULL when the queue is empty. To 41 + * wait for data to consume, spin (if safe) or combine the queue with a struct event_count to 42 + * signal the presence of new entries. 43 + */ 44 + 45 + /* This queue link structure must be embedded in client entries. */ 46 + struct funnel_queue_entry { 47 + /* The next (newer) entry in the queue. */ 48 + struct funnel_queue_entry *next; 49 + }; 50 + 51 + /* 52 + * The dynamically allocated queue structure, which is allocated on a cache line boundary so the 53 + * producer and consumer fields in the structure will land on separate cache lines. This should be 54 + * consider opaque but it is exposed here so uds_funnel_queue_put() can be inlined. 55 + */ 56 + struct __aligned(L1_CACHE_BYTES) funnel_queue { 57 + /* 58 + * The producers' end of the queue, an atomically exchanged pointer that will never be 59 + * NULL. 60 + */ 61 + struct funnel_queue_entry *newest; 62 + 63 + /* The consumer's end of the queue, which is owned by the consumer and never NULL. */ 64 + struct funnel_queue_entry *oldest __aligned(L1_CACHE_BYTES); 65 + 66 + /* A dummy entry used to provide the non-NULL invariants above. */ 67 + struct funnel_queue_entry stub; 68 + }; 69 + 70 + int __must_check uds_make_funnel_queue(struct funnel_queue **queue_ptr); 71 + 72 + void uds_free_funnel_queue(struct funnel_queue *queue); 73 + 74 + /* 75 + * Put an entry on the end of the queue. 76 + * 77 + * The entry pointer must be to the struct funnel_queue_entry embedded in the caller's data 78 + * structure. The caller must be able to derive the address of the start of their data structure 79 + * from the pointer that passed in here, so every entry in the queue must have the struct 80 + * funnel_queue_entry at the same offset within the client's structure. 81 + */ 82 + static inline void uds_funnel_queue_put(struct funnel_queue *queue, 83 + struct funnel_queue_entry *entry) 84 + { 85 + struct funnel_queue_entry *previous; 86 + 87 + /* 88 + * Barrier requirements: All stores relating to the entry ("next" pointer, containing data 89 + * structure fields) must happen before the previous->next store making it visible to the 90 + * consumer. Also, the entry's "next" field initialization to NULL must happen before any 91 + * other producer threads can see the entry (the xchg) and try to update the "next" field. 92 + * 93 + * xchg implements a full barrier. 94 + */ 95 + WRITE_ONCE(entry->next, NULL); 96 + previous = xchg(&queue->newest, entry); 97 + /* 98 + * Preemptions between these two statements hide the rest of the queue from the consumer, 99 + * preventing consumption until the following assignment runs. 100 + */ 101 + WRITE_ONCE(previous->next, entry); 102 + } 103 + 104 + struct funnel_queue_entry *__must_check uds_funnel_queue_poll(struct funnel_queue *queue); 105 + 106 + bool __must_check uds_is_funnel_queue_empty(struct funnel_queue *queue); 107 + 108 + bool __must_check uds_is_funnel_queue_idle(struct funnel_queue *queue); 109 + 110 + #endif /* UDS_FUNNEL_QUEUE_H */
+283
drivers/md/dm-vdo/funnel-requestqueue.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright 2023 Red Hat 4 + */ 5 + 6 + #include "funnel-requestqueue.h" 7 + 8 + #include <linux/atomic.h> 9 + #include <linux/compiler.h> 10 + #include <linux/wait.h> 11 + 12 + #include "funnel-queue.h" 13 + #include "logger.h" 14 + #include "memory-alloc.h" 15 + #include "uds-threads.h" 16 + 17 + /* 18 + * This queue will attempt to handle requests in reasonably sized batches instead of reacting 19 + * immediately to each new request. The wait time between batches is dynamically adjusted up or 20 + * down to try to balance responsiveness against wasted thread run time. 21 + * 22 + * If the wait time becomes long enough, the queue will become dormant and must be explicitly 23 + * awoken when a new request is enqueued. The enqueue operation updates "newest" in the funnel 24 + * queue via xchg (which is a memory barrier), and later checks "dormant" to decide whether to do a 25 + * wakeup of the worker thread. 26 + * 27 + * When deciding to go to sleep, the worker thread sets "dormant" and then examines "newest" to 28 + * decide if the funnel queue is idle. In dormant mode, the last examination of "newest" before 29 + * going to sleep is done inside the wait_event_interruptible() macro, after a point where one or 30 + * more memory barriers have been issued. (Preparing to sleep uses spin locks.) Even if the funnel 31 + * queue's "next" field update isn't visible yet to make the entry accessible, its existence will 32 + * kick the worker thread out of dormant mode and back into timer-based mode. 33 + * 34 + * Unbatched requests are used to communicate between different zone threads and will also cause 35 + * the queue to awaken immediately. 36 + */ 37 + 38 + enum { 39 + NANOSECOND = 1, 40 + MICROSECOND = 1000 * NANOSECOND, 41 + MILLISECOND = 1000 * MICROSECOND, 42 + DEFAULT_WAIT_TIME = 20 * MICROSECOND, 43 + MINIMUM_WAIT_TIME = DEFAULT_WAIT_TIME / 2, 44 + MAXIMUM_WAIT_TIME = MILLISECOND, 45 + MINIMUM_BATCH = 32, 46 + MAXIMUM_BATCH = 64, 47 + }; 48 + 49 + struct uds_request_queue { 50 + /* Wait queue for synchronizing producers and consumer */ 51 + struct wait_queue_head wait_head; 52 + /* Function to process a request */ 53 + uds_request_queue_processor_fn processor; 54 + /* Queue of new incoming requests */ 55 + struct funnel_queue *main_queue; 56 + /* Queue of old requests to retry */ 57 + struct funnel_queue *retry_queue; 58 + /* The thread id of the worker thread */ 59 + struct thread *thread; 60 + /* True if the worker was started */ 61 + bool started; 62 + /* When true, requests can be enqueued */ 63 + bool running; 64 + /* A flag set when the worker is waiting without a timeout */ 65 + atomic_t dormant; 66 + }; 67 + 68 + static inline struct uds_request *poll_queues(struct uds_request_queue *queue) 69 + { 70 + struct funnel_queue_entry *entry; 71 + 72 + entry = uds_funnel_queue_poll(queue->retry_queue); 73 + if (entry != NULL) 74 + return container_of(entry, struct uds_request, queue_link); 75 + 76 + entry = uds_funnel_queue_poll(queue->main_queue); 77 + if (entry != NULL) 78 + return container_of(entry, struct uds_request, queue_link); 79 + 80 + return NULL; 81 + } 82 + 83 + static inline bool are_queues_idle(struct uds_request_queue *queue) 84 + { 85 + return uds_is_funnel_queue_idle(queue->retry_queue) && 86 + uds_is_funnel_queue_idle(queue->main_queue); 87 + } 88 + 89 + /* 90 + * Determine if there is a next request to process, and return it if there is. Also return flags 91 + * indicating whether the worker thread can sleep (for the use of wait_event() macros) and whether 92 + * the thread did sleep before returning a new request. 93 + */ 94 + static inline bool dequeue_request(struct uds_request_queue *queue, 95 + struct uds_request **request_ptr, bool *waited_ptr) 96 + { 97 + struct uds_request *request = poll_queues(queue); 98 + 99 + if (request != NULL) { 100 + *request_ptr = request; 101 + return true; 102 + } 103 + 104 + if (!READ_ONCE(queue->running)) { 105 + /* Wake the worker thread so it can exit. */ 106 + *request_ptr = NULL; 107 + return true; 108 + } 109 + 110 + *request_ptr = NULL; 111 + *waited_ptr = true; 112 + return false; 113 + } 114 + 115 + static void wait_for_request(struct uds_request_queue *queue, bool dormant, 116 + unsigned long timeout, struct uds_request **request, 117 + bool *waited) 118 + { 119 + if (dormant) { 120 + wait_event_interruptible(queue->wait_head, 121 + (dequeue_request(queue, request, waited) || 122 + !are_queues_idle(queue))); 123 + return; 124 + } 125 + 126 + wait_event_interruptible_hrtimeout(queue->wait_head, 127 + dequeue_request(queue, request, waited), 128 + ns_to_ktime(timeout)); 129 + } 130 + 131 + static void request_queue_worker(void *arg) 132 + { 133 + struct uds_request_queue *queue = arg; 134 + struct uds_request *request = NULL; 135 + unsigned long time_batch = DEFAULT_WAIT_TIME; 136 + bool dormant = atomic_read(&queue->dormant); 137 + bool waited = false; 138 + long current_batch = 0; 139 + 140 + for (;;) { 141 + wait_for_request(queue, dormant, time_batch, &request, &waited); 142 + if (likely(request != NULL)) { 143 + current_batch++; 144 + queue->processor(request); 145 + } else if (!READ_ONCE(queue->running)) { 146 + break; 147 + } 148 + 149 + if (dormant) { 150 + /* 151 + * The queue has been roused from dormancy. Clear the flag so enqueuers can 152 + * stop broadcasting. No fence is needed for this transition. 153 + */ 154 + atomic_set(&queue->dormant, false); 155 + dormant = false; 156 + time_batch = DEFAULT_WAIT_TIME; 157 + } else if (waited) { 158 + /* 159 + * We waited for this request to show up. Adjust the wait time to smooth 160 + * out the batch size. 161 + */ 162 + if (current_batch < MINIMUM_BATCH) { 163 + /* 164 + * If the last batch of requests was too small, increase the wait 165 + * time. 166 + */ 167 + time_batch += time_batch / 4; 168 + if (time_batch >= MAXIMUM_WAIT_TIME) { 169 + atomic_set(&queue->dormant, true); 170 + dormant = true; 171 + } 172 + } else if (current_batch > MAXIMUM_BATCH) { 173 + /* 174 + * If the last batch of requests was too large, decrease the wait 175 + * time. 176 + */ 177 + time_batch -= time_batch / 4; 178 + if (time_batch < MINIMUM_WAIT_TIME) 179 + time_batch = MINIMUM_WAIT_TIME; 180 + } 181 + current_batch = 0; 182 + } 183 + } 184 + 185 + /* 186 + * Ensure that we process any remaining requests that were enqueued before trying to shut 187 + * down. The corresponding write barrier is in uds_request_queue_finish(). 188 + */ 189 + smp_rmb(); 190 + while ((request = poll_queues(queue)) != NULL) 191 + queue->processor(request); 192 + } 193 + 194 + int uds_make_request_queue(const char *queue_name, 195 + uds_request_queue_processor_fn processor, 196 + struct uds_request_queue **queue_ptr) 197 + { 198 + int result; 199 + struct uds_request_queue *queue; 200 + 201 + result = uds_allocate(1, struct uds_request_queue, __func__, &queue); 202 + if (result != UDS_SUCCESS) 203 + return result; 204 + 205 + queue->processor = processor; 206 + queue->running = true; 207 + atomic_set(&queue->dormant, false); 208 + init_waitqueue_head(&queue->wait_head); 209 + 210 + result = uds_make_funnel_queue(&queue->main_queue); 211 + if (result != UDS_SUCCESS) { 212 + uds_request_queue_finish(queue); 213 + return result; 214 + } 215 + 216 + result = uds_make_funnel_queue(&queue->retry_queue); 217 + if (result != UDS_SUCCESS) { 218 + uds_request_queue_finish(queue); 219 + return result; 220 + } 221 + 222 + result = uds_create_thread(request_queue_worker, queue, queue_name, 223 + &queue->thread); 224 + if (result != UDS_SUCCESS) { 225 + uds_request_queue_finish(queue); 226 + return result; 227 + } 228 + 229 + queue->started = true; 230 + *queue_ptr = queue; 231 + return UDS_SUCCESS; 232 + } 233 + 234 + static inline void wake_up_worker(struct uds_request_queue *queue) 235 + { 236 + if (wq_has_sleeper(&queue->wait_head)) 237 + wake_up(&queue->wait_head); 238 + } 239 + 240 + void uds_request_queue_enqueue(struct uds_request_queue *queue, 241 + struct uds_request *request) 242 + { 243 + struct funnel_queue *sub_queue; 244 + bool unbatched = request->unbatched; 245 + 246 + sub_queue = request->requeued ? queue->retry_queue : queue->main_queue; 247 + uds_funnel_queue_put(sub_queue, &request->queue_link); 248 + 249 + /* 250 + * We must wake the worker thread when it is dormant. A read fence isn't needed here since 251 + * we know the queue operation acts as one. 252 + */ 253 + if (atomic_read(&queue->dormant) || unbatched) 254 + wake_up_worker(queue); 255 + } 256 + 257 + void uds_request_queue_finish(struct uds_request_queue *queue) 258 + { 259 + int result; 260 + 261 + if (queue == NULL) 262 + return; 263 + 264 + /* 265 + * This memory barrier ensures that any requests we queued will be seen. The point is that 266 + * when dequeue_request() sees the following update to the running flag, it will also be 267 + * able to see any change we made to a next field in the funnel queue entry. The 268 + * corresponding read barrier is in request_queue_worker(). 269 + */ 270 + smp_wmb(); 271 + WRITE_ONCE(queue->running, false); 272 + 273 + if (queue->started) { 274 + wake_up_worker(queue); 275 + result = uds_join_threads(queue->thread); 276 + if (result != UDS_SUCCESS) 277 + uds_log_warning_strerror(result, "Failed to join worker thread"); 278 + } 279 + 280 + uds_free_funnel_queue(queue->main_queue); 281 + uds_free_funnel_queue(queue->retry_queue); 282 + uds_free(queue); 283 + }
+31
drivers/md/dm-vdo/funnel-requestqueue.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright 2023 Red Hat 4 + */ 5 + 6 + #ifndef UDS_REQUEST_QUEUE_H 7 + #define UDS_REQUEST_QUEUE_H 8 + 9 + #include "uds.h" 10 + 11 + /* 12 + * A simple request queue which will handle new requests in the order in which they are received, 13 + * and will attempt to handle requeued requests before new ones. However, the nature of the 14 + * implementation means that it cannot guarantee this ordering; the prioritization is merely a 15 + * hint. 16 + */ 17 + 18 + struct uds_request_queue; 19 + 20 + typedef void (*uds_request_queue_processor_fn)(struct uds_request *); 21 + 22 + int __must_check uds_make_request_queue(const char *queue_name, 23 + uds_request_queue_processor_fn processor, 24 + struct uds_request_queue **queue_ptr); 25 + 26 + void uds_request_queue_enqueue(struct uds_request_queue *queue, 27 + struct uds_request *request); 28 + 29 + void uds_request_queue_finish(struct uds_request_queue *queue); 30 + 31 + #endif /* UDS_REQUEST_QUEUE_H */
+638
drivers/md/dm-vdo/funnel-workqueue.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright 2023 Red Hat 4 + */ 5 + 6 + #include "funnel-workqueue.h" 7 + 8 + #include <linux/atomic.h> 9 + #include <linux/cache.h> 10 + #include <linux/completion.h> 11 + #include <linux/err.h> 12 + #include <linux/kthread.h> 13 + #include <linux/percpu.h> 14 + 15 + #include "funnel-queue.h" 16 + #include "logger.h" 17 + #include "memory-alloc.h" 18 + #include "numeric.h" 19 + #include "permassert.h" 20 + #include "string-utils.h" 21 + 22 + #include "completion.h" 23 + #include "status-codes.h" 24 + 25 + static DEFINE_PER_CPU(unsigned int, service_queue_rotor); 26 + 27 + /** 28 + * DOC: Work queue definition. 29 + * 30 + * There are two types of work queues: simple, with one worker thread, and round-robin, which uses 31 + * a group of the former to do the work, and assigns work to them in round-robin fashion (roughly). 32 + * Externally, both are represented via the same common sub-structure, though there's actually not 33 + * a great deal of overlap between the two types internally. 34 + */ 35 + struct vdo_work_queue { 36 + /* Name of just the work queue (e.g., "cpuQ12") */ 37 + char *name; 38 + bool round_robin_mode; 39 + struct vdo_thread *owner; 40 + /* Life cycle functions, etc */ 41 + const struct vdo_work_queue_type *type; 42 + }; 43 + 44 + struct simple_work_queue { 45 + struct vdo_work_queue common; 46 + struct funnel_queue *priority_lists[VDO_WORK_Q_MAX_PRIORITY + 1]; 47 + void *private; 48 + 49 + /* 50 + * The fields above are unchanged after setup but often read, and are good candidates for 51 + * caching -- and if the max priority is 2, just fit in one x86-64 cache line if aligned. 52 + * The fields below are often modified as we sleep and wake, so we want a separate cache 53 + * line for performance. 54 + */ 55 + 56 + /* Any (0 or 1) worker threads waiting for new work to do */ 57 + wait_queue_head_t waiting_worker_threads ____cacheline_aligned; 58 + /* Hack to reduce wakeup calls if the worker thread is running */ 59 + atomic_t idle; 60 + 61 + /* These are infrequently used so in terms of performance we don't care where they land. */ 62 + struct task_struct *thread; 63 + /* Notify creator once worker has initialized */ 64 + struct completion *started; 65 + }; 66 + 67 + struct round_robin_work_queue { 68 + struct vdo_work_queue common; 69 + struct simple_work_queue **service_queues; 70 + unsigned int num_service_queues; 71 + }; 72 + 73 + static inline struct simple_work_queue *as_simple_work_queue(struct vdo_work_queue *queue) 74 + { 75 + return ((queue == NULL) ? 76 + NULL : container_of(queue, struct simple_work_queue, common)); 77 + } 78 + 79 + static inline struct round_robin_work_queue *as_round_robin_work_queue(struct vdo_work_queue *queue) 80 + { 81 + return ((queue == NULL) ? 82 + NULL : 83 + container_of(queue, struct round_robin_work_queue, common)); 84 + } 85 + 86 + /* Processing normal completions. */ 87 + 88 + /* 89 + * Dequeue and return the next waiting completion, if any. 90 + * 91 + * We scan the funnel queues from highest priority to lowest, once; there is therefore a race 92 + * condition where a high-priority completion can be enqueued followed by a lower-priority one, and 93 + * we'll grab the latter (but we'll catch the high-priority item on the next call). If strict 94 + * enforcement of priorities becomes necessary, this function will need fixing. 95 + */ 96 + static struct vdo_completion *poll_for_completion(struct simple_work_queue *queue) 97 + { 98 + int i; 99 + 100 + for (i = queue->common.type->max_priority; i >= 0; i--) { 101 + struct funnel_queue_entry *link = uds_funnel_queue_poll(queue->priority_lists[i]); 102 + 103 + if (link != NULL) 104 + return container_of(link, struct vdo_completion, work_queue_entry_link); 105 + } 106 + 107 + return NULL; 108 + } 109 + 110 + static void enqueue_work_queue_completion(struct simple_work_queue *queue, 111 + struct vdo_completion *completion) 112 + { 113 + ASSERT_LOG_ONLY(completion->my_queue == NULL, 114 + "completion %px (fn %px) to enqueue (%px) is not already queued (%px)", 115 + completion, completion->callback, queue, completion->my_queue); 116 + if (completion->priority == VDO_WORK_Q_DEFAULT_PRIORITY) 117 + completion->priority = queue->common.type->default_priority; 118 + 119 + if (ASSERT(completion->priority <= queue->common.type->max_priority, 120 + "priority is in range for queue") != VDO_SUCCESS) 121 + completion->priority = 0; 122 + 123 + completion->my_queue = &queue->common; 124 + 125 + /* Funnel queue handles the synchronization for the put. */ 126 + uds_funnel_queue_put(queue->priority_lists[completion->priority], 127 + &completion->work_queue_entry_link); 128 + 129 + /* 130 + * Due to how funnel queue synchronization is handled (just atomic operations), the 131 + * simplest safe implementation here would be to wake-up any waiting threads after 132 + * enqueueing each item. Even if the funnel queue is not empty at the time of adding an 133 + * item to the queue, the consumer thread may not see this since it is not guaranteed to 134 + * have the same view of the queue as a producer thread. 135 + * 136 + * However, the above is wasteful so instead we attempt to minimize the number of thread 137 + * wakeups. Using an idle flag, and careful ordering using memory barriers, we should be 138 + * able to determine when the worker thread might be asleep or going to sleep. We use 139 + * cmpxchg to try to take ownership (vs other producer threads) of the responsibility for 140 + * waking the worker thread, so multiple wakeups aren't tried at once. 141 + * 142 + * This was tuned for some x86 boxes that were handy; it's untested whether doing the read 143 + * first is any better or worse for other platforms, even other x86 configurations. 144 + */ 145 + smp_mb(); 146 + if ((atomic_read(&queue->idle) != 1) || (atomic_cmpxchg(&queue->idle, 1, 0) != 1)) 147 + return; 148 + 149 + /* There's a maximum of one thread in this list. */ 150 + wake_up(&queue->waiting_worker_threads); 151 + } 152 + 153 + static void run_start_hook(struct simple_work_queue *queue) 154 + { 155 + if (queue->common.type->start != NULL) 156 + queue->common.type->start(queue->private); 157 + } 158 + 159 + static void run_finish_hook(struct simple_work_queue *queue) 160 + { 161 + if (queue->common.type->finish != NULL) 162 + queue->common.type->finish(queue->private); 163 + } 164 + 165 + /* 166 + * Wait for the next completion to process, or until kthread_should_stop indicates that it's time 167 + * for us to shut down. 168 + * 169 + * If kthread_should_stop says it's time to stop but we have pending completions return a 170 + * completion. 171 + * 172 + * Also update statistics relating to scheduler interactions. 173 + */ 174 + static struct vdo_completion *wait_for_next_completion(struct simple_work_queue *queue) 175 + { 176 + struct vdo_completion *completion; 177 + DEFINE_WAIT(wait); 178 + 179 + while (true) { 180 + prepare_to_wait(&queue->waiting_worker_threads, &wait, 181 + TASK_INTERRUPTIBLE); 182 + /* 183 + * Don't set the idle flag until a wakeup will not be lost. 184 + * 185 + * Force synchronization between setting the idle flag and checking the funnel 186 + * queue; the producer side will do them in the reverse order. (There's still a 187 + * race condition we've chosen to allow, because we've got a timeout below that 188 + * unwedges us if we hit it, but this may narrow the window a little.) 189 + */ 190 + atomic_set(&queue->idle, 1); 191 + smp_mb(); /* store-load barrier between "idle" and funnel queue */ 192 + 193 + completion = poll_for_completion(queue); 194 + if (completion != NULL) 195 + break; 196 + 197 + /* 198 + * We need to check for thread-stop after setting TASK_INTERRUPTIBLE state up 199 + * above. Otherwise, schedule() will put the thread to sleep and might miss a 200 + * wakeup from kthread_stop() call in vdo_finish_work_queue(). 201 + */ 202 + if (kthread_should_stop()) 203 + break; 204 + 205 + schedule(); 206 + 207 + /* 208 + * Most of the time when we wake, it should be because there's work to do. If it 209 + * was a spurious wakeup, continue looping. 210 + */ 211 + completion = poll_for_completion(queue); 212 + if (completion != NULL) 213 + break; 214 + } 215 + 216 + finish_wait(&queue->waiting_worker_threads, &wait); 217 + atomic_set(&queue->idle, 0); 218 + 219 + return completion; 220 + } 221 + 222 + static void process_completion(struct simple_work_queue *queue, 223 + struct vdo_completion *completion) 224 + { 225 + if (ASSERT(completion->my_queue == &queue->common, 226 + "completion %px from queue %px marked as being in this queue (%px)", 227 + completion, queue, completion->my_queue) == UDS_SUCCESS) 228 + completion->my_queue = NULL; 229 + 230 + vdo_run_completion(completion); 231 + } 232 + 233 + static void service_work_queue(struct simple_work_queue *queue) 234 + { 235 + run_start_hook(queue); 236 + 237 + while (true) { 238 + struct vdo_completion *completion = poll_for_completion(queue); 239 + 240 + if (completion == NULL) 241 + completion = wait_for_next_completion(queue); 242 + 243 + if (completion == NULL) { 244 + /* No completions but kthread_should_stop() was triggered. */ 245 + break; 246 + } 247 + 248 + process_completion(queue, completion); 249 + 250 + /* 251 + * Be friendly to a CPU that has other work to do, if the kernel has told us to. 252 + * This speeds up some performance tests; that "other work" might include other VDO 253 + * threads. 254 + */ 255 + if (need_resched()) 256 + cond_resched(); 257 + } 258 + 259 + run_finish_hook(queue); 260 + } 261 + 262 + static int work_queue_runner(void *ptr) 263 + { 264 + struct simple_work_queue *queue = ptr; 265 + 266 + complete(queue->started); 267 + service_work_queue(queue); 268 + return 0; 269 + } 270 + 271 + /* Creation & teardown */ 272 + 273 + static void free_simple_work_queue(struct simple_work_queue *queue) 274 + { 275 + unsigned int i; 276 + 277 + for (i = 0; i <= VDO_WORK_Q_MAX_PRIORITY; i++) 278 + uds_free_funnel_queue(queue->priority_lists[i]); 279 + uds_free(queue->common.name); 280 + uds_free(queue); 281 + } 282 + 283 + static void free_round_robin_work_queue(struct round_robin_work_queue *queue) 284 + { 285 + struct simple_work_queue **queue_table = queue->service_queues; 286 + unsigned int count = queue->num_service_queues; 287 + unsigned int i; 288 + 289 + queue->service_queues = NULL; 290 + 291 + for (i = 0; i < count; i++) 292 + free_simple_work_queue(queue_table[i]); 293 + uds_free(queue_table); 294 + uds_free(queue->common.name); 295 + uds_free(queue); 296 + } 297 + 298 + void vdo_free_work_queue(struct vdo_work_queue *queue) 299 + { 300 + if (queue == NULL) 301 + return; 302 + 303 + vdo_finish_work_queue(queue); 304 + 305 + if (queue->round_robin_mode) 306 + free_round_robin_work_queue(as_round_robin_work_queue(queue)); 307 + else 308 + free_simple_work_queue(as_simple_work_queue(queue)); 309 + } 310 + 311 + static int make_simple_work_queue(const char *thread_name_prefix, const char *name, 312 + struct vdo_thread *owner, void *private, 313 + const struct vdo_work_queue_type *type, 314 + struct simple_work_queue **queue_ptr) 315 + { 316 + DECLARE_COMPLETION_ONSTACK(started); 317 + struct simple_work_queue *queue; 318 + int i; 319 + struct task_struct *thread = NULL; 320 + int result; 321 + 322 + ASSERT_LOG_ONLY((type->max_priority <= VDO_WORK_Q_MAX_PRIORITY), 323 + "queue priority count %u within limit %u", type->max_priority, 324 + VDO_WORK_Q_MAX_PRIORITY); 325 + 326 + result = uds_allocate(1, struct simple_work_queue, "simple work queue", &queue); 327 + if (result != UDS_SUCCESS) 328 + return result; 329 + 330 + queue->private = private; 331 + queue->started = &started; 332 + queue->common.type = type; 333 + queue->common.owner = owner; 334 + init_waitqueue_head(&queue->waiting_worker_threads); 335 + 336 + result = uds_duplicate_string(name, "queue name", &queue->common.name); 337 + if (result != VDO_SUCCESS) { 338 + uds_free(queue); 339 + return -ENOMEM; 340 + } 341 + 342 + for (i = 0; i <= type->max_priority; i++) { 343 + result = uds_make_funnel_queue(&queue->priority_lists[i]); 344 + if (result != UDS_SUCCESS) { 345 + free_simple_work_queue(queue); 346 + return result; 347 + } 348 + } 349 + 350 + thread = kthread_run(work_queue_runner, queue, "%s:%s", thread_name_prefix, 351 + queue->common.name); 352 + if (IS_ERR(thread)) { 353 + free_simple_work_queue(queue); 354 + return (int) PTR_ERR(thread); 355 + } 356 + 357 + queue->thread = thread; 358 + 359 + /* 360 + * If we don't wait to ensure the thread is running VDO code, a quick kthread_stop (due to 361 + * errors elsewhere) could cause it to never get as far as running VDO, skipping the 362 + * cleanup code. 363 + * 364 + * Eventually we should just make that path safe too, and then we won't need this 365 + * synchronization. 366 + */ 367 + wait_for_completion(&started); 368 + 369 + *queue_ptr = queue; 370 + return UDS_SUCCESS; 371 + } 372 + 373 + /** 374 + * vdo_make_work_queue() - Create a work queue; if multiple threads are requested, completions will 375 + * be distributed to them in round-robin fashion. 376 + * 377 + * Each queue is associated with a struct vdo_thread which has a single vdo thread id. Regardless 378 + * of the actual number of queues and threads allocated here, code outside of the queue 379 + * implementation will treat this as a single zone. 380 + */ 381 + int vdo_make_work_queue(const char *thread_name_prefix, const char *name, 382 + struct vdo_thread *owner, const struct vdo_work_queue_type *type, 383 + unsigned int thread_count, void *thread_privates[], 384 + struct vdo_work_queue **queue_ptr) 385 + { 386 + struct round_robin_work_queue *queue; 387 + int result; 388 + char thread_name[TASK_COMM_LEN]; 389 + unsigned int i; 390 + 391 + if (thread_count == 1) { 392 + struct simple_work_queue *simple_queue; 393 + void *context = ((thread_privates != NULL) ? thread_privates[0] : NULL); 394 + 395 + result = make_simple_work_queue(thread_name_prefix, name, owner, context, 396 + type, &simple_queue); 397 + if (result == VDO_SUCCESS) 398 + *queue_ptr = &simple_queue->common; 399 + return result; 400 + } 401 + 402 + result = uds_allocate(1, struct round_robin_work_queue, "round-robin work queue", 403 + &queue); 404 + if (result != UDS_SUCCESS) 405 + return result; 406 + 407 + result = uds_allocate(thread_count, struct simple_work_queue *, 408 + "subordinate work queues", &queue->service_queues); 409 + if (result != UDS_SUCCESS) { 410 + uds_free(queue); 411 + return result; 412 + } 413 + 414 + queue->num_service_queues = thread_count; 415 + queue->common.round_robin_mode = true; 416 + queue->common.owner = owner; 417 + 418 + result = uds_duplicate_string(name, "queue name", &queue->common.name); 419 + if (result != VDO_SUCCESS) { 420 + uds_free(queue->service_queues); 421 + uds_free(queue); 422 + return -ENOMEM; 423 + } 424 + 425 + *queue_ptr = &queue->common; 426 + 427 + for (i = 0; i < thread_count; i++) { 428 + void *context = ((thread_privates != NULL) ? thread_privates[i] : NULL); 429 + 430 + snprintf(thread_name, sizeof(thread_name), "%s%u", name, i); 431 + result = make_simple_work_queue(thread_name_prefix, thread_name, owner, 432 + context, type, &queue->service_queues[i]); 433 + if (result != VDO_SUCCESS) { 434 + queue->num_service_queues = i; 435 + /* Destroy previously created subordinates. */ 436 + vdo_free_work_queue(uds_forget(*queue_ptr)); 437 + return result; 438 + } 439 + } 440 + 441 + return VDO_SUCCESS; 442 + } 443 + 444 + static void finish_simple_work_queue(struct simple_work_queue *queue) 445 + { 446 + if (queue->thread == NULL) 447 + return; 448 + 449 + /* Tells the worker thread to shut down and waits for it to exit. */ 450 + kthread_stop(queue->thread); 451 + queue->thread = NULL; 452 + } 453 + 454 + static void finish_round_robin_work_queue(struct round_robin_work_queue *queue) 455 + { 456 + struct simple_work_queue **queue_table = queue->service_queues; 457 + unsigned int count = queue->num_service_queues; 458 + unsigned int i; 459 + 460 + for (i = 0; i < count; i++) 461 + finish_simple_work_queue(queue_table[i]); 462 + } 463 + 464 + /* No enqueueing of completions should be done once this function is called. */ 465 + void vdo_finish_work_queue(struct vdo_work_queue *queue) 466 + { 467 + if (queue == NULL) 468 + return; 469 + 470 + if (queue->round_robin_mode) 471 + finish_round_robin_work_queue(as_round_robin_work_queue(queue)); 472 + else 473 + finish_simple_work_queue(as_simple_work_queue(queue)); 474 + } 475 + 476 + /* Debugging dumps */ 477 + 478 + static void dump_simple_work_queue(struct simple_work_queue *queue) 479 + { 480 + const char *thread_status = "no threads"; 481 + char task_state_report = '-'; 482 + 483 + if (queue->thread != NULL) { 484 + task_state_report = task_state_to_char(queue->thread); 485 + thread_status = atomic_read(&queue->idle) ? "idle" : "running"; 486 + } 487 + 488 + uds_log_info("workQ %px (%s) %s (%c)", &queue->common, queue->common.name, 489 + thread_status, task_state_report); 490 + 491 + /* ->waiting_worker_threads wait queue status? anyone waiting? */ 492 + } 493 + 494 + /* 495 + * Write to the buffer some info about the completion, for logging. Since the common use case is 496 + * dumping info about a lot of completions to syslog all at once, the format favors brevity over 497 + * readability. 498 + */ 499 + void vdo_dump_work_queue(struct vdo_work_queue *queue) 500 + { 501 + if (queue->round_robin_mode) { 502 + struct round_robin_work_queue *round_robin = as_round_robin_work_queue(queue); 503 + unsigned int i; 504 + 505 + for (i = 0; i < round_robin->num_service_queues; i++) 506 + dump_simple_work_queue(round_robin->service_queues[i]); 507 + } else { 508 + dump_simple_work_queue(as_simple_work_queue(queue)); 509 + } 510 + } 511 + 512 + static void get_function_name(void *pointer, char *buffer, size_t buffer_length) 513 + { 514 + if (pointer == NULL) { 515 + /* 516 + * Format "%ps" logs a null pointer as "(null)" with a bunch of leading spaces. We 517 + * sometimes use this when logging lots of data; don't be so verbose. 518 + */ 519 + strscpy(buffer, "-", buffer_length); 520 + } else { 521 + /* 522 + * Use a pragma to defeat gcc's format checking, which doesn't understand that 523 + * "%ps" actually does support a precision spec in Linux kernel code. 524 + */ 525 + char *space; 526 + 527 + #pragma GCC diagnostic push 528 + #pragma GCC diagnostic ignored "-Wformat" 529 + snprintf(buffer, buffer_length, "%.*ps", buffer_length - 1, pointer); 530 + #pragma GCC diagnostic pop 531 + 532 + space = strchr(buffer, ' '); 533 + if (space != NULL) 534 + *space = '\0'; 535 + } 536 + } 537 + 538 + void vdo_dump_completion_to_buffer(struct vdo_completion *completion, char *buffer, 539 + size_t length) 540 + { 541 + size_t current_length = 542 + scnprintf(buffer, length, "%.*s/", TASK_COMM_LEN, 543 + (completion->my_queue == NULL ? "-" : completion->my_queue->name)); 544 + 545 + if (current_length < length - 1) { 546 + get_function_name((void *) completion->callback, buffer + current_length, 547 + length - current_length); 548 + } 549 + } 550 + 551 + /* Completion submission */ 552 + /* 553 + * If the completion has a timeout that has already passed, the timeout handler function may be 554 + * invoked by this function. 555 + */ 556 + void vdo_enqueue_work_queue(struct vdo_work_queue *queue, 557 + struct vdo_completion *completion) 558 + { 559 + /* 560 + * Convert the provided generic vdo_work_queue to the simple_work_queue to actually queue 561 + * on. 562 + */ 563 + struct simple_work_queue *simple_queue = NULL; 564 + 565 + if (!queue->round_robin_mode) { 566 + simple_queue = as_simple_work_queue(queue); 567 + } else { 568 + struct round_robin_work_queue *round_robin = as_round_robin_work_queue(queue); 569 + 570 + /* 571 + * It shouldn't be a big deal if the same rotor gets used for multiple work queues. 572 + * Any patterns that might develop are likely to be disrupted by random ordering of 573 + * multiple completions and migration between cores, unless the load is so light as 574 + * to be regular in ordering of tasks and the threads are confined to individual 575 + * cores; with a load that light we won't care. 576 + */ 577 + unsigned int rotor = this_cpu_inc_return(service_queue_rotor); 578 + unsigned int index = rotor % round_robin->num_service_queues; 579 + 580 + simple_queue = round_robin->service_queues[index]; 581 + } 582 + 583 + enqueue_work_queue_completion(simple_queue, completion); 584 + } 585 + 586 + /* Misc */ 587 + 588 + /* 589 + * Return the work queue pointer recorded at initialization time in the work-queue stack handle 590 + * initialized on the stack of the current thread, if any. 591 + */ 592 + static struct simple_work_queue *get_current_thread_work_queue(void) 593 + { 594 + /* 595 + * In interrupt context, if a vdo thread is what got interrupted, the calls below will find 596 + * the queue for the thread which was interrupted. However, the interrupted thread may have 597 + * been processing a completion, in which case starting to process another would violate 598 + * our concurrency assumptions. 599 + */ 600 + if (in_interrupt()) 601 + return NULL; 602 + 603 + if (kthread_func(current) != work_queue_runner) 604 + /* Not a VDO work queue thread. */ 605 + return NULL; 606 + 607 + return kthread_data(current); 608 + } 609 + 610 + struct vdo_work_queue *vdo_get_current_work_queue(void) 611 + { 612 + struct simple_work_queue *queue = get_current_thread_work_queue(); 613 + 614 + return (queue == NULL) ? NULL : &queue->common; 615 + } 616 + 617 + struct vdo_thread *vdo_get_work_queue_owner(struct vdo_work_queue *queue) 618 + { 619 + return queue->owner; 620 + } 621 + 622 + /** 623 + * vdo_get_work_queue_private_data() - Returns the private data for the current thread's work 624 + * queue, or NULL if none or if the current thread is not a 625 + * work queue thread. 626 + */ 627 + void *vdo_get_work_queue_private_data(void) 628 + { 629 + struct simple_work_queue *queue = get_current_thread_work_queue(); 630 + 631 + return (queue != NULL) ? queue->private : NULL; 632 + } 633 + 634 + bool vdo_work_queue_type_is(struct vdo_work_queue *queue, 635 + const struct vdo_work_queue_type *type) 636 + { 637 + return (queue->type == type); 638 + }
+51
drivers/md/dm-vdo/funnel-workqueue.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright 2023 Red Hat 4 + */ 5 + 6 + #ifndef VDO_WORK_QUEUE_H 7 + #define VDO_WORK_QUEUE_H 8 + 9 + #include <linux/sched.h> /* for TASK_COMM_LEN */ 10 + 11 + #include "types.h" 12 + 13 + enum { 14 + MAX_VDO_WORK_QUEUE_NAME_LEN = TASK_COMM_LEN, 15 + }; 16 + 17 + struct vdo_work_queue_type { 18 + void (*start)(void *context); 19 + void (*finish)(void *context); 20 + enum vdo_completion_priority max_priority; 21 + enum vdo_completion_priority default_priority; 22 + }; 23 + 24 + struct vdo_completion; 25 + struct vdo_thread; 26 + struct vdo_work_queue; 27 + 28 + int vdo_make_work_queue(const char *thread_name_prefix, const char *name, 29 + struct vdo_thread *owner, const struct vdo_work_queue_type *type, 30 + unsigned int thread_count, void *thread_privates[], 31 + struct vdo_work_queue **queue_ptr); 32 + 33 + void vdo_enqueue_work_queue(struct vdo_work_queue *queue, struct vdo_completion *completion); 34 + 35 + void vdo_finish_work_queue(struct vdo_work_queue *queue); 36 + 37 + void vdo_free_work_queue(struct vdo_work_queue *queue); 38 + 39 + void vdo_dump_work_queue(struct vdo_work_queue *queue); 40 + 41 + void vdo_dump_completion_to_buffer(struct vdo_completion *completion, char *buffer, 42 + size_t length); 43 + 44 + void *vdo_get_work_queue_private_data(void); 45 + struct vdo_work_queue *vdo_get_current_work_queue(void); 46 + struct vdo_thread *vdo_get_work_queue_owner(struct vdo_work_queue *queue); 47 + 48 + bool __must_check vdo_work_queue_type_is(struct vdo_work_queue *queue, 49 + const struct vdo_work_queue_type *type); 50 + 51 + #endif /* VDO_WORK_QUEUE_H */