Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'probes-v6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull probes updates from Masami Hiramatsu:
"Cleanups:

- kprobes: Fixes typo in kprobes samples

- tracing/eprobes: Remove 'break' after return

kretprobe/fprobe performance improvements:

- lib: Introduce new `objpool`, which is a high performance lockless
object queue. This uses per-cpu ring array to allocate/release
objects from the pre-allocated object pool.

Since the index of ring array is a 32bit sequential counter, we can
retry to push/pop the object pointer from the ring without lock (as
seq-lock does)

- lib: Add an objpool test module to test the functionality and
evaluate the performance under some circumstances

- kprobes/fprobe: Improve kretprobe and rethook scalability
performance with objpool.

This improves both legacy kretprobe and fprobe exit handler (which
is based on rethook) to be scalable on SMP systems. Even with
8-threads parallel test, it shows a great scalability improvement

- Remove unneeded freelist.h which is replaced by objpool

- objpool: Add maintainers entry for the objpool

- objpool: Fix to remove unused include header lines"

* tag 'probes-v6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
kprobes: unused header files removed
MAINTAINERS: objpool added
kprobes: freelist.h removed
kprobes: kretprobe scalability improvement
lib: objpool test module added
lib: objpool added: ring-array based lockless MPMC
tracing/eprobe: drop unneeded breaks
samples: kprobes: Fixes a typo

+1273 -280
+7
MAINTAINERS
··· 15553 15553 F: lib/objagg.c 15554 15554 F: lib/test_objagg.c 15555 15555 15556 + OBJPOOL 15557 + M: Matt Wu <wuqiang.matt@bytedance.com> 15558 + S: Supported 15559 + F: include/linux/objpool.h 15560 + F: lib/objpool.c 15561 + F: lib/test_objpool.c 15562 + 15556 15563 OBJTOOL 15557 15564 M: Josh Poimboeuf <jpoimboe@kernel.org> 15558 15565 M: Peter Zijlstra <peterz@infradead.org>
-129
include/linux/freelist.h
··· 1 - /* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */ 2 - #ifndef FREELIST_H 3 - #define FREELIST_H 4 - 5 - #include <linux/atomic.h> 6 - 7 - /* 8 - * Copyright: cameron@moodycamel.com 9 - * 10 - * A simple CAS-based lock-free free list. Not the fastest thing in the world 11 - * under heavy contention, but simple and correct (assuming nodes are never 12 - * freed until after the free list is destroyed), and fairly speedy under low 13 - * contention. 14 - * 15 - * Adapted from: https://moodycamel.com/blog/2014/solving-the-aba-problem-for-lock-free-free-lists 16 - */ 17 - 18 - struct freelist_node { 19 - atomic_t refs; 20 - struct freelist_node *next; 21 - }; 22 - 23 - struct freelist_head { 24 - struct freelist_node *head; 25 - }; 26 - 27 - #define REFS_ON_FREELIST 0x80000000 28 - #define REFS_MASK 0x7FFFFFFF 29 - 30 - static inline void __freelist_add(struct freelist_node *node, struct freelist_head *list) 31 - { 32 - /* 33 - * Since the refcount is zero, and nobody can increase it once it's 34 - * zero (except us, and we run only one copy of this method per node at 35 - * a time, i.e. the single thread case), then we know we can safely 36 - * change the next pointer of the node; however, once the refcount is 37 - * back above zero, then other threads could increase it (happens under 38 - * heavy contention, when the refcount goes to zero in between a load 39 - * and a refcount increment of a node in try_get, then back up to 40 - * something non-zero, then the refcount increment is done by the other 41 - * thread) -- so if the CAS to add the node to the actual list fails, 42 - * decrese the refcount and leave the add operation to the next thread 43 - * who puts the refcount back to zero (which could be us, hence the 44 - * loop). 45 - */ 46 - struct freelist_node *head = READ_ONCE(list->head); 47 - 48 - for (;;) { 49 - WRITE_ONCE(node->next, head); 50 - atomic_set_release(&node->refs, 1); 51 - 52 - if (!try_cmpxchg_release(&list->head, &head, node)) { 53 - /* 54 - * Hmm, the add failed, but we can only try again when 55 - * the refcount goes back to zero. 56 - */ 57 - if (atomic_fetch_add_release(REFS_ON_FREELIST - 1, &node->refs) == 1) 58 - continue; 59 - } 60 - return; 61 - } 62 - } 63 - 64 - static inline void freelist_add(struct freelist_node *node, struct freelist_head *list) 65 - { 66 - /* 67 - * We know that the should-be-on-freelist bit is 0 at this point, so 68 - * it's safe to set it using a fetch_add. 69 - */ 70 - if (!atomic_fetch_add_release(REFS_ON_FREELIST, &node->refs)) { 71 - /* 72 - * Oh look! We were the last ones referencing this node, and we 73 - * know we want to add it to the free list, so let's do it! 74 - */ 75 - __freelist_add(node, list); 76 - } 77 - } 78 - 79 - static inline struct freelist_node *freelist_try_get(struct freelist_head *list) 80 - { 81 - struct freelist_node *prev, *next, *head = smp_load_acquire(&list->head); 82 - unsigned int refs; 83 - 84 - while (head) { 85 - prev = head; 86 - refs = atomic_read(&head->refs); 87 - if ((refs & REFS_MASK) == 0 || 88 - !atomic_try_cmpxchg_acquire(&head->refs, &refs, refs+1)) { 89 - head = smp_load_acquire(&list->head); 90 - continue; 91 - } 92 - 93 - /* 94 - * Good, reference count has been incremented (it wasn't at 95 - * zero), which means we can read the next and not worry about 96 - * it changing between now and the time we do the CAS. 97 - */ 98 - next = READ_ONCE(head->next); 99 - if (try_cmpxchg_acquire(&list->head, &head, next)) { 100 - /* 101 - * Yay, got the node. This means it was on the list, 102 - * which means should-be-on-freelist must be false no 103 - * matter the refcount (because nobody else knows it's 104 - * been taken off yet, it can't have been put back on). 105 - */ 106 - WARN_ON_ONCE(atomic_read(&head->refs) & REFS_ON_FREELIST); 107 - 108 - /* 109 - * Decrease refcount twice, once for our ref, and once 110 - * for the list's ref. 111 - */ 112 - atomic_fetch_add(-2, &head->refs); 113 - 114 - return head; 115 - } 116 - 117 - /* 118 - * OK, the head must have changed on us, but we still need to decrement 119 - * the refcount we increased. 120 - */ 121 - refs = atomic_fetch_add(-1, &prev->refs); 122 - if (refs == REFS_ON_FREELIST + 1) 123 - __freelist_add(prev, list); 124 - } 125 - 126 - return NULL; 127 - } 128 - 129 - #endif /* FREELIST_H */
+3 -8
include/linux/kprobes.h
··· 26 26 #include <linux/rcupdate.h> 27 27 #include <linux/mutex.h> 28 28 #include <linux/ftrace.h> 29 - #include <linux/refcount.h> 30 - #include <linux/freelist.h> 29 + #include <linux/objpool.h> 31 30 #include <linux/rethook.h> 32 31 #include <asm/kprobes.h> 33 32 ··· 140 141 */ 141 142 struct kretprobe_holder { 142 143 struct kretprobe *rp; 143 - refcount_t ref; 144 + struct objpool_head pool; 144 145 }; 145 146 146 147 struct kretprobe { ··· 153 154 #ifdef CONFIG_KRETPROBE_ON_RETHOOK 154 155 struct rethook *rh; 155 156 #else 156 - struct freelist_head freelist; 157 157 struct kretprobe_holder *rph; 158 158 #endif 159 159 }; ··· 163 165 #ifdef CONFIG_KRETPROBE_ON_RETHOOK 164 166 struct rethook_node node; 165 167 #else 166 - union { 167 - struct freelist_node freelist; 168 - struct rcu_head rcu; 169 - }; 168 + struct rcu_head rcu; 170 169 struct llist_node llist; 171 170 struct kretprobe_holder *rph; 172 171 kprobe_opcode_t *ret_addr;
+181
include/linux/objpool.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + 3 + #ifndef _LINUX_OBJPOOL_H 4 + #define _LINUX_OBJPOOL_H 5 + 6 + #include <linux/types.h> 7 + #include <linux/refcount.h> 8 + 9 + /* 10 + * objpool: ring-array based lockless MPMC queue 11 + * 12 + * Copyright: wuqiang.matt@bytedance.com,mhiramat@kernel.org 13 + * 14 + * objpool is a scalable implementation of high performance queue for 15 + * object allocation and reclamation, such as kretprobe instances. 16 + * 17 + * With leveraging percpu ring-array to mitigate hot spots of memory 18 + * contention, it delivers near-linear scalability for high parallel 19 + * scenarios. The objpool is best suited for the following cases: 20 + * 1) Memory allocation or reclamation are prohibited or too expensive 21 + * 2) Consumers are of different priorities, such as irqs and threads 22 + * 23 + * Limitations: 24 + * 1) Maximum objects (capacity) is fixed after objpool creation 25 + * 2) All pre-allocated objects are managed in percpu ring array, 26 + * which consumes more memory than linked lists 27 + */ 28 + 29 + /** 30 + * struct objpool_slot - percpu ring array of objpool 31 + * @head: head sequence of the local ring array (to retrieve at) 32 + * @tail: tail sequence of the local ring array (to append at) 33 + * @last: the last sequence number marked as ready for retrieve 34 + * @mask: bits mask for modulo capacity to compute array indexes 35 + * @entries: object entries on this slot 36 + * 37 + * Represents a cpu-local array-based ring buffer, its size is specialized 38 + * during initialization of object pool. The percpu objpool node is to be 39 + * allocated from local memory for NUMA system, and to be kept compact in 40 + * continuous memory: CPU assigned number of objects are stored just after 41 + * the body of objpool_node. 42 + * 43 + * Real size of the ring array is far too smaller than the value range of 44 + * head and tail, typed as uint32_t: [0, 2^32), so only lower bits (mask) 45 + * of head and tail are used as the actual position in the ring array. In 46 + * general the ring array is acting like a small sliding window, which is 47 + * always moving forward in the loop of [0, 2^32). 48 + */ 49 + struct objpool_slot { 50 + uint32_t head; 51 + uint32_t tail; 52 + uint32_t last; 53 + uint32_t mask; 54 + void *entries[]; 55 + } __packed; 56 + 57 + struct objpool_head; 58 + 59 + /* 60 + * caller-specified callback for object initial setup, it's only called 61 + * once for each object (just after the memory allocation of the object) 62 + */ 63 + typedef int (*objpool_init_obj_cb)(void *obj, void *context); 64 + 65 + /* caller-specified cleanup callback for objpool destruction */ 66 + typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context); 67 + 68 + /** 69 + * struct objpool_head - object pooling metadata 70 + * @obj_size: object size, aligned to sizeof(void *) 71 + * @nr_objs: total objs (to be pre-allocated with objpool) 72 + * @nr_cpus: local copy of nr_cpu_ids 73 + * @capacity: max objs can be managed by one objpool_slot 74 + * @gfp: gfp flags for kmalloc & vmalloc 75 + * @ref: refcount of objpool 76 + * @flags: flags for objpool management 77 + * @cpu_slots: pointer to the array of objpool_slot 78 + * @release: resource cleanup callback 79 + * @context: caller-provided context 80 + */ 81 + struct objpool_head { 82 + int obj_size; 83 + int nr_objs; 84 + int nr_cpus; 85 + int capacity; 86 + gfp_t gfp; 87 + refcount_t ref; 88 + unsigned long flags; 89 + struct objpool_slot **cpu_slots; 90 + objpool_fini_cb release; 91 + void *context; 92 + }; 93 + 94 + #define OBJPOOL_NR_OBJECT_MAX (1UL << 24) /* maximum numbers of total objects */ 95 + #define OBJPOOL_OBJECT_SIZE_MAX (1UL << 16) /* maximum size of an object */ 96 + 97 + /** 98 + * objpool_init() - initialize objpool and pre-allocated objects 99 + * @pool: the object pool to be initialized, declared by caller 100 + * @nr_objs: total objects to be pre-allocated by this object pool 101 + * @object_size: size of an object (should be > 0) 102 + * @gfp: flags for memory allocation (via kmalloc or vmalloc) 103 + * @context: user context for object initialization callback 104 + * @objinit: object initialization callback for extra setup 105 + * @release: cleanup callback for extra cleanup task 106 + * 107 + * return value: 0 for success, otherwise error code 108 + * 109 + * All pre-allocated objects are to be zeroed after memory allocation. 110 + * Caller could do extra initialization in objinit callback. objinit() 111 + * will be called just after slot allocation and called only once for 112 + * each object. After that the objpool won't touch any content of the 113 + * objects. It's caller's duty to perform reinitialization after each 114 + * pop (object allocation) or do clearance before each push (object 115 + * reclamation). 116 + */ 117 + int objpool_init(struct objpool_head *pool, int nr_objs, int object_size, 118 + gfp_t gfp, void *context, objpool_init_obj_cb objinit, 119 + objpool_fini_cb release); 120 + 121 + /** 122 + * objpool_pop() - allocate an object from objpool 123 + * @pool: object pool 124 + * 125 + * return value: object ptr or NULL if failed 126 + */ 127 + void *objpool_pop(struct objpool_head *pool); 128 + 129 + /** 130 + * objpool_push() - reclaim the object and return back to objpool 131 + * @obj: object ptr to be pushed to objpool 132 + * @pool: object pool 133 + * 134 + * return: 0 or error code (it fails only when user tries to push 135 + * the same object multiple times or wrong "objects" into objpool) 136 + */ 137 + int objpool_push(void *obj, struct objpool_head *pool); 138 + 139 + /** 140 + * objpool_drop() - discard the object and deref objpool 141 + * @obj: object ptr to be discarded 142 + * @pool: object pool 143 + * 144 + * return: 0 if objpool was released; -EAGAIN if there are still 145 + * outstanding objects 146 + * 147 + * objpool_drop is normally for the release of outstanding objects 148 + * after objpool cleanup (objpool_fini). Thinking of this example: 149 + * kretprobe is unregistered and objpool_fini() is called to release 150 + * all remained objects, but there are still objects being used by 151 + * unfinished kretprobes (like blockable function: sys_accept). So 152 + * only when the last outstanding object is dropped could the whole 153 + * objpool be released along with the call of objpool_drop() 154 + */ 155 + int objpool_drop(void *obj, struct objpool_head *pool); 156 + 157 + /** 158 + * objpool_free() - release objpool forcely (all objects to be freed) 159 + * @pool: object pool to be released 160 + */ 161 + void objpool_free(struct objpool_head *pool); 162 + 163 + /** 164 + * objpool_fini() - deref object pool (also releasing unused objects) 165 + * @pool: object pool to be dereferenced 166 + * 167 + * objpool_fini() will try to release all remained free objects and 168 + * then drop an extra reference of the objpool. If all objects are 169 + * already returned to objpool (so called synchronous use cases), 170 + * the objpool itself will be freed together. But if there are still 171 + * outstanding objects (so called asynchronous use cases, such like 172 + * blockable kretprobe), the objpool won't be released until all 173 + * the outstanding objects are dropped, but the caller must assure 174 + * there are no concurrent objpool_push() on the fly. Normally RCU 175 + * is being required to make sure all ongoing objpool_push() must 176 + * be finished before calling objpool_fini(), so does test_objpool, 177 + * kretprobe or rethook 178 + */ 179 + void objpool_fini(struct objpool_head *pool); 180 + 181 + #endif /* _LINUX_OBJPOOL_H */
+4 -12
include/linux/rethook.h
··· 6 6 #define _LINUX_RETHOOK_H 7 7 8 8 #include <linux/compiler.h> 9 - #include <linux/freelist.h> 9 + #include <linux/objpool.h> 10 10 #include <linux/kallsyms.h> 11 11 #include <linux/llist.h> 12 12 #include <linux/rcupdate.h> 13 - #include <linux/refcount.h> 14 13 15 14 struct rethook_node; 16 15 ··· 29 30 struct rethook { 30 31 void *data; 31 32 rethook_handler_t handler; 32 - struct freelist_head pool; 33 - refcount_t ref; 33 + struct objpool_head pool; 34 34 struct rcu_head rcu; 35 35 }; 36 36 37 37 /** 38 38 * struct rethook_node - The rethook shadow-stack entry node. 39 - * @freelist: The freelist, linked to struct rethook::pool. 40 39 * @rcu: The rcu_head for deferred freeing. 41 40 * @llist: The llist, linked to a struct task_struct::rethooks. 42 41 * @rethook: The pointer to the struct rethook. ··· 45 48 * on each entry of the shadow stack. 46 49 */ 47 50 struct rethook_node { 48 - union { 49 - struct freelist_node freelist; 50 - struct rcu_head rcu; 51 - }; 51 + struct rcu_head rcu; 52 52 struct llist_node llist; 53 53 struct rethook *rethook; 54 54 unsigned long ret_addr; 55 55 unsigned long frame; 56 56 }; 57 57 58 - struct rethook *rethook_alloc(void *data, rethook_handler_t handler); 58 + struct rethook *rethook_alloc(void *data, rethook_handler_t handler, int size, int num); 59 59 void rethook_stop(struct rethook *rh); 60 60 void rethook_free(struct rethook *rh); 61 - void rethook_add_node(struct rethook *rh, struct rethook_node *node); 62 61 struct rethook_node *rethook_try_get(struct rethook *rh); 63 62 void rethook_recycle(struct rethook_node *node); 64 63 void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount); ··· 91 98 #endif 92 99 93 100 #endif 94 -
+39 -52
kernel/kprobes.c
··· 1877 1877 #ifdef CONFIG_KRETPROBES 1878 1878 1879 1879 #if !defined(CONFIG_KRETPROBE_ON_RETHOOK) 1880 + 1881 + /* callbacks for objpool of kretprobe instances */ 1882 + static int kretprobe_init_inst(void *nod, void *context) 1883 + { 1884 + struct kretprobe_instance *ri = nod; 1885 + 1886 + ri->rph = context; 1887 + return 0; 1888 + } 1889 + static int kretprobe_fini_pool(struct objpool_head *head, void *context) 1890 + { 1891 + kfree(context); 1892 + return 0; 1893 + } 1894 + 1880 1895 static void free_rp_inst_rcu(struct rcu_head *head) 1881 1896 { 1882 1897 struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu); 1898 + struct kretprobe_holder *rph = ri->rph; 1883 1899 1884 - if (refcount_dec_and_test(&ri->rph->ref)) 1885 - kfree(ri->rph); 1886 - kfree(ri); 1900 + objpool_drop(ri, &rph->pool); 1887 1901 } 1888 1902 NOKPROBE_SYMBOL(free_rp_inst_rcu); 1889 1903 ··· 1906 1892 struct kretprobe *rp = get_kretprobe(ri); 1907 1893 1908 1894 if (likely(rp)) 1909 - freelist_add(&ri->freelist, &rp->freelist); 1895 + objpool_push(ri, &rp->rph->pool); 1910 1896 else 1911 1897 call_rcu(&ri->rcu, free_rp_inst_rcu); 1912 1898 } ··· 1943 1929 1944 1930 static inline void free_rp_inst(struct kretprobe *rp) 1945 1931 { 1946 - struct kretprobe_instance *ri; 1947 - struct freelist_node *node; 1948 - int count = 0; 1932 + struct kretprobe_holder *rph = rp->rph; 1949 1933 1950 - node = rp->freelist.head; 1951 - while (node) { 1952 - ri = container_of(node, struct kretprobe_instance, freelist); 1953 - node = node->next; 1954 - 1955 - kfree(ri); 1956 - count++; 1957 - } 1958 - 1959 - if (refcount_sub_and_test(count, &rp->rph->ref)) { 1960 - kfree(rp->rph); 1961 - rp->rph = NULL; 1962 - } 1934 + if (!rph) 1935 + return; 1936 + rp->rph = NULL; 1937 + objpool_fini(&rph->pool); 1963 1938 } 1964 1939 1965 1940 /* This assumes the 'tsk' is the current task or the is not running. */ ··· 2090 2087 static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) 2091 2088 { 2092 2089 struct kretprobe *rp = container_of(p, struct kretprobe, kp); 2090 + struct kretprobe_holder *rph = rp->rph; 2093 2091 struct kretprobe_instance *ri; 2094 - struct freelist_node *fn; 2095 2092 2096 - fn = freelist_try_get(&rp->freelist); 2097 - if (!fn) { 2093 + ri = objpool_pop(&rph->pool); 2094 + if (!ri) { 2098 2095 rp->nmissed++; 2099 2096 return 0; 2100 2097 } 2101 2098 2102 - ri = container_of(fn, struct kretprobe_instance, freelist); 2103 - 2104 2099 if (rp->entry_handler && rp->entry_handler(ri, regs)) { 2105 - freelist_add(&ri->freelist, &rp->freelist); 2100 + objpool_push(ri, &rph->pool); 2106 2101 return 0; 2107 2102 } 2108 2103 ··· 2194 2193 int register_kretprobe(struct kretprobe *rp) 2195 2194 { 2196 2195 int ret; 2197 - struct kretprobe_instance *inst; 2198 2196 int i; 2199 2197 void *addr; 2200 2198 ··· 2227 2227 rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); 2228 2228 2229 2229 #ifdef CONFIG_KRETPROBE_ON_RETHOOK 2230 - rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler); 2231 - if (!rp->rh) 2232 - return -ENOMEM; 2230 + rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler, 2231 + sizeof(struct kretprobe_instance) + 2232 + rp->data_size, rp->maxactive); 2233 + if (IS_ERR(rp->rh)) 2234 + return PTR_ERR(rp->rh); 2233 2235 2234 - for (i = 0; i < rp->maxactive; i++) { 2235 - inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL); 2236 - if (inst == NULL) { 2237 - rethook_free(rp->rh); 2238 - rp->rh = NULL; 2239 - return -ENOMEM; 2240 - } 2241 - rethook_add_node(rp->rh, &inst->node); 2242 - } 2243 2236 rp->nmissed = 0; 2244 2237 /* Establish function entry probe point */ 2245 2238 ret = register_kprobe(&rp->kp); ··· 2241 2248 rp->rh = NULL; 2242 2249 } 2243 2250 #else /* !CONFIG_KRETPROBE_ON_RETHOOK */ 2244 - rp->freelist.head = NULL; 2245 2251 rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL); 2246 2252 if (!rp->rph) 2247 2253 return -ENOMEM; 2248 2254 2249 - rp->rph->rp = rp; 2250 - for (i = 0; i < rp->maxactive; i++) { 2251 - inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL); 2252 - if (inst == NULL) { 2253 - refcount_set(&rp->rph->ref, i); 2254 - free_rp_inst(rp); 2255 - return -ENOMEM; 2256 - } 2257 - inst->rph = rp->rph; 2258 - freelist_add(&inst->freelist, &rp->freelist); 2255 + if (objpool_init(&rp->rph->pool, rp->maxactive, rp->data_size + 2256 + sizeof(struct kretprobe_instance), GFP_KERNEL, 2257 + rp->rph, kretprobe_init_inst, kretprobe_fini_pool)) { 2258 + kfree(rp->rph); 2259 + rp->rph = NULL; 2260 + return -ENOMEM; 2259 2261 } 2260 - refcount_set(&rp->rph->ref, i); 2261 - 2262 + rp->rph->rp = rp; 2262 2263 rp->nmissed = 0; 2263 2264 /* Establish function entry probe point */ 2264 2265 ret = register_kprobe(&rp->kp);
+9 -17
kernel/trace/fprobe.c
··· 187 187 188 188 static int fprobe_init_rethook(struct fprobe *fp, int num) 189 189 { 190 - int i, size; 190 + int size; 191 191 192 192 if (num <= 0) 193 193 return -EINVAL; ··· 205 205 if (size <= 0) 206 206 return -EINVAL; 207 207 208 - fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler); 209 - if (!fp->rethook) 210 - return -ENOMEM; 211 - for (i = 0; i < size; i++) { 212 - struct fprobe_rethook_node *node; 208 + /* Initialize rethook */ 209 + fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, 210 + sizeof(struct fprobe_rethook_node), size); 211 + if (IS_ERR(fp->rethook)) 212 + return PTR_ERR(fp->rethook); 213 213 214 - node = kzalloc(sizeof(*node) + fp->entry_data_size, GFP_KERNEL); 215 - if (!node) { 216 - rethook_free(fp->rethook); 217 - fp->rethook = NULL; 218 - return -ENOMEM; 219 - } 220 - rethook_add_node(fp->rethook, &node->node); 221 - } 222 214 return 0; 223 215 } 224 216 225 217 static void fprobe_fail_cleanup(struct fprobe *fp) 226 218 { 227 - if (fp->rethook) { 219 + if (!IS_ERR_OR_NULL(fp->rethook)) { 228 220 /* Don't need to cleanup rethook->handler because this is not used. */ 229 221 rethook_free(fp->rethook); 230 222 fp->rethook = NULL; ··· 371 379 if (!fprobe_is_registered(fp)) 372 380 return -EINVAL; 373 381 374 - if (fp->rethook) 382 + if (!IS_ERR_OR_NULL(fp->rethook)) 375 383 rethook_stop(fp->rethook); 376 384 377 385 ret = unregister_ftrace_function(&fp->ops); 378 386 if (ret < 0) 379 387 return ret; 380 388 381 - if (fp->rethook) 389 + if (!IS_ERR_OR_NULL(fp->rethook)) 382 390 rethook_free(fp->rethook); 383 391 384 392 ftrace_free_filter(&fp->ops);
+44 -56
kernel/trace/rethook.c
··· 8 8 #include <linux/preempt.h> 9 9 #include <linux/rethook.h> 10 10 #include <linux/slab.h> 11 - #include <linux/sort.h> 12 11 13 12 /* Return hook list (shadow stack by list) */ 14 13 ··· 35 36 static void rethook_free_rcu(struct rcu_head *head) 36 37 { 37 38 struct rethook *rh = container_of(head, struct rethook, rcu); 38 - struct rethook_node *rhn; 39 - struct freelist_node *node; 40 - int count = 1; 41 - 42 - node = rh->pool.head; 43 - while (node) { 44 - rhn = container_of(node, struct rethook_node, freelist); 45 - node = node->next; 46 - kfree(rhn); 47 - count++; 48 - } 49 - 50 - /* The rh->ref is the number of pooled node + 1 */ 51 - if (refcount_sub_and_test(count, &rh->ref)) 52 - kfree(rh); 39 + objpool_fini(&rh->pool); 53 40 } 54 41 55 42 /** ··· 68 83 call_rcu(&rh->rcu, rethook_free_rcu); 69 84 } 70 85 71 - /** 72 - * rethook_alloc() - Allocate struct rethook. 73 - * @data: a data to pass the @handler when hooking the return. 74 - * @handler: the return hook callback function. 75 - * 76 - * Allocate and initialize a new rethook with @data and @handler. 77 - * Return NULL if memory allocation fails or @handler is NULL. 78 - * Note that @handler == NULL means this rethook is going to be freed. 79 - */ 80 - struct rethook *rethook_alloc(void *data, rethook_handler_t handler) 86 + static int rethook_init_node(void *nod, void *context) 81 87 { 82 - struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL); 88 + struct rethook_node *node = nod; 83 89 84 - if (!rh || !handler) { 85 - kfree(rh); 86 - return NULL; 87 - } 90 + node->rethook = context; 91 + return 0; 92 + } 88 93 89 - rh->data = data; 90 - rh->handler = handler; 91 - rh->pool.head = NULL; 92 - refcount_set(&rh->ref, 1); 93 - 94 - return rh; 94 + static int rethook_fini_pool(struct objpool_head *head, void *context) 95 + { 96 + kfree(context); 97 + return 0; 95 98 } 96 99 97 100 /** 98 - * rethook_add_node() - Add a new node to the rethook. 99 - * @rh: the struct rethook. 100 - * @node: the struct rethook_node to be added. 101 + * rethook_alloc() - Allocate struct rethook. 102 + * @data: a data to pass the @handler when hooking the return. 103 + * @handler: the return hook callback function, must NOT be NULL 104 + * @size: node size: rethook node and additional data 105 + * @num: number of rethook nodes to be preallocated 101 106 * 102 - * Add @node to @rh. User must allocate @node (as a part of user's 103 - * data structure.) The @node fields are initialized in this function. 107 + * Allocate and initialize a new rethook with @data and @handler. 108 + * Return pointer of new rethook, or error codes for failures. 109 + * 110 + * Note that @handler == NULL means this rethook is going to be freed. 104 111 */ 105 - void rethook_add_node(struct rethook *rh, struct rethook_node *node) 112 + struct rethook *rethook_alloc(void *data, rethook_handler_t handler, 113 + int size, int num) 106 114 { 107 - node->rethook = rh; 108 - freelist_add(&node->freelist, &rh->pool); 109 - refcount_inc(&rh->ref); 115 + struct rethook *rh; 116 + 117 + if (!handler || num <= 0 || size < sizeof(struct rethook_node)) 118 + return ERR_PTR(-EINVAL); 119 + 120 + rh = kzalloc(sizeof(struct rethook), GFP_KERNEL); 121 + if (!rh) 122 + return ERR_PTR(-ENOMEM); 123 + 124 + rh->data = data; 125 + rh->handler = handler; 126 + 127 + /* initialize the objpool for rethook nodes */ 128 + if (objpool_init(&rh->pool, num, size, GFP_KERNEL, rh, 129 + rethook_init_node, rethook_fini_pool)) { 130 + kfree(rh); 131 + return ERR_PTR(-ENOMEM); 132 + } 133 + return rh; 110 134 } 111 135 112 136 static void free_rethook_node_rcu(struct rcu_head *head) 113 137 { 114 138 struct rethook_node *node = container_of(head, struct rethook_node, rcu); 139 + struct rethook *rh = node->rethook; 115 140 116 - if (refcount_dec_and_test(&node->rethook->ref)) 117 - kfree(node->rethook); 118 - kfree(node); 141 + objpool_drop(node, &rh->pool); 119 142 } 120 143 121 144 /** ··· 138 145 lockdep_assert_preemption_disabled(); 139 146 140 147 if (likely(READ_ONCE(node->rethook->handler))) 141 - freelist_add(&node->freelist, &node->rethook->pool); 148 + objpool_push(node, &node->rethook->pool); 142 149 else 143 150 call_rcu(&node->rcu, free_rethook_node_rcu); 144 151 } ··· 154 161 struct rethook_node *rethook_try_get(struct rethook *rh) 155 162 { 156 163 rethook_handler_t handler = READ_ONCE(rh->handler); 157 - struct freelist_node *fn; 158 164 159 165 lockdep_assert_preemption_disabled(); 160 166 ··· 170 178 if (unlikely(!rcu_is_watching())) 171 179 return NULL; 172 180 173 - fn = freelist_try_get(&rh->pool); 174 - if (!fn) 175 - return NULL; 176 - 177 - return container_of(fn, struct rethook_node, freelist); 181 + return (struct rethook_node *)objpool_pop(&rh->pool); 178 182 } 179 183 NOKPROBE_SYMBOL(rethook_try_get); 180 184
+1 -4
kernel/trace/trace_eprobe.c
··· 788 788 name = trace_event_name(tp_event); 789 789 if (!name || strcmp(event_name, name)) 790 790 continue; 791 - if (!trace_event_try_get_ref(tp_event)) { 791 + if (!trace_event_try_get_ref(tp_event)) 792 792 return NULL; 793 - break; 794 - } 795 793 return tp_event; 796 - break; 797 794 } 798 795 return NULL; 799 796 }
+11
lib/Kconfig.debug
··· 2954 2954 2955 2955 If unsure, say N. 2956 2956 2957 + config TEST_OBJPOOL 2958 + tristate "Test module for correctness and stress of objpool" 2959 + default n 2960 + depends on m && DEBUG_KERNEL 2961 + help 2962 + This builds the "test_objpool" module that should be used for 2963 + correctness verification and concurrent testings of objects 2964 + allocation and reclamation. 2965 + 2966 + If unsure, say N. 2967 + 2957 2968 endif # RUNTIME_TESTING_MENU 2958 2969 2959 2970 config ARCH_USE_MEMTEST
+3 -1
lib/Makefile
··· 34 34 is_single_threaded.o plist.o decompress.o kobject_uevent.o \ 35 35 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ 36 36 nmi_backtrace.o win_minmax.o memcat_p.o \ 37 - buildid.o 37 + buildid.o objpool.o 38 38 39 39 lib-$(CONFIG_PRINTK) += dump_stack.o 40 40 lib-$(CONFIG_SMP) += cpumask.o ··· 107 107 obj-$(CONFIG_TEST_REF_TRACKER) += test_ref_tracker.o 108 108 CFLAGS_test_fprobe.o += $(CC_FLAGS_FTRACE) 109 109 obj-$(CONFIG_FPROBE_SANITY_TEST) += test_fprobe.o 110 + obj-$(CONFIG_TEST_OBJPOOL) += test_objpool.o 111 + 110 112 # 111 113 # CFLAGS for compiling floating point code inside the kernel. x86/Makefile turns 112 114 # off the generation of FPU/SSE* instructions for kernel proper but FPU_FLAGS
+280
lib/objpool.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <linux/objpool.h> 4 + #include <linux/slab.h> 5 + #include <linux/vmalloc.h> 6 + #include <linux/atomic.h> 7 + #include <linux/irqflags.h> 8 + #include <linux/cpumask.h> 9 + #include <linux/log2.h> 10 + 11 + /* 12 + * objpool: ring-array based lockless MPMC/FIFO queues 13 + * 14 + * Copyright: wuqiang.matt@bytedance.com,mhiramat@kernel.org 15 + */ 16 + 17 + /* initialize percpu objpool_slot */ 18 + static int 19 + objpool_init_percpu_slot(struct objpool_head *pool, 20 + struct objpool_slot *slot, 21 + int nodes, void *context, 22 + objpool_init_obj_cb objinit) 23 + { 24 + void *obj = (void *)&slot->entries[pool->capacity]; 25 + int i; 26 + 27 + /* initialize elements of percpu objpool_slot */ 28 + slot->mask = pool->capacity - 1; 29 + 30 + for (i = 0; i < nodes; i++) { 31 + if (objinit) { 32 + int rc = objinit(obj, context); 33 + if (rc) 34 + return rc; 35 + } 36 + slot->entries[slot->tail & slot->mask] = obj; 37 + obj = obj + pool->obj_size; 38 + slot->tail++; 39 + slot->last = slot->tail; 40 + pool->nr_objs++; 41 + } 42 + 43 + return 0; 44 + } 45 + 46 + /* allocate and initialize percpu slots */ 47 + static int 48 + objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs, 49 + void *context, objpool_init_obj_cb objinit) 50 + { 51 + int i, cpu_count = 0; 52 + 53 + for (i = 0; i < pool->nr_cpus; i++) { 54 + 55 + struct objpool_slot *slot; 56 + int nodes, size, rc; 57 + 58 + /* skip the cpu node which could never be present */ 59 + if (!cpu_possible(i)) 60 + continue; 61 + 62 + /* compute how many objects to be allocated with this slot */ 63 + nodes = nr_objs / num_possible_cpus(); 64 + if (cpu_count < (nr_objs % num_possible_cpus())) 65 + nodes++; 66 + cpu_count++; 67 + 68 + size = struct_size(slot, entries, pool->capacity) + 69 + pool->obj_size * nodes; 70 + 71 + /* 72 + * here we allocate percpu-slot & objs together in a single 73 + * allocation to make it more compact, taking advantage of 74 + * warm caches and TLB hits. in default vmalloc is used to 75 + * reduce the pressure of kernel slab system. as we know, 76 + * mimimal size of vmalloc is one page since vmalloc would 77 + * always align the requested size to page size 78 + */ 79 + if (pool->gfp & GFP_ATOMIC) 80 + slot = kmalloc_node(size, pool->gfp, cpu_to_node(i)); 81 + else 82 + slot = __vmalloc_node(size, sizeof(void *), pool->gfp, 83 + cpu_to_node(i), __builtin_return_address(0)); 84 + if (!slot) 85 + return -ENOMEM; 86 + memset(slot, 0, size); 87 + pool->cpu_slots[i] = slot; 88 + 89 + /* initialize the objpool_slot of cpu node i */ 90 + rc = objpool_init_percpu_slot(pool, slot, nodes, context, objinit); 91 + if (rc) 92 + return rc; 93 + } 94 + 95 + return 0; 96 + } 97 + 98 + /* cleanup all percpu slots of the object pool */ 99 + static void objpool_fini_percpu_slots(struct objpool_head *pool) 100 + { 101 + int i; 102 + 103 + if (!pool->cpu_slots) 104 + return; 105 + 106 + for (i = 0; i < pool->nr_cpus; i++) 107 + kvfree(pool->cpu_slots[i]); 108 + kfree(pool->cpu_slots); 109 + } 110 + 111 + /* initialize object pool and pre-allocate objects */ 112 + int objpool_init(struct objpool_head *pool, int nr_objs, int object_size, 113 + gfp_t gfp, void *context, objpool_init_obj_cb objinit, 114 + objpool_fini_cb release) 115 + { 116 + int rc, capacity, slot_size; 117 + 118 + /* check input parameters */ 119 + if (nr_objs <= 0 || nr_objs > OBJPOOL_NR_OBJECT_MAX || 120 + object_size <= 0 || object_size > OBJPOOL_OBJECT_SIZE_MAX) 121 + return -EINVAL; 122 + 123 + /* align up to unsigned long size */ 124 + object_size = ALIGN(object_size, sizeof(long)); 125 + 126 + /* calculate capacity of percpu objpool_slot */ 127 + capacity = roundup_pow_of_two(nr_objs); 128 + if (!capacity) 129 + return -EINVAL; 130 + 131 + /* initialize objpool pool */ 132 + memset(pool, 0, sizeof(struct objpool_head)); 133 + pool->nr_cpus = nr_cpu_ids; 134 + pool->obj_size = object_size; 135 + pool->capacity = capacity; 136 + pool->gfp = gfp & ~__GFP_ZERO; 137 + pool->context = context; 138 + pool->release = release; 139 + slot_size = pool->nr_cpus * sizeof(struct objpool_slot); 140 + pool->cpu_slots = kzalloc(slot_size, pool->gfp); 141 + if (!pool->cpu_slots) 142 + return -ENOMEM; 143 + 144 + /* initialize per-cpu slots */ 145 + rc = objpool_init_percpu_slots(pool, nr_objs, context, objinit); 146 + if (rc) 147 + objpool_fini_percpu_slots(pool); 148 + else 149 + refcount_set(&pool->ref, pool->nr_objs + 1); 150 + 151 + return rc; 152 + } 153 + EXPORT_SYMBOL_GPL(objpool_init); 154 + 155 + /* adding object to slot, abort if the slot was already full */ 156 + static inline int 157 + objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu) 158 + { 159 + struct objpool_slot *slot = pool->cpu_slots[cpu]; 160 + uint32_t head, tail; 161 + 162 + /* loading tail and head as a local snapshot, tail first */ 163 + tail = READ_ONCE(slot->tail); 164 + 165 + do { 166 + head = READ_ONCE(slot->head); 167 + /* fault caught: something must be wrong */ 168 + WARN_ON_ONCE(tail - head > pool->nr_objs); 169 + } while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1)); 170 + 171 + /* now the tail position is reserved for the given obj */ 172 + WRITE_ONCE(slot->entries[tail & slot->mask], obj); 173 + /* update sequence to make this obj available for pop() */ 174 + smp_store_release(&slot->last, tail + 1); 175 + 176 + return 0; 177 + } 178 + 179 + /* reclaim an object to object pool */ 180 + int objpool_push(void *obj, struct objpool_head *pool) 181 + { 182 + unsigned long flags; 183 + int rc; 184 + 185 + /* disable local irq to avoid preemption & interruption */ 186 + raw_local_irq_save(flags); 187 + rc = objpool_try_add_slot(obj, pool, raw_smp_processor_id()); 188 + raw_local_irq_restore(flags); 189 + 190 + return rc; 191 + } 192 + EXPORT_SYMBOL_GPL(objpool_push); 193 + 194 + /* try to retrieve object from slot */ 195 + static inline void *objpool_try_get_slot(struct objpool_head *pool, int cpu) 196 + { 197 + struct objpool_slot *slot = pool->cpu_slots[cpu]; 198 + /* load head snapshot, other cpus may change it */ 199 + uint32_t head = smp_load_acquire(&slot->head); 200 + 201 + while (head != READ_ONCE(slot->last)) { 202 + void *obj; 203 + 204 + /* obj must be retrieved before moving forward head */ 205 + obj = READ_ONCE(slot->entries[head & slot->mask]); 206 + 207 + /* move head forward to mark it's consumption */ 208 + if (try_cmpxchg_release(&slot->head, &head, head + 1)) 209 + return obj; 210 + } 211 + 212 + return NULL; 213 + } 214 + 215 + /* allocate an object from object pool */ 216 + void *objpool_pop(struct objpool_head *pool) 217 + { 218 + void *obj = NULL; 219 + unsigned long flags; 220 + int i, cpu; 221 + 222 + /* disable local irq to avoid preemption & interruption */ 223 + raw_local_irq_save(flags); 224 + 225 + cpu = raw_smp_processor_id(); 226 + for (i = 0; i < num_possible_cpus(); i++) { 227 + obj = objpool_try_get_slot(pool, cpu); 228 + if (obj) 229 + break; 230 + cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1); 231 + } 232 + raw_local_irq_restore(flags); 233 + 234 + return obj; 235 + } 236 + EXPORT_SYMBOL_GPL(objpool_pop); 237 + 238 + /* release whole objpool forcely */ 239 + void objpool_free(struct objpool_head *pool) 240 + { 241 + if (!pool->cpu_slots) 242 + return; 243 + 244 + /* release percpu slots */ 245 + objpool_fini_percpu_slots(pool); 246 + 247 + /* call user's cleanup callback if provided */ 248 + if (pool->release) 249 + pool->release(pool, pool->context); 250 + } 251 + EXPORT_SYMBOL_GPL(objpool_free); 252 + 253 + /* drop the allocated object, rather reclaim it to objpool */ 254 + int objpool_drop(void *obj, struct objpool_head *pool) 255 + { 256 + if (!obj || !pool) 257 + return -EINVAL; 258 + 259 + if (refcount_dec_and_test(&pool->ref)) { 260 + objpool_free(pool); 261 + return 0; 262 + } 263 + 264 + return -EAGAIN; 265 + } 266 + EXPORT_SYMBOL_GPL(objpool_drop); 267 + 268 + /* drop unused objects and defref objpool for releasing */ 269 + void objpool_fini(struct objpool_head *pool) 270 + { 271 + int count = 1; /* extra ref for objpool itself */ 272 + 273 + /* drop all remained objects from objpool */ 274 + while (objpool_pop(pool)) 275 + count++; 276 + 277 + if (refcount_sub_and_test(count, &pool->ref)) 278 + objpool_free(pool); 279 + } 280 + EXPORT_SYMBOL_GPL(objpool_fini);
+690
lib/test_objpool.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + /* 4 + * Test module for lockless object pool 5 + * 6 + * Copyright: wuqiang.matt@bytedance.com 7 + */ 8 + 9 + #include <linux/errno.h> 10 + #include <linux/module.h> 11 + #include <linux/moduleparam.h> 12 + #include <linux/completion.h> 13 + #include <linux/kthread.h> 14 + #include <linux/slab.h> 15 + #include <linux/vmalloc.h> 16 + #include <linux/delay.h> 17 + #include <linux/hrtimer.h> 18 + #include <linux/objpool.h> 19 + 20 + #define OT_NR_MAX_BULK (16) 21 + 22 + /* memory usage */ 23 + struct ot_mem_stat { 24 + atomic_long_t alloc; 25 + atomic_long_t free; 26 + }; 27 + 28 + /* object allocation results */ 29 + struct ot_obj_stat { 30 + unsigned long nhits; 31 + unsigned long nmiss; 32 + }; 33 + 34 + /* control & results per testcase */ 35 + struct ot_data { 36 + struct rw_semaphore start; 37 + struct completion wait; 38 + struct completion rcu; 39 + atomic_t nthreads ____cacheline_aligned_in_smp; 40 + atomic_t stop ____cacheline_aligned_in_smp; 41 + struct ot_mem_stat kmalloc; 42 + struct ot_mem_stat vmalloc; 43 + struct ot_obj_stat objects; 44 + u64 duration; 45 + }; 46 + 47 + /* testcase */ 48 + struct ot_test { 49 + int async; /* synchronous or asynchronous */ 50 + int mode; /* only mode 0 supported */ 51 + int objsz; /* object size */ 52 + int duration; /* ms */ 53 + int delay; /* ms */ 54 + int bulk_normal; 55 + int bulk_irq; 56 + unsigned long hrtimer; /* ms */ 57 + const char *name; 58 + struct ot_data data; 59 + }; 60 + 61 + /* per-cpu worker */ 62 + struct ot_item { 63 + struct objpool_head *pool; /* pool head */ 64 + struct ot_test *test; /* test parameters */ 65 + 66 + void (*worker)(struct ot_item *item, int irq); 67 + 68 + /* hrtimer control */ 69 + ktime_t hrtcycle; 70 + struct hrtimer hrtimer; 71 + 72 + int bulk[2]; /* for thread and irq */ 73 + int delay; 74 + u32 niters; 75 + 76 + /* summary per thread */ 77 + struct ot_obj_stat stat[2]; /* thread and irq */ 78 + u64 duration; 79 + }; 80 + 81 + /* 82 + * memory leakage checking 83 + */ 84 + 85 + static void *ot_kzalloc(struct ot_test *test, long size) 86 + { 87 + void *ptr = kzalloc(size, GFP_KERNEL); 88 + 89 + if (ptr) 90 + atomic_long_add(size, &test->data.kmalloc.alloc); 91 + return ptr; 92 + } 93 + 94 + static void ot_kfree(struct ot_test *test, void *ptr, long size) 95 + { 96 + if (!ptr) 97 + return; 98 + atomic_long_add(size, &test->data.kmalloc.free); 99 + kfree(ptr); 100 + } 101 + 102 + static void ot_mem_report(struct ot_test *test) 103 + { 104 + long alloc, free; 105 + 106 + pr_info("memory allocation summary for %s\n", test->name); 107 + 108 + alloc = atomic_long_read(&test->data.kmalloc.alloc); 109 + free = atomic_long_read(&test->data.kmalloc.free); 110 + pr_info(" kmalloc: %lu - %lu = %lu\n", alloc, free, alloc - free); 111 + 112 + alloc = atomic_long_read(&test->data.vmalloc.alloc); 113 + free = atomic_long_read(&test->data.vmalloc.free); 114 + pr_info(" vmalloc: %lu - %lu = %lu\n", alloc, free, alloc - free); 115 + } 116 + 117 + /* user object instance */ 118 + struct ot_node { 119 + void *owner; 120 + unsigned long data; 121 + unsigned long refs; 122 + unsigned long payload[32]; 123 + }; 124 + 125 + /* user objpool manager */ 126 + struct ot_context { 127 + struct objpool_head pool; /* objpool head */ 128 + struct ot_test *test; /* test parameters */ 129 + void *ptr; /* user pool buffer */ 130 + unsigned long size; /* buffer size */ 131 + struct rcu_head rcu; 132 + }; 133 + 134 + static DEFINE_PER_CPU(struct ot_item, ot_pcup_items); 135 + 136 + static int ot_init_data(struct ot_data *data) 137 + { 138 + memset(data, 0, sizeof(*data)); 139 + init_rwsem(&data->start); 140 + init_completion(&data->wait); 141 + init_completion(&data->rcu); 142 + atomic_set(&data->nthreads, 1); 143 + 144 + return 0; 145 + } 146 + 147 + static int ot_init_node(void *nod, void *context) 148 + { 149 + struct ot_context *sop = context; 150 + struct ot_node *on = nod; 151 + 152 + on->owner = &sop->pool; 153 + return 0; 154 + } 155 + 156 + static enum hrtimer_restart ot_hrtimer_handler(struct hrtimer *hrt) 157 + { 158 + struct ot_item *item = container_of(hrt, struct ot_item, hrtimer); 159 + struct ot_test *test = item->test; 160 + 161 + if (atomic_read_acquire(&test->data.stop)) 162 + return HRTIMER_NORESTART; 163 + 164 + /* do bulk-testings for objects pop/push */ 165 + item->worker(item, 1); 166 + 167 + hrtimer_forward(hrt, hrt->base->get_time(), item->hrtcycle); 168 + return HRTIMER_RESTART; 169 + } 170 + 171 + static void ot_start_hrtimer(struct ot_item *item) 172 + { 173 + if (!item->test->hrtimer) 174 + return; 175 + hrtimer_start(&item->hrtimer, item->hrtcycle, HRTIMER_MODE_REL); 176 + } 177 + 178 + static void ot_stop_hrtimer(struct ot_item *item) 179 + { 180 + if (!item->test->hrtimer) 181 + return; 182 + hrtimer_cancel(&item->hrtimer); 183 + } 184 + 185 + static int ot_init_hrtimer(struct ot_item *item, unsigned long hrtimer) 186 + { 187 + struct hrtimer *hrt = &item->hrtimer; 188 + 189 + if (!hrtimer) 190 + return -ENOENT; 191 + 192 + item->hrtcycle = ktime_set(0, hrtimer * 1000000UL); 193 + hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 194 + hrt->function = ot_hrtimer_handler; 195 + return 0; 196 + } 197 + 198 + static int ot_init_cpu_item(struct ot_item *item, 199 + struct ot_test *test, 200 + struct objpool_head *pool, 201 + void (*worker)(struct ot_item *, int)) 202 + { 203 + memset(item, 0, sizeof(*item)); 204 + item->pool = pool; 205 + item->test = test; 206 + item->worker = worker; 207 + 208 + item->bulk[0] = test->bulk_normal; 209 + item->bulk[1] = test->bulk_irq; 210 + item->delay = test->delay; 211 + 212 + /* initialize hrtimer */ 213 + ot_init_hrtimer(item, item->test->hrtimer); 214 + return 0; 215 + } 216 + 217 + static int ot_thread_worker(void *arg) 218 + { 219 + struct ot_item *item = arg; 220 + struct ot_test *test = item->test; 221 + ktime_t start; 222 + 223 + atomic_inc(&test->data.nthreads); 224 + down_read(&test->data.start); 225 + up_read(&test->data.start); 226 + start = ktime_get(); 227 + ot_start_hrtimer(item); 228 + do { 229 + if (atomic_read_acquire(&test->data.stop)) 230 + break; 231 + /* do bulk-testings for objects pop/push */ 232 + item->worker(item, 0); 233 + } while (!kthread_should_stop()); 234 + ot_stop_hrtimer(item); 235 + item->duration = (u64) ktime_us_delta(ktime_get(), start); 236 + if (atomic_dec_and_test(&test->data.nthreads)) 237 + complete(&test->data.wait); 238 + 239 + return 0; 240 + } 241 + 242 + static void ot_perf_report(struct ot_test *test, u64 duration) 243 + { 244 + struct ot_obj_stat total, normal = {0}, irq = {0}; 245 + int cpu, nthreads = 0; 246 + 247 + pr_info("\n"); 248 + pr_info("Testing summary for %s\n", test->name); 249 + 250 + for_each_possible_cpu(cpu) { 251 + struct ot_item *item = per_cpu_ptr(&ot_pcup_items, cpu); 252 + if (!item->duration) 253 + continue; 254 + normal.nhits += item->stat[0].nhits; 255 + normal.nmiss += item->stat[0].nmiss; 256 + irq.nhits += item->stat[1].nhits; 257 + irq.nmiss += item->stat[1].nmiss; 258 + pr_info("CPU: %d duration: %lluus\n", cpu, item->duration); 259 + pr_info("\tthread:\t%16lu hits \t%16lu miss\n", 260 + item->stat[0].nhits, item->stat[0].nmiss); 261 + pr_info("\tirq: \t%16lu hits \t%16lu miss\n", 262 + item->stat[1].nhits, item->stat[1].nmiss); 263 + pr_info("\ttotal: \t%16lu hits \t%16lu miss\n", 264 + item->stat[0].nhits + item->stat[1].nhits, 265 + item->stat[0].nmiss + item->stat[1].nmiss); 266 + nthreads++; 267 + } 268 + 269 + total.nhits = normal.nhits + irq.nhits; 270 + total.nmiss = normal.nmiss + irq.nmiss; 271 + 272 + pr_info("ALL: \tnthreads: %d duration: %lluus\n", nthreads, duration); 273 + pr_info("SUM: \t%16lu hits \t%16lu miss\n", 274 + total.nhits, total.nmiss); 275 + 276 + test->data.objects = total; 277 + test->data.duration = duration; 278 + } 279 + 280 + /* 281 + * synchronous test cases for objpool manipulation 282 + */ 283 + 284 + /* objpool manipulation for synchronous mode (percpu objpool) */ 285 + static struct ot_context *ot_init_sync_m0(struct ot_test *test) 286 + { 287 + struct ot_context *sop = NULL; 288 + int max = num_possible_cpus() << 3; 289 + gfp_t gfp = GFP_KERNEL; 290 + 291 + sop = (struct ot_context *)ot_kzalloc(test, sizeof(*sop)); 292 + if (!sop) 293 + return NULL; 294 + sop->test = test; 295 + if (test->objsz < 512) 296 + gfp = GFP_ATOMIC; 297 + 298 + if (objpool_init(&sop->pool, max, test->objsz, 299 + gfp, sop, ot_init_node, NULL)) { 300 + ot_kfree(test, sop, sizeof(*sop)); 301 + return NULL; 302 + } 303 + WARN_ON(max != sop->pool.nr_objs); 304 + 305 + return sop; 306 + } 307 + 308 + static void ot_fini_sync(struct ot_context *sop) 309 + { 310 + objpool_fini(&sop->pool); 311 + ot_kfree(sop->test, sop, sizeof(*sop)); 312 + } 313 + 314 + struct { 315 + struct ot_context * (*init)(struct ot_test *oc); 316 + void (*fini)(struct ot_context *sop); 317 + } g_ot_sync_ops[] = { 318 + {.init = ot_init_sync_m0, .fini = ot_fini_sync}, 319 + }; 320 + 321 + /* 322 + * synchronous test cases: performance mode 323 + */ 324 + 325 + static void ot_bulk_sync(struct ot_item *item, int irq) 326 + { 327 + struct ot_node *nods[OT_NR_MAX_BULK]; 328 + int i; 329 + 330 + for (i = 0; i < item->bulk[irq]; i++) 331 + nods[i] = objpool_pop(item->pool); 332 + 333 + if (!irq && (item->delay || !(++(item->niters) & 0x7FFF))) 334 + msleep(item->delay); 335 + 336 + while (i-- > 0) { 337 + struct ot_node *on = nods[i]; 338 + if (on) { 339 + on->refs++; 340 + objpool_push(on, item->pool); 341 + item->stat[irq].nhits++; 342 + } else { 343 + item->stat[irq].nmiss++; 344 + } 345 + } 346 + } 347 + 348 + static int ot_start_sync(struct ot_test *test) 349 + { 350 + struct ot_context *sop; 351 + ktime_t start; 352 + u64 duration; 353 + unsigned long timeout; 354 + int cpu; 355 + 356 + /* initialize objpool for syncrhonous testcase */ 357 + sop = g_ot_sync_ops[test->mode].init(test); 358 + if (!sop) 359 + return -ENOMEM; 360 + 361 + /* grab rwsem to block testing threads */ 362 + down_write(&test->data.start); 363 + 364 + for_each_possible_cpu(cpu) { 365 + struct ot_item *item = per_cpu_ptr(&ot_pcup_items, cpu); 366 + struct task_struct *work; 367 + 368 + ot_init_cpu_item(item, test, &sop->pool, ot_bulk_sync); 369 + 370 + /* skip offline cpus */ 371 + if (!cpu_online(cpu)) 372 + continue; 373 + 374 + work = kthread_create_on_node(ot_thread_worker, item, 375 + cpu_to_node(cpu), "ot_worker_%d", cpu); 376 + if (IS_ERR(work)) { 377 + pr_err("failed to create thread for cpu %d\n", cpu); 378 + } else { 379 + kthread_bind(work, cpu); 380 + wake_up_process(work); 381 + } 382 + } 383 + 384 + /* wait a while to make sure all threads waiting at start line */ 385 + msleep(20); 386 + 387 + /* in case no threads were created: memory insufficient ? */ 388 + if (atomic_dec_and_test(&test->data.nthreads)) 389 + complete(&test->data.wait); 390 + 391 + // sched_set_fifo_low(current); 392 + 393 + /* start objpool testing threads */ 394 + start = ktime_get(); 395 + up_write(&test->data.start); 396 + 397 + /* yeild cpu to worker threads for duration ms */ 398 + timeout = msecs_to_jiffies(test->duration); 399 + schedule_timeout_interruptible(timeout); 400 + 401 + /* tell workers threads to quit */ 402 + atomic_set_release(&test->data.stop, 1); 403 + 404 + /* wait all workers threads finish and quit */ 405 + wait_for_completion(&test->data.wait); 406 + duration = (u64) ktime_us_delta(ktime_get(), start); 407 + 408 + /* cleanup objpool */ 409 + g_ot_sync_ops[test->mode].fini(sop); 410 + 411 + /* report testing summary and performance results */ 412 + ot_perf_report(test, duration); 413 + 414 + /* report memory allocation summary */ 415 + ot_mem_report(test); 416 + 417 + return 0; 418 + } 419 + 420 + /* 421 + * asynchronous test cases: pool lifecycle controlled by refcount 422 + */ 423 + 424 + static void ot_fini_async_rcu(struct rcu_head *rcu) 425 + { 426 + struct ot_context *sop = container_of(rcu, struct ot_context, rcu); 427 + struct ot_test *test = sop->test; 428 + 429 + /* here all cpus are aware of the stop event: test->data.stop = 1 */ 430 + WARN_ON(!atomic_read_acquire(&test->data.stop)); 431 + 432 + objpool_fini(&sop->pool); 433 + complete(&test->data.rcu); 434 + } 435 + 436 + static void ot_fini_async(struct ot_context *sop) 437 + { 438 + /* make sure the stop event is acknowledged by all cores */ 439 + call_rcu(&sop->rcu, ot_fini_async_rcu); 440 + } 441 + 442 + static int ot_objpool_release(struct objpool_head *head, void *context) 443 + { 444 + struct ot_context *sop = context; 445 + 446 + WARN_ON(!head || !sop || head != &sop->pool); 447 + 448 + /* do context cleaning if needed */ 449 + if (sop) 450 + ot_kfree(sop->test, sop, sizeof(*sop)); 451 + 452 + return 0; 453 + } 454 + 455 + static struct ot_context *ot_init_async_m0(struct ot_test *test) 456 + { 457 + struct ot_context *sop = NULL; 458 + int max = num_possible_cpus() << 3; 459 + gfp_t gfp = GFP_KERNEL; 460 + 461 + sop = (struct ot_context *)ot_kzalloc(test, sizeof(*sop)); 462 + if (!sop) 463 + return NULL; 464 + sop->test = test; 465 + if (test->objsz < 512) 466 + gfp = GFP_ATOMIC; 467 + 468 + if (objpool_init(&sop->pool, max, test->objsz, gfp, sop, 469 + ot_init_node, ot_objpool_release)) { 470 + ot_kfree(test, sop, sizeof(*sop)); 471 + return NULL; 472 + } 473 + WARN_ON(max != sop->pool.nr_objs); 474 + 475 + return sop; 476 + } 477 + 478 + struct { 479 + struct ot_context * (*init)(struct ot_test *oc); 480 + void (*fini)(struct ot_context *sop); 481 + } g_ot_async_ops[] = { 482 + {.init = ot_init_async_m0, .fini = ot_fini_async}, 483 + }; 484 + 485 + static void ot_nod_recycle(struct ot_node *on, struct objpool_head *pool, 486 + int release) 487 + { 488 + struct ot_context *sop; 489 + 490 + on->refs++; 491 + 492 + if (!release) { 493 + /* push object back to opjpool for reuse */ 494 + objpool_push(on, pool); 495 + return; 496 + } 497 + 498 + sop = container_of(pool, struct ot_context, pool); 499 + WARN_ON(sop != pool->context); 500 + 501 + /* unref objpool with nod removed forever */ 502 + objpool_drop(on, pool); 503 + } 504 + 505 + static void ot_bulk_async(struct ot_item *item, int irq) 506 + { 507 + struct ot_test *test = item->test; 508 + struct ot_node *nods[OT_NR_MAX_BULK]; 509 + int i, stop; 510 + 511 + for (i = 0; i < item->bulk[irq]; i++) 512 + nods[i] = objpool_pop(item->pool); 513 + 514 + if (!irq) { 515 + if (item->delay || !(++(item->niters) & 0x7FFF)) 516 + msleep(item->delay); 517 + get_cpu(); 518 + } 519 + 520 + stop = atomic_read_acquire(&test->data.stop); 521 + 522 + /* drop all objects and deref objpool */ 523 + while (i-- > 0) { 524 + struct ot_node *on = nods[i]; 525 + 526 + if (on) { 527 + on->refs++; 528 + ot_nod_recycle(on, item->pool, stop); 529 + item->stat[irq].nhits++; 530 + } else { 531 + item->stat[irq].nmiss++; 532 + } 533 + } 534 + 535 + if (!irq) 536 + put_cpu(); 537 + } 538 + 539 + static int ot_start_async(struct ot_test *test) 540 + { 541 + struct ot_context *sop; 542 + ktime_t start; 543 + u64 duration; 544 + unsigned long timeout; 545 + int cpu; 546 + 547 + /* initialize objpool for syncrhonous testcase */ 548 + sop = g_ot_async_ops[test->mode].init(test); 549 + if (!sop) 550 + return -ENOMEM; 551 + 552 + /* grab rwsem to block testing threads */ 553 + down_write(&test->data.start); 554 + 555 + for_each_possible_cpu(cpu) { 556 + struct ot_item *item = per_cpu_ptr(&ot_pcup_items, cpu); 557 + struct task_struct *work; 558 + 559 + ot_init_cpu_item(item, test, &sop->pool, ot_bulk_async); 560 + 561 + /* skip offline cpus */ 562 + if (!cpu_online(cpu)) 563 + continue; 564 + 565 + work = kthread_create_on_node(ot_thread_worker, item, 566 + cpu_to_node(cpu), "ot_worker_%d", cpu); 567 + if (IS_ERR(work)) { 568 + pr_err("failed to create thread for cpu %d\n", cpu); 569 + } else { 570 + kthread_bind(work, cpu); 571 + wake_up_process(work); 572 + } 573 + } 574 + 575 + /* wait a while to make sure all threads waiting at start line */ 576 + msleep(20); 577 + 578 + /* in case no threads were created: memory insufficient ? */ 579 + if (atomic_dec_and_test(&test->data.nthreads)) 580 + complete(&test->data.wait); 581 + 582 + /* start objpool testing threads */ 583 + start = ktime_get(); 584 + up_write(&test->data.start); 585 + 586 + /* yeild cpu to worker threads for duration ms */ 587 + timeout = msecs_to_jiffies(test->duration); 588 + schedule_timeout_interruptible(timeout); 589 + 590 + /* tell workers threads to quit */ 591 + atomic_set_release(&test->data.stop, 1); 592 + 593 + /* do async-finalization */ 594 + g_ot_async_ops[test->mode].fini(sop); 595 + 596 + /* wait all workers threads finish and quit */ 597 + wait_for_completion(&test->data.wait); 598 + duration = (u64) ktime_us_delta(ktime_get(), start); 599 + 600 + /* assure rcu callback is triggered */ 601 + wait_for_completion(&test->data.rcu); 602 + 603 + /* 604 + * now we are sure that objpool is finalized either 605 + * by rcu callback or by worker threads 606 + */ 607 + 608 + /* report testing summary and performance results */ 609 + ot_perf_report(test, duration); 610 + 611 + /* report memory allocation summary */ 612 + ot_mem_report(test); 613 + 614 + return 0; 615 + } 616 + 617 + /* 618 + * predefined testing cases: 619 + * synchronous case / overrun case / async case 620 + * 621 + * async: synchronous or asynchronous testing 622 + * mode: only mode 0 supported 623 + * objsz: object size 624 + * duration: int, total test time in ms 625 + * delay: int, delay (in ms) between each iteration 626 + * bulk_normal: int, repeat times for thread worker 627 + * bulk_irq: int, repeat times for irq consumer 628 + * hrtimer: unsigned long, hrtimer intervnal in ms 629 + * name: char *, tag for current test ot_item 630 + */ 631 + 632 + #define NODE_COMPACT sizeof(struct ot_node) 633 + #define NODE_VMALLOC (512) 634 + 635 + struct ot_test g_testcases[] = { 636 + 637 + /* sync & normal */ 638 + {0, 0, NODE_COMPACT, 1000, 0, 1, 0, 0, "sync: percpu objpool"}, 639 + {0, 0, NODE_VMALLOC, 1000, 0, 1, 0, 0, "sync: percpu objpool from vmalloc"}, 640 + 641 + /* sync & hrtimer */ 642 + {0, 0, NODE_COMPACT, 1000, 0, 1, 1, 4, "sync & hrtimer: percpu objpool"}, 643 + {0, 0, NODE_VMALLOC, 1000, 0, 1, 1, 4, "sync & hrtimer: percpu objpool from vmalloc"}, 644 + 645 + /* sync & overrun */ 646 + {0, 0, NODE_COMPACT, 1000, 0, 16, 0, 0, "sync overrun: percpu objpool"}, 647 + {0, 0, NODE_VMALLOC, 1000, 0, 16, 0, 0, "sync overrun: percpu objpool from vmalloc"}, 648 + 649 + /* async mode */ 650 + {1, 0, NODE_COMPACT, 1000, 100, 1, 0, 0, "async: percpu objpool"}, 651 + {1, 0, NODE_VMALLOC, 1000, 100, 1, 0, 0, "async: percpu objpool from vmalloc"}, 652 + 653 + /* async + hrtimer mode */ 654 + {1, 0, NODE_COMPACT, 1000, 0, 4, 4, 4, "async & hrtimer: percpu objpool"}, 655 + {1, 0, NODE_VMALLOC, 1000, 0, 4, 4, 4, "async & hrtimer: percpu objpool from vmalloc"}, 656 + }; 657 + 658 + static int __init ot_mod_init(void) 659 + { 660 + int i; 661 + 662 + /* perform testings */ 663 + for (i = 0; i < ARRAY_SIZE(g_testcases); i++) { 664 + ot_init_data(&g_testcases[i].data); 665 + if (g_testcases[i].async) 666 + ot_start_async(&g_testcases[i]); 667 + else 668 + ot_start_sync(&g_testcases[i]); 669 + } 670 + 671 + /* show tests summary */ 672 + pr_info("\n"); 673 + pr_info("Summary of testcases:\n"); 674 + for (i = 0; i < ARRAY_SIZE(g_testcases); i++) { 675 + pr_info(" duration: %lluus \thits: %10lu \tmiss: %10lu \t%s\n", 676 + g_testcases[i].data.duration, g_testcases[i].data.objects.nhits, 677 + g_testcases[i].data.objects.nmiss, g_testcases[i].name); 678 + } 679 + 680 + return -EAGAIN; 681 + } 682 + 683 + static void __exit ot_mod_exit(void) 684 + { 685 + } 686 + 687 + module_init(ot_mod_init); 688 + module_exit(ot_mod_exit); 689 + 690 + MODULE_LICENSE("GPL");
+1 -1
samples/kprobes/kretprobe_example.c
··· 35 35 ktime_t entry_stamp; 36 36 }; 37 37 38 - /* Here we use the entry_hanlder to timestamp function entry */ 38 + /* Here we use the entry_handler to timestamp function entry */ 39 39 static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) 40 40 { 41 41 struct my_data *data;