Linux Kernel Markers: support multiple probes

RCU style multiple probes support for the Linux Kernel Markers. Common case
(one probe) is still fast and does not require dynamic allocation or a
supplementary pointer dereference on the fast path.

- Move preempt disable from the marker site to the callback.

Since we now have an internal callback, move the preempt disable/enable to the
callback instead of the marker site.

Since the callback change is done asynchronously (passing from a handler that
supports arguments to a handler that does not setup the arguments is no
arguments are passed), we can safely update it even if it is outside the
preempt disable section.

- Move probe arm to probe connection. Now, a connected probe is automatically
armed.

Remove MARK_MAX_FORMAT_LEN, unused.

This patch modifies the Linux Kernel Markers API : it removes the probe
"arm/disarm" and changes the probe function prototype : it now expects a
va_list * instead of a "...".

If we want to have more than one probe connected to a marker at a given
time (LTTng, or blktrace, ssytemtap) then we need this patch. Without it,
connecting a second probe handler to a marker will fail.

It allow us, for instance, to do interesting combinations :

Do standard tracing with LTTng and, eventually, to compute statistics
with SystemTAP, or to have a special trigger on an event that would call
a systemtap script which would stop flight recorder tracing.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Mike Mason <mmlnx@us.ibm.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: David Smith <dsmith@redhat.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by Mathieu Desnoyers and committed by Linus Torvalds fb40bd78 9170d2f6

+579 -250
+11 -20
arch/powerpc/platforms/cell/spufs/sputrace.c
··· 146 146 wake_up(&sputrace_wait); 147 147 } 148 148 149 - static void spu_context_event(const struct marker *mdata, 150 - void *private, const char *format, ...) 149 + static void spu_context_event(void *probe_private, void *call_data, 150 + const char *format, va_list *args) 151 151 { 152 - struct spu_probe *p = mdata->private; 153 - va_list ap; 152 + struct spu_probe *p = probe_private; 154 153 struct spu_context *ctx; 155 154 struct spu *spu; 156 155 157 - va_start(ap, format); 158 - ctx = va_arg(ap, struct spu_context *); 159 - spu = va_arg(ap, struct spu *); 156 + ctx = va_arg(*args, struct spu_context *); 157 + spu = va_arg(*args, struct spu *); 160 158 161 159 sputrace_log_item(p->name, ctx, spu); 162 - va_end(ap); 163 160 } 164 161 165 - static void spu_context_nospu_event(const struct marker *mdata, 166 - void *private, const char *format, ...) 162 + static void spu_context_nospu_event(void *probe_private, void *call_data, 163 + const char *format, va_list *args) 167 164 { 168 - struct spu_probe *p = mdata->private; 169 - va_list ap; 165 + struct spu_probe *p = probe_private; 170 166 struct spu_context *ctx; 171 167 172 - va_start(ap, format); 173 - ctx = va_arg(ap, struct spu_context *); 168 + ctx = va_arg(*args, struct spu_context *); 174 169 175 170 sputrace_log_item(p->name, ctx, NULL); 176 - va_end(ap); 177 171 } 178 172 179 173 struct spu_probe spu_probes[] = { ··· 213 219 if (error) 214 220 printk(KERN_INFO "Unable to register probe %s\n", 215 221 p->name); 216 - 217 - error = marker_arm(p->name); 218 - if (error) 219 - printk(KERN_INFO "Unable to arm probe %s\n", p->name); 220 222 } 221 223 222 224 return 0; ··· 228 238 int i; 229 239 230 240 for (i = 0; i < ARRAY_SIZE(spu_probes); i++) 231 - marker_probe_unregister(spu_probes[i].name); 241 + marker_probe_unregister(spu_probes[i].name, 242 + spu_probes[i].probe_func, &spu_probes[i]); 232 243 233 244 remove_proc_entry("sputrace", NULL); 234 245 kfree(sputrace_log);
+36 -23
include/linux/marker.h
··· 19 19 20 20 /** 21 21 * marker_probe_func - Type of a marker probe function 22 - * @mdata: pointer of type struct marker 23 - * @private_data: caller site private data 22 + * @probe_private: probe private data 23 + * @call_private: call site private data 24 24 * @fmt: format string 25 - * @...: variable argument list 25 + * @args: variable argument list pointer. Use a pointer to overcome C's 26 + * inability to pass this around as a pointer in a portable manner in 27 + * the callee otherwise. 26 28 * 27 29 * Type of marker probe functions. They receive the mdata and need to parse the 28 30 * format string to recover the variable argument list. 29 31 */ 30 - typedef void marker_probe_func(const struct marker *mdata, 31 - void *private_data, const char *fmt, ...); 32 + typedef void marker_probe_func(void *probe_private, void *call_private, 33 + const char *fmt, va_list *args); 34 + 35 + struct marker_probe_closure { 36 + marker_probe_func *func; /* Callback */ 37 + void *probe_private; /* Private probe data */ 38 + }; 32 39 33 40 struct marker { 34 41 const char *name; /* Marker name */ ··· 43 36 * variable argument list. 44 37 */ 45 38 char state; /* Marker state. */ 46 - marker_probe_func *call;/* Probe handler function pointer */ 47 - void *private; /* Private probe data */ 39 + char ptype; /* probe type : 0 : single, 1 : multi */ 40 + void (*call)(const struct marker *mdata, /* Probe wrapper */ 41 + void *call_private, const char *fmt, ...); 42 + struct marker_probe_closure single; 43 + struct marker_probe_closure *multi; 48 44 } __attribute__((aligned(8))); 49 45 50 46 #ifdef CONFIG_MARKERS ··· 59 49 * not add unwanted padding between the beginning of the section and the 60 50 * structure. Force alignment to the same alignment as the section start. 61 51 */ 62 - #define __trace_mark(name, call_data, format, args...) \ 52 + #define __trace_mark(name, call_private, format, args...) \ 63 53 do { \ 64 54 static const char __mstrtab_name_##name[] \ 65 55 __attribute__((section("__markers_strings"))) \ ··· 70 60 static struct marker __mark_##name \ 71 61 __attribute__((section("__markers"), aligned(8))) = \ 72 62 { __mstrtab_name_##name, __mstrtab_format_##name, \ 73 - 0, __mark_empty_function, NULL }; \ 63 + 0, 0, marker_probe_cb, \ 64 + { __mark_empty_function, NULL}, NULL }; \ 74 65 __mark_check_format(format, ## args); \ 75 66 if (unlikely(__mark_##name.state)) { \ 76 - preempt_disable(); \ 77 67 (*__mark_##name.call) \ 78 - (&__mark_##name, call_data, \ 68 + (&__mark_##name, call_private, \ 79 69 format, ## args); \ 80 - preempt_enable(); \ 81 70 } \ 82 71 } while (0) 83 72 84 73 extern void marker_update_probe_range(struct marker *begin, 85 - struct marker *end, struct module *probe_module, int *refcount); 74 + struct marker *end); 86 75 #else /* !CONFIG_MARKERS */ 87 - #define __trace_mark(name, call_data, format, args...) \ 76 + #define __trace_mark(name, call_private, format, args...) \ 88 77 __mark_check_format(format, ## args) 89 78 static inline void marker_update_probe_range(struct marker *begin, 90 - struct marker *end, struct module *probe_module, int *refcount) 79 + struct marker *end) 91 80 { } 92 81 #endif /* CONFIG_MARKERS */ 93 82 ··· 101 92 #define trace_mark(name, format, args...) \ 102 93 __trace_mark(name, NULL, format, ## args) 103 94 104 - #define MARK_MAX_FORMAT_LEN 1024 105 - 106 95 /** 107 96 * MARK_NOARGS - Format string for a marker with no argument. 108 97 */ ··· 113 106 114 107 extern marker_probe_func __mark_empty_function; 115 108 109 + extern void marker_probe_cb(const struct marker *mdata, 110 + void *call_private, const char *fmt, ...); 111 + extern void marker_probe_cb_noarg(const struct marker *mdata, 112 + void *call_private, const char *fmt, ...); 113 + 116 114 /* 117 115 * Connect a probe to a marker. 118 116 * private data pointer must be a valid allocated memory address, or NULL. 119 117 */ 120 118 extern int marker_probe_register(const char *name, const char *format, 121 - marker_probe_func *probe, void *private); 119 + marker_probe_func *probe, void *probe_private); 122 120 123 121 /* 124 122 * Returns the private data given to marker_probe_register. 125 123 */ 126 - extern void *marker_probe_unregister(const char *name); 124 + extern int marker_probe_unregister(const char *name, 125 + marker_probe_func *probe, void *probe_private); 127 126 /* 128 127 * Unregister a marker by providing the registered private data. 129 128 */ 130 - extern void *marker_probe_unregister_private_data(void *private); 129 + extern int marker_probe_unregister_private_data(marker_probe_func *probe, 130 + void *probe_private); 131 131 132 - extern int marker_arm(const char *name); 133 - extern int marker_disarm(const char *name); 134 - extern void *marker_get_private_data(const char *name); 132 + extern void *marker_get_private_data(const char *name, marker_probe_func *probe, 133 + int num); 135 134 136 135 #endif
+1 -1
include/linux/module.h
··· 465 465 466 466 extern void print_modules(void); 467 467 468 - extern void module_update_markers(struct module *probe_module, int *refcount); 468 + extern void module_update_markers(void); 469 469 470 470 #else /* !CONFIG_MODULES... */ 471 471 #define EXPORT_SYMBOL(sym)
+519 -186
kernel/marker.c
··· 27 27 extern struct marker __start___markers[]; 28 28 extern struct marker __stop___markers[]; 29 29 30 - /* 31 - * markers_mutex nests inside module_mutex. Markers mutex protects the builtin 32 - * and module markers, the hash table and deferred_sync. 33 - */ 34 - static DEFINE_MUTEX(markers_mutex); 30 + /* Set to 1 to enable marker debug output */ 31 + const int marker_debug; 35 32 36 33 /* 37 - * Marker deferred synchronization. 38 - * Upon marker probe_unregister, we delay call to synchronize_sched() to 39 - * accelerate mass unregistration (only when there is no more reference to a 40 - * given module do we call synchronize_sched()). However, we need to make sure 41 - * every critical region has ended before we re-arm a marker that has been 42 - * unregistered and then registered back with a different probe data. 34 + * markers_mutex nests inside module_mutex. Markers mutex protects the builtin 35 + * and module markers and the hash table. 43 36 */ 44 - static int deferred_sync; 37 + static DEFINE_MUTEX(markers_mutex); 45 38 46 39 /* 47 40 * Marker hash table, containing the active markers. ··· 43 50 #define MARKER_HASH_BITS 6 44 51 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) 45 52 53 + /* 54 + * Note about RCU : 55 + * It is used to make sure every handler has finished using its private data 56 + * between two consecutive operation (add or remove) on a given marker. It is 57 + * also used to delay the free of multiple probes array until a quiescent state 58 + * is reached. 59 + * marker entries modifications are protected by the markers_mutex. 60 + */ 46 61 struct marker_entry { 47 62 struct hlist_node hlist; 48 63 char *format; 49 - marker_probe_func *probe; 50 - void *private; 64 + void (*call)(const struct marker *mdata, /* Probe wrapper */ 65 + void *call_private, const char *fmt, ...); 66 + struct marker_probe_closure single; 67 + struct marker_probe_closure *multi; 51 68 int refcount; /* Number of times armed. 0 if disarmed. */ 69 + struct rcu_head rcu; 70 + void *oldptr; 71 + char rcu_pending:1; 72 + char ptype:1; 52 73 char name[0]; /* Contains name'\0'format'\0' */ 53 74 }; 54 75 ··· 70 63 71 64 /** 72 65 * __mark_empty_function - Empty probe callback 73 - * @mdata: pointer of type const struct marker 66 + * @probe_private: probe private data 67 + * @call_private: call site private data 74 68 * @fmt: format string 75 69 * @...: variable argument list 76 70 * ··· 80 72 * though the function pointer change and the marker enabling are two distinct 81 73 * operations that modifies the execution flow of preemptible code. 82 74 */ 83 - void __mark_empty_function(const struct marker *mdata, void *private, 84 - const char *fmt, ...) 75 + void __mark_empty_function(void *probe_private, void *call_private, 76 + const char *fmt, va_list *args) 85 77 { 86 78 } 87 79 EXPORT_SYMBOL_GPL(__mark_empty_function); 80 + 81 + /* 82 + * marker_probe_cb Callback that prepares the variable argument list for probes. 83 + * @mdata: pointer of type struct marker 84 + * @call_private: caller site private data 85 + * @fmt: format string 86 + * @...: Variable argument list. 87 + * 88 + * Since we do not use "typical" pointer based RCU in the 1 argument case, we 89 + * need to put a full smp_rmb() in this branch. This is why we do not use 90 + * rcu_dereference() for the pointer read. 91 + */ 92 + void marker_probe_cb(const struct marker *mdata, void *call_private, 93 + const char *fmt, ...) 94 + { 95 + va_list args; 96 + char ptype; 97 + 98 + /* 99 + * disabling preemption to make sure the teardown of the callbacks can 100 + * be done correctly when they are in modules and they insure RCU read 101 + * coherency. 102 + */ 103 + preempt_disable(); 104 + ptype = ACCESS_ONCE(mdata->ptype); 105 + if (likely(!ptype)) { 106 + marker_probe_func *func; 107 + /* Must read the ptype before ptr. They are not data dependant, 108 + * so we put an explicit smp_rmb() here. */ 109 + smp_rmb(); 110 + func = ACCESS_ONCE(mdata->single.func); 111 + /* Must read the ptr before private data. They are not data 112 + * dependant, so we put an explicit smp_rmb() here. */ 113 + smp_rmb(); 114 + va_start(args, fmt); 115 + func(mdata->single.probe_private, call_private, fmt, &args); 116 + va_end(args); 117 + } else { 118 + struct marker_probe_closure *multi; 119 + int i; 120 + /* 121 + * multi points to an array, therefore accessing the array 122 + * depends on reading multi. However, even in this case, 123 + * we must insure that the pointer is read _before_ the array 124 + * data. Same as rcu_dereference, but we need a full smp_rmb() 125 + * in the fast path, so put the explicit barrier here. 126 + */ 127 + smp_read_barrier_depends(); 128 + multi = ACCESS_ONCE(mdata->multi); 129 + for (i = 0; multi[i].func; i++) { 130 + va_start(args, fmt); 131 + multi[i].func(multi[i].probe_private, call_private, fmt, 132 + &args); 133 + va_end(args); 134 + } 135 + } 136 + preempt_enable(); 137 + } 138 + EXPORT_SYMBOL_GPL(marker_probe_cb); 139 + 140 + /* 141 + * marker_probe_cb Callback that does not prepare the variable argument list. 142 + * @mdata: pointer of type struct marker 143 + * @call_private: caller site private data 144 + * @fmt: format string 145 + * @...: Variable argument list. 146 + * 147 + * Should be connected to markers "MARK_NOARGS". 148 + */ 149 + void marker_probe_cb_noarg(const struct marker *mdata, 150 + void *call_private, const char *fmt, ...) 151 + { 152 + va_list args; /* not initialized */ 153 + char ptype; 154 + 155 + preempt_disable(); 156 + ptype = ACCESS_ONCE(mdata->ptype); 157 + if (likely(!ptype)) { 158 + marker_probe_func *func; 159 + /* Must read the ptype before ptr. They are not data dependant, 160 + * so we put an explicit smp_rmb() here. */ 161 + smp_rmb(); 162 + func = ACCESS_ONCE(mdata->single.func); 163 + /* Must read the ptr before private data. They are not data 164 + * dependant, so we put an explicit smp_rmb() here. */ 165 + smp_rmb(); 166 + func(mdata->single.probe_private, call_private, fmt, &args); 167 + } else { 168 + struct marker_probe_closure *multi; 169 + int i; 170 + /* 171 + * multi points to an array, therefore accessing the array 172 + * depends on reading multi. However, even in this case, 173 + * we must insure that the pointer is read _before_ the array 174 + * data. Same as rcu_dereference, but we need a full smp_rmb() 175 + * in the fast path, so put the explicit barrier here. 176 + */ 177 + smp_read_barrier_depends(); 178 + multi = ACCESS_ONCE(mdata->multi); 179 + for (i = 0; multi[i].func; i++) 180 + multi[i].func(multi[i].probe_private, call_private, fmt, 181 + &args); 182 + } 183 + preempt_enable(); 184 + } 185 + EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); 186 + 187 + static void free_old_closure(struct rcu_head *head) 188 + { 189 + struct marker_entry *entry = container_of(head, 190 + struct marker_entry, rcu); 191 + kfree(entry->oldptr); 192 + /* Make sure we free the data before setting the pending flag to 0 */ 193 + smp_wmb(); 194 + entry->rcu_pending = 0; 195 + } 196 + 197 + static void debug_print_probes(struct marker_entry *entry) 198 + { 199 + int i; 200 + 201 + if (!marker_debug) 202 + return; 203 + 204 + if (!entry->ptype) { 205 + printk(KERN_DEBUG "Single probe : %p %p\n", 206 + entry->single.func, 207 + entry->single.probe_private); 208 + } else { 209 + for (i = 0; entry->multi[i].func; i++) 210 + printk(KERN_DEBUG "Multi probe %d : %p %p\n", i, 211 + entry->multi[i].func, 212 + entry->multi[i].probe_private); 213 + } 214 + } 215 + 216 + static struct marker_probe_closure * 217 + marker_entry_add_probe(struct marker_entry *entry, 218 + marker_probe_func *probe, void *probe_private) 219 + { 220 + int nr_probes = 0; 221 + struct marker_probe_closure *old, *new; 222 + 223 + WARN_ON(!probe); 224 + 225 + debug_print_probes(entry); 226 + old = entry->multi; 227 + if (!entry->ptype) { 228 + if (entry->single.func == probe && 229 + entry->single.probe_private == probe_private) 230 + return ERR_PTR(-EBUSY); 231 + if (entry->single.func == __mark_empty_function) { 232 + /* 0 -> 1 probes */ 233 + entry->single.func = probe; 234 + entry->single.probe_private = probe_private; 235 + entry->refcount = 1; 236 + entry->ptype = 0; 237 + debug_print_probes(entry); 238 + return NULL; 239 + } else { 240 + /* 1 -> 2 probes */ 241 + nr_probes = 1; 242 + old = NULL; 243 + } 244 + } else { 245 + /* (N -> N+1), (N != 0, 1) probes */ 246 + for (nr_probes = 0; old[nr_probes].func; nr_probes++) 247 + if (old[nr_probes].func == probe 248 + && old[nr_probes].probe_private 249 + == probe_private) 250 + return ERR_PTR(-EBUSY); 251 + } 252 + /* + 2 : one for new probe, one for NULL func */ 253 + new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure), 254 + GFP_KERNEL); 255 + if (new == NULL) 256 + return ERR_PTR(-ENOMEM); 257 + if (!old) 258 + new[0] = entry->single; 259 + else 260 + memcpy(new, old, 261 + nr_probes * sizeof(struct marker_probe_closure)); 262 + new[nr_probes].func = probe; 263 + new[nr_probes].probe_private = probe_private; 264 + entry->refcount = nr_probes + 1; 265 + entry->multi = new; 266 + entry->ptype = 1; 267 + debug_print_probes(entry); 268 + return old; 269 + } 270 + 271 + static struct marker_probe_closure * 272 + marker_entry_remove_probe(struct marker_entry *entry, 273 + marker_probe_func *probe, void *probe_private) 274 + { 275 + int nr_probes = 0, nr_del = 0, i; 276 + struct marker_probe_closure *old, *new; 277 + 278 + old = entry->multi; 279 + 280 + debug_print_probes(entry); 281 + if (!entry->ptype) { 282 + /* 0 -> N is an error */ 283 + WARN_ON(entry->single.func == __mark_empty_function); 284 + /* 1 -> 0 probes */ 285 + WARN_ON(probe && entry->single.func != probe); 286 + WARN_ON(entry->single.probe_private != probe_private); 287 + entry->single.func = __mark_empty_function; 288 + entry->refcount = 0; 289 + entry->ptype = 0; 290 + debug_print_probes(entry); 291 + return NULL; 292 + } else { 293 + /* (N -> M), (N > 1, M >= 0) probes */ 294 + for (nr_probes = 0; old[nr_probes].func; nr_probes++) { 295 + if ((!probe || old[nr_probes].func == probe) 296 + && old[nr_probes].probe_private 297 + == probe_private) 298 + nr_del++; 299 + } 300 + } 301 + 302 + if (nr_probes - nr_del == 0) { 303 + /* N -> 0, (N > 1) */ 304 + entry->single.func = __mark_empty_function; 305 + entry->refcount = 0; 306 + entry->ptype = 0; 307 + } else if (nr_probes - nr_del == 1) { 308 + /* N -> 1, (N > 1) */ 309 + for (i = 0; old[i].func; i++) 310 + if ((probe && old[i].func != probe) || 311 + old[i].probe_private != probe_private) 312 + entry->single = old[i]; 313 + entry->refcount = 1; 314 + entry->ptype = 0; 315 + } else { 316 + int j = 0; 317 + /* N -> M, (N > 1, M > 1) */ 318 + /* + 1 for NULL */ 319 + new = kzalloc((nr_probes - nr_del + 1) 320 + * sizeof(struct marker_probe_closure), GFP_KERNEL); 321 + if (new == NULL) 322 + return ERR_PTR(-ENOMEM); 323 + for (i = 0; old[i].func; i++) 324 + if ((probe && old[i].func != probe) || 325 + old[i].probe_private != probe_private) 326 + new[j++] = old[i]; 327 + entry->refcount = nr_probes - nr_del; 328 + entry->ptype = 1; 329 + entry->multi = new; 330 + } 331 + debug_print_probes(entry); 332 + return old; 333 + } 88 334 89 335 /* 90 336 * Get marker if the marker is present in the marker hash table. ··· 364 102 * Add the marker to the marker hash table. Must be called with markers_mutex 365 103 * held. 366 104 */ 367 - static int add_marker(const char *name, const char *format, 368 - marker_probe_func *probe, void *private) 105 + static struct marker_entry *add_marker(const char *name, const char *format) 369 106 { 370 107 struct hlist_head *head; 371 108 struct hlist_node *node; ··· 379 118 hlist_for_each_entry(e, node, head, hlist) { 380 119 if (!strcmp(name, e->name)) { 381 120 printk(KERN_NOTICE 382 - "Marker %s busy, probe %p already installed\n", 383 - name, e->probe); 384 - return -EBUSY; /* Already there */ 121 + "Marker %s busy\n", name); 122 + return ERR_PTR(-EBUSY); /* Already there */ 385 123 } 386 124 } 387 125 /* ··· 390 130 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, 391 131 GFP_KERNEL); 392 132 if (!e) 393 - return -ENOMEM; 133 + return ERR_PTR(-ENOMEM); 394 134 memcpy(&e->name[0], name, name_len); 395 135 if (format) { 396 136 e->format = &e->name[name_len]; 397 137 memcpy(e->format, format, format_len); 138 + if (strcmp(e->format, MARK_NOARGS) == 0) 139 + e->call = marker_probe_cb_noarg; 140 + else 141 + e->call = marker_probe_cb; 398 142 trace_mark(core_marker_format, "name %s format %s", 399 143 e->name, e->format); 400 - } else 144 + } else { 401 145 e->format = NULL; 402 - e->probe = probe; 403 - e->private = private; 146 + e->call = marker_probe_cb; 147 + } 148 + e->single.func = __mark_empty_function; 149 + e->single.probe_private = NULL; 150 + e->multi = NULL; 151 + e->ptype = 0; 404 152 e->refcount = 0; 153 + e->rcu_pending = 0; 405 154 hlist_add_head(&e->hlist, head); 406 - return 0; 155 + return e; 407 156 } 408 157 409 158 /* 410 159 * Remove the marker from the marker hash table. Must be called with mutex_lock 411 160 * held. 412 161 */ 413 - static void *remove_marker(const char *name) 162 + static int remove_marker(const char *name) 414 163 { 415 164 struct hlist_head *head; 416 165 struct hlist_node *node; 417 166 struct marker_entry *e; 418 167 int found = 0; 419 168 size_t len = strlen(name) + 1; 420 - void *private = NULL; 421 169 u32 hash = jhash(name, len-1, 0); 422 170 423 171 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; ··· 435 167 break; 436 168 } 437 169 } 438 - if (found) { 439 - private = e->private; 440 - hlist_del(&e->hlist); 441 - kfree(e); 442 - } 443 - return private; 170 + if (!found) 171 + return -ENOENT; 172 + if (e->single.func != __mark_empty_function) 173 + return -EBUSY; 174 + hlist_del(&e->hlist); 175 + /* Make sure the call_rcu has been executed */ 176 + if (e->rcu_pending) 177 + rcu_barrier(); 178 + kfree(e); 179 + return 0; 444 180 } 445 181 446 182 /* ··· 456 184 size_t name_len = strlen((*entry)->name) + 1; 457 185 size_t format_len = strlen(format) + 1; 458 186 187 + 459 188 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, 460 189 GFP_KERNEL); 461 190 if (!e) ··· 464 191 memcpy(&e->name[0], (*entry)->name, name_len); 465 192 e->format = &e->name[name_len]; 466 193 memcpy(e->format, format, format_len); 467 - e->probe = (*entry)->probe; 468 - e->private = (*entry)->private; 194 + if (strcmp(e->format, MARK_NOARGS) == 0) 195 + e->call = marker_probe_cb_noarg; 196 + else 197 + e->call = marker_probe_cb; 198 + e->single = (*entry)->single; 199 + e->multi = (*entry)->multi; 200 + e->ptype = (*entry)->ptype; 469 201 e->refcount = (*entry)->refcount; 202 + e->rcu_pending = 0; 470 203 hlist_add_before(&e->hlist, &(*entry)->hlist); 471 204 hlist_del(&(*entry)->hlist); 205 + /* Make sure the call_rcu has been executed */ 206 + if ((*entry)->rcu_pending) 207 + rcu_barrier(); 472 208 kfree(*entry); 473 209 *entry = e; 474 210 trace_mark(core_marker_format, "name %s format %s", ··· 488 206 /* 489 207 * Sets the probe callback corresponding to one marker. 490 208 */ 491 - static int set_marker(struct marker_entry **entry, struct marker *elem) 209 + static int set_marker(struct marker_entry **entry, struct marker *elem, 210 + int active) 492 211 { 493 212 int ret; 494 213 WARN_ON(strcmp((*entry)->name, elem->name) != 0); ··· 509 226 if (ret) 510 227 return ret; 511 228 } 512 - elem->call = (*entry)->probe; 513 - elem->private = (*entry)->private; 514 - elem->state = 1; 229 + 230 + /* 231 + * probe_cb setup (statically known) is done here. It is 232 + * asynchronous with the rest of execution, therefore we only 233 + * pass from a "safe" callback (with argument) to an "unsafe" 234 + * callback (does not set arguments). 235 + */ 236 + elem->call = (*entry)->call; 237 + /* 238 + * Sanity check : 239 + * We only update the single probe private data when the ptr is 240 + * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) 241 + */ 242 + WARN_ON(elem->single.func != __mark_empty_function 243 + && elem->single.probe_private 244 + != (*entry)->single.probe_private && 245 + !elem->ptype); 246 + elem->single.probe_private = (*entry)->single.probe_private; 247 + /* 248 + * Make sure the private data is valid when we update the 249 + * single probe ptr. 250 + */ 251 + smp_wmb(); 252 + elem->single.func = (*entry)->single.func; 253 + /* 254 + * We also make sure that the new probe callbacks array is consistent 255 + * before setting a pointer to it. 256 + */ 257 + rcu_assign_pointer(elem->multi, (*entry)->multi); 258 + /* 259 + * Update the function or multi probe array pointer before setting the 260 + * ptype. 261 + */ 262 + smp_wmb(); 263 + elem->ptype = (*entry)->ptype; 264 + elem->state = active; 265 + 515 266 return 0; 516 267 } 517 268 ··· 557 240 */ 558 241 static void disable_marker(struct marker *elem) 559 242 { 243 + /* leave "call" as is. It is known statically. */ 560 244 elem->state = 0; 561 - elem->call = __mark_empty_function; 245 + elem->single.func = __mark_empty_function; 246 + /* Update the function before setting the ptype */ 247 + smp_wmb(); 248 + elem->ptype = 0; /* single probe */ 562 249 /* 563 250 * Leave the private data and id there, because removal is racy and 564 251 * should be done only after a synchronize_sched(). These are never used ··· 574 253 * marker_update_probe_range - Update a probe range 575 254 * @begin: beginning of the range 576 255 * @end: end of the range 577 - * @probe_module: module address of the probe being updated 578 - * @refcount: number of references left to the given probe_module (out) 579 256 * 580 257 * Updates the probe callback corresponding to a range of markers. 581 258 */ 582 259 void marker_update_probe_range(struct marker *begin, 583 - struct marker *end, struct module *probe_module, 584 - int *refcount) 260 + struct marker *end) 585 261 { 586 262 struct marker *iter; 587 263 struct marker_entry *mark_entry; ··· 586 268 mutex_lock(&markers_mutex); 587 269 for (iter = begin; iter < end; iter++) { 588 270 mark_entry = get_marker(iter->name); 589 - if (mark_entry && mark_entry->refcount) { 590 - set_marker(&mark_entry, iter); 271 + if (mark_entry) { 272 + set_marker(&mark_entry, iter, 273 + !!mark_entry->refcount); 591 274 /* 592 275 * ignore error, continue 593 276 */ 594 - if (probe_module) 595 - if (probe_module == 596 - __module_text_address((unsigned long)mark_entry->probe)) 597 - (*refcount)++; 598 277 } else { 599 278 disable_marker(iter); 600 279 } ··· 604 289 * Issues a synchronize_sched() when no reference to the module passed 605 290 * as parameter is found in the probes so the probe module can be 606 291 * safely unloaded from now on. 292 + * 293 + * Internal callback only changed before the first probe is connected to it. 294 + * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 295 + * transitions. All other transitions will leave the old private data valid. 296 + * This makes the non-atomicity of the callback/private data updates valid. 297 + * 298 + * "special case" updates : 299 + * 0 -> 1 callback 300 + * 1 -> 0 callback 301 + * 1 -> 2 callbacks 302 + * 2 -> 1 callbacks 303 + * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates. 304 + * Site effect : marker_set_format may delete the marker entry (creating a 305 + * replacement). 607 306 */ 608 - static void marker_update_probes(struct module *probe_module) 307 + static void marker_update_probes(void) 609 308 { 610 - int refcount = 0; 611 - 612 309 /* Core kernel markers */ 613 - marker_update_probe_range(__start___markers, 614 - __stop___markers, probe_module, &refcount); 310 + marker_update_probe_range(__start___markers, __stop___markers); 615 311 /* Markers in modules. */ 616 - module_update_markers(probe_module, &refcount); 617 - if (probe_module && refcount == 0) { 618 - synchronize_sched(); 619 - deferred_sync = 0; 620 - } 312 + module_update_markers(); 621 313 } 622 314 623 315 /** ··· 632 310 * @name: marker name 633 311 * @format: format string 634 312 * @probe: probe handler 635 - * @private: probe private data 313 + * @probe_private: probe private data 636 314 * 637 315 * private data must be a valid allocated memory address, or NULL. 638 316 * Returns 0 if ok, error value on error. 317 + * The probe address must at least be aligned on the architecture pointer size. 639 318 */ 640 319 int marker_probe_register(const char *name, const char *format, 641 - marker_probe_func *probe, void *private) 320 + marker_probe_func *probe, void *probe_private) 642 321 { 643 322 struct marker_entry *entry; 644 323 int ret = 0; 324 + struct marker_probe_closure *old; 645 325 646 326 mutex_lock(&markers_mutex); 647 327 entry = get_marker(name); 648 - if (entry && entry->refcount) { 649 - ret = -EBUSY; 328 + if (!entry) { 329 + entry = add_marker(name, format); 330 + if (IS_ERR(entry)) { 331 + ret = PTR_ERR(entry); 332 + goto end; 333 + } 334 + } 335 + /* 336 + * If we detect that a call_rcu is pending for this marker, 337 + * make sure it's executed now. 338 + */ 339 + if (entry->rcu_pending) 340 + rcu_barrier(); 341 + old = marker_entry_add_probe(entry, probe, probe_private); 342 + if (IS_ERR(old)) { 343 + ret = PTR_ERR(old); 650 344 goto end; 651 345 } 652 - if (deferred_sync) { 653 - synchronize_sched(); 654 - deferred_sync = 0; 655 - } 656 - ret = add_marker(name, format, probe, private); 657 - if (ret) 658 - goto end; 659 346 mutex_unlock(&markers_mutex); 660 - marker_update_probes(NULL); 661 - return ret; 347 + marker_update_probes(); /* may update entry */ 348 + mutex_lock(&markers_mutex); 349 + entry = get_marker(name); 350 + WARN_ON(!entry); 351 + entry->oldptr = old; 352 + entry->rcu_pending = 1; 353 + /* write rcu_pending before calling the RCU callback */ 354 + smp_wmb(); 355 + call_rcu(&entry->rcu, free_old_closure); 662 356 end: 663 357 mutex_unlock(&markers_mutex); 664 358 return ret; ··· 684 346 /** 685 347 * marker_probe_unregister - Disconnect a probe from a marker 686 348 * @name: marker name 349 + * @probe: probe function pointer 350 + * @probe_private: probe private data 687 351 * 688 352 * Returns the private data given to marker_probe_register, or an ERR_PTR(). 353 + * We do not need to call a synchronize_sched to make sure the probes have 354 + * finished running before doing a module unload, because the module unload 355 + * itself uses stop_machine(), which insures that every preempt disabled section 356 + * have finished. 689 357 */ 690 - void *marker_probe_unregister(const char *name) 358 + int marker_probe_unregister(const char *name, 359 + marker_probe_func *probe, void *probe_private) 691 360 { 692 - struct module *probe_module; 693 361 struct marker_entry *entry; 694 - void *private; 362 + struct marker_probe_closure *old; 363 + int ret = 0; 695 364 696 365 mutex_lock(&markers_mutex); 697 366 entry = get_marker(name); 698 367 if (!entry) { 699 - private = ERR_PTR(-ENOENT); 368 + ret = -ENOENT; 700 369 goto end; 701 370 } 702 - entry->refcount = 0; 703 - /* In what module is the probe handler ? */ 704 - probe_module = __module_text_address((unsigned long)entry->probe); 705 - private = remove_marker(name); 706 - deferred_sync = 1; 371 + if (entry->rcu_pending) 372 + rcu_barrier(); 373 + old = marker_entry_remove_probe(entry, probe, probe_private); 707 374 mutex_unlock(&markers_mutex); 708 - marker_update_probes(probe_module); 709 - return private; 375 + marker_update_probes(); /* may update entry */ 376 + mutex_lock(&markers_mutex); 377 + entry = get_marker(name); 378 + entry->oldptr = old; 379 + entry->rcu_pending = 1; 380 + /* write rcu_pending before calling the RCU callback */ 381 + smp_wmb(); 382 + call_rcu(&entry->rcu, free_old_closure); 383 + remove_marker(name); /* Ignore busy error message */ 710 384 end: 711 385 mutex_unlock(&markers_mutex); 712 - return private; 386 + return ret; 713 387 } 714 388 EXPORT_SYMBOL_GPL(marker_probe_unregister); 715 389 716 - /** 717 - * marker_probe_unregister_private_data - Disconnect a probe from a marker 718 - * @private: probe private data 719 - * 720 - * Unregister a marker by providing the registered private data. 721 - * Returns the private data given to marker_probe_register, or an ERR_PTR(). 722 - */ 723 - void *marker_probe_unregister_private_data(void *private) 390 + static struct marker_entry * 391 + get_marker_from_private_data(marker_probe_func *probe, void *probe_private) 724 392 { 725 - struct module *probe_module; 393 + struct marker_entry *entry; 394 + unsigned int i; 726 395 struct hlist_head *head; 727 396 struct hlist_node *node; 728 - struct marker_entry *entry; 729 - int found = 0; 730 - unsigned int i; 731 397 732 - mutex_lock(&markers_mutex); 733 398 for (i = 0; i < MARKER_TABLE_SIZE; i++) { 734 399 head = &marker_table[i]; 735 400 hlist_for_each_entry(entry, node, head, hlist) { 736 - if (entry->private == private) { 737 - found = 1; 738 - goto iter_end; 401 + if (!entry->ptype) { 402 + if (entry->single.func == probe 403 + && entry->single.probe_private 404 + == probe_private) 405 + return entry; 406 + } else { 407 + struct marker_probe_closure *closure; 408 + closure = entry->multi; 409 + for (i = 0; closure[i].func; i++) { 410 + if (closure[i].func == probe && 411 + closure[i].probe_private 412 + == probe_private) 413 + return entry; 414 + } 739 415 } 740 416 } 741 417 } 742 - iter_end: 743 - if (!found) { 744 - private = ERR_PTR(-ENOENT); 418 + return NULL; 419 + } 420 + 421 + /** 422 + * marker_probe_unregister_private_data - Disconnect a probe from a marker 423 + * @probe: probe function 424 + * @probe_private: probe private data 425 + * 426 + * Unregister a probe by providing the registered private data. 427 + * Only removes the first marker found in hash table. 428 + * Return 0 on success or error value. 429 + * We do not need to call a synchronize_sched to make sure the probes have 430 + * finished running before doing a module unload, because the module unload 431 + * itself uses stop_machine(), which insures that every preempt disabled section 432 + * have finished. 433 + */ 434 + int marker_probe_unregister_private_data(marker_probe_func *probe, 435 + void *probe_private) 436 + { 437 + struct marker_entry *entry; 438 + int ret = 0; 439 + struct marker_probe_closure *old; 440 + 441 + mutex_lock(&markers_mutex); 442 + entry = get_marker_from_private_data(probe, probe_private); 443 + if (!entry) { 444 + ret = -ENOENT; 745 445 goto end; 746 446 } 747 - entry->refcount = 0; 748 - /* In what module is the probe handler ? */ 749 - probe_module = __module_text_address((unsigned long)entry->probe); 750 - private = remove_marker(entry->name); 751 - deferred_sync = 1; 447 + if (entry->rcu_pending) 448 + rcu_barrier(); 449 + old = marker_entry_remove_probe(entry, NULL, probe_private); 752 450 mutex_unlock(&markers_mutex); 753 - marker_update_probes(probe_module); 754 - return private; 451 + marker_update_probes(); /* may update entry */ 452 + mutex_lock(&markers_mutex); 453 + entry = get_marker_from_private_data(probe, probe_private); 454 + WARN_ON(!entry); 455 + entry->oldptr = old; 456 + entry->rcu_pending = 1; 457 + /* write rcu_pending before calling the RCU callback */ 458 + smp_wmb(); 459 + call_rcu(&entry->rcu, free_old_closure); 460 + remove_marker(entry->name); /* Ignore busy error message */ 755 461 end: 756 462 mutex_unlock(&markers_mutex); 757 - return private; 463 + return ret; 758 464 } 759 465 EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); 760 466 761 467 /** 762 - * marker_arm - Arm a marker 763 - * @name: marker name 764 - * 765 - * Activate a marker. It keeps a reference count of the number of 766 - * arming/disarming done. 767 - * Returns 0 if ok, error value on error. 768 - */ 769 - int marker_arm(const char *name) 770 - { 771 - struct marker_entry *entry; 772 - int ret = 0; 773 - 774 - mutex_lock(&markers_mutex); 775 - entry = get_marker(name); 776 - if (!entry) { 777 - ret = -ENOENT; 778 - goto end; 779 - } 780 - /* 781 - * Only need to update probes when refcount passes from 0 to 1. 782 - */ 783 - if (entry->refcount++) 784 - goto end; 785 - end: 786 - mutex_unlock(&markers_mutex); 787 - marker_update_probes(NULL); 788 - return ret; 789 - } 790 - EXPORT_SYMBOL_GPL(marker_arm); 791 - 792 - /** 793 - * marker_disarm - Disarm a marker 794 - * @name: marker name 795 - * 796 - * Disarm a marker. It keeps a reference count of the number of arming/disarming 797 - * done. 798 - * Returns 0 if ok, error value on error. 799 - */ 800 - int marker_disarm(const char *name) 801 - { 802 - struct marker_entry *entry; 803 - int ret = 0; 804 - 805 - mutex_lock(&markers_mutex); 806 - entry = get_marker(name); 807 - if (!entry) { 808 - ret = -ENOENT; 809 - goto end; 810 - } 811 - /* 812 - * Only permit decrement refcount if higher than 0. 813 - * Do probe update only on 1 -> 0 transition. 814 - */ 815 - if (entry->refcount) { 816 - if (--entry->refcount) 817 - goto end; 818 - } else { 819 - ret = -EPERM; 820 - goto end; 821 - } 822 - end: 823 - mutex_unlock(&markers_mutex); 824 - marker_update_probes(NULL); 825 - return ret; 826 - } 827 - EXPORT_SYMBOL_GPL(marker_disarm); 828 - 829 - /** 830 468 * marker_get_private_data - Get a marker's probe private data 831 469 * @name: marker name 470 + * @probe: probe to match 471 + * @num: get the nth matching probe's private data 832 472 * 473 + * Returns the nth private data pointer (starting from 0) matching, or an 474 + * ERR_PTR. 833 475 * Returns the private data pointer, or an ERR_PTR. 834 476 * The private data pointer should _only_ be dereferenced if the caller is the 835 477 * owner of the data, or its content could vanish. This is mostly used to 836 478 * confirm that a caller is the owner of a registered probe. 837 479 */ 838 - void *marker_get_private_data(const char *name) 480 + void *marker_get_private_data(const char *name, marker_probe_func *probe, 481 + int num) 839 482 { 840 483 struct hlist_head *head; 841 484 struct hlist_node *node; 842 485 struct marker_entry *e; 843 486 size_t name_len = strlen(name) + 1; 844 487 u32 hash = jhash(name, name_len-1, 0); 845 - int found = 0; 488 + int i; 846 489 847 490 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; 848 491 hlist_for_each_entry(e, node, head, hlist) { 849 492 if (!strcmp(name, e->name)) { 850 - found = 1; 851 - return e->private; 493 + if (!e->ptype) { 494 + if (num == 0 && e->single.func == probe) 495 + return e->single.probe_private; 496 + else 497 + break; 498 + } else { 499 + struct marker_probe_closure *closure; 500 + int match = 0; 501 + closure = e->multi; 502 + for (i = 0; closure[i].func; i++) { 503 + if (closure[i].func != probe) 504 + continue; 505 + if (match++ == num) 506 + return closure[i].probe_private; 507 + } 508 + } 852 509 } 853 510 } 854 511 return ERR_PTR(-ENOENT);
+3 -4
kernel/module.c
··· 2038 2038 #ifdef CONFIG_MARKERS 2039 2039 if (!mod->taints) 2040 2040 marker_update_probe_range(mod->markers, 2041 - mod->markers + mod->num_markers, NULL, NULL); 2041 + mod->markers + mod->num_markers); 2042 2042 #endif 2043 2043 err = module_finalize(hdr, sechdrs, mod); 2044 2044 if (err < 0) ··· 2564 2564 #endif 2565 2565 2566 2566 #ifdef CONFIG_MARKERS 2567 - void module_update_markers(struct module *probe_module, int *refcount) 2567 + void module_update_markers(void) 2568 2568 { 2569 2569 struct module *mod; 2570 2570 ··· 2572 2572 list_for_each_entry(mod, &modules, list) 2573 2573 if (!mod->taints) 2574 2574 marker_update_probe_range(mod->markers, 2575 - mod->markers + mod->num_markers, 2576 - probe_module, refcount); 2575 + mod->markers + mod->num_markers); 2577 2576 mutex_unlock(&module_mutex); 2578 2577 } 2579 2578 #endif
+9 -16
samples/markers/probe-example.c
··· 20 20 marker_probe_func *probe_func; 21 21 }; 22 22 23 - void probe_subsystem_event(const struct marker *mdata, void *private, 24 - const char *format, ...) 23 + void probe_subsystem_event(void *probe_data, void *call_data, 24 + const char *format, va_list *args) 25 25 { 26 - va_list ap; 27 26 /* Declare args */ 28 27 unsigned int value; 29 28 const char *mystr; 30 29 31 30 /* Assign args */ 32 - va_start(ap, format); 33 - value = va_arg(ap, typeof(value)); 34 - mystr = va_arg(ap, typeof(mystr)); 31 + value = va_arg(*args, typeof(value)); 32 + mystr = va_arg(*args, typeof(mystr)); 35 33 36 34 /* Call printk */ 37 - printk(KERN_DEBUG "Value %u, string %s\n", value, mystr); 35 + printk(KERN_INFO "Value %u, string %s\n", value, mystr); 38 36 39 37 /* or count, check rights, serialize data in a buffer */ 40 - 41 - va_end(ap); 42 38 } 43 39 44 40 atomic_t eventb_count = ATOMIC_INIT(0); 45 41 46 - void probe_subsystem_eventb(const struct marker *mdata, void *private, 47 - const char *format, ...) 42 + void probe_subsystem_eventb(void *probe_data, void *call_data, 43 + const char *format, va_list *args) 48 44 { 49 45 /* Increment counter */ 50 46 atomic_inc(&eventb_count); ··· 68 72 if (result) 69 73 printk(KERN_INFO "Unable to register probe %s\n", 70 74 probe_array[i].name); 71 - result = marker_arm(probe_array[i].name); 72 - if (result) 73 - printk(KERN_INFO "Unable to arm probe %s\n", 74 - probe_array[i].name); 75 75 } 76 76 return 0; 77 77 } ··· 77 85 int i; 78 86 79 87 for (i = 0; i < ARRAY_SIZE(probe_array); i++) 80 - marker_probe_unregister(probe_array[i].name); 88 + marker_probe_unregister(probe_array[i].name, 89 + probe_array[i].probe_func, &probe_array[i]); 81 90 printk(KERN_INFO "Number of event b : %u\n", 82 91 atomic_read(&eventb_count)); 83 92 }