jcs's openbsd hax
openbsd
1/* $OpenBSD: dt_dev.c,v 1.47 2025/12/10 09:38:41 mpi Exp $ */
2
3/*
4 * Copyright (c) 2019 Martin Pieuchot <mpi@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/types.h>
20#include <sys/systm.h>
21#include <sys/param.h>
22#include <sys/clockintr.h>
23#include <sys/device.h>
24#include <sys/exec_elf.h>
25#include <sys/malloc.h>
26#include <sys/proc.h>
27#include <sys/ptrace.h>
28#include <sys/vnode.h>
29#include <uvm/uvm.h>
30#include <uvm/uvm_map.h>
31#include <uvm/uvm_vnode.h>
32#include <sys/file.h>
33#include <sys/filedesc.h>
34#include <sys/fcntl.h>
35
36#include <machine/intr.h>
37
38#include <dev/dt/dtvar.h>
39
40/*
41 * Number of frames to skip in stack traces.
42 *
43 * The number of frames required to execute dt(4) profiling code
44 * depends on the probe, context, architecture and possibly the
45 * compiler.
46 *
47 * Static probes (tracepoints) are executed in the context of the
48 * current thread and only need to skip frames up to the recording
49 * function. For example the syscall provider:
50 *
51 * dt_prov_syscall_entry+0x141
52 * syscall+0x205 <--- start here
53 * Xsyscall+0x128
54 *
55 * Probes executed in their own context, like the profile provider,
56 * need to skip the frames of that context which are different for
57 * every architecture. For example the profile provider executed
58 * from hardclock(9) on amd64:
59 *
60 * dt_prov_profile_enter+0x6e
61 * hardclock+0x1a9
62 * lapic_clockintr+0x3f
63 * Xresume_lapic_ltimer+0x26
64 * acpicpu_idle+0x1d2 <---- start here.
65 * sched_idle+0x225
66 * proc_trampoline+0x1c
67 */
68#if defined(__amd64__)
69#define DT_FA_PROFILE 5
70#define DT_FA_STATIC 2
71#elif defined(__i386__)
72#define DT_FA_PROFILE 5
73#define DT_FA_STATIC 2
74#elif defined(__macppc__)
75#define DT_FA_PROFILE 5
76#define DT_FA_STATIC 2
77#elif defined(__octeon__)
78#define DT_FA_PROFILE 6
79#define DT_FA_STATIC 2
80#elif defined(__powerpc64__)
81#define DT_FA_PROFILE 6
82#define DT_FA_STATIC 2
83#elif defined(__sparc64__)
84#define DT_FA_PROFILE 7
85#define DT_FA_STATIC 1
86#else
87#define DT_FA_STATIC 0
88#define DT_FA_PROFILE 0
89#endif
90
91#define DT_EVTRING_SIZE 16 /* # of slots in per PCB event ring */
92
93#define DPRINTF(x...) /* nothing */
94
95/*
96 * Locks used to protect struct members and variables in this file:
97 * a atomic
98 * I invariant after initialization
99 * K kernel lock
100 * D dtrace rw-lock dt_lock
101 * r owned by thread doing read(2)
102 * c owned by CPU
103 * s sliced ownership, based on read/write indexes
104 * p written by CPU, read by thread doing read(2)
105 */
106
107/*
108 * Per-CPU Event States
109 */
110struct dt_cpubuf {
111 unsigned int dc_prod; /* [r] read index */
112 unsigned int dc_cons; /* [c] write index */
113 struct dt_evt *dc_ring; /* [s] ring of event states */
114 unsigned int dc_inevt; /* [c] in event already? */
115
116 /* Counters */
117 unsigned int dc_dropevt; /* [p] # of events dropped */
118 unsigned int dc_skiptick; /* [p] # of ticks skipped */
119 unsigned int dc_recurevt; /* [p] # of recursive events */
120 unsigned int dc_readevt; /* [r] # of events read */
121};
122
123/*
124 * Descriptor associated with each program opening /dev/dt. It is used
125 * to keep track of enabled PCBs.
126 */
127struct dt_softc {
128 SLIST_ENTRY(dt_softc) ds_next; /* [K] descriptor list */
129 int ds_unit; /* [I] D_CLONE unique unit */
130 pid_t ds_pid; /* [I] PID of tracing program */
131 void *ds_si; /* [I] to defer wakeup(9) */
132
133 struct dt_pcb_list ds_pcbs; /* [K] list of enabled PCBs */
134 int ds_recording; /* [D] currently recording? */
135 unsigned int ds_evtcnt; /* [a] # of readable evts */
136
137 struct dt_cpubuf ds_cpu[MAXCPUS]; /* [I] Per-cpu event states */
138 unsigned int ds_lastcpu; /* [r] last CPU ring read(2). */
139};
140
141SLIST_HEAD(, dt_softc) dtdev_list; /* [K] list of open /dev/dt nodes */
142
143/*
144 * Probes are created during dt_attach() and never modified/freed during
145 * the lifetime of the system. That's why we consider them as [I]mmutable.
146 */
147unsigned int dt_nprobes; /* [I] # of probes available */
148SIMPLEQ_HEAD(, dt_probe) dt_probe_list; /* [I] list of probes */
149
150struct rwlock dt_lock = RWLOCK_INITIALIZER("dtlk");
151volatile uint32_t dt_tracing = 0; /* [D] # of processes tracing */
152
153int allowdt; /* [a] */
154
155void dtattach(struct device *, struct device *, void *);
156int dtopen(dev_t, int, int, struct proc *);
157int dtclose(dev_t, int, int, struct proc *);
158int dtread(dev_t, struct uio *, int);
159int dtioctl(dev_t, u_long, caddr_t, int, struct proc *);
160
161struct dt_softc *dtlookup(int);
162struct dt_softc *dtalloc(void);
163void dtfree(struct dt_softc *);
164
165int dt_ioctl_list_probes(struct dt_softc *, struct dtioc_probe *);
166int dt_ioctl_get_args(struct dt_softc *, struct dtioc_arg *);
167int dt_ioctl_get_stats(struct dt_softc *, struct dtioc_stat *);
168int dt_ioctl_record_start(struct dt_softc *);
169void dt_ioctl_record_stop(struct dt_softc *);
170int dt_ioctl_probe_enable(struct dt_softc *, struct dtioc_req *);
171int dt_ioctl_probe_disable(struct dt_softc *, struct dtioc_req *);
172int dt_ioctl_rd_vnode(struct dt_softc *, struct dtioc_rdvn *);
173
174int dt_ring_copy(struct dt_cpubuf *, struct uio *, size_t, size_t *);
175
176void dt_wakeup(struct dt_softc *);
177void dt_deferred_wakeup(void *);
178
179void
180dtattach(struct device *parent, struct device *self, void *aux)
181{
182 SLIST_INIT(&dtdev_list);
183 SIMPLEQ_INIT(&dt_probe_list);
184
185 /* Init providers */
186 dt_nprobes += dt_prov_profile_init();
187 dt_nprobes += dt_prov_syscall_init();
188 dt_nprobes += dt_prov_static_init();
189#ifdef DDBPROF
190 dt_nprobes += dt_prov_kprobe_init();
191#endif
192}
193
194int
195dtopen(dev_t dev, int flags, int mode, struct proc *p)
196{
197 struct dt_softc *sc;
198 int unit = minor(dev);
199
200 if (atomic_load_int(&allowdt) == 0)
201 return EPERM;
202
203 sc = dtalloc();
204 if (sc == NULL)
205 return ENOMEM;
206
207 /* no sleep after this point */
208 if (dtlookup(unit) != NULL) {
209 dtfree(sc);
210 return EBUSY;
211 }
212
213 sc->ds_unit = unit;
214 sc->ds_pid = p->p_p->ps_pid;
215 TAILQ_INIT(&sc->ds_pcbs);
216 sc->ds_lastcpu = 0;
217 sc->ds_evtcnt = 0;
218
219 SLIST_INSERT_HEAD(&dtdev_list, sc, ds_next);
220
221 DPRINTF("dt%d: pid %d open\n", sc->ds_unit, sc->ds_pid);
222
223 return 0;
224}
225
226int
227dtclose(dev_t dev, int flags, int mode, struct proc *p)
228{
229 struct dt_softc *sc;
230 int unit = minor(dev);
231
232 sc = dtlookup(unit);
233 KASSERT(sc != NULL);
234
235 DPRINTF("dt%d: pid %d close\n", sc->ds_unit, sc->ds_pid);
236
237 SLIST_REMOVE(&dtdev_list, sc, dt_softc, ds_next);
238 dt_ioctl_record_stop(sc);
239 dt_pcb_purge(&sc->ds_pcbs);
240 dtfree(sc);
241
242 return 0;
243}
244
245int
246dtread(dev_t dev, struct uio *uio, int flags)
247{
248 struct dt_softc *sc;
249 struct dt_cpubuf *dc;
250 int i, error = 0, unit = minor(dev);
251 size_t count, max, read = 0;
252
253 sc = dtlookup(unit);
254 KASSERT(sc != NULL);
255
256 max = howmany(uio->uio_resid, sizeof(struct dt_evt));
257 if (max < 1)
258 return (EMSGSIZE);
259
260 while (!atomic_load_int(&sc->ds_evtcnt)) {
261 sleep_setup(sc, PWAIT | PCATCH, "dtread");
262 error = sleep_finish(INFSLP, !atomic_load_int(&sc->ds_evtcnt));
263 if (error == EINTR || error == ERESTART)
264 break;
265 }
266 if (error)
267 return error;
268
269 KERNEL_ASSERT_LOCKED();
270 for (i = 0; i < ncpusfound; i++) {
271 count = 0;
272 dc = &sc->ds_cpu[(sc->ds_lastcpu + i) % ncpusfound];
273 error = dt_ring_copy(dc, uio, max, &count);
274 if (error && count == 0)
275 break;
276
277 read += count;
278 max -= count;
279 if (max == 0)
280 break;
281 }
282 sc->ds_lastcpu += i % ncpusfound;
283
284 atomic_sub_int(&sc->ds_evtcnt, read);
285
286 return error;
287}
288
289int
290dtioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p)
291{
292 struct dt_softc *sc;
293 int unit = minor(dev);
294 int on, error = 0;
295
296 sc = dtlookup(unit);
297 KASSERT(sc != NULL);
298
299 switch (cmd) {
300 case DTIOCGPLIST:
301 return dt_ioctl_list_probes(sc, (struct dtioc_probe *)addr);
302 case DTIOCGARGS:
303 return dt_ioctl_get_args(sc, (struct dtioc_arg *)addr);
304 case DTIOCGSTATS:
305 return dt_ioctl_get_stats(sc, (struct dtioc_stat *)addr);
306 case DTIOCRECORD:
307 case DTIOCPRBENABLE:
308 case DTIOCPRBDISABLE:
309 case DTIOCRDVNODE:
310 /* root only ioctl(2) */
311 break;
312 default:
313 return ENOTTY;
314 }
315
316 if ((error = suser(p)) != 0)
317 return error;
318
319 switch (cmd) {
320 case DTIOCRECORD:
321 on = *(int *)addr;
322 if (on)
323 error = dt_ioctl_record_start(sc);
324 else
325 dt_ioctl_record_stop(sc);
326 break;
327 case DTIOCPRBENABLE:
328 error = dt_ioctl_probe_enable(sc, (struct dtioc_req *)addr);
329 break;
330 case DTIOCPRBDISABLE:
331 error = dt_ioctl_probe_disable(sc, (struct dtioc_req *)addr);
332 break;
333 case DTIOCRDVNODE:
334 error = dt_ioctl_rd_vnode(sc, (struct dtioc_rdvn *)addr);
335 break;
336 default:
337 KASSERT(0);
338 }
339
340 return error;
341}
342
343struct dt_softc *
344dtlookup(int unit)
345{
346 struct dt_softc *sc;
347
348 KERNEL_ASSERT_LOCKED();
349
350 SLIST_FOREACH(sc, &dtdev_list, ds_next) {
351 if (sc->ds_unit == unit)
352 break;
353 }
354
355 return sc;
356}
357
358struct dt_softc *
359dtalloc(void)
360{
361 struct dt_softc *sc;
362 struct dt_evt *dtev;
363 int i;
364
365 sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO);
366 if (sc == NULL)
367 return NULL;
368
369 for (i = 0; i < ncpusfound; i++) {
370 dtev = mallocarray(DT_EVTRING_SIZE, sizeof(*dtev), M_DEVBUF,
371 M_WAITOK|M_CANFAIL|M_ZERO);
372 if (dtev == NULL)
373 break;
374 sc->ds_cpu[i].dc_ring = dtev;
375 }
376 if (i < ncpusfound) {
377 dtfree(sc);
378 return NULL;
379 }
380
381 sc->ds_si = softintr_establish(IPL_SOFTCLOCK | IPL_MPSAFE,
382 dt_deferred_wakeup, sc);
383 if (sc->ds_si == NULL) {
384 dtfree(sc);
385 return NULL;
386 }
387
388 return sc;
389}
390
391void
392dtfree(struct dt_softc *sc)
393{
394 struct dt_evt *dtev;
395 int i;
396
397 if (sc->ds_si != NULL)
398 softintr_disestablish(sc->ds_si);
399
400 for (i = 0; i < ncpusfound; i++) {
401 dtev = sc->ds_cpu[i].dc_ring;
402 free(dtev, M_DEVBUF, DT_EVTRING_SIZE * sizeof(*dtev));
403 }
404 free(sc, M_DEVBUF, sizeof(*sc));
405}
406
407int
408dt_ioctl_list_probes(struct dt_softc *sc, struct dtioc_probe *dtpr)
409{
410 struct dtioc_probe_info info, *dtpi;
411 struct dt_probe *dtp;
412 size_t size;
413 int error = 0;
414
415 size = dtpr->dtpr_size;
416 dtpr->dtpr_size = dt_nprobes * sizeof(*dtpi);
417 if (size == 0)
418 return 0;
419
420 dtpi = dtpr->dtpr_probes;
421 SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) {
422 if (size < sizeof(*dtpi)) {
423 error = ENOSPC;
424 break;
425 }
426 memset(&info, 0, sizeof(info));
427 info.dtpi_pbn = dtp->dtp_pbn;
428 info.dtpi_nargs = dtp->dtp_nargs;
429 strlcpy(info.dtpi_prov, dtp->dtp_prov->dtpv_name,
430 sizeof(info.dtpi_prov));
431 strlcpy(info.dtpi_func, dtp->dtp_func, sizeof(info.dtpi_func));
432 strlcpy(info.dtpi_name, dtp->dtp_name, sizeof(info.dtpi_name));
433 error = copyout(&info, dtpi, sizeof(*dtpi));
434 if (error)
435 break;
436 size -= sizeof(*dtpi);
437 dtpi++;
438 }
439
440 return error;
441}
442
443int
444dt_ioctl_get_args(struct dt_softc *sc, struct dtioc_arg *dtar)
445{
446 struct dtioc_arg_info info, *dtai;
447 struct dt_probe *dtp;
448 size_t size, n, t;
449 uint32_t pbn;
450 int error = 0;
451
452 pbn = dtar->dtar_pbn;
453 if (pbn == 0 || pbn > dt_nprobes)
454 return EINVAL;
455
456 SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) {
457 if (pbn == dtp->dtp_pbn)
458 break;
459 }
460 if (dtp == NULL)
461 return EINVAL;
462
463 if (dtp->dtp_sysnum != 0) {
464 /* currently not supported for system calls */
465 dtar->dtar_size = 0;
466 return 0;
467 }
468
469 size = dtar->dtar_size;
470 dtar->dtar_size = dtp->dtp_nargs * sizeof(*dtar);
471 if (size == 0)
472 return 0;
473
474 t = 0;
475 dtai = dtar->dtar_args;
476 for (n = 0; n < dtp->dtp_nargs; n++) {
477 if (size < sizeof(*dtai)) {
478 error = ENOSPC;
479 break;
480 }
481 if (n >= DTMAXARGTYPES || dtp->dtp_argtype[n] == NULL)
482 continue;
483 memset(&info, 0, sizeof(info));
484 info.dtai_pbn = dtp->dtp_pbn;
485 info.dtai_argn = t++;
486 strlcpy(info.dtai_argtype, dtp->dtp_argtype[n],
487 sizeof(info.dtai_argtype));
488 error = copyout(&info, dtai, sizeof(*dtai));
489 if (error)
490 break;
491 size -= sizeof(*dtai);
492 dtai++;
493 }
494 dtar->dtar_size = t * sizeof(*dtar);
495
496 return error;
497}
498
499int
500dt_ioctl_get_stats(struct dt_softc *sc, struct dtioc_stat *dtst)
501{
502 struct dt_cpubuf *dc;
503 uint64_t readevt, dropevt, skiptick, recurevt;
504 int i;
505
506 readevt = dropevt = skiptick = 0;
507 for (i = 0; i < ncpusfound; i++) {
508 dc = &sc->ds_cpu[i];
509
510 membar_consumer();
511 dropevt += dc->dc_dropevt;
512 skiptick = dc->dc_skiptick;
513 recurevt = dc->dc_recurevt;
514 readevt += dc->dc_readevt;
515 }
516
517 dtst->dtst_readevt = readevt;
518 dtst->dtst_dropevt = dropevt;
519 dtst->dtst_skiptick = skiptick;
520 dtst->dtst_recurevt = recurevt;
521 return 0;
522}
523
524int
525dt_ioctl_record_start(struct dt_softc *sc)
526{
527 uint64_t now;
528 struct dt_pcb *dp;
529 int error = 0;
530
531 rw_enter_write(&dt_lock);
532 if (sc->ds_recording) {
533 error = EBUSY;
534 goto out;
535 }
536
537 KERNEL_ASSERT_LOCKED();
538 if (TAILQ_EMPTY(&sc->ds_pcbs)) {
539 error = ENOENT;
540 goto out;
541 }
542
543 now = nsecuptime();
544 TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) {
545 struct dt_probe *dtp = dp->dp_dtp;
546
547 SMR_SLIST_INSERT_HEAD_LOCKED(&dtp->dtp_pcbs, dp, dp_pnext);
548 dtp->dtp_recording++;
549 dtp->dtp_prov->dtpv_recording++;
550
551 if (dp->dp_nsecs != 0) {
552 clockintr_bind(&dp->dp_clockintr, dp->dp_cpu, dt_clock,
553 dp);
554 clockintr_schedule(&dp->dp_clockintr,
555 now + dp->dp_nsecs);
556 }
557 }
558 sc->ds_recording = 1;
559 dt_tracing++;
560
561 out:
562 rw_exit_write(&dt_lock);
563 return error;
564}
565
566void
567dt_ioctl_record_stop(struct dt_softc *sc)
568{
569 struct dt_pcb *dp;
570
571 rw_enter_write(&dt_lock);
572 if (!sc->ds_recording) {
573 rw_exit_write(&dt_lock);
574 return;
575 }
576
577 DPRINTF("dt%d: pid %d disable\n", sc->ds_unit, sc->ds_pid);
578
579 dt_tracing--;
580 sc->ds_recording = 0;
581 TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) {
582 struct dt_probe *dtp = dp->dp_dtp;
583
584 /*
585 * Set an execution barrier to ensure the shared
586 * reference to dp is inactive.
587 */
588 if (dp->dp_nsecs != 0)
589 clockintr_unbind(&dp->dp_clockintr, CL_BARRIER);
590
591 dtp->dtp_recording--;
592 dtp->dtp_prov->dtpv_recording--;
593 SMR_SLIST_REMOVE_LOCKED(&dtp->dtp_pcbs, dp, dt_pcb, dp_pnext);
594 }
595 rw_exit_write(&dt_lock);
596
597 /* Wait until readers cannot access the PCBs. */
598 smr_barrier();
599}
600
601int
602dt_ioctl_probe_enable(struct dt_softc *sc, struct dtioc_req *dtrq)
603{
604 struct dt_pcb_list plist;
605 struct dt_probe *dtp;
606 struct dt_pcb *dp;
607 int error;
608
609 SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) {
610 if (dtp->dtp_pbn == dtrq->dtrq_pbn)
611 break;
612 }
613 if (dtp == NULL)
614 return ENOENT;
615
616 /* Only allow one probe of each type. */
617 TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) {
618 if (dp->dp_dtp->dtp_pbn == dtrq->dtrq_pbn)
619 return EEXIST;
620 }
621
622 TAILQ_INIT(&plist);
623 error = dtp->dtp_prov->dtpv_alloc(dtp, sc, &plist, dtrq);
624 if (error)
625 return error;
626
627 DPRINTF("dt%d: pid %d enable %u : %b\n", sc->ds_unit, sc->ds_pid,
628 dtrq->dtrq_pbn, (unsigned int)dtrq->dtrq_evtflags, DTEVT_FLAG_BITS);
629
630 /* Append all PCBs to this instance */
631 TAILQ_CONCAT(&sc->ds_pcbs, &plist, dp_snext);
632
633 return 0;
634}
635
636int
637dt_ioctl_probe_disable(struct dt_softc *sc, struct dtioc_req *dtrq)
638{
639 struct dt_probe *dtp;
640 int error;
641
642 SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) {
643 if (dtp->dtp_pbn == dtrq->dtrq_pbn)
644 break;
645 }
646 if (dtp == NULL)
647 return ENOENT;
648
649 if (dtp->dtp_prov->dtpv_dealloc) {
650 error = dtp->dtp_prov->dtpv_dealloc(dtp, sc, dtrq);
651 if (error)
652 return error;
653 }
654
655 DPRINTF("dt%d: pid %d dealloc\n", sc->ds_unit, sc->ds_pid,
656 dtrq->dtrq_pbn);
657
658 return 0;
659}
660
661int
662dt_ioctl_rd_vnode(struct dt_softc *sc, struct dtioc_rdvn *dtrv)
663{
664 struct process *ps;
665 struct proc *p = curproc;
666 boolean_t ok;
667 struct vm_map_entry *e;
668 int err = 0;
669 int fd;
670 struct uvm_vnode *uvn;
671 struct vnode *vn;
672 struct file *fp;
673
674 if ((ps = prfind(dtrv->dtrv_pid)) == NULL)
675 return ESRCH;
676
677 vm_map_lock_read(&ps->ps_vmspace->vm_map);
678
679 ok = uvm_map_lookup_entry(&ps->ps_vmspace->vm_map,
680 (vaddr_t)dtrv->dtrv_va, &e);
681 if (ok == 0 || (e->etype & UVM_ET_OBJ) == 0 ||
682 (e->protection & PROT_EXEC) == 0 ||
683 !UVM_OBJ_IS_VNODE(e->object.uvm_obj)) {
684 err = ENOENT;
685 vn = NULL;
686 DPRINTF("%s no mapping for %p\n", __func__, dtrv->dtrv_va);
687 } else {
688 uvn = (struct uvm_vnode *)e->object.uvm_obj;
689 vn = uvn->u_vnode;
690 vref(vn);
691
692 dtrv->dtrv_len = (size_t)uvn->u_size;
693 dtrv->dtrv_start = (caddr_t)e->start;
694 dtrv->dtrv_offset = (caddr_t)e->offset;
695 }
696
697 vm_map_unlock_read(&ps->ps_vmspace->vm_map);
698
699 if (vn != NULL) {
700 fdplock(p->p_fd);
701 err = falloc(p, &fp, &fd);
702 fdpunlock(p->p_fd);
703 if (err != 0) {
704 vrele(vn);
705 DPRINTF("%s fdopen failed (%d)\n", __func__, err);
706 return err;
707 }
708 err = VOP_OPEN(vn, O_RDONLY, p->p_p->ps_ucred, p);
709 if (err == 0) {
710 fp->f_flag = FREAD;
711 fp->f_type = DTYPE_VNODE;
712 fp->f_ops = &vnops;
713 fp->f_data = vn;
714 dtrv->dtrv_fd = fd;
715 fdplock(p->p_fd);
716 fdinsert(p->p_fd, fd, UF_EXCLOSE, fp);
717 fdpunlock(p->p_fd);
718 FRELE(fp, p);
719 } else {
720 DPRINTF("%s vopen() failed (%d)\n", __func__,
721 err);
722 vrele(vn);
723 fdplock(p->p_fd);
724 fdremove(p->p_fd, fd);
725 fdpunlock(p->p_fd);
726 FRELE(fp, p);
727 }
728 }
729
730 return err;
731}
732
733struct dt_probe *
734dt_dev_alloc_probe(const char *func, const char *name, struct dt_provider *dtpv)
735{
736 struct dt_probe *dtp;
737
738 dtp = malloc(sizeof(*dtp), M_DT, M_NOWAIT|M_ZERO);
739 if (dtp == NULL)
740 return NULL;
741
742 SMR_SLIST_INIT(&dtp->dtp_pcbs);
743 dtp->dtp_prov = dtpv;
744 dtp->dtp_func = func;
745 dtp->dtp_name = name;
746 dtp->dtp_sysnum = -1;
747 dtp->dtp_ref = 0;
748
749 return dtp;
750}
751
752void
753dt_dev_register_probe(struct dt_probe *dtp)
754{
755 static uint64_t probe_nb;
756
757 dtp->dtp_pbn = ++probe_nb;
758 SIMPLEQ_INSERT_TAIL(&dt_probe_list, dtp, dtp_next);
759}
760
761struct dt_pcb *
762dt_pcb_alloc(struct dt_probe *dtp, struct dt_softc *sc)
763{
764 struct dt_pcb *dp;
765
766 dp = malloc(sizeof(*dp), M_DT, M_WAITOK|M_CANFAIL|M_ZERO);
767 if (dp == NULL)
768 return NULL;
769
770 dp->dp_sc = sc;
771 dp->dp_dtp = dtp;
772 return dp;
773}
774
775void
776dt_pcb_free(struct dt_pcb *dp)
777{
778 free(dp, M_DT, sizeof(*dp));
779}
780
781void
782dt_pcb_purge(struct dt_pcb_list *plist)
783{
784 struct dt_pcb *dp;
785
786 while ((dp = TAILQ_FIRST(plist)) != NULL) {
787 TAILQ_REMOVE(plist, dp, dp_snext);
788 dt_pcb_free(dp);
789 }
790}
791
792void
793dt_pcb_ring_skiptick(struct dt_pcb *dp, unsigned int skip)
794{
795 struct dt_cpubuf *dc = &dp->dp_sc->ds_cpu[cpu_number()];
796
797 dc->dc_skiptick += skip;
798 membar_producer();
799}
800
801/*
802 * Get a reference to the next free event state from the ring.
803 */
804struct dt_evt *
805dt_pcb_ring_get(struct dt_pcb *dp, int profiling)
806{
807 struct proc *p = curproc;
808 struct dt_evt *dtev;
809 int prod, cons, distance;
810 struct dt_cpubuf *dc = &dp->dp_sc->ds_cpu[cpu_number()];
811
812 if (dc->dc_inevt == 1) {
813 dc->dc_recurevt++;
814 membar_producer();
815 return NULL;
816 }
817
818 dc->dc_inevt = 1;
819
820 membar_consumer();
821 prod = dc->dc_prod;
822 cons = dc->dc_cons;
823 distance = prod - cons;
824 if (distance == 1 || distance == (1 - DT_EVTRING_SIZE)) {
825 /* read(2) isn't finished */
826 dc->dc_dropevt++;
827 membar_producer();
828
829 dc->dc_inevt = 0;
830 return NULL;
831 }
832
833 /*
834 * Save states in next free event slot.
835 */
836 dtev = &dc->dc_ring[cons];
837 memset(dtev, 0, sizeof(*dtev));
838
839 dtev->dtev_pbn = dp->dp_dtp->dtp_pbn;
840 dtev->dtev_cpu = cpu_number();
841 dtev->dtev_pid = p->p_p->ps_pid;
842 dtev->dtev_tid = p->p_tid + THREAD_PID_OFFSET;
843 nanotime(&dtev->dtev_tsp);
844
845 if (ISSET(dp->dp_evtflags, DTEVT_EXECNAME))
846 strlcpy(dtev->dtev_comm, p->p_p->ps_comm, sizeof(dtev->dtev_comm));
847
848 if (ISSET(dp->dp_evtflags, DTEVT_KSTACK)) {
849 if (profiling)
850 stacktrace_save_at(&dtev->dtev_kstack, DT_FA_PROFILE);
851 else
852 stacktrace_save_at(&dtev->dtev_kstack, DT_FA_STATIC);
853 }
854 if (ISSET(dp->dp_evtflags, DTEVT_USTACK))
855 stacktrace_save_utrace(&dtev->dtev_ustack);
856
857 return dtev;
858}
859
860void
861dt_pcb_ring_consume(struct dt_pcb *dp, struct dt_evt *dtev)
862{
863 struct dt_cpubuf *dc = &dp->dp_sc->ds_cpu[cpu_number()];
864
865 KASSERT(dtev == &dc->dc_ring[dc->dc_cons]);
866
867 dc->dc_cons = (dc->dc_cons + 1) % DT_EVTRING_SIZE;
868 membar_producer();
869
870 atomic_inc_int(&dp->dp_sc->ds_evtcnt);
871 dc->dc_inevt = 0;
872
873 dt_wakeup(dp->dp_sc);
874}
875
876/*
877 * Copy at most `max' events from `dc', producing the same amount
878 * of free slots.
879 */
880int
881dt_ring_copy(struct dt_cpubuf *dc, struct uio *uio, size_t max, size_t *rcvd)
882{
883 size_t count, copied = 0;
884 unsigned int cons, prod;
885 int error = 0;
886
887 KASSERT(max > 0);
888
889 membar_consumer();
890 cons = dc->dc_cons;
891 prod = dc->dc_prod;
892
893 if (cons < prod)
894 count = DT_EVTRING_SIZE - prod;
895 else
896 count = cons - prod;
897
898 if (count == 0)
899 return 0;
900
901 count = MIN(count, max);
902 error = uiomove(&dc->dc_ring[prod], count * sizeof(struct dt_evt), uio);
903 if (error)
904 return error;
905 copied += count;
906
907 /* Produce */
908 prod = (prod + count) % DT_EVTRING_SIZE;
909
910 /* If the ring didn't wrap, stop here. */
911 if (max == copied || prod != 0 || cons == 0)
912 goto out;
913
914 count = MIN(cons, (max - copied));
915 error = uiomove(&dc->dc_ring[0], count * sizeof(struct dt_evt), uio);
916 if (error)
917 goto out;
918
919 copied += count;
920 prod += count;
921
922out:
923 dc->dc_readevt += copied;
924 dc->dc_prod = prod;
925 membar_producer();
926
927 *rcvd = copied;
928 return error;
929}
930
931void
932dt_wakeup(struct dt_softc *sc)
933{
934 /*
935 * It is not always safe or possible to call wakeup(9) and grab
936 * the SCHED_LOCK() from a given tracepoint. This is true for
937 * any tracepoint that might trigger inside the scheduler or at
938 * any IPL higher than IPL_SCHED. For this reason use a soft-
939 * interrupt to defer the wakeup.
940 */
941 softintr_schedule(sc->ds_si);
942}
943
944void
945dt_deferred_wakeup(void *arg)
946{
947 struct dt_softc *sc = arg;
948
949 wakeup(sc);
950}