Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tracing/user_events: Use bits vs bytes for enabled status page data

User processes may require many events and when they do the cache
performance of a byte index status check is less ideal than a bit index.
The previous event limit per-page was 4096, the new limit is 32,768.

This change adds a bitwise index to the user_reg struct. Programs check
that the bit at status_bit has a bit set within the status page(s).

Link: https://lkml.kernel.org/r/20220728233309.1896-6-beaub@linux.microsoft.com
Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@efficios.com/

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

authored by

Beau Belgrave and committed by
Steven Rostedt (Google)
39d6d08b d401b724

+135 -38
+3 -12
include/linux/user_events.h
··· 20 20 #define USER_EVENTS_SYSTEM "user_events" 21 21 #define USER_EVENTS_PREFIX "u:" 22 22 23 - /* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */ 24 - #define EVENT_BIT_FTRACE 0 25 - #define EVENT_BIT_PERF 1 26 - #define EVENT_BIT_OTHER 7 27 - 28 - #define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE) 29 - #define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF) 30 - #define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER) 31 - 32 23 /* Create dynamic location entry within a 32-bit value */ 33 24 #define DYN_LOC(offset, size) ((size) << 16 | (offset)) 34 25 ··· 36 45 /* Input: Pointer to string with event name, description and flags */ 37 46 __u64 name_args; 38 47 39 - /* Output: Byte index of the event within the status page */ 40 - __u32 status_index; 48 + /* Output: Bitwise index of the event within the status page */ 49 + __u32 status_bit; 41 50 42 51 /* Output: Index of the event to use when writing data */ 43 52 __u32 write_index; 44 - }; 53 + } __attribute__((__packed__)); 45 54 46 55 #define DIAG_IOC_MAGIC '*' 47 56
+67 -8
kernel/trace/trace_events_user.c
··· 40 40 */ 41 41 #define MAX_PAGE_ORDER 0 42 42 #define MAX_PAGES (1 << MAX_PAGE_ORDER) 43 - #define MAX_EVENTS (MAX_PAGES * PAGE_SIZE) 43 + #define MAX_BYTES (MAX_PAGES * PAGE_SIZE) 44 + #define MAX_EVENTS (MAX_BYTES * 8) 44 45 45 46 /* Limit how long of an event name plus args within the subsystem. */ 46 47 #define MAX_EVENT_DESC 512 47 48 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name) 48 49 #define MAX_FIELD_ARRAY_SIZE 1024 49 50 51 + /* 52 + * The MAP_STATUS_* macros are used for taking a index and determining the 53 + * appropriate byte and the bit in the byte to set/reset for an event. 54 + * 55 + * The lower 3 bits of the index decide which bit to set. 56 + * The remaining upper bits of the index decide which byte to use for the bit. 57 + * 58 + * This is used when an event has a probe attached/removed to reflect live 59 + * status of the event wanting tracing or not to user-programs via shared 60 + * memory maps. 61 + */ 62 + #define MAP_STATUS_BYTE(index) ((index) >> 3) 63 + #define MAP_STATUS_MASK(index) BIT((index) & 7) 64 + 65 + /* 66 + * Internal bits (kernel side only) to keep track of connected probes: 67 + * These are used when status is requested in text form about an event. These 68 + * bits are compared against an internal byte on the event to determine which 69 + * probes to print out to the user. 70 + * 71 + * These do not reflect the mapped bytes between the user and kernel space. 72 + */ 73 + #define EVENT_STATUS_FTRACE BIT(0) 74 + #define EVENT_STATUS_PERF BIT(1) 75 + #define EVENT_STATUS_OTHER BIT(7) 76 + 50 77 static char *register_page_data; 51 78 52 79 static DEFINE_MUTEX(reg_mutex); 53 - static DEFINE_HASHTABLE(register_table, 4); 80 + static DEFINE_HASHTABLE(register_table, 8); 54 81 static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); 55 82 56 83 /* ··· 99 72 int index; 100 73 int flags; 101 74 int min_size; 75 + char status; 102 76 }; 103 77 104 78 /* ··· 132 104 static u32 user_event_key(char *name) 133 105 { 134 106 return jhash(name, strlen(name), 0); 107 + } 108 + 109 + static __always_inline 110 + void user_event_register_set(struct user_event *user) 111 + { 112 + int i = user->index; 113 + 114 + register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i); 115 + } 116 + 117 + static __always_inline 118 + void user_event_register_clear(struct user_event *user) 119 + { 120 + int i = user->index; 121 + 122 + register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i); 135 123 } 136 124 137 125 static __always_inline __must_check ··· 692 648 693 649 dyn_event_remove(&user->devent); 694 650 695 - register_page_data[user->index] = 0; 651 + user_event_register_clear(user); 696 652 clear_bit(user->index, page_bitmap); 697 653 hash_del(&user->node); 698 654 ··· 871 827 rcu_read_unlock_sched(); 872 828 } 873 829 874 - register_page_data[user->index] = status; 830 + if (status) 831 + user_event_register_set(user); 832 + else 833 + user_event_register_clear(user); 834 + 835 + user->status = status; 875 836 } 876 837 877 838 /* ··· 1381 1332 if (size > PAGE_SIZE) 1382 1333 return -E2BIG; 1383 1334 1384 - return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); 1335 + if (size < offsetofend(struct user_reg, write_index)) 1336 + return -EINVAL; 1337 + 1338 + ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); 1339 + 1340 + if (ret) 1341 + return ret; 1342 + 1343 + kreg->size = size; 1344 + 1345 + return 0; 1385 1346 } 1386 1347 1387 1348 /* ··· 1435 1376 return ret; 1436 1377 1437 1378 put_user((u32)ret, &ureg->write_index); 1438 - put_user(user->index, &ureg->status_index); 1379 + put_user(user->index, &ureg->status_bit); 1439 1380 1440 1381 return 0; 1441 1382 } ··· 1544 1485 { 1545 1486 unsigned long size = vma->vm_end - vma->vm_start; 1546 1487 1547 - if (size != MAX_EVENTS) 1488 + if (size != MAX_BYTES) 1548 1489 return -EINVAL; 1549 1490 1550 1491 return remap_pfn_range(vma, vma->vm_start, ··· 1579 1520 mutex_lock(&reg_mutex); 1580 1521 1581 1522 hash_for_each(register_table, i, user, node) { 1582 - status = register_page_data[user->index]; 1523 + status = user->status; 1583 1524 flags = user->flags; 1584 1525 1585 1526 seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
+18 -7
samples/user_events/example.c
··· 12 12 #include <fcntl.h> 13 13 #include <stdio.h> 14 14 #include <unistd.h> 15 + #include <asm/bitsperlong.h> 16 + #include <endian.h> 15 17 #include <linux/user_events.h> 18 + 19 + #if __BITS_PER_LONG == 64 20 + #define endian_swap(x) htole64(x) 21 + #else 22 + #define endian_swap(x) htole32(x) 23 + #endif 16 24 17 25 /* Assumes debugfs is mounted */ 18 26 const char *data_file = "/sys/kernel/debug/tracing/user_events_data"; 19 27 const char *status_file = "/sys/kernel/debug/tracing/user_events_status"; 20 28 21 - static int event_status(char **status) 29 + static int event_status(long **status) 22 30 { 23 31 int fd = open(status_file, O_RDONLY); 24 32 ··· 41 33 return 0; 42 34 } 43 35 44 - static int event_reg(int fd, const char *command, int *status, int *write) 36 + static int event_reg(int fd, const char *command, long *index, long *mask, 37 + int *write) 45 38 { 46 39 struct user_reg reg = {0}; 47 40 ··· 52 43 if (ioctl(fd, DIAG_IOCSREG, &reg) == -1) 53 44 return -1; 54 45 55 - *status = reg.status_index; 46 + *index = reg.status_bit / __BITS_PER_LONG; 47 + *mask = endian_swap(1L << (reg.status_bit % __BITS_PER_LONG)); 56 48 *write = reg.write_index; 57 49 58 50 return 0; ··· 61 51 62 52 int main(int argc, char **argv) 63 53 { 64 - int data_fd, status, write; 65 - char *status_page; 54 + int data_fd, write; 55 + long index, mask; 56 + long *status_page; 66 57 struct iovec io[2]; 67 58 __u32 count = 0; 68 59 ··· 72 61 73 62 data_fd = open(data_file, O_RDWR); 74 63 75 - if (event_reg(data_fd, "test u32 count", &status, &write) == -1) 64 + if (event_reg(data_fd, "test u32 count", &index, &mask, &write) == -1) 76 65 return errno; 77 66 78 67 /* Setup iovec */ ··· 86 75 getchar(); 87 76 88 77 /* Check if anyone is listening */ 89 - if (status_page[status]) { 78 + if (status_page[index] & mask) { 90 79 /* Yep, trace out our data */ 91 80 writev(data_fd, (const struct iovec *)io, 2); 92 81
+39 -8
tools/testing/selftests/user_events/ftrace_test.c
··· 22 22 const char *trace_file = "/sys/kernel/debug/tracing/trace"; 23 23 const char *fmt_file = "/sys/kernel/debug/tracing/events/user_events/__test_event/format"; 24 24 25 + static inline int status_check(char *status_page, int status_bit) 26 + { 27 + return status_page[status_bit >> 3] & (1 << (status_bit & 7)); 28 + } 29 + 25 30 static int trace_bytes(void) 26 31 { 27 32 int fd = open(trace_file, O_RDONLY); ··· 202 197 /* Register should work */ 203 198 ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg)); 204 199 ASSERT_EQ(0, reg.write_index); 205 - ASSERT_NE(0, reg.status_index); 200 + ASSERT_NE(0, reg.status_bit); 206 201 207 202 /* Multiple registers should result in same index */ 208 203 ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg)); 209 204 ASSERT_EQ(0, reg.write_index); 210 - ASSERT_NE(0, reg.status_index); 205 + ASSERT_NE(0, reg.status_bit); 211 206 212 207 /* Ensure disabled */ 213 208 self->enable_fd = open(enable_file, O_RDWR); ··· 217 212 /* MMAP should work and be zero'd */ 218 213 ASSERT_NE(MAP_FAILED, status_page); 219 214 ASSERT_NE(NULL, status_page); 220 - ASSERT_EQ(0, status_page[reg.status_index]); 215 + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); 221 216 222 217 /* Enable event and ensure bits updated in status */ 223 218 ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) 224 - ASSERT_EQ(EVENT_STATUS_FTRACE, status_page[reg.status_index]); 219 + ASSERT_NE(0, status_check(status_page, reg.status_bit)); 225 220 226 221 /* Disable event and ensure bits updated in status */ 227 222 ASSERT_NE(-1, write(self->enable_fd, "0", sizeof("0"))) 228 - ASSERT_EQ(0, status_page[reg.status_index]); 223 + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); 229 224 230 225 /* File still open should return -EBUSY for delete */ 231 226 ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSDEL, "__test_event")); ··· 245 240 struct iovec io[3]; 246 241 __u32 field1, field2; 247 242 int before = 0, after = 0; 243 + int page_size = sysconf(_SC_PAGESIZE); 244 + char *status_page; 248 245 249 246 reg.size = sizeof(reg); 250 247 reg.name_args = (__u64)"__test_event u32 field1; u32 field2"; ··· 261 254 io[2].iov_base = &field2; 262 255 io[2].iov_len = sizeof(field2); 263 256 257 + status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED, 258 + self->status_fd, 0); 259 + 264 260 /* Register should work */ 265 261 ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg)); 266 262 ASSERT_EQ(0, reg.write_index); 267 - ASSERT_NE(0, reg.status_index); 263 + ASSERT_NE(0, reg.status_bit); 264 + 265 + /* MMAP should work and be zero'd */ 266 + ASSERT_NE(MAP_FAILED, status_page); 267 + ASSERT_NE(NULL, status_page); 268 + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); 268 269 269 270 /* Write should fail on invalid slot with ENOENT */ 270 271 io[0].iov_base = &field2; ··· 285 270 /* Enable event */ 286 271 self->enable_fd = open(enable_file, O_RDWR); 287 272 ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) 273 + 274 + /* Event should now be enabled */ 275 + ASSERT_NE(0, status_check(status_page, reg.status_bit)); 288 276 289 277 /* Write should make it out to ftrace buffers */ 290 278 before = trace_bytes(); ··· 316 298 /* Register should work */ 317 299 ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg)); 318 300 ASSERT_EQ(0, reg.write_index); 319 - ASSERT_NE(0, reg.status_index); 301 + ASSERT_NE(0, reg.status_bit); 320 302 321 303 /* Write should work normally */ 322 304 ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 2)); ··· 333 315 int loc, bytes; 334 316 char data[8]; 335 317 int before = 0, after = 0; 318 + int page_size = sysconf(_SC_PAGESIZE); 319 + char *status_page; 320 + 321 + status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED, 322 + self->status_fd, 0); 336 323 337 324 reg.size = sizeof(reg); 338 325 reg.name_args = (__u64)"__test_event __rel_loc char[] data"; ··· 345 322 /* Register should work */ 346 323 ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg)); 347 324 ASSERT_EQ(0, reg.write_index); 348 - ASSERT_NE(0, reg.status_index); 325 + ASSERT_NE(0, reg.status_bit); 326 + 327 + /* MMAP should work and be zero'd */ 328 + ASSERT_NE(MAP_FAILED, status_page); 329 + ASSERT_NE(NULL, status_page); 330 + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); 349 331 350 332 io[0].iov_base = &reg.write_index; 351 333 io[0].iov_len = sizeof(reg.write_index); ··· 367 339 /* Enable event */ 368 340 self->enable_fd = open(enable_file, O_RDWR); 369 341 ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) 342 + 343 + /* Event should now be enabled */ 344 + ASSERT_NE(0, status_check(status_page, reg.status_bit)); 370 345 371 346 /* Full in-bounds write should work */ 372 347 before = trace_bytes();
+8 -3
tools/testing/selftests/user_events/perf_test.c
··· 35 35 return syscall(__NR_perf_event_open, pe, pid, cpu, group_fd, flags); 36 36 } 37 37 38 + static inline int status_check(char *status_page, int status_bit) 39 + { 40 + return status_page[status_bit >> 3] & (1 << (status_bit & 7)); 41 + } 42 + 38 43 static int get_id(void) 39 44 { 40 45 FILE *fp = fopen(id_file, "r"); ··· 125 120 /* Register should work */ 126 121 ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg)); 127 122 ASSERT_EQ(0, reg.write_index); 128 - ASSERT_NE(0, reg.status_index); 129 - ASSERT_EQ(0, status_page[reg.status_index]); 123 + ASSERT_NE(0, reg.status_bit); 124 + ASSERT_EQ(0, status_check(status_page, reg.status_bit)); 130 125 131 126 /* Id should be there */ 132 127 id = get_id(); ··· 149 144 ASSERT_NE(MAP_FAILED, perf_page); 150 145 151 146 /* Status should be updated */ 152 - ASSERT_EQ(EVENT_STATUS_PERF, status_page[reg.status_index]); 147 + ASSERT_NE(0, status_check(status_page, reg.status_bit)); 153 148 154 149 event.index = reg.write_index; 155 150 event.field1 = 0xc001;