The open source OpenXR runtime
1#include "TracyDebug.hpp"
2#include "TracyStringHelpers.hpp"
3#include "TracySysTrace.hpp"
4#include "../common/TracySystem.hpp"
5
6#ifdef TRACY_HAS_SYSTEM_TRACING
7
8#ifndef TRACY_SAMPLING_HZ
9# if defined _WIN32
10# define TRACY_SAMPLING_HZ 8000
11# elif defined __linux__
12# define TRACY_SAMPLING_HZ 10000
13# endif
14#endif
15
16namespace tracy
17{
18
19static constexpr int GetSamplingFrequency()
20{
21#if defined _WIN32
22 return TRACY_SAMPLING_HZ > 8000 ? 8000 : ( TRACY_SAMPLING_HZ < 1 ? 1 : TRACY_SAMPLING_HZ );
23#else
24 return TRACY_SAMPLING_HZ > 1000000 ? 1000000 : ( TRACY_SAMPLING_HZ < 1 ? 1 : TRACY_SAMPLING_HZ );
25#endif
26}
27
28static constexpr int GetSamplingPeriod()
29{
30 return 1000000000 / GetSamplingFrequency();
31}
32
33}
34
35# if defined _WIN32
36
37# ifndef NOMINMAX
38# define NOMINMAX
39# endif
40
41# define INITGUID
42# include <assert.h>
43# include <string.h>
44# include <windows.h>
45# include <dbghelp.h>
46# include <evntrace.h>
47# include <evntcons.h>
48# include <psapi.h>
49# include <winternl.h>
50
51# include "../common/TracyAlloc.hpp"
52# include "../common/TracySystem.hpp"
53# include "TracyProfiler.hpp"
54# include "TracyThread.hpp"
55
56namespace tracy
57{
58
59static const GUID PerfInfoGuid = { 0xce1dbfb4, 0x137e, 0x4da6, { 0x87, 0xb0, 0x3f, 0x59, 0xaa, 0x10, 0x2c, 0xbc } };
60static const GUID DxgKrnlGuid = { 0x802ec45a, 0x1e99, 0x4b83, { 0x99, 0x20, 0x87, 0xc9, 0x82, 0x77, 0xba, 0x9d } };
61static const GUID ThreadV2Guid = { 0x3d6fa8d1, 0xfe05, 0x11d0, { 0x9d, 0xda, 0x00, 0xc0, 0x4f, 0xd7, 0xba, 0x7c } };
62
63
64static TRACEHANDLE s_traceHandle;
65static TRACEHANDLE s_traceHandle2;
66static EVENT_TRACE_PROPERTIES* s_prop;
67static DWORD s_pid;
68
69static EVENT_TRACE_PROPERTIES* s_propVsync;
70static TRACEHANDLE s_traceHandleVsync;
71static TRACEHANDLE s_traceHandleVsync2;
72Thread* s_threadVsync = nullptr;
73
74struct CSwitch
75{
76 uint32_t newThreadId;
77 uint32_t oldThreadId;
78 int8_t newThreadPriority;
79 int8_t oldThreadPriority;
80 uint8_t previousCState;
81 int8_t spareByte;
82 int8_t oldThreadWaitReason;
83 int8_t oldThreadWaitMode;
84 int8_t oldThreadState;
85 int8_t oldThreadWaitIdealProcessor;
86 uint32_t newThreadWaitTime;
87 uint32_t reserved;
88};
89
90struct ReadyThread
91{
92 uint32_t threadId;
93 int8_t adjustReason;
94 int8_t adjustIncrement;
95 int8_t flag;
96 int8_t reserverd;
97};
98
99struct ThreadTrace
100{
101 uint32_t processId;
102 uint32_t threadId;
103 uint32_t stackBase;
104 uint32_t stackLimit;
105 uint32_t userStackBase;
106 uint32_t userStackLimit;
107 uint32_t startAddr;
108 uint32_t win32StartAddr;
109 uint32_t tebBase;
110 uint32_t subProcessTag;
111};
112
113struct StackWalkEvent
114{
115 uint64_t eventTimeStamp;
116 uint32_t stackProcess;
117 uint32_t stackThread;
118 uint64_t stack[192];
119};
120
121struct VSyncInfo
122{
123 void* dxgAdapter;
124 uint32_t vidPnTargetId;
125 uint64_t scannedPhysicalAddress;
126 uint32_t vidPnSourceId;
127 uint32_t frameNumber;
128 int64_t frameQpcTime;
129 void* hFlipDevice;
130 uint32_t flipType;
131 uint64_t flipFenceId;
132};
133
134extern "C" typedef NTSTATUS (WINAPI *t_NtQueryInformationThread)( HANDLE, THREADINFOCLASS, PVOID, ULONG, PULONG );
135extern "C" typedef BOOL (WINAPI *t_EnumProcessModules)( HANDLE, HMODULE*, DWORD, LPDWORD );
136extern "C" typedef BOOL (WINAPI *t_GetModuleInformation)( HANDLE, HMODULE, LPMODULEINFO, DWORD );
137extern "C" typedef DWORD (WINAPI *t_GetModuleBaseNameA)( HANDLE, HMODULE, LPSTR, DWORD );
138extern "C" typedef HRESULT (WINAPI *t_GetThreadDescription)( HANDLE, PWSTR* );
139
140t_NtQueryInformationThread NtQueryInformationThread = (t_NtQueryInformationThread)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "NtQueryInformationThread" );
141t_EnumProcessModules _EnumProcessModules = (t_EnumProcessModules)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32EnumProcessModules" );
142t_GetModuleInformation _GetModuleInformation = (t_GetModuleInformation)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetModuleInformation" );
143t_GetModuleBaseNameA _GetModuleBaseNameA = (t_GetModuleBaseNameA)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetModuleBaseNameA" );
144
145static t_GetThreadDescription _GetThreadDescription = 0;
146
147
148void WINAPI EventRecordCallback( PEVENT_RECORD record )
149{
150#ifdef TRACY_ON_DEMAND
151 if( !GetProfiler().IsConnected() ) return;
152#endif
153
154 const auto& hdr = record->EventHeader;
155 switch( hdr.ProviderId.Data1 )
156 {
157 case 0x3d6fa8d1: // Thread Guid
158 if( hdr.EventDescriptor.Opcode == 36 )
159 {
160 const auto cswitch = (const CSwitch*)record->UserData;
161
162 TracyLfqPrepare( QueueType::ContextSwitch );
163 MemWrite( &item->contextSwitch.time, hdr.TimeStamp.QuadPart );
164 MemWrite( &item->contextSwitch.oldThread, cswitch->oldThreadId );
165 MemWrite( &item->contextSwitch.newThread, cswitch->newThreadId );
166 MemWrite( &item->contextSwitch.cpu, record->BufferContext.ProcessorNumber );
167 MemWrite( &item->contextSwitch.reason, cswitch->oldThreadWaitReason );
168 MemWrite( &item->contextSwitch.state, cswitch->oldThreadState );
169 TracyLfqCommit;
170 }
171 else if( hdr.EventDescriptor.Opcode == 50 )
172 {
173 const auto rt = (const ReadyThread*)record->UserData;
174
175 TracyLfqPrepare( QueueType::ThreadWakeup );
176 MemWrite( &item->threadWakeup.time, hdr.TimeStamp.QuadPart );
177 MemWrite( &item->threadWakeup.thread, rt->threadId );
178 TracyLfqCommit;
179 }
180 else if( hdr.EventDescriptor.Opcode == 1 || hdr.EventDescriptor.Opcode == 3 )
181 {
182 const auto tt = (const ThreadTrace*)record->UserData;
183
184 uint64_t tid = tt->threadId;
185 if( tid == 0 ) return;
186 uint64_t pid = tt->processId;
187 TracyLfqPrepare( QueueType::TidToPid );
188 MemWrite( &item->tidToPid.tid, tid );
189 MemWrite( &item->tidToPid.pid, pid );
190 TracyLfqCommit;
191 }
192 break;
193 case 0xdef2fe46: // StackWalk Guid
194 if( hdr.EventDescriptor.Opcode == 32 )
195 {
196 const auto sw = (const StackWalkEvent*)record->UserData;
197 if( sw->stackProcess == s_pid )
198 {
199 const uint64_t sz = ( record->UserDataLength - 16 ) / 8;
200 if( sz > 0 )
201 {
202 auto trace = (uint64_t*)tracy_malloc( ( 1 + sz ) * sizeof( uint64_t ) );
203 memcpy( trace, &sz, sizeof( uint64_t ) );
204 memcpy( trace+1, sw->stack, sizeof( uint64_t ) * sz );
205 TracyLfqPrepare( QueueType::CallstackSample );
206 MemWrite( &item->callstackSampleFat.time, sw->eventTimeStamp );
207 MemWrite( &item->callstackSampleFat.thread, sw->stackThread );
208 MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
209 TracyLfqCommit;
210 }
211 }
212 }
213 break;
214 default:
215 break;
216 }
217}
218
219void WINAPI EventRecordCallbackVsync( PEVENT_RECORD record )
220{
221#ifdef TRACY_ON_DEMAND
222 if( !GetProfiler().IsConnected() ) return;
223#endif
224
225 const auto& hdr = record->EventHeader;
226 assert( hdr.ProviderId.Data1 == 0x802EC45A );
227 assert( hdr.EventDescriptor.Id == 0x0011 );
228
229 const auto vs = (const VSyncInfo*)record->UserData;
230
231 TracyLfqPrepare( QueueType::FrameVsync );
232 MemWrite( &item->frameVsync.time, hdr.TimeStamp.QuadPart );
233 MemWrite( &item->frameVsync.id, vs->vidPnTargetId );
234 TracyLfqCommit;
235}
236
237static void SetupVsync()
238{
239#if _WIN32_WINNT >= _WIN32_WINNT_WINBLUE && !defined(__MINGW32__)
240 const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + MAX_PATH;
241 s_propVsync = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz );
242 memset( s_propVsync, 0, sizeof( EVENT_TRACE_PROPERTIES ) );
243 s_propVsync->LogFileMode = EVENT_TRACE_REAL_TIME_MODE;
244 s_propVsync->Wnode.BufferSize = psz;
245#ifdef TRACY_TIMER_QPC
246 s_propVsync->Wnode.ClientContext = 1;
247#else
248 s_propVsync->Wnode.ClientContext = 3;
249#endif
250 s_propVsync->LoggerNameOffset = sizeof( EVENT_TRACE_PROPERTIES );
251 strcpy( ((char*)s_propVsync) + sizeof( EVENT_TRACE_PROPERTIES ), "TracyVsync" );
252
253 auto backup = tracy_malloc( psz );
254 memcpy( backup, s_propVsync, psz );
255
256 const auto controlStatus = ControlTraceA( 0, "TracyVsync", s_propVsync, EVENT_TRACE_CONTROL_STOP );
257 if( controlStatus != ERROR_SUCCESS && controlStatus != ERROR_WMI_INSTANCE_NOT_FOUND )
258 {
259 tracy_free( backup );
260 tracy_free( s_propVsync );
261 return;
262 }
263
264 memcpy( s_propVsync, backup, psz );
265 tracy_free( backup );
266
267 const auto startStatus = StartTraceA( &s_traceHandleVsync, "TracyVsync", s_propVsync );
268 if( startStatus != ERROR_SUCCESS )
269 {
270 tracy_free( s_propVsync );
271 return;
272 }
273
274 EVENT_FILTER_EVENT_ID fe = {};
275 fe.FilterIn = TRUE;
276 fe.Count = 1;
277 fe.Events[0] = 0x0011; // VSyncDPC_Info
278
279 EVENT_FILTER_DESCRIPTOR desc = {};
280 desc.Ptr = (ULONGLONG)&fe;
281 desc.Size = sizeof( fe );
282 desc.Type = EVENT_FILTER_TYPE_EVENT_ID;
283
284 ENABLE_TRACE_PARAMETERS params = {};
285 params.Version = ENABLE_TRACE_PARAMETERS_VERSION_2;
286 params.EnableProperty = EVENT_ENABLE_PROPERTY_IGNORE_KEYWORD_0;
287 params.SourceId = s_propVsync->Wnode.Guid;
288 params.EnableFilterDesc = &desc;
289 params.FilterDescCount = 1;
290
291 uint64_t mask = 0x4000000000000001; // Microsoft_Windows_DxgKrnl_Performance | Base
292 if( EnableTraceEx2( s_traceHandleVsync, &DxgKrnlGuid, EVENT_CONTROL_CODE_ENABLE_PROVIDER, TRACE_LEVEL_INFORMATION, mask, mask, 0, ¶ms ) != ERROR_SUCCESS )
293 {
294 tracy_free( s_propVsync );
295 return;
296 }
297
298 char loggerName[MAX_PATH];
299 strcpy( loggerName, "TracyVsync" );
300
301 EVENT_TRACE_LOGFILEA log = {};
302 log.LoggerName = loggerName;
303 log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_EVENT_RECORD | PROCESS_TRACE_MODE_RAW_TIMESTAMP;
304 log.EventRecordCallback = EventRecordCallbackVsync;
305
306 s_traceHandleVsync2 = OpenTraceA( &log );
307 if( s_traceHandleVsync2 == (TRACEHANDLE)INVALID_HANDLE_VALUE )
308 {
309 CloseTrace( s_traceHandleVsync );
310 tracy_free( s_propVsync );
311 return;
312 }
313
314 s_threadVsync = (Thread*)tracy_malloc( sizeof( Thread ) );
315 new(s_threadVsync) Thread( [] (void*) {
316 ThreadExitHandler threadExitHandler;
317 SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
318 SetThreadName( "Tracy Vsync" );
319 ProcessTrace( &s_traceHandleVsync2, 1, nullptr, nullptr );
320 }, nullptr );
321#endif
322}
323
324static constexpr int GetSamplingInterval()
325{
326 return GetSamplingPeriod() / 100;
327}
328
329bool SysTraceStart( int64_t& samplingPeriod )
330{
331 if( !_GetThreadDescription ) _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" );
332
333 s_pid = GetCurrentProcessId();
334
335#if defined _WIN64
336 constexpr bool isOs64Bit = true;
337#else
338 BOOL _iswow64;
339 IsWow64Process( GetCurrentProcess(), &_iswow64 );
340 const bool isOs64Bit = _iswow64;
341#endif
342
343 TOKEN_PRIVILEGES priv = {};
344 priv.PrivilegeCount = 1;
345 priv.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
346 if( LookupPrivilegeValue( nullptr, SE_SYSTEM_PROFILE_NAME, &priv.Privileges[0].Luid ) == 0 ) return false;
347
348 HANDLE pt;
349 if( OpenProcessToken( GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &pt ) == 0 ) return false;
350 const auto adjust = AdjustTokenPrivileges( pt, FALSE, &priv, 0, nullptr, nullptr );
351 CloseHandle( pt );
352 if( adjust == 0 ) return false;
353 const auto status = GetLastError();
354 if( status != ERROR_SUCCESS ) return false;
355
356 if( isOs64Bit )
357 {
358 TRACE_PROFILE_INTERVAL interval = {};
359 interval.Interval = GetSamplingInterval();
360 const auto intervalStatus = TraceSetInformation( 0, TraceSampledProfileIntervalInfo, &interval, sizeof( interval ) );
361 if( intervalStatus != ERROR_SUCCESS ) return false;
362 samplingPeriod = GetSamplingPeriod();
363 }
364
365 const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + sizeof( KERNEL_LOGGER_NAME );
366 s_prop = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz );
367 memset( s_prop, 0, sizeof( EVENT_TRACE_PROPERTIES ) );
368 ULONG flags = 0;
369#ifndef TRACY_NO_CONTEXT_SWITCH
370 flags = EVENT_TRACE_FLAG_CSWITCH | EVENT_TRACE_FLAG_DISPATCHER | EVENT_TRACE_FLAG_THREAD;
371#endif
372#ifndef TRACY_NO_SAMPLING
373 if( isOs64Bit ) flags |= EVENT_TRACE_FLAG_PROFILE;
374#endif
375 s_prop->EnableFlags = flags;
376 s_prop->LogFileMode = EVENT_TRACE_REAL_TIME_MODE;
377 s_prop->Wnode.BufferSize = psz;
378 s_prop->Wnode.Flags = WNODE_FLAG_TRACED_GUID;
379#ifdef TRACY_TIMER_QPC
380 s_prop->Wnode.ClientContext = 1;
381#else
382 s_prop->Wnode.ClientContext = 3;
383#endif
384 s_prop->Wnode.Guid = SystemTraceControlGuid;
385 s_prop->BufferSize = 1024;
386 s_prop->MinimumBuffers = std::thread::hardware_concurrency() * 4;
387 s_prop->MaximumBuffers = std::thread::hardware_concurrency() * 6;
388 s_prop->LoggerNameOffset = sizeof( EVENT_TRACE_PROPERTIES );
389 memcpy( ((char*)s_prop) + sizeof( EVENT_TRACE_PROPERTIES ), KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) );
390
391 auto backup = tracy_malloc( psz );
392 memcpy( backup, s_prop, psz );
393
394 const auto controlStatus = ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP );
395 if( controlStatus != ERROR_SUCCESS && controlStatus != ERROR_WMI_INSTANCE_NOT_FOUND )
396 {
397 tracy_free( backup );
398 tracy_free( s_prop );
399 return false;
400 }
401
402 memcpy( s_prop, backup, psz );
403 tracy_free( backup );
404
405 const auto startStatus = StartTrace( &s_traceHandle, KERNEL_LOGGER_NAME, s_prop );
406 if( startStatus != ERROR_SUCCESS )
407 {
408 tracy_free( s_prop );
409 return false;
410 }
411
412#ifndef TRACY_NO_SAMPLING
413 if( isOs64Bit )
414 {
415 CLASSIC_EVENT_ID stackId[2] = {};
416 stackId[0].EventGuid = PerfInfoGuid;
417 stackId[0].Type = 46;
418 stackId[1].EventGuid = ThreadV2Guid;
419 stackId[1].Type = 36;
420 const auto stackStatus = TraceSetInformation( s_traceHandle, TraceStackTracingInfo, &stackId, sizeof( stackId ) );
421 if( stackStatus != ERROR_SUCCESS )
422 {
423 tracy_free( s_prop );
424 return false;
425 }
426 }
427#endif
428
429#ifdef UNICODE
430 WCHAR KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )];
431#else
432 char KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )];
433#endif
434 memcpy( KernelLoggerName, KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) );
435 EVENT_TRACE_LOGFILE log = {};
436 log.LoggerName = KernelLoggerName;
437 log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_EVENT_RECORD | PROCESS_TRACE_MODE_RAW_TIMESTAMP;
438 log.EventRecordCallback = EventRecordCallback;
439
440 s_traceHandle2 = OpenTrace( &log );
441 if( s_traceHandle2 == (TRACEHANDLE)INVALID_HANDLE_VALUE )
442 {
443 CloseTrace( s_traceHandle );
444 tracy_free( s_prop );
445 return false;
446 }
447
448#ifndef TRACY_NO_VSYNC_CAPTURE
449 SetupVsync();
450#endif
451
452 return true;
453}
454
455void SysTraceStop()
456{
457 if( s_threadVsync )
458 {
459 CloseTrace( s_traceHandleVsync2 );
460 CloseTrace( s_traceHandleVsync );
461 s_threadVsync->~Thread();
462 tracy_free( s_threadVsync );
463 }
464
465 CloseTrace( s_traceHandle2 );
466 CloseTrace( s_traceHandle );
467}
468
469void SysTraceWorker( void* ptr )
470{
471 ThreadExitHandler threadExitHandler;
472 SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
473 SetThreadName( "Tracy SysTrace" );
474 ProcessTrace( &s_traceHandle2, 1, 0, 0 );
475 ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP );
476 tracy_free( s_prop );
477}
478
479void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const char*& name )
480{
481 bool threadSent = false;
482 auto hnd = OpenThread( THREAD_QUERY_INFORMATION, FALSE, DWORD( thread ) );
483 if( hnd == 0 )
484 {
485 hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, DWORD( thread ) );
486 }
487 if( hnd != 0 )
488 {
489 if( _GetThreadDescription )
490 {
491 PWSTR tmp;
492 _GetThreadDescription( hnd, &tmp );
493 char buf[256];
494 if( tmp )
495 {
496 auto ret = wcstombs( buf, tmp, 256 );
497 if( ret != 0 )
498 {
499 threadName = CopyString( buf, ret );
500 threadSent = true;
501 }
502 }
503 }
504 const auto pid = GetProcessIdOfThread( hnd );
505 if( !threadSent && NtQueryInformationThread && _EnumProcessModules && _GetModuleInformation && _GetModuleBaseNameA )
506 {
507 void* ptr;
508 ULONG retlen;
509 auto status = NtQueryInformationThread( hnd, (THREADINFOCLASS)9 /*ThreadQuerySetWin32StartAddress*/, &ptr, sizeof( &ptr ), &retlen );
510 if( status == 0 )
511 {
512 const auto phnd = OpenProcess( PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, pid );
513 if( phnd != INVALID_HANDLE_VALUE )
514 {
515 HMODULE modules[1024];
516 DWORD needed;
517 if( _EnumProcessModules( phnd, modules, 1024 * sizeof( HMODULE ), &needed ) != 0 )
518 {
519 const auto sz = std::min( DWORD( needed / sizeof( HMODULE ) ), DWORD( 1024 ) );
520 for( DWORD i=0; i<sz; i++ )
521 {
522 MODULEINFO info;
523 if( _GetModuleInformation( phnd, modules[i], &info, sizeof( info ) ) != 0 )
524 {
525 if( (uint64_t)ptr >= (uint64_t)info.lpBaseOfDll && (uint64_t)ptr <= (uint64_t)info.lpBaseOfDll + (uint64_t)info.SizeOfImage )
526 {
527 char buf2[1024];
528 const auto modlen = _GetModuleBaseNameA( phnd, modules[i], buf2, 1024 );
529 if( modlen != 0 )
530 {
531 threadName = CopyString( buf2, modlen );
532 threadSent = true;
533 }
534 }
535 }
536 }
537 }
538 CloseHandle( phnd );
539 }
540 }
541 }
542 CloseHandle( hnd );
543 if( !threadSent )
544 {
545 threadName = CopyString( "???", 3 );
546 threadSent = true;
547 }
548 if( pid != 0 )
549 {
550 {
551 uint64_t _pid = pid;
552 TracyLfqPrepare( QueueType::TidToPid );
553 MemWrite( &item->tidToPid.tid, thread );
554 MemWrite( &item->tidToPid.pid, _pid );
555 TracyLfqCommit;
556 }
557 if( pid == 4 )
558 {
559 name = CopyStringFast( "System", 6 );
560 return;
561 }
562 else
563 {
564 const auto phnd = OpenProcess( PROCESS_QUERY_LIMITED_INFORMATION, FALSE, pid );
565 if( phnd != INVALID_HANDLE_VALUE )
566 {
567 char buf2[1024];
568 const auto sz = GetProcessImageFileNameA( phnd, buf2, 1024 );
569 CloseHandle( phnd );
570 if( sz != 0 )
571 {
572 auto ptr = buf2 + sz - 1;
573 while( ptr > buf2 && *ptr != '\\' ) ptr--;
574 if( *ptr == '\\' ) ptr++;
575 name = CopyStringFast( ptr );
576 return;
577 }
578 }
579 }
580 }
581 }
582
583 if( !threadSent )
584 {
585 threadName = CopyString( "???", 3 );
586 }
587 name = CopyStringFast( "???", 3 );
588}
589
590}
591
592# elif defined __linux__
593
594# include <sys/types.h>
595# include <sys/stat.h>
596# include <sys/wait.h>
597# include <fcntl.h>
598# include <inttypes.h>
599# include <limits>
600# include <poll.h>
601# include <stdio.h>
602# include <stdlib.h>
603# include <string.h>
604# include <unistd.h>
605# include <atomic>
606# include <thread>
607# include <linux/perf_event.h>
608# include <linux/version.h>
609# include <sys/mman.h>
610# include <sys/ioctl.h>
611# include <sys/syscall.h>
612
613# if defined __i386 || defined __x86_64__
614# include "TracyCpuid.hpp"
615# endif
616
617# include "TracyProfiler.hpp"
618# include "TracyRingBuffer.hpp"
619# include "TracyThread.hpp"
620
621namespace tracy
622{
623
624static std::atomic<bool> traceActive { false };
625static int s_numCpus = 0;
626static int s_numBuffers = 0;
627static int s_ctxBufferIdx = 0;
628
629static RingBuffer* s_ring = nullptr;
630
631static const int ThreadHashSize = 4 * 1024;
632static uint32_t s_threadHash[ThreadHashSize] = {};
633
634static bool CurrentProcOwnsThread( uint32_t tid )
635{
636 const auto hash = tid & ( ThreadHashSize-1 );
637 const auto hv = s_threadHash[hash];
638 if( hv == tid ) return true;
639 if( hv == -tid ) return false;
640
641 char path[256];
642 sprintf( path, "/proc/self/task/%d", tid );
643 struct stat st;
644 if( stat( path, &st ) == 0 )
645 {
646 s_threadHash[hash] = tid;
647 return true;
648 }
649 else
650 {
651 s_threadHash[hash] = -tid;
652 return false;
653 }
654}
655
656static int perf_event_open( struct perf_event_attr* hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags )
657{
658 return syscall( __NR_perf_event_open, hw_event, pid, cpu, group_fd, flags );
659}
660
661enum TraceEventId
662{
663 EventCallstack,
664 EventCpuCycles,
665 EventInstructionsRetired,
666 EventCacheReference,
667 EventCacheMiss,
668 EventBranchRetired,
669 EventBranchMiss,
670 EventVsync,
671 EventContextSwitch,
672 EventWakeup,
673};
674
675static void ProbePreciseIp( perf_event_attr& pe, unsigned long long config0, unsigned long long config1, pid_t pid )
676{
677 pe.config = config1;
678 pe.precise_ip = 3;
679 while( pe.precise_ip != 0 )
680 {
681 const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC );
682 if( fd != -1 )
683 {
684 close( fd );
685 break;
686 }
687 pe.precise_ip--;
688 }
689 pe.config = config0;
690 while( pe.precise_ip != 0 )
691 {
692 const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC );
693 if( fd != -1 )
694 {
695 close( fd );
696 break;
697 }
698 pe.precise_ip--;
699 }
700 TracyDebug( " Probed precise_ip: %i\n", pe.precise_ip );
701}
702
703static void ProbePreciseIp( perf_event_attr& pe, pid_t pid )
704{
705 pe.precise_ip = 3;
706 while( pe.precise_ip != 0 )
707 {
708 const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC );
709 if( fd != -1 )
710 {
711 close( fd );
712 break;
713 }
714 pe.precise_ip--;
715 }
716 TracyDebug( " Probed precise_ip: %i\n", pe.precise_ip );
717}
718
719static bool IsGenuineIntel()
720{
721#if defined __i386 || defined __x86_64__
722 uint32_t regs[4] = {};
723 __get_cpuid( 0, regs, regs+1, regs+2, regs+3 );
724 char manufacturer[12];
725 memcpy( manufacturer, regs+1, 4 );
726 memcpy( manufacturer+4, regs+3, 4 );
727 memcpy( manufacturer+8, regs+2, 4 );
728 return memcmp( manufacturer, "GenuineIntel", 12 ) == 0;
729#else
730 return false;
731#endif
732}
733
734static const char* ReadFile( const char* path )
735{
736 int fd = open( path, O_RDONLY );
737 if( fd < 0 ) return nullptr;
738
739 static char tmp[64];
740 const auto cnt = read( fd, tmp, 63 );
741 close( fd );
742 if( cnt < 0 ) return nullptr;
743 tmp[cnt] = '\0';
744 return tmp;
745}
746
747bool SysTraceStart( int64_t& samplingPeriod )
748{
749#ifndef CLOCK_MONOTONIC_RAW
750 return false;
751#endif
752
753 const auto paranoidLevelStr = ReadFile( "/proc/sys/kernel/perf_event_paranoid" );
754 if( !paranoidLevelStr ) return false;
755#ifdef TRACY_VERBOSE
756 int paranoidLevel = 2;
757 paranoidLevel = atoi( paranoidLevelStr );
758 TracyDebug( "perf_event_paranoid: %i\n", paranoidLevel );
759#endif
760
761 int switchId = -1, wakeupId = -1, vsyncId = -1;
762 const auto switchIdStr = ReadFile( "/sys/kernel/debug/tracing/events/sched/sched_switch/id" );
763 if( switchIdStr ) switchId = atoi( switchIdStr );
764 const auto wakeupIdStr = ReadFile( "/sys/kernel/debug/tracing/events/sched/sched_wakeup/id" );
765 if( wakeupIdStr ) wakeupId = atoi( wakeupIdStr );
766 const auto vsyncIdStr = ReadFile( "/sys/kernel/debug/tracing/events/drm/drm_vblank_event/id" );
767 if( vsyncIdStr ) vsyncId = atoi( vsyncIdStr );
768
769 TracyDebug( "sched_switch id: %i\n", switchId );
770 TracyDebug( "sched_wakeup id: %i\n", wakeupId );
771 TracyDebug( "drm_vblank_event id: %i\n", vsyncId );
772
773#ifdef TRACY_NO_SAMPLE_RETIREMENT
774 const bool noRetirement = true;
775#else
776 const char* noRetirementEnv = GetEnvVar( "TRACY_NO_SAMPLE_RETIREMENT" );
777 const bool noRetirement = noRetirementEnv && noRetirementEnv[0] == '1';
778#endif
779
780#ifdef TRACY_NO_SAMPLE_CACHE
781 const bool noCache = true;
782#else
783 const char* noCacheEnv = GetEnvVar( "TRACY_NO_SAMPLE_CACHE" );
784 const bool noCache = noCacheEnv && noCacheEnv[0] == '1';
785#endif
786
787#ifdef TRACY_NO_SAMPLE_BRANCH
788 const bool noBranch = true;
789#else
790 const char* noBranchEnv = GetEnvVar( "TRACY_NO_SAMPLE_BRANCH" );
791 const bool noBranch = noBranchEnv && noBranchEnv[0] == '1';
792#endif
793
794#ifdef TRACY_NO_CONTEXT_SWITCH
795 const bool noCtxSwitch = true;
796#else
797 const char* noCtxSwitchEnv = GetEnvVar( "TRACY_NO_CONTEXT_SWITCH" );
798 const bool noCtxSwitch = noCtxSwitchEnv && noCtxSwitchEnv[0] == '1';
799#endif
800
801#ifdef TRACY_NO_VSYNC_CAPTURE
802 const bool noVsync = true;
803#else
804 const char* noVsyncEnv = GetEnvVar( "TRACY_NO_VSYNC_CAPTURE" );
805 const bool noVsync = noVsyncEnv && noVsyncEnv[0] == '1';
806#endif
807
808 samplingPeriod = GetSamplingPeriod();
809 uint32_t currentPid = (uint32_t)getpid();
810
811 s_numCpus = (int)std::thread::hardware_concurrency();
812
813 const auto maxNumBuffers = s_numCpus * (
814 1 + // software sampling
815 2 + // CPU cycles + instructions retired
816 2 + // cache reference + miss
817 2 + // branch retired + miss
818 2 + // context switches + wakeups
819 1 // vsync
820 );
821 s_ring = (RingBuffer*)tracy_malloc( sizeof( RingBuffer ) * maxNumBuffers );
822 s_numBuffers = 0;
823
824 // software sampling
825 perf_event_attr pe = {};
826 pe.type = PERF_TYPE_SOFTWARE;
827 pe.size = sizeof( perf_event_attr );
828 pe.config = PERF_COUNT_SW_CPU_CLOCK;
829 pe.sample_freq = GetSamplingFrequency();
830 pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN;
831#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 )
832 pe.sample_max_stack = 127;
833#endif
834 pe.disabled = 1;
835 pe.freq = 1;
836 pe.inherit = 1;
837#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
838 pe.use_clockid = 1;
839 pe.clockid = CLOCK_MONOTONIC_RAW;
840#endif
841
842 TracyDebug( "Setup software sampling\n" );
843 ProbePreciseIp( pe, currentPid );
844 for( int i=0; i<s_numCpus; i++ )
845 {
846 int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
847 if( fd == -1 )
848 {
849 pe.exclude_kernel = 1;
850 ProbePreciseIp( pe, currentPid );
851 fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
852 if( fd == -1 )
853 {
854 TracyDebug( " Failed to setup!\n");
855 break;
856 }
857 TracyDebug( " No access to kernel samples\n" );
858 }
859 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCallstack );
860 if( s_ring[s_numBuffers].IsValid() )
861 {
862 s_numBuffers++;
863 TracyDebug( " Core %i ok\n", i );
864 }
865 }
866
867 // CPU cycles + instructions retired
868 pe = {};
869 pe.type = PERF_TYPE_HARDWARE;
870 pe.size = sizeof( perf_event_attr );
871 pe.sample_freq = 5000;
872 pe.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TIME;
873 pe.disabled = 1;
874 pe.exclude_kernel = 1;
875 pe.exclude_guest = 1;
876 pe.exclude_hv = 1;
877 pe.freq = 1;
878 pe.inherit = 1;
879#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
880 pe.use_clockid = 1;
881 pe.clockid = CLOCK_MONOTONIC_RAW;
882#endif
883
884 if( !noRetirement )
885 {
886 TracyDebug( "Setup sampling cycles + retirement\n" );
887 ProbePreciseIp( pe, PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_INSTRUCTIONS, currentPid );
888 for( int i=0; i<s_numCpus; i++ )
889 {
890 const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
891 if( fd != -1 )
892 {
893 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCpuCycles );
894 if( s_ring[s_numBuffers].IsValid() )
895 {
896 s_numBuffers++;
897 TracyDebug( " Core %i ok\n", i );
898 }
899 }
900 }
901
902 pe.config = PERF_COUNT_HW_INSTRUCTIONS;
903 for( int i=0; i<s_numCpus; i++ )
904 {
905 const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
906 if( fd != -1 )
907 {
908 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventInstructionsRetired );
909 if( s_ring[s_numBuffers].IsValid() )
910 {
911 s_numBuffers++;
912 TracyDebug( " Core %i ok\n", i );
913 }
914 }
915 }
916 }
917
918 // cache reference + miss
919 if( !noCache )
920 {
921 TracyDebug( "Setup sampling CPU cache references + misses\n" );
922 ProbePreciseIp( pe, PERF_COUNT_HW_CACHE_REFERENCES, PERF_COUNT_HW_CACHE_MISSES, currentPid );
923 if( IsGenuineIntel() )
924 {
925 pe.precise_ip = 0;
926 TracyDebug( " CPU is GenuineIntel, forcing precise_ip down to 0\n" );
927 }
928 for( int i=0; i<s_numCpus; i++ )
929 {
930 const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
931 if( fd != -1 )
932 {
933 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCacheReference );
934 if( s_ring[s_numBuffers].IsValid() )
935 {
936 s_numBuffers++;
937 TracyDebug( " Core %i ok\n", i );
938 }
939 }
940 }
941
942 pe.config = PERF_COUNT_HW_CACHE_MISSES;
943 for( int i=0; i<s_numCpus; i++ )
944 {
945 const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
946 if( fd != -1 )
947 {
948 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCacheMiss );
949 if( s_ring[s_numBuffers].IsValid() )
950 {
951 s_numBuffers++;
952 TracyDebug( " Core %i ok\n", i );
953 }
954 }
955 }
956 }
957
958 // branch retired + miss
959 if( !noBranch )
960 {
961 TracyDebug( "Setup sampling CPU branch retirements + misses\n" );
962 ProbePreciseIp( pe, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, PERF_COUNT_HW_BRANCH_MISSES, currentPid );
963 for( int i=0; i<s_numCpus; i++ )
964 {
965 const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
966 if( fd != -1 )
967 {
968 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventBranchRetired );
969 if( s_ring[s_numBuffers].IsValid() )
970 {
971 s_numBuffers++;
972 TracyDebug( " Core %i ok\n", i );
973 }
974 }
975 }
976
977 pe.config = PERF_COUNT_HW_BRANCH_MISSES;
978 for( int i=0; i<s_numCpus; i++ )
979 {
980 const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
981 if( fd != -1 )
982 {
983 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventBranchMiss );
984 if( s_ring[s_numBuffers].IsValid() )
985 {
986 s_numBuffers++;
987 TracyDebug( " Core %i ok\n", i );
988 }
989 }
990 }
991 }
992
993 s_ctxBufferIdx = s_numBuffers;
994
995 // vsync
996 if( !noVsync && vsyncId != -1 )
997 {
998 pe = {};
999 pe.type = PERF_TYPE_TRACEPOINT;
1000 pe.size = sizeof( perf_event_attr );
1001 pe.sample_period = 1;
1002 pe.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_RAW;
1003 pe.disabled = 1;
1004 pe.config = vsyncId;
1005#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
1006 pe.use_clockid = 1;
1007 pe.clockid = CLOCK_MONOTONIC_RAW;
1008#endif
1009
1010 TracyDebug( "Setup vsync capture\n" );
1011 for( int i=0; i<s_numCpus; i++ )
1012 {
1013 const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC );
1014 if( fd != -1 )
1015 {
1016 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventVsync, i );
1017 if( s_ring[s_numBuffers].IsValid() )
1018 {
1019 s_numBuffers++;
1020 TracyDebug( " Core %i ok\n", i );
1021 }
1022 }
1023 }
1024 }
1025
1026 // context switches
1027 if( !noCtxSwitch && switchId != -1 )
1028 {
1029 pe = {};
1030 pe.type = PERF_TYPE_TRACEPOINT;
1031 pe.size = sizeof( perf_event_attr );
1032 pe.sample_period = 1;
1033 pe.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
1034#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 )
1035 pe.sample_max_stack = 127;
1036#endif
1037 pe.disabled = 1;
1038 pe.inherit = 1;
1039 pe.config = switchId;
1040#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
1041 pe.use_clockid = 1;
1042 pe.clockid = CLOCK_MONOTONIC_RAW;
1043#endif
1044
1045 TracyDebug( "Setup context switch capture\n" );
1046 for( int i=0; i<s_numCpus; i++ )
1047 {
1048 const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC );
1049 if( fd != -1 )
1050 {
1051 new( s_ring+s_numBuffers ) RingBuffer( 256*1024, fd, EventContextSwitch, i );
1052 if( s_ring[s_numBuffers].IsValid() )
1053 {
1054 s_numBuffers++;
1055 TracyDebug( " Core %i ok\n", i );
1056 }
1057 }
1058 }
1059
1060 if( wakeupId != -1 )
1061 {
1062 pe.config = wakeupId;
1063 pe.config &= ~PERF_SAMPLE_CALLCHAIN;
1064
1065 TracyDebug( "Setup wakeup capture\n" );
1066 for( int i=0; i<s_numCpus; i++ )
1067 {
1068 const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC );
1069 if( fd != -1 )
1070 {
1071 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventWakeup, i );
1072 if( s_ring[s_numBuffers].IsValid() )
1073 {
1074 s_numBuffers++;
1075 TracyDebug( " Core %i ok\n", i );
1076 }
1077 }
1078 }
1079 }
1080 }
1081
1082 TracyDebug( "Ringbuffers in use: %i\n", s_numBuffers );
1083
1084 traceActive.store( true, std::memory_order_relaxed );
1085 return true;
1086}
1087
1088void SysTraceStop()
1089{
1090 traceActive.store( false, std::memory_order_relaxed );
1091}
1092
1093static uint64_t* GetCallstackBlock( uint64_t cnt, RingBuffer& ring, uint64_t offset )
1094{
1095 auto trace = (uint64_t*)tracy_malloc_fast( ( 1 + cnt ) * sizeof( uint64_t ) );
1096 ring.Read( trace+1, offset, sizeof( uint64_t ) * cnt );
1097
1098#if defined __x86_64__ || defined _M_X64
1099 // remove non-canonical pointers
1100 do
1101 {
1102 const auto test = (int64_t)trace[cnt];
1103 const auto m1 = test >> 63;
1104 const auto m2 = test >> 47;
1105 if( m1 == m2 ) break;
1106 }
1107 while( --cnt > 0 );
1108 for( uint64_t j=1; j<cnt; j++ )
1109 {
1110 const auto test = (int64_t)trace[j];
1111 const auto m1 = test >> 63;
1112 const auto m2 = test >> 47;
1113 if( m1 != m2 ) trace[j] = 0;
1114 }
1115#endif
1116
1117 for( uint64_t j=1; j<=cnt; j++ )
1118 {
1119 if( trace[j] >= (uint64_t)-4095 ) // PERF_CONTEXT_MAX
1120 {
1121 memmove( trace+j, trace+j+1, sizeof( uint64_t ) * ( cnt - j ) );
1122 cnt--;
1123 }
1124 }
1125
1126 memcpy( trace, &cnt, sizeof( uint64_t ) );
1127 return trace;
1128}
1129
1130void SysTraceWorker( void* ptr )
1131{
1132 ThreadExitHandler threadExitHandler;
1133 SetThreadName( "Tracy Sampling" );
1134 InitRpmalloc();
1135 sched_param sp = { 99 };
1136 if( pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp ) != 0 ) {
1137 TracyDebug( "Failed to increase SysTraceWorker thread priority!\n" );
1138 }
1139 auto ctxBufferIdx = s_ctxBufferIdx;
1140 auto ringArray = s_ring;
1141 auto numBuffers = s_numBuffers;
1142 for( int i=0; i<numBuffers; i++ ) ringArray[i].Enable();
1143 for(;;)
1144 {
1145#ifdef TRACY_ON_DEMAND
1146 if( !GetProfiler().IsConnected() )
1147 {
1148 if( !traceActive.load( std::memory_order_relaxed ) ) break;
1149 for( int i=0; i<numBuffers; i++ )
1150 {
1151 auto& ring = ringArray[i];
1152 const auto head = ring.LoadHead();
1153 const auto tail = ring.GetTail();
1154 if( head != tail )
1155 {
1156 const auto end = head - tail;
1157 ring.Advance( end );
1158 }
1159 }
1160 if( !traceActive.load( std::memory_order_relaxed ) ) break;
1161 std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
1162 continue;
1163 }
1164#endif
1165
1166 bool hadData = false;
1167 for( int i=0; i<ctxBufferIdx; i++ )
1168 {
1169 if( !traceActive.load( std::memory_order_relaxed ) ) break;
1170 auto& ring = ringArray[i];
1171 const auto head = ring.LoadHead();
1172 const auto tail = ring.GetTail();
1173 if( head == tail ) continue;
1174 assert( head > tail );
1175 hadData = true;
1176
1177 const auto id = ring.GetId();
1178 assert( id != EventContextSwitch );
1179 const auto end = head - tail;
1180 uint64_t pos = 0;
1181 if( id == EventCallstack )
1182 {
1183 while( pos < end )
1184 {
1185 perf_event_header hdr;
1186 ring.Read( &hdr, pos, sizeof( perf_event_header ) );
1187 if( hdr.type == PERF_RECORD_SAMPLE )
1188 {
1189 auto offset = pos + sizeof( perf_event_header );
1190
1191 // Layout:
1192 // u32 pid, tid
1193 // u64 time
1194 // u64 cnt
1195 // u64 ip[cnt]
1196
1197 uint32_t tid;
1198 uint64_t t0;
1199 uint64_t cnt;
1200
1201 offset += sizeof( uint32_t );
1202 ring.Read( &tid, offset, sizeof( uint32_t ) );
1203 offset += sizeof( uint32_t );
1204 ring.Read( &t0, offset, sizeof( uint64_t ) );
1205 offset += sizeof( uint64_t );
1206 ring.Read( &cnt, offset, sizeof( uint64_t ) );
1207 offset += sizeof( uint64_t );
1208
1209 if( cnt > 0 )
1210 {
1211#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
1212 t0 = ring.ConvertTimeToTsc( t0 );
1213#endif
1214 auto trace = GetCallstackBlock( cnt, ring, offset );
1215
1216 TracyLfqPrepare( QueueType::CallstackSample );
1217 MemWrite( &item->callstackSampleFat.time, t0 );
1218 MemWrite( &item->callstackSampleFat.thread, tid );
1219 MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
1220 TracyLfqCommit;
1221 }
1222 }
1223 pos += hdr.size;
1224 }
1225 }
1226 else
1227 {
1228 while( pos < end )
1229 {
1230 perf_event_header hdr;
1231 ring.Read( &hdr, pos, sizeof( perf_event_header ) );
1232 if( hdr.type == PERF_RECORD_SAMPLE )
1233 {
1234 auto offset = pos + sizeof( perf_event_header );
1235
1236 // Layout:
1237 // u64 ip
1238 // u64 time
1239
1240 uint64_t ip, t0;
1241 ring.Read( &ip, offset, sizeof( uint64_t ) );
1242 offset += sizeof( uint64_t );
1243 ring.Read( &t0, offset, sizeof( uint64_t ) );
1244
1245#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
1246 t0 = ring.ConvertTimeToTsc( t0 );
1247#endif
1248 QueueType type;
1249 switch( id )
1250 {
1251 case EventCpuCycles:
1252 type = QueueType::HwSampleCpuCycle;
1253 break;
1254 case EventInstructionsRetired:
1255 type = QueueType::HwSampleInstructionRetired;
1256 break;
1257 case EventCacheReference:
1258 type = QueueType::HwSampleCacheReference;
1259 break;
1260 case EventCacheMiss:
1261 type = QueueType::HwSampleCacheMiss;
1262 break;
1263 case EventBranchRetired:
1264 type = QueueType::HwSampleBranchRetired;
1265 break;
1266 case EventBranchMiss:
1267 type = QueueType::HwSampleBranchMiss;
1268 break;
1269 default:
1270 abort();
1271 }
1272
1273 TracyLfqPrepare( type );
1274 MemWrite( &item->hwSample.ip, ip );
1275 MemWrite( &item->hwSample.time, t0 );
1276 TracyLfqCommit;
1277 }
1278 pos += hdr.size;
1279 }
1280 }
1281 assert( pos == end );
1282 ring.Advance( end );
1283 }
1284 if( !traceActive.load( std::memory_order_relaxed ) ) break;
1285
1286 if( ctxBufferIdx != numBuffers )
1287 {
1288 const auto ctxBufNum = numBuffers - ctxBufferIdx;
1289
1290 int activeNum = 0;
1291 uint16_t active[512];
1292 uint32_t end[512];
1293 uint32_t pos[512];
1294 for( int i=0; i<ctxBufNum; i++ )
1295 {
1296 const auto rbIdx = ctxBufferIdx + i;
1297 const auto rbHead = ringArray[rbIdx].LoadHead();
1298 const auto rbTail = ringArray[rbIdx].GetTail();
1299 const auto rbActive = rbHead != rbTail;
1300
1301 if( rbActive )
1302 {
1303 active[activeNum] = (uint16_t)i;
1304 activeNum++;
1305 end[i] = rbHead - rbTail;
1306 pos[i] = 0;
1307 }
1308 else
1309 {
1310 end[i] = 0;
1311 }
1312 }
1313 if( activeNum > 0 )
1314 {
1315 hadData = true;
1316 while( activeNum > 0 )
1317 {
1318 int sel = -1;
1319 int selPos;
1320 int64_t t0 = std::numeric_limits<int64_t>::max();
1321 for( int i=0; i<activeNum; i++ )
1322 {
1323 auto idx = active[i];
1324 auto rbPos = pos[idx];
1325 assert( rbPos < end[idx] );
1326 const auto rbIdx = ctxBufferIdx + idx;
1327 perf_event_header hdr;
1328 ringArray[rbIdx].Read( &hdr, rbPos, sizeof( perf_event_header ) );
1329 if( hdr.type == PERF_RECORD_SAMPLE )
1330 {
1331 int64_t rbTime;
1332 ringArray[rbIdx].Read( &rbTime, rbPos + sizeof( perf_event_header ), sizeof( int64_t ) );
1333 if( rbTime < t0 )
1334 {
1335 t0 = rbTime;
1336 sel = idx;
1337 selPos = i;
1338 }
1339 }
1340 else
1341 {
1342 rbPos += hdr.size;
1343 if( rbPos == end[idx] )
1344 {
1345 memmove( active+i, active+i+1, sizeof(*active) * ( activeNum - i - 1 ) );
1346 activeNum--;
1347 i--;
1348 }
1349 else
1350 {
1351 pos[idx] = rbPos;
1352 }
1353 }
1354 }
1355 if( sel >= 0 )
1356 {
1357 auto& ring = ringArray[ctxBufferIdx + sel];
1358 auto rbPos = pos[sel];
1359 auto offset = rbPos;
1360 perf_event_header hdr;
1361 ring.Read( &hdr, offset, sizeof( perf_event_header ) );
1362
1363#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
1364 t0 = ring.ConvertTimeToTsc( t0 );
1365#endif
1366
1367 const auto rid = ring.GetId();
1368 if( rid == EventContextSwitch )
1369 {
1370 // Layout:
1371 // u64 time
1372 // u64 cnt
1373 // u64 ip[cnt]
1374 // u32 size
1375 // u8 data[size]
1376 // Data (not ABI stable, but has not changed since it was added, in 2009):
1377 // u8 hdr[8]
1378 // u8 prev_comm[16]
1379 // u32 prev_pid
1380 // u32 prev_prio
1381 // lng prev_state
1382 // u8 next_comm[16]
1383 // u32 next_pid
1384 // u32 next_prio
1385
1386 offset += sizeof( perf_event_header ) + sizeof( uint64_t );
1387
1388 uint64_t cnt;
1389 ring.Read( &cnt, offset, sizeof( uint64_t ) );
1390 offset += sizeof( uint64_t );
1391 const auto traceOffset = offset;
1392 offset += sizeof( uint64_t ) * cnt + sizeof( uint32_t ) + 8 + 16;
1393
1394 uint32_t prev_pid, next_pid;
1395 long prev_state;
1396
1397 ring.Read( &prev_pid, offset, sizeof( uint32_t ) );
1398 offset += sizeof( uint32_t ) + sizeof( uint32_t );
1399 ring.Read( &prev_state, offset, sizeof( long ) );
1400 offset += sizeof( long ) + 16;
1401 ring.Read( &next_pid, offset, sizeof( uint32_t ) );
1402
1403 uint8_t reason = 100;
1404 uint8_t state;
1405
1406 if( prev_state & 0x0001 ) state = 104;
1407 else if( prev_state & 0x0002 ) state = 101;
1408 else if( prev_state & 0x0004 ) state = 105;
1409 else if( prev_state & 0x0008 ) state = 106;
1410 else if( prev_state & 0x0010 ) state = 108;
1411 else if( prev_state & 0x0020 ) state = 109;
1412 else if( prev_state & 0x0040 ) state = 110;
1413 else if( prev_state & 0x0080 ) state = 102;
1414 else state = 103;
1415
1416 TracyLfqPrepare( QueueType::ContextSwitch );
1417 MemWrite( &item->contextSwitch.time, t0 );
1418 MemWrite( &item->contextSwitch.oldThread, prev_pid );
1419 MemWrite( &item->contextSwitch.newThread, next_pid );
1420 MemWrite( &item->contextSwitch.cpu, uint8_t( ring.GetCpu() ) );
1421 MemWrite( &item->contextSwitch.reason, reason );
1422 MemWrite( &item->contextSwitch.state, state );
1423 TracyLfqCommit;
1424
1425 if( cnt > 0 && prev_pid != 0 && CurrentProcOwnsThread( prev_pid ) )
1426 {
1427 auto trace = GetCallstackBlock( cnt, ring, traceOffset );
1428
1429 TracyLfqPrepare( QueueType::CallstackSampleContextSwitch );
1430 MemWrite( &item->callstackSampleFat.time, t0 );
1431 MemWrite( &item->callstackSampleFat.thread, prev_pid );
1432 MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
1433 TracyLfqCommit;
1434 }
1435 }
1436 else if( rid == EventWakeup )
1437 {
1438 // Layout:
1439 // u64 time
1440 // u32 size
1441 // u8 data[size]
1442 // Data:
1443 // u8 hdr[8]
1444 // u8 comm[16]
1445 // u32 pid
1446 // u32 prio
1447 // u64 target_cpu
1448
1449 offset += sizeof( perf_event_header ) + sizeof( uint64_t ) + sizeof( uint32_t ) + 8 + 16;
1450
1451 uint32_t pid;
1452 ring.Read( &pid, offset, sizeof( uint32_t ) );
1453
1454 TracyLfqPrepare( QueueType::ThreadWakeup );
1455 MemWrite( &item->threadWakeup.time, t0 );
1456 MemWrite( &item->threadWakeup.thread, pid );
1457 TracyLfqCommit;
1458 }
1459 else
1460 {
1461 assert( rid == EventVsync );
1462 // Layout:
1463 // u64 time
1464 // u32 size
1465 // u8 data[size]
1466 // Data (not ABI stable):
1467 // u8 hdr[8]
1468 // i32 crtc
1469 // u32 seq
1470 // i64 ktime
1471 // u8 high precision
1472
1473 offset += sizeof( perf_event_header ) + sizeof( uint64_t ) + sizeof( uint32_t ) + 8;
1474
1475 int32_t crtc;
1476 ring.Read( &crtc, offset, sizeof( int32_t ) );
1477
1478 // Note: The timestamp value t0 might be off by a number of microseconds from the
1479 // true hardware vblank event. The ktime value should be used instead, but it is
1480 // measured in CLOCK_MONOTONIC time. Tracy only supports the timestamp counter
1481 // register (TSC) or CLOCK_MONOTONIC_RAW clock.
1482#if 0
1483 offset += sizeof( uint32_t ) * 2;
1484 int64_t ktime;
1485 ring.Read( &ktime, offset, sizeof( int64_t ) );
1486#endif
1487
1488 TracyLfqPrepare( QueueType::FrameVsync );
1489 MemWrite( &item->frameVsync.id, crtc );
1490 MemWrite( &item->frameVsync.time, t0 );
1491 TracyLfqCommit;
1492 }
1493
1494 rbPos += hdr.size;
1495 if( rbPos == end[sel] )
1496 {
1497 memmove( active+selPos, active+selPos+1, sizeof(*active) * ( activeNum - selPos - 1 ) );
1498 activeNum--;
1499 }
1500 else
1501 {
1502 pos[sel] = rbPos;
1503 }
1504 }
1505 }
1506 for( int i=0; i<ctxBufNum; i++ )
1507 {
1508 if( end[i] != 0 ) ringArray[ctxBufferIdx + i].Advance( end[i] );
1509 }
1510 }
1511 }
1512 if( !traceActive.load( std::memory_order_relaxed ) ) break;
1513 if( !hadData )
1514 {
1515 std::this_thread::sleep_for( std::chrono::milliseconds( 1 ) );
1516 }
1517 }
1518
1519 for( int i=0; i<numBuffers; i++ ) ringArray[i].~RingBuffer();
1520 tracy_free_fast( ringArray );
1521}
1522
1523void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const char*& name )
1524{
1525 FILE* f;
1526 char fn[256];
1527 sprintf( fn, "/proc/%" PRIu64 "/comm", thread );
1528 f = fopen( fn, "rb" );
1529 if( f )
1530 {
1531 char buf[256];
1532 const auto sz = fread( buf, 1, 256, f );
1533 if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0';
1534 threadName = CopyString( buf );
1535 fclose( f );
1536 }
1537 else
1538 {
1539 threadName = CopyString( "???", 3 );
1540 }
1541
1542 sprintf( fn, "/proc/%" PRIu64 "/status", thread );
1543 f = fopen( fn, "rb" );
1544 if( f )
1545 {
1546 char* tmp = (char*)tracy_malloc_fast( 8*1024 );
1547 const auto fsz = (ptrdiff_t)fread( tmp, 1, 8*1024, f );
1548 fclose( f );
1549
1550 int pid = -1;
1551 auto line = tmp;
1552 for(;;)
1553 {
1554 if( memcmp( "Tgid:\t", line, 6 ) == 0 )
1555 {
1556 pid = atoi( line + 6 );
1557 break;
1558 }
1559 while( line - tmp < fsz && *line != '\n' ) line++;
1560 if( *line != '\n' ) break;
1561 line++;
1562 }
1563 tracy_free_fast( tmp );
1564
1565 if( pid >= 0 )
1566 {
1567 {
1568 uint64_t _pid = pid;
1569 TracyLfqPrepare( QueueType::TidToPid );
1570 MemWrite( &item->tidToPid.tid, thread );
1571 MemWrite( &item->tidToPid.pid, _pid );
1572 TracyLfqCommit;
1573 }
1574 sprintf( fn, "/proc/%i/comm", pid );
1575 f = fopen( fn, "rb" );
1576 if( f )
1577 {
1578 char buf[256];
1579 const auto sz = fread( buf, 1, 256, f );
1580 if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0';
1581 name = CopyStringFast( buf );
1582 fclose( f );
1583 return;
1584 }
1585 }
1586 }
1587 name = CopyStringFast( "???", 3 );
1588}
1589
1590}
1591
1592# endif
1593
1594#endif