The open source OpenXR runtime
at main 1594 lines 55 kB view raw
1#include "TracyDebug.hpp" 2#include "TracyStringHelpers.hpp" 3#include "TracySysTrace.hpp" 4#include "../common/TracySystem.hpp" 5 6#ifdef TRACY_HAS_SYSTEM_TRACING 7 8#ifndef TRACY_SAMPLING_HZ 9# if defined _WIN32 10# define TRACY_SAMPLING_HZ 8000 11# elif defined __linux__ 12# define TRACY_SAMPLING_HZ 10000 13# endif 14#endif 15 16namespace tracy 17{ 18 19static constexpr int GetSamplingFrequency() 20{ 21#if defined _WIN32 22 return TRACY_SAMPLING_HZ > 8000 ? 8000 : ( TRACY_SAMPLING_HZ < 1 ? 1 : TRACY_SAMPLING_HZ ); 23#else 24 return TRACY_SAMPLING_HZ > 1000000 ? 1000000 : ( TRACY_SAMPLING_HZ < 1 ? 1 : TRACY_SAMPLING_HZ ); 25#endif 26} 27 28static constexpr int GetSamplingPeriod() 29{ 30 return 1000000000 / GetSamplingFrequency(); 31} 32 33} 34 35# if defined _WIN32 36 37# ifndef NOMINMAX 38# define NOMINMAX 39# endif 40 41# define INITGUID 42# include <assert.h> 43# include <string.h> 44# include <windows.h> 45# include <dbghelp.h> 46# include <evntrace.h> 47# include <evntcons.h> 48# include <psapi.h> 49# include <winternl.h> 50 51# include "../common/TracyAlloc.hpp" 52# include "../common/TracySystem.hpp" 53# include "TracyProfiler.hpp" 54# include "TracyThread.hpp" 55 56namespace tracy 57{ 58 59static const GUID PerfInfoGuid = { 0xce1dbfb4, 0x137e, 0x4da6, { 0x87, 0xb0, 0x3f, 0x59, 0xaa, 0x10, 0x2c, 0xbc } }; 60static const GUID DxgKrnlGuid = { 0x802ec45a, 0x1e99, 0x4b83, { 0x99, 0x20, 0x87, 0xc9, 0x82, 0x77, 0xba, 0x9d } }; 61static const GUID ThreadV2Guid = { 0x3d6fa8d1, 0xfe05, 0x11d0, { 0x9d, 0xda, 0x00, 0xc0, 0x4f, 0xd7, 0xba, 0x7c } }; 62 63 64static TRACEHANDLE s_traceHandle; 65static TRACEHANDLE s_traceHandle2; 66static EVENT_TRACE_PROPERTIES* s_prop; 67static DWORD s_pid; 68 69static EVENT_TRACE_PROPERTIES* s_propVsync; 70static TRACEHANDLE s_traceHandleVsync; 71static TRACEHANDLE s_traceHandleVsync2; 72Thread* s_threadVsync = nullptr; 73 74struct CSwitch 75{ 76 uint32_t newThreadId; 77 uint32_t oldThreadId; 78 int8_t newThreadPriority; 79 int8_t oldThreadPriority; 80 uint8_t previousCState; 81 int8_t spareByte; 82 int8_t oldThreadWaitReason; 83 int8_t oldThreadWaitMode; 84 int8_t oldThreadState; 85 int8_t oldThreadWaitIdealProcessor; 86 uint32_t newThreadWaitTime; 87 uint32_t reserved; 88}; 89 90struct ReadyThread 91{ 92 uint32_t threadId; 93 int8_t adjustReason; 94 int8_t adjustIncrement; 95 int8_t flag; 96 int8_t reserverd; 97}; 98 99struct ThreadTrace 100{ 101 uint32_t processId; 102 uint32_t threadId; 103 uint32_t stackBase; 104 uint32_t stackLimit; 105 uint32_t userStackBase; 106 uint32_t userStackLimit; 107 uint32_t startAddr; 108 uint32_t win32StartAddr; 109 uint32_t tebBase; 110 uint32_t subProcessTag; 111}; 112 113struct StackWalkEvent 114{ 115 uint64_t eventTimeStamp; 116 uint32_t stackProcess; 117 uint32_t stackThread; 118 uint64_t stack[192]; 119}; 120 121struct VSyncInfo 122{ 123 void* dxgAdapter; 124 uint32_t vidPnTargetId; 125 uint64_t scannedPhysicalAddress; 126 uint32_t vidPnSourceId; 127 uint32_t frameNumber; 128 int64_t frameQpcTime; 129 void* hFlipDevice; 130 uint32_t flipType; 131 uint64_t flipFenceId; 132}; 133 134extern "C" typedef NTSTATUS (WINAPI *t_NtQueryInformationThread)( HANDLE, THREADINFOCLASS, PVOID, ULONG, PULONG ); 135extern "C" typedef BOOL (WINAPI *t_EnumProcessModules)( HANDLE, HMODULE*, DWORD, LPDWORD ); 136extern "C" typedef BOOL (WINAPI *t_GetModuleInformation)( HANDLE, HMODULE, LPMODULEINFO, DWORD ); 137extern "C" typedef DWORD (WINAPI *t_GetModuleBaseNameA)( HANDLE, HMODULE, LPSTR, DWORD ); 138extern "C" typedef HRESULT (WINAPI *t_GetThreadDescription)( HANDLE, PWSTR* ); 139 140t_NtQueryInformationThread NtQueryInformationThread = (t_NtQueryInformationThread)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "NtQueryInformationThread" ); 141t_EnumProcessModules _EnumProcessModules = (t_EnumProcessModules)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32EnumProcessModules" ); 142t_GetModuleInformation _GetModuleInformation = (t_GetModuleInformation)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetModuleInformation" ); 143t_GetModuleBaseNameA _GetModuleBaseNameA = (t_GetModuleBaseNameA)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetModuleBaseNameA" ); 144 145static t_GetThreadDescription _GetThreadDescription = 0; 146 147 148void WINAPI EventRecordCallback( PEVENT_RECORD record ) 149{ 150#ifdef TRACY_ON_DEMAND 151 if( !GetProfiler().IsConnected() ) return; 152#endif 153 154 const auto& hdr = record->EventHeader; 155 switch( hdr.ProviderId.Data1 ) 156 { 157 case 0x3d6fa8d1: // Thread Guid 158 if( hdr.EventDescriptor.Opcode == 36 ) 159 { 160 const auto cswitch = (const CSwitch*)record->UserData; 161 162 TracyLfqPrepare( QueueType::ContextSwitch ); 163 MemWrite( &item->contextSwitch.time, hdr.TimeStamp.QuadPart ); 164 MemWrite( &item->contextSwitch.oldThread, cswitch->oldThreadId ); 165 MemWrite( &item->contextSwitch.newThread, cswitch->newThreadId ); 166 MemWrite( &item->contextSwitch.cpu, record->BufferContext.ProcessorNumber ); 167 MemWrite( &item->contextSwitch.reason, cswitch->oldThreadWaitReason ); 168 MemWrite( &item->contextSwitch.state, cswitch->oldThreadState ); 169 TracyLfqCommit; 170 } 171 else if( hdr.EventDescriptor.Opcode == 50 ) 172 { 173 const auto rt = (const ReadyThread*)record->UserData; 174 175 TracyLfqPrepare( QueueType::ThreadWakeup ); 176 MemWrite( &item->threadWakeup.time, hdr.TimeStamp.QuadPart ); 177 MemWrite( &item->threadWakeup.thread, rt->threadId ); 178 TracyLfqCommit; 179 } 180 else if( hdr.EventDescriptor.Opcode == 1 || hdr.EventDescriptor.Opcode == 3 ) 181 { 182 const auto tt = (const ThreadTrace*)record->UserData; 183 184 uint64_t tid = tt->threadId; 185 if( tid == 0 ) return; 186 uint64_t pid = tt->processId; 187 TracyLfqPrepare( QueueType::TidToPid ); 188 MemWrite( &item->tidToPid.tid, tid ); 189 MemWrite( &item->tidToPid.pid, pid ); 190 TracyLfqCommit; 191 } 192 break; 193 case 0xdef2fe46: // StackWalk Guid 194 if( hdr.EventDescriptor.Opcode == 32 ) 195 { 196 const auto sw = (const StackWalkEvent*)record->UserData; 197 if( sw->stackProcess == s_pid ) 198 { 199 const uint64_t sz = ( record->UserDataLength - 16 ) / 8; 200 if( sz > 0 ) 201 { 202 auto trace = (uint64_t*)tracy_malloc( ( 1 + sz ) * sizeof( uint64_t ) ); 203 memcpy( trace, &sz, sizeof( uint64_t ) ); 204 memcpy( trace+1, sw->stack, sizeof( uint64_t ) * sz ); 205 TracyLfqPrepare( QueueType::CallstackSample ); 206 MemWrite( &item->callstackSampleFat.time, sw->eventTimeStamp ); 207 MemWrite( &item->callstackSampleFat.thread, sw->stackThread ); 208 MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace ); 209 TracyLfqCommit; 210 } 211 } 212 } 213 break; 214 default: 215 break; 216 } 217} 218 219void WINAPI EventRecordCallbackVsync( PEVENT_RECORD record ) 220{ 221#ifdef TRACY_ON_DEMAND 222 if( !GetProfiler().IsConnected() ) return; 223#endif 224 225 const auto& hdr = record->EventHeader; 226 assert( hdr.ProviderId.Data1 == 0x802EC45A ); 227 assert( hdr.EventDescriptor.Id == 0x0011 ); 228 229 const auto vs = (const VSyncInfo*)record->UserData; 230 231 TracyLfqPrepare( QueueType::FrameVsync ); 232 MemWrite( &item->frameVsync.time, hdr.TimeStamp.QuadPart ); 233 MemWrite( &item->frameVsync.id, vs->vidPnTargetId ); 234 TracyLfqCommit; 235} 236 237static void SetupVsync() 238{ 239#if _WIN32_WINNT >= _WIN32_WINNT_WINBLUE && !defined(__MINGW32__) 240 const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + MAX_PATH; 241 s_propVsync = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz ); 242 memset( s_propVsync, 0, sizeof( EVENT_TRACE_PROPERTIES ) ); 243 s_propVsync->LogFileMode = EVENT_TRACE_REAL_TIME_MODE; 244 s_propVsync->Wnode.BufferSize = psz; 245#ifdef TRACY_TIMER_QPC 246 s_propVsync->Wnode.ClientContext = 1; 247#else 248 s_propVsync->Wnode.ClientContext = 3; 249#endif 250 s_propVsync->LoggerNameOffset = sizeof( EVENT_TRACE_PROPERTIES ); 251 strcpy( ((char*)s_propVsync) + sizeof( EVENT_TRACE_PROPERTIES ), "TracyVsync" ); 252 253 auto backup = tracy_malloc( psz ); 254 memcpy( backup, s_propVsync, psz ); 255 256 const auto controlStatus = ControlTraceA( 0, "TracyVsync", s_propVsync, EVENT_TRACE_CONTROL_STOP ); 257 if( controlStatus != ERROR_SUCCESS && controlStatus != ERROR_WMI_INSTANCE_NOT_FOUND ) 258 { 259 tracy_free( backup ); 260 tracy_free( s_propVsync ); 261 return; 262 } 263 264 memcpy( s_propVsync, backup, psz ); 265 tracy_free( backup ); 266 267 const auto startStatus = StartTraceA( &s_traceHandleVsync, "TracyVsync", s_propVsync ); 268 if( startStatus != ERROR_SUCCESS ) 269 { 270 tracy_free( s_propVsync ); 271 return; 272 } 273 274 EVENT_FILTER_EVENT_ID fe = {}; 275 fe.FilterIn = TRUE; 276 fe.Count = 1; 277 fe.Events[0] = 0x0011; // VSyncDPC_Info 278 279 EVENT_FILTER_DESCRIPTOR desc = {}; 280 desc.Ptr = (ULONGLONG)&fe; 281 desc.Size = sizeof( fe ); 282 desc.Type = EVENT_FILTER_TYPE_EVENT_ID; 283 284 ENABLE_TRACE_PARAMETERS params = {}; 285 params.Version = ENABLE_TRACE_PARAMETERS_VERSION_2; 286 params.EnableProperty = EVENT_ENABLE_PROPERTY_IGNORE_KEYWORD_0; 287 params.SourceId = s_propVsync->Wnode.Guid; 288 params.EnableFilterDesc = &desc; 289 params.FilterDescCount = 1; 290 291 uint64_t mask = 0x4000000000000001; // Microsoft_Windows_DxgKrnl_Performance | Base 292 if( EnableTraceEx2( s_traceHandleVsync, &DxgKrnlGuid, EVENT_CONTROL_CODE_ENABLE_PROVIDER, TRACE_LEVEL_INFORMATION, mask, mask, 0, &params ) != ERROR_SUCCESS ) 293 { 294 tracy_free( s_propVsync ); 295 return; 296 } 297 298 char loggerName[MAX_PATH]; 299 strcpy( loggerName, "TracyVsync" ); 300 301 EVENT_TRACE_LOGFILEA log = {}; 302 log.LoggerName = loggerName; 303 log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_EVENT_RECORD | PROCESS_TRACE_MODE_RAW_TIMESTAMP; 304 log.EventRecordCallback = EventRecordCallbackVsync; 305 306 s_traceHandleVsync2 = OpenTraceA( &log ); 307 if( s_traceHandleVsync2 == (TRACEHANDLE)INVALID_HANDLE_VALUE ) 308 { 309 CloseTrace( s_traceHandleVsync ); 310 tracy_free( s_propVsync ); 311 return; 312 } 313 314 s_threadVsync = (Thread*)tracy_malloc( sizeof( Thread ) ); 315 new(s_threadVsync) Thread( [] (void*) { 316 ThreadExitHandler threadExitHandler; 317 SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL ); 318 SetThreadName( "Tracy Vsync" ); 319 ProcessTrace( &s_traceHandleVsync2, 1, nullptr, nullptr ); 320 }, nullptr ); 321#endif 322} 323 324static constexpr int GetSamplingInterval() 325{ 326 return GetSamplingPeriod() / 100; 327} 328 329bool SysTraceStart( int64_t& samplingPeriod ) 330{ 331 if( !_GetThreadDescription ) _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" ); 332 333 s_pid = GetCurrentProcessId(); 334 335#if defined _WIN64 336 constexpr bool isOs64Bit = true; 337#else 338 BOOL _iswow64; 339 IsWow64Process( GetCurrentProcess(), &_iswow64 ); 340 const bool isOs64Bit = _iswow64; 341#endif 342 343 TOKEN_PRIVILEGES priv = {}; 344 priv.PrivilegeCount = 1; 345 priv.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; 346 if( LookupPrivilegeValue( nullptr, SE_SYSTEM_PROFILE_NAME, &priv.Privileges[0].Luid ) == 0 ) return false; 347 348 HANDLE pt; 349 if( OpenProcessToken( GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &pt ) == 0 ) return false; 350 const auto adjust = AdjustTokenPrivileges( pt, FALSE, &priv, 0, nullptr, nullptr ); 351 CloseHandle( pt ); 352 if( adjust == 0 ) return false; 353 const auto status = GetLastError(); 354 if( status != ERROR_SUCCESS ) return false; 355 356 if( isOs64Bit ) 357 { 358 TRACE_PROFILE_INTERVAL interval = {}; 359 interval.Interval = GetSamplingInterval(); 360 const auto intervalStatus = TraceSetInformation( 0, TraceSampledProfileIntervalInfo, &interval, sizeof( interval ) ); 361 if( intervalStatus != ERROR_SUCCESS ) return false; 362 samplingPeriod = GetSamplingPeriod(); 363 } 364 365 const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + sizeof( KERNEL_LOGGER_NAME ); 366 s_prop = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz ); 367 memset( s_prop, 0, sizeof( EVENT_TRACE_PROPERTIES ) ); 368 ULONG flags = 0; 369#ifndef TRACY_NO_CONTEXT_SWITCH 370 flags = EVENT_TRACE_FLAG_CSWITCH | EVENT_TRACE_FLAG_DISPATCHER | EVENT_TRACE_FLAG_THREAD; 371#endif 372#ifndef TRACY_NO_SAMPLING 373 if( isOs64Bit ) flags |= EVENT_TRACE_FLAG_PROFILE; 374#endif 375 s_prop->EnableFlags = flags; 376 s_prop->LogFileMode = EVENT_TRACE_REAL_TIME_MODE; 377 s_prop->Wnode.BufferSize = psz; 378 s_prop->Wnode.Flags = WNODE_FLAG_TRACED_GUID; 379#ifdef TRACY_TIMER_QPC 380 s_prop->Wnode.ClientContext = 1; 381#else 382 s_prop->Wnode.ClientContext = 3; 383#endif 384 s_prop->Wnode.Guid = SystemTraceControlGuid; 385 s_prop->BufferSize = 1024; 386 s_prop->MinimumBuffers = std::thread::hardware_concurrency() * 4; 387 s_prop->MaximumBuffers = std::thread::hardware_concurrency() * 6; 388 s_prop->LoggerNameOffset = sizeof( EVENT_TRACE_PROPERTIES ); 389 memcpy( ((char*)s_prop) + sizeof( EVENT_TRACE_PROPERTIES ), KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) ); 390 391 auto backup = tracy_malloc( psz ); 392 memcpy( backup, s_prop, psz ); 393 394 const auto controlStatus = ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP ); 395 if( controlStatus != ERROR_SUCCESS && controlStatus != ERROR_WMI_INSTANCE_NOT_FOUND ) 396 { 397 tracy_free( backup ); 398 tracy_free( s_prop ); 399 return false; 400 } 401 402 memcpy( s_prop, backup, psz ); 403 tracy_free( backup ); 404 405 const auto startStatus = StartTrace( &s_traceHandle, KERNEL_LOGGER_NAME, s_prop ); 406 if( startStatus != ERROR_SUCCESS ) 407 { 408 tracy_free( s_prop ); 409 return false; 410 } 411 412#ifndef TRACY_NO_SAMPLING 413 if( isOs64Bit ) 414 { 415 CLASSIC_EVENT_ID stackId[2] = {}; 416 stackId[0].EventGuid = PerfInfoGuid; 417 stackId[0].Type = 46; 418 stackId[1].EventGuid = ThreadV2Guid; 419 stackId[1].Type = 36; 420 const auto stackStatus = TraceSetInformation( s_traceHandle, TraceStackTracingInfo, &stackId, sizeof( stackId ) ); 421 if( stackStatus != ERROR_SUCCESS ) 422 { 423 tracy_free( s_prop ); 424 return false; 425 } 426 } 427#endif 428 429#ifdef UNICODE 430 WCHAR KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )]; 431#else 432 char KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )]; 433#endif 434 memcpy( KernelLoggerName, KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) ); 435 EVENT_TRACE_LOGFILE log = {}; 436 log.LoggerName = KernelLoggerName; 437 log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_EVENT_RECORD | PROCESS_TRACE_MODE_RAW_TIMESTAMP; 438 log.EventRecordCallback = EventRecordCallback; 439 440 s_traceHandle2 = OpenTrace( &log ); 441 if( s_traceHandle2 == (TRACEHANDLE)INVALID_HANDLE_VALUE ) 442 { 443 CloseTrace( s_traceHandle ); 444 tracy_free( s_prop ); 445 return false; 446 } 447 448#ifndef TRACY_NO_VSYNC_CAPTURE 449 SetupVsync(); 450#endif 451 452 return true; 453} 454 455void SysTraceStop() 456{ 457 if( s_threadVsync ) 458 { 459 CloseTrace( s_traceHandleVsync2 ); 460 CloseTrace( s_traceHandleVsync ); 461 s_threadVsync->~Thread(); 462 tracy_free( s_threadVsync ); 463 } 464 465 CloseTrace( s_traceHandle2 ); 466 CloseTrace( s_traceHandle ); 467} 468 469void SysTraceWorker( void* ptr ) 470{ 471 ThreadExitHandler threadExitHandler; 472 SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL ); 473 SetThreadName( "Tracy SysTrace" ); 474 ProcessTrace( &s_traceHandle2, 1, 0, 0 ); 475 ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP ); 476 tracy_free( s_prop ); 477} 478 479void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const char*& name ) 480{ 481 bool threadSent = false; 482 auto hnd = OpenThread( THREAD_QUERY_INFORMATION, FALSE, DWORD( thread ) ); 483 if( hnd == 0 ) 484 { 485 hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, DWORD( thread ) ); 486 } 487 if( hnd != 0 ) 488 { 489 if( _GetThreadDescription ) 490 { 491 PWSTR tmp; 492 _GetThreadDescription( hnd, &tmp ); 493 char buf[256]; 494 if( tmp ) 495 { 496 auto ret = wcstombs( buf, tmp, 256 ); 497 if( ret != 0 ) 498 { 499 threadName = CopyString( buf, ret ); 500 threadSent = true; 501 } 502 } 503 } 504 const auto pid = GetProcessIdOfThread( hnd ); 505 if( !threadSent && NtQueryInformationThread && _EnumProcessModules && _GetModuleInformation && _GetModuleBaseNameA ) 506 { 507 void* ptr; 508 ULONG retlen; 509 auto status = NtQueryInformationThread( hnd, (THREADINFOCLASS)9 /*ThreadQuerySetWin32StartAddress*/, &ptr, sizeof( &ptr ), &retlen ); 510 if( status == 0 ) 511 { 512 const auto phnd = OpenProcess( PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, pid ); 513 if( phnd != INVALID_HANDLE_VALUE ) 514 { 515 HMODULE modules[1024]; 516 DWORD needed; 517 if( _EnumProcessModules( phnd, modules, 1024 * sizeof( HMODULE ), &needed ) != 0 ) 518 { 519 const auto sz = std::min( DWORD( needed / sizeof( HMODULE ) ), DWORD( 1024 ) ); 520 for( DWORD i=0; i<sz; i++ ) 521 { 522 MODULEINFO info; 523 if( _GetModuleInformation( phnd, modules[i], &info, sizeof( info ) ) != 0 ) 524 { 525 if( (uint64_t)ptr >= (uint64_t)info.lpBaseOfDll && (uint64_t)ptr <= (uint64_t)info.lpBaseOfDll + (uint64_t)info.SizeOfImage ) 526 { 527 char buf2[1024]; 528 const auto modlen = _GetModuleBaseNameA( phnd, modules[i], buf2, 1024 ); 529 if( modlen != 0 ) 530 { 531 threadName = CopyString( buf2, modlen ); 532 threadSent = true; 533 } 534 } 535 } 536 } 537 } 538 CloseHandle( phnd ); 539 } 540 } 541 } 542 CloseHandle( hnd ); 543 if( !threadSent ) 544 { 545 threadName = CopyString( "???", 3 ); 546 threadSent = true; 547 } 548 if( pid != 0 ) 549 { 550 { 551 uint64_t _pid = pid; 552 TracyLfqPrepare( QueueType::TidToPid ); 553 MemWrite( &item->tidToPid.tid, thread ); 554 MemWrite( &item->tidToPid.pid, _pid ); 555 TracyLfqCommit; 556 } 557 if( pid == 4 ) 558 { 559 name = CopyStringFast( "System", 6 ); 560 return; 561 } 562 else 563 { 564 const auto phnd = OpenProcess( PROCESS_QUERY_LIMITED_INFORMATION, FALSE, pid ); 565 if( phnd != INVALID_HANDLE_VALUE ) 566 { 567 char buf2[1024]; 568 const auto sz = GetProcessImageFileNameA( phnd, buf2, 1024 ); 569 CloseHandle( phnd ); 570 if( sz != 0 ) 571 { 572 auto ptr = buf2 + sz - 1; 573 while( ptr > buf2 && *ptr != '\\' ) ptr--; 574 if( *ptr == '\\' ) ptr++; 575 name = CopyStringFast( ptr ); 576 return; 577 } 578 } 579 } 580 } 581 } 582 583 if( !threadSent ) 584 { 585 threadName = CopyString( "???", 3 ); 586 } 587 name = CopyStringFast( "???", 3 ); 588} 589 590} 591 592# elif defined __linux__ 593 594# include <sys/types.h> 595# include <sys/stat.h> 596# include <sys/wait.h> 597# include <fcntl.h> 598# include <inttypes.h> 599# include <limits> 600# include <poll.h> 601# include <stdio.h> 602# include <stdlib.h> 603# include <string.h> 604# include <unistd.h> 605# include <atomic> 606# include <thread> 607# include <linux/perf_event.h> 608# include <linux/version.h> 609# include <sys/mman.h> 610# include <sys/ioctl.h> 611# include <sys/syscall.h> 612 613# if defined __i386 || defined __x86_64__ 614# include "TracyCpuid.hpp" 615# endif 616 617# include "TracyProfiler.hpp" 618# include "TracyRingBuffer.hpp" 619# include "TracyThread.hpp" 620 621namespace tracy 622{ 623 624static std::atomic<bool> traceActive { false }; 625static int s_numCpus = 0; 626static int s_numBuffers = 0; 627static int s_ctxBufferIdx = 0; 628 629static RingBuffer* s_ring = nullptr; 630 631static const int ThreadHashSize = 4 * 1024; 632static uint32_t s_threadHash[ThreadHashSize] = {}; 633 634static bool CurrentProcOwnsThread( uint32_t tid ) 635{ 636 const auto hash = tid & ( ThreadHashSize-1 ); 637 const auto hv = s_threadHash[hash]; 638 if( hv == tid ) return true; 639 if( hv == -tid ) return false; 640 641 char path[256]; 642 sprintf( path, "/proc/self/task/%d", tid ); 643 struct stat st; 644 if( stat( path, &st ) == 0 ) 645 { 646 s_threadHash[hash] = tid; 647 return true; 648 } 649 else 650 { 651 s_threadHash[hash] = -tid; 652 return false; 653 } 654} 655 656static int perf_event_open( struct perf_event_attr* hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags ) 657{ 658 return syscall( __NR_perf_event_open, hw_event, pid, cpu, group_fd, flags ); 659} 660 661enum TraceEventId 662{ 663 EventCallstack, 664 EventCpuCycles, 665 EventInstructionsRetired, 666 EventCacheReference, 667 EventCacheMiss, 668 EventBranchRetired, 669 EventBranchMiss, 670 EventVsync, 671 EventContextSwitch, 672 EventWakeup, 673}; 674 675static void ProbePreciseIp( perf_event_attr& pe, unsigned long long config0, unsigned long long config1, pid_t pid ) 676{ 677 pe.config = config1; 678 pe.precise_ip = 3; 679 while( pe.precise_ip != 0 ) 680 { 681 const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC ); 682 if( fd != -1 ) 683 { 684 close( fd ); 685 break; 686 } 687 pe.precise_ip--; 688 } 689 pe.config = config0; 690 while( pe.precise_ip != 0 ) 691 { 692 const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC ); 693 if( fd != -1 ) 694 { 695 close( fd ); 696 break; 697 } 698 pe.precise_ip--; 699 } 700 TracyDebug( " Probed precise_ip: %i\n", pe.precise_ip ); 701} 702 703static void ProbePreciseIp( perf_event_attr& pe, pid_t pid ) 704{ 705 pe.precise_ip = 3; 706 while( pe.precise_ip != 0 ) 707 { 708 const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC ); 709 if( fd != -1 ) 710 { 711 close( fd ); 712 break; 713 } 714 pe.precise_ip--; 715 } 716 TracyDebug( " Probed precise_ip: %i\n", pe.precise_ip ); 717} 718 719static bool IsGenuineIntel() 720{ 721#if defined __i386 || defined __x86_64__ 722 uint32_t regs[4] = {}; 723 __get_cpuid( 0, regs, regs+1, regs+2, regs+3 ); 724 char manufacturer[12]; 725 memcpy( manufacturer, regs+1, 4 ); 726 memcpy( manufacturer+4, regs+3, 4 ); 727 memcpy( manufacturer+8, regs+2, 4 ); 728 return memcmp( manufacturer, "GenuineIntel", 12 ) == 0; 729#else 730 return false; 731#endif 732} 733 734static const char* ReadFile( const char* path ) 735{ 736 int fd = open( path, O_RDONLY ); 737 if( fd < 0 ) return nullptr; 738 739 static char tmp[64]; 740 const auto cnt = read( fd, tmp, 63 ); 741 close( fd ); 742 if( cnt < 0 ) return nullptr; 743 tmp[cnt] = '\0'; 744 return tmp; 745} 746 747bool SysTraceStart( int64_t& samplingPeriod ) 748{ 749#ifndef CLOCK_MONOTONIC_RAW 750 return false; 751#endif 752 753 const auto paranoidLevelStr = ReadFile( "/proc/sys/kernel/perf_event_paranoid" ); 754 if( !paranoidLevelStr ) return false; 755#ifdef TRACY_VERBOSE 756 int paranoidLevel = 2; 757 paranoidLevel = atoi( paranoidLevelStr ); 758 TracyDebug( "perf_event_paranoid: %i\n", paranoidLevel ); 759#endif 760 761 int switchId = -1, wakeupId = -1, vsyncId = -1; 762 const auto switchIdStr = ReadFile( "/sys/kernel/debug/tracing/events/sched/sched_switch/id" ); 763 if( switchIdStr ) switchId = atoi( switchIdStr ); 764 const auto wakeupIdStr = ReadFile( "/sys/kernel/debug/tracing/events/sched/sched_wakeup/id" ); 765 if( wakeupIdStr ) wakeupId = atoi( wakeupIdStr ); 766 const auto vsyncIdStr = ReadFile( "/sys/kernel/debug/tracing/events/drm/drm_vblank_event/id" ); 767 if( vsyncIdStr ) vsyncId = atoi( vsyncIdStr ); 768 769 TracyDebug( "sched_switch id: %i\n", switchId ); 770 TracyDebug( "sched_wakeup id: %i\n", wakeupId ); 771 TracyDebug( "drm_vblank_event id: %i\n", vsyncId ); 772 773#ifdef TRACY_NO_SAMPLE_RETIREMENT 774 const bool noRetirement = true; 775#else 776 const char* noRetirementEnv = GetEnvVar( "TRACY_NO_SAMPLE_RETIREMENT" ); 777 const bool noRetirement = noRetirementEnv && noRetirementEnv[0] == '1'; 778#endif 779 780#ifdef TRACY_NO_SAMPLE_CACHE 781 const bool noCache = true; 782#else 783 const char* noCacheEnv = GetEnvVar( "TRACY_NO_SAMPLE_CACHE" ); 784 const bool noCache = noCacheEnv && noCacheEnv[0] == '1'; 785#endif 786 787#ifdef TRACY_NO_SAMPLE_BRANCH 788 const bool noBranch = true; 789#else 790 const char* noBranchEnv = GetEnvVar( "TRACY_NO_SAMPLE_BRANCH" ); 791 const bool noBranch = noBranchEnv && noBranchEnv[0] == '1'; 792#endif 793 794#ifdef TRACY_NO_CONTEXT_SWITCH 795 const bool noCtxSwitch = true; 796#else 797 const char* noCtxSwitchEnv = GetEnvVar( "TRACY_NO_CONTEXT_SWITCH" ); 798 const bool noCtxSwitch = noCtxSwitchEnv && noCtxSwitchEnv[0] == '1'; 799#endif 800 801#ifdef TRACY_NO_VSYNC_CAPTURE 802 const bool noVsync = true; 803#else 804 const char* noVsyncEnv = GetEnvVar( "TRACY_NO_VSYNC_CAPTURE" ); 805 const bool noVsync = noVsyncEnv && noVsyncEnv[0] == '1'; 806#endif 807 808 samplingPeriod = GetSamplingPeriod(); 809 uint32_t currentPid = (uint32_t)getpid(); 810 811 s_numCpus = (int)std::thread::hardware_concurrency(); 812 813 const auto maxNumBuffers = s_numCpus * ( 814 1 + // software sampling 815 2 + // CPU cycles + instructions retired 816 2 + // cache reference + miss 817 2 + // branch retired + miss 818 2 + // context switches + wakeups 819 1 // vsync 820 ); 821 s_ring = (RingBuffer*)tracy_malloc( sizeof( RingBuffer ) * maxNumBuffers ); 822 s_numBuffers = 0; 823 824 // software sampling 825 perf_event_attr pe = {}; 826 pe.type = PERF_TYPE_SOFTWARE; 827 pe.size = sizeof( perf_event_attr ); 828 pe.config = PERF_COUNT_SW_CPU_CLOCK; 829 pe.sample_freq = GetSamplingFrequency(); 830 pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN; 831#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 ) 832 pe.sample_max_stack = 127; 833#endif 834 pe.disabled = 1; 835 pe.freq = 1; 836 pe.inherit = 1; 837#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) 838 pe.use_clockid = 1; 839 pe.clockid = CLOCK_MONOTONIC_RAW; 840#endif 841 842 TracyDebug( "Setup software sampling\n" ); 843 ProbePreciseIp( pe, currentPid ); 844 for( int i=0; i<s_numCpus; i++ ) 845 { 846 int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC ); 847 if( fd == -1 ) 848 { 849 pe.exclude_kernel = 1; 850 ProbePreciseIp( pe, currentPid ); 851 fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC ); 852 if( fd == -1 ) 853 { 854 TracyDebug( " Failed to setup!\n"); 855 break; 856 } 857 TracyDebug( " No access to kernel samples\n" ); 858 } 859 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCallstack ); 860 if( s_ring[s_numBuffers].IsValid() ) 861 { 862 s_numBuffers++; 863 TracyDebug( " Core %i ok\n", i ); 864 } 865 } 866 867 // CPU cycles + instructions retired 868 pe = {}; 869 pe.type = PERF_TYPE_HARDWARE; 870 pe.size = sizeof( perf_event_attr ); 871 pe.sample_freq = 5000; 872 pe.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TIME; 873 pe.disabled = 1; 874 pe.exclude_kernel = 1; 875 pe.exclude_guest = 1; 876 pe.exclude_hv = 1; 877 pe.freq = 1; 878 pe.inherit = 1; 879#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) 880 pe.use_clockid = 1; 881 pe.clockid = CLOCK_MONOTONIC_RAW; 882#endif 883 884 if( !noRetirement ) 885 { 886 TracyDebug( "Setup sampling cycles + retirement\n" ); 887 ProbePreciseIp( pe, PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_INSTRUCTIONS, currentPid ); 888 for( int i=0; i<s_numCpus; i++ ) 889 { 890 const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC ); 891 if( fd != -1 ) 892 { 893 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCpuCycles ); 894 if( s_ring[s_numBuffers].IsValid() ) 895 { 896 s_numBuffers++; 897 TracyDebug( " Core %i ok\n", i ); 898 } 899 } 900 } 901 902 pe.config = PERF_COUNT_HW_INSTRUCTIONS; 903 for( int i=0; i<s_numCpus; i++ ) 904 { 905 const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC ); 906 if( fd != -1 ) 907 { 908 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventInstructionsRetired ); 909 if( s_ring[s_numBuffers].IsValid() ) 910 { 911 s_numBuffers++; 912 TracyDebug( " Core %i ok\n", i ); 913 } 914 } 915 } 916 } 917 918 // cache reference + miss 919 if( !noCache ) 920 { 921 TracyDebug( "Setup sampling CPU cache references + misses\n" ); 922 ProbePreciseIp( pe, PERF_COUNT_HW_CACHE_REFERENCES, PERF_COUNT_HW_CACHE_MISSES, currentPid ); 923 if( IsGenuineIntel() ) 924 { 925 pe.precise_ip = 0; 926 TracyDebug( " CPU is GenuineIntel, forcing precise_ip down to 0\n" ); 927 } 928 for( int i=0; i<s_numCpus; i++ ) 929 { 930 const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC ); 931 if( fd != -1 ) 932 { 933 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCacheReference ); 934 if( s_ring[s_numBuffers].IsValid() ) 935 { 936 s_numBuffers++; 937 TracyDebug( " Core %i ok\n", i ); 938 } 939 } 940 } 941 942 pe.config = PERF_COUNT_HW_CACHE_MISSES; 943 for( int i=0; i<s_numCpus; i++ ) 944 { 945 const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC ); 946 if( fd != -1 ) 947 { 948 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCacheMiss ); 949 if( s_ring[s_numBuffers].IsValid() ) 950 { 951 s_numBuffers++; 952 TracyDebug( " Core %i ok\n", i ); 953 } 954 } 955 } 956 } 957 958 // branch retired + miss 959 if( !noBranch ) 960 { 961 TracyDebug( "Setup sampling CPU branch retirements + misses\n" ); 962 ProbePreciseIp( pe, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, PERF_COUNT_HW_BRANCH_MISSES, currentPid ); 963 for( int i=0; i<s_numCpus; i++ ) 964 { 965 const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC ); 966 if( fd != -1 ) 967 { 968 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventBranchRetired ); 969 if( s_ring[s_numBuffers].IsValid() ) 970 { 971 s_numBuffers++; 972 TracyDebug( " Core %i ok\n", i ); 973 } 974 } 975 } 976 977 pe.config = PERF_COUNT_HW_BRANCH_MISSES; 978 for( int i=0; i<s_numCpus; i++ ) 979 { 980 const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC ); 981 if( fd != -1 ) 982 { 983 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventBranchMiss ); 984 if( s_ring[s_numBuffers].IsValid() ) 985 { 986 s_numBuffers++; 987 TracyDebug( " Core %i ok\n", i ); 988 } 989 } 990 } 991 } 992 993 s_ctxBufferIdx = s_numBuffers; 994 995 // vsync 996 if( !noVsync && vsyncId != -1 ) 997 { 998 pe = {}; 999 pe.type = PERF_TYPE_TRACEPOINT; 1000 pe.size = sizeof( perf_event_attr ); 1001 pe.sample_period = 1; 1002 pe.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_RAW; 1003 pe.disabled = 1; 1004 pe.config = vsyncId; 1005#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) 1006 pe.use_clockid = 1; 1007 pe.clockid = CLOCK_MONOTONIC_RAW; 1008#endif 1009 1010 TracyDebug( "Setup vsync capture\n" ); 1011 for( int i=0; i<s_numCpus; i++ ) 1012 { 1013 const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC ); 1014 if( fd != -1 ) 1015 { 1016 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventVsync, i ); 1017 if( s_ring[s_numBuffers].IsValid() ) 1018 { 1019 s_numBuffers++; 1020 TracyDebug( " Core %i ok\n", i ); 1021 } 1022 } 1023 } 1024 } 1025 1026 // context switches 1027 if( !noCtxSwitch && switchId != -1 ) 1028 { 1029 pe = {}; 1030 pe.type = PERF_TYPE_TRACEPOINT; 1031 pe.size = sizeof( perf_event_attr ); 1032 pe.sample_period = 1; 1033 pe.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN; 1034#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 ) 1035 pe.sample_max_stack = 127; 1036#endif 1037 pe.disabled = 1; 1038 pe.inherit = 1; 1039 pe.config = switchId; 1040#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) 1041 pe.use_clockid = 1; 1042 pe.clockid = CLOCK_MONOTONIC_RAW; 1043#endif 1044 1045 TracyDebug( "Setup context switch capture\n" ); 1046 for( int i=0; i<s_numCpus; i++ ) 1047 { 1048 const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC ); 1049 if( fd != -1 ) 1050 { 1051 new( s_ring+s_numBuffers ) RingBuffer( 256*1024, fd, EventContextSwitch, i ); 1052 if( s_ring[s_numBuffers].IsValid() ) 1053 { 1054 s_numBuffers++; 1055 TracyDebug( " Core %i ok\n", i ); 1056 } 1057 } 1058 } 1059 1060 if( wakeupId != -1 ) 1061 { 1062 pe.config = wakeupId; 1063 pe.config &= ~PERF_SAMPLE_CALLCHAIN; 1064 1065 TracyDebug( "Setup wakeup capture\n" ); 1066 for( int i=0; i<s_numCpus; i++ ) 1067 { 1068 const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC ); 1069 if( fd != -1 ) 1070 { 1071 new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventWakeup, i ); 1072 if( s_ring[s_numBuffers].IsValid() ) 1073 { 1074 s_numBuffers++; 1075 TracyDebug( " Core %i ok\n", i ); 1076 } 1077 } 1078 } 1079 } 1080 } 1081 1082 TracyDebug( "Ringbuffers in use: %i\n", s_numBuffers ); 1083 1084 traceActive.store( true, std::memory_order_relaxed ); 1085 return true; 1086} 1087 1088void SysTraceStop() 1089{ 1090 traceActive.store( false, std::memory_order_relaxed ); 1091} 1092 1093static uint64_t* GetCallstackBlock( uint64_t cnt, RingBuffer& ring, uint64_t offset ) 1094{ 1095 auto trace = (uint64_t*)tracy_malloc_fast( ( 1 + cnt ) * sizeof( uint64_t ) ); 1096 ring.Read( trace+1, offset, sizeof( uint64_t ) * cnt ); 1097 1098#if defined __x86_64__ || defined _M_X64 1099 // remove non-canonical pointers 1100 do 1101 { 1102 const auto test = (int64_t)trace[cnt]; 1103 const auto m1 = test >> 63; 1104 const auto m2 = test >> 47; 1105 if( m1 == m2 ) break; 1106 } 1107 while( --cnt > 0 ); 1108 for( uint64_t j=1; j<cnt; j++ ) 1109 { 1110 const auto test = (int64_t)trace[j]; 1111 const auto m1 = test >> 63; 1112 const auto m2 = test >> 47; 1113 if( m1 != m2 ) trace[j] = 0; 1114 } 1115#endif 1116 1117 for( uint64_t j=1; j<=cnt; j++ ) 1118 { 1119 if( trace[j] >= (uint64_t)-4095 ) // PERF_CONTEXT_MAX 1120 { 1121 memmove( trace+j, trace+j+1, sizeof( uint64_t ) * ( cnt - j ) ); 1122 cnt--; 1123 } 1124 } 1125 1126 memcpy( trace, &cnt, sizeof( uint64_t ) ); 1127 return trace; 1128} 1129 1130void SysTraceWorker( void* ptr ) 1131{ 1132 ThreadExitHandler threadExitHandler; 1133 SetThreadName( "Tracy Sampling" ); 1134 InitRpmalloc(); 1135 sched_param sp = { 99 }; 1136 if( pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp ) != 0 ) { 1137 TracyDebug( "Failed to increase SysTraceWorker thread priority!\n" ); 1138 } 1139 auto ctxBufferIdx = s_ctxBufferIdx; 1140 auto ringArray = s_ring; 1141 auto numBuffers = s_numBuffers; 1142 for( int i=0; i<numBuffers; i++ ) ringArray[i].Enable(); 1143 for(;;) 1144 { 1145#ifdef TRACY_ON_DEMAND 1146 if( !GetProfiler().IsConnected() ) 1147 { 1148 if( !traceActive.load( std::memory_order_relaxed ) ) break; 1149 for( int i=0; i<numBuffers; i++ ) 1150 { 1151 auto& ring = ringArray[i]; 1152 const auto head = ring.LoadHead(); 1153 const auto tail = ring.GetTail(); 1154 if( head != tail ) 1155 { 1156 const auto end = head - tail; 1157 ring.Advance( end ); 1158 } 1159 } 1160 if( !traceActive.load( std::memory_order_relaxed ) ) break; 1161 std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); 1162 continue; 1163 } 1164#endif 1165 1166 bool hadData = false; 1167 for( int i=0; i<ctxBufferIdx; i++ ) 1168 { 1169 if( !traceActive.load( std::memory_order_relaxed ) ) break; 1170 auto& ring = ringArray[i]; 1171 const auto head = ring.LoadHead(); 1172 const auto tail = ring.GetTail(); 1173 if( head == tail ) continue; 1174 assert( head > tail ); 1175 hadData = true; 1176 1177 const auto id = ring.GetId(); 1178 assert( id != EventContextSwitch ); 1179 const auto end = head - tail; 1180 uint64_t pos = 0; 1181 if( id == EventCallstack ) 1182 { 1183 while( pos < end ) 1184 { 1185 perf_event_header hdr; 1186 ring.Read( &hdr, pos, sizeof( perf_event_header ) ); 1187 if( hdr.type == PERF_RECORD_SAMPLE ) 1188 { 1189 auto offset = pos + sizeof( perf_event_header ); 1190 1191 // Layout: 1192 // u32 pid, tid 1193 // u64 time 1194 // u64 cnt 1195 // u64 ip[cnt] 1196 1197 uint32_t tid; 1198 uint64_t t0; 1199 uint64_t cnt; 1200 1201 offset += sizeof( uint32_t ); 1202 ring.Read( &tid, offset, sizeof( uint32_t ) ); 1203 offset += sizeof( uint32_t ); 1204 ring.Read( &t0, offset, sizeof( uint64_t ) ); 1205 offset += sizeof( uint64_t ); 1206 ring.Read( &cnt, offset, sizeof( uint64_t ) ); 1207 offset += sizeof( uint64_t ); 1208 1209 if( cnt > 0 ) 1210 { 1211#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) 1212 t0 = ring.ConvertTimeToTsc( t0 ); 1213#endif 1214 auto trace = GetCallstackBlock( cnt, ring, offset ); 1215 1216 TracyLfqPrepare( QueueType::CallstackSample ); 1217 MemWrite( &item->callstackSampleFat.time, t0 ); 1218 MemWrite( &item->callstackSampleFat.thread, tid ); 1219 MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace ); 1220 TracyLfqCommit; 1221 } 1222 } 1223 pos += hdr.size; 1224 } 1225 } 1226 else 1227 { 1228 while( pos < end ) 1229 { 1230 perf_event_header hdr; 1231 ring.Read( &hdr, pos, sizeof( perf_event_header ) ); 1232 if( hdr.type == PERF_RECORD_SAMPLE ) 1233 { 1234 auto offset = pos + sizeof( perf_event_header ); 1235 1236 // Layout: 1237 // u64 ip 1238 // u64 time 1239 1240 uint64_t ip, t0; 1241 ring.Read( &ip, offset, sizeof( uint64_t ) ); 1242 offset += sizeof( uint64_t ); 1243 ring.Read( &t0, offset, sizeof( uint64_t ) ); 1244 1245#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) 1246 t0 = ring.ConvertTimeToTsc( t0 ); 1247#endif 1248 QueueType type; 1249 switch( id ) 1250 { 1251 case EventCpuCycles: 1252 type = QueueType::HwSampleCpuCycle; 1253 break; 1254 case EventInstructionsRetired: 1255 type = QueueType::HwSampleInstructionRetired; 1256 break; 1257 case EventCacheReference: 1258 type = QueueType::HwSampleCacheReference; 1259 break; 1260 case EventCacheMiss: 1261 type = QueueType::HwSampleCacheMiss; 1262 break; 1263 case EventBranchRetired: 1264 type = QueueType::HwSampleBranchRetired; 1265 break; 1266 case EventBranchMiss: 1267 type = QueueType::HwSampleBranchMiss; 1268 break; 1269 default: 1270 abort(); 1271 } 1272 1273 TracyLfqPrepare( type ); 1274 MemWrite( &item->hwSample.ip, ip ); 1275 MemWrite( &item->hwSample.time, t0 ); 1276 TracyLfqCommit; 1277 } 1278 pos += hdr.size; 1279 } 1280 } 1281 assert( pos == end ); 1282 ring.Advance( end ); 1283 } 1284 if( !traceActive.load( std::memory_order_relaxed ) ) break; 1285 1286 if( ctxBufferIdx != numBuffers ) 1287 { 1288 const auto ctxBufNum = numBuffers - ctxBufferIdx; 1289 1290 int activeNum = 0; 1291 uint16_t active[512]; 1292 uint32_t end[512]; 1293 uint32_t pos[512]; 1294 for( int i=0; i<ctxBufNum; i++ ) 1295 { 1296 const auto rbIdx = ctxBufferIdx + i; 1297 const auto rbHead = ringArray[rbIdx].LoadHead(); 1298 const auto rbTail = ringArray[rbIdx].GetTail(); 1299 const auto rbActive = rbHead != rbTail; 1300 1301 if( rbActive ) 1302 { 1303 active[activeNum] = (uint16_t)i; 1304 activeNum++; 1305 end[i] = rbHead - rbTail; 1306 pos[i] = 0; 1307 } 1308 else 1309 { 1310 end[i] = 0; 1311 } 1312 } 1313 if( activeNum > 0 ) 1314 { 1315 hadData = true; 1316 while( activeNum > 0 ) 1317 { 1318 int sel = -1; 1319 int selPos; 1320 int64_t t0 = std::numeric_limits<int64_t>::max(); 1321 for( int i=0; i<activeNum; i++ ) 1322 { 1323 auto idx = active[i]; 1324 auto rbPos = pos[idx]; 1325 assert( rbPos < end[idx] ); 1326 const auto rbIdx = ctxBufferIdx + idx; 1327 perf_event_header hdr; 1328 ringArray[rbIdx].Read( &hdr, rbPos, sizeof( perf_event_header ) ); 1329 if( hdr.type == PERF_RECORD_SAMPLE ) 1330 { 1331 int64_t rbTime; 1332 ringArray[rbIdx].Read( &rbTime, rbPos + sizeof( perf_event_header ), sizeof( int64_t ) ); 1333 if( rbTime < t0 ) 1334 { 1335 t0 = rbTime; 1336 sel = idx; 1337 selPos = i; 1338 } 1339 } 1340 else 1341 { 1342 rbPos += hdr.size; 1343 if( rbPos == end[idx] ) 1344 { 1345 memmove( active+i, active+i+1, sizeof(*active) * ( activeNum - i - 1 ) ); 1346 activeNum--; 1347 i--; 1348 } 1349 else 1350 { 1351 pos[idx] = rbPos; 1352 } 1353 } 1354 } 1355 if( sel >= 0 ) 1356 { 1357 auto& ring = ringArray[ctxBufferIdx + sel]; 1358 auto rbPos = pos[sel]; 1359 auto offset = rbPos; 1360 perf_event_header hdr; 1361 ring.Read( &hdr, offset, sizeof( perf_event_header ) ); 1362 1363#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) 1364 t0 = ring.ConvertTimeToTsc( t0 ); 1365#endif 1366 1367 const auto rid = ring.GetId(); 1368 if( rid == EventContextSwitch ) 1369 { 1370 // Layout: 1371 // u64 time 1372 // u64 cnt 1373 // u64 ip[cnt] 1374 // u32 size 1375 // u8 data[size] 1376 // Data (not ABI stable, but has not changed since it was added, in 2009): 1377 // u8 hdr[8] 1378 // u8 prev_comm[16] 1379 // u32 prev_pid 1380 // u32 prev_prio 1381 // lng prev_state 1382 // u8 next_comm[16] 1383 // u32 next_pid 1384 // u32 next_prio 1385 1386 offset += sizeof( perf_event_header ) + sizeof( uint64_t ); 1387 1388 uint64_t cnt; 1389 ring.Read( &cnt, offset, sizeof( uint64_t ) ); 1390 offset += sizeof( uint64_t ); 1391 const auto traceOffset = offset; 1392 offset += sizeof( uint64_t ) * cnt + sizeof( uint32_t ) + 8 + 16; 1393 1394 uint32_t prev_pid, next_pid; 1395 long prev_state; 1396 1397 ring.Read( &prev_pid, offset, sizeof( uint32_t ) ); 1398 offset += sizeof( uint32_t ) + sizeof( uint32_t ); 1399 ring.Read( &prev_state, offset, sizeof( long ) ); 1400 offset += sizeof( long ) + 16; 1401 ring.Read( &next_pid, offset, sizeof( uint32_t ) ); 1402 1403 uint8_t reason = 100; 1404 uint8_t state; 1405 1406 if( prev_state & 0x0001 ) state = 104; 1407 else if( prev_state & 0x0002 ) state = 101; 1408 else if( prev_state & 0x0004 ) state = 105; 1409 else if( prev_state & 0x0008 ) state = 106; 1410 else if( prev_state & 0x0010 ) state = 108; 1411 else if( prev_state & 0x0020 ) state = 109; 1412 else if( prev_state & 0x0040 ) state = 110; 1413 else if( prev_state & 0x0080 ) state = 102; 1414 else state = 103; 1415 1416 TracyLfqPrepare( QueueType::ContextSwitch ); 1417 MemWrite( &item->contextSwitch.time, t0 ); 1418 MemWrite( &item->contextSwitch.oldThread, prev_pid ); 1419 MemWrite( &item->contextSwitch.newThread, next_pid ); 1420 MemWrite( &item->contextSwitch.cpu, uint8_t( ring.GetCpu() ) ); 1421 MemWrite( &item->contextSwitch.reason, reason ); 1422 MemWrite( &item->contextSwitch.state, state ); 1423 TracyLfqCommit; 1424 1425 if( cnt > 0 && prev_pid != 0 && CurrentProcOwnsThread( prev_pid ) ) 1426 { 1427 auto trace = GetCallstackBlock( cnt, ring, traceOffset ); 1428 1429 TracyLfqPrepare( QueueType::CallstackSampleContextSwitch ); 1430 MemWrite( &item->callstackSampleFat.time, t0 ); 1431 MemWrite( &item->callstackSampleFat.thread, prev_pid ); 1432 MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace ); 1433 TracyLfqCommit; 1434 } 1435 } 1436 else if( rid == EventWakeup ) 1437 { 1438 // Layout: 1439 // u64 time 1440 // u32 size 1441 // u8 data[size] 1442 // Data: 1443 // u8 hdr[8] 1444 // u8 comm[16] 1445 // u32 pid 1446 // u32 prio 1447 // u64 target_cpu 1448 1449 offset += sizeof( perf_event_header ) + sizeof( uint64_t ) + sizeof( uint32_t ) + 8 + 16; 1450 1451 uint32_t pid; 1452 ring.Read( &pid, offset, sizeof( uint32_t ) ); 1453 1454 TracyLfqPrepare( QueueType::ThreadWakeup ); 1455 MemWrite( &item->threadWakeup.time, t0 ); 1456 MemWrite( &item->threadWakeup.thread, pid ); 1457 TracyLfqCommit; 1458 } 1459 else 1460 { 1461 assert( rid == EventVsync ); 1462 // Layout: 1463 // u64 time 1464 // u32 size 1465 // u8 data[size] 1466 // Data (not ABI stable): 1467 // u8 hdr[8] 1468 // i32 crtc 1469 // u32 seq 1470 // i64 ktime 1471 // u8 high precision 1472 1473 offset += sizeof( perf_event_header ) + sizeof( uint64_t ) + sizeof( uint32_t ) + 8; 1474 1475 int32_t crtc; 1476 ring.Read( &crtc, offset, sizeof( int32_t ) ); 1477 1478 // Note: The timestamp value t0 might be off by a number of microseconds from the 1479 // true hardware vblank event. The ktime value should be used instead, but it is 1480 // measured in CLOCK_MONOTONIC time. Tracy only supports the timestamp counter 1481 // register (TSC) or CLOCK_MONOTONIC_RAW clock. 1482#if 0 1483 offset += sizeof( uint32_t ) * 2; 1484 int64_t ktime; 1485 ring.Read( &ktime, offset, sizeof( int64_t ) ); 1486#endif 1487 1488 TracyLfqPrepare( QueueType::FrameVsync ); 1489 MemWrite( &item->frameVsync.id, crtc ); 1490 MemWrite( &item->frameVsync.time, t0 ); 1491 TracyLfqCommit; 1492 } 1493 1494 rbPos += hdr.size; 1495 if( rbPos == end[sel] ) 1496 { 1497 memmove( active+selPos, active+selPos+1, sizeof(*active) * ( activeNum - selPos - 1 ) ); 1498 activeNum--; 1499 } 1500 else 1501 { 1502 pos[sel] = rbPos; 1503 } 1504 } 1505 } 1506 for( int i=0; i<ctxBufNum; i++ ) 1507 { 1508 if( end[i] != 0 ) ringArray[ctxBufferIdx + i].Advance( end[i] ); 1509 } 1510 } 1511 } 1512 if( !traceActive.load( std::memory_order_relaxed ) ) break; 1513 if( !hadData ) 1514 { 1515 std::this_thread::sleep_for( std::chrono::milliseconds( 1 ) ); 1516 } 1517 } 1518 1519 for( int i=0; i<numBuffers; i++ ) ringArray[i].~RingBuffer(); 1520 tracy_free_fast( ringArray ); 1521} 1522 1523void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const char*& name ) 1524{ 1525 FILE* f; 1526 char fn[256]; 1527 sprintf( fn, "/proc/%" PRIu64 "/comm", thread ); 1528 f = fopen( fn, "rb" ); 1529 if( f ) 1530 { 1531 char buf[256]; 1532 const auto sz = fread( buf, 1, 256, f ); 1533 if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0'; 1534 threadName = CopyString( buf ); 1535 fclose( f ); 1536 } 1537 else 1538 { 1539 threadName = CopyString( "???", 3 ); 1540 } 1541 1542 sprintf( fn, "/proc/%" PRIu64 "/status", thread ); 1543 f = fopen( fn, "rb" ); 1544 if( f ) 1545 { 1546 char* tmp = (char*)tracy_malloc_fast( 8*1024 ); 1547 const auto fsz = (ptrdiff_t)fread( tmp, 1, 8*1024, f ); 1548 fclose( f ); 1549 1550 int pid = -1; 1551 auto line = tmp; 1552 for(;;) 1553 { 1554 if( memcmp( "Tgid:\t", line, 6 ) == 0 ) 1555 { 1556 pid = atoi( line + 6 ); 1557 break; 1558 } 1559 while( line - tmp < fsz && *line != '\n' ) line++; 1560 if( *line != '\n' ) break; 1561 line++; 1562 } 1563 tracy_free_fast( tmp ); 1564 1565 if( pid >= 0 ) 1566 { 1567 { 1568 uint64_t _pid = pid; 1569 TracyLfqPrepare( QueueType::TidToPid ); 1570 MemWrite( &item->tidToPid.tid, thread ); 1571 MemWrite( &item->tidToPid.pid, _pid ); 1572 TracyLfqCommit; 1573 } 1574 sprintf( fn, "/proc/%i/comm", pid ); 1575 f = fopen( fn, "rb" ); 1576 if( f ) 1577 { 1578 char buf[256]; 1579 const auto sz = fread( buf, 1, 256, f ); 1580 if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0'; 1581 name = CopyStringFast( buf ); 1582 fclose( f ); 1583 return; 1584 } 1585 } 1586 } 1587 name = CopyStringFast( "???", 3 ); 1588} 1589 1590} 1591 1592# endif 1593 1594#endif