mirror of https://github.com/wolfpld/tracy.git
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
557 lines
24 KiB
C++
557 lines
24 KiB
C++
#include "../server/tracy_robin_hood.h"
|
|
#include "TracyProfiler.hpp"
|
|
#include "TracyThread.hpp"
|
|
#include "tracy/TracyC.h"
|
|
#include <rocprofiler-sdk/registration.h>
|
|
#include <rocprofiler-sdk/rocprofiler.h>
|
|
|
|
#include <iostream>
|
|
#include <mutex>
|
|
#include <set>
|
|
#include <shared_mutex>
|
|
#include <sstream>
|
|
#include <time.h>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
#define ROCPROFILER_CALL( result, msg ) \
|
|
{ \
|
|
rocprofiler_status_t CHECKSTATUS = result; \
|
|
if( CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS ) \
|
|
{ \
|
|
std::string status_msg = rocprofiler_get_status_string( CHECKSTATUS ); \
|
|
std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg << " failed with error code " \
|
|
<< CHECKSTATUS << ": " << status_msg << std::endl; \
|
|
std::stringstream errmsg{}; \
|
|
errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" << status_msg \
|
|
<< ")"; \
|
|
throw std::runtime_error( errmsg.str() ); \
|
|
} \
|
|
}
|
|
|
|
namespace
|
|
{
|
|
|
|
using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
|
|
|
|
struct DispatchData
|
|
{
|
|
int64_t launch_start;
|
|
int64_t launch_end;
|
|
uint32_t thread_id;
|
|
uint16_t query_id;
|
|
};
|
|
|
|
struct ToolData
|
|
{
|
|
uint32_t version;
|
|
const char* runtime_version;
|
|
uint32_t priority;
|
|
rocprofiler_client_id_t client_id;
|
|
uint8_t context_id;
|
|
bool init;
|
|
uint64_t query_id;
|
|
int64_t previous_cpu_time;
|
|
tracy::unordered_map<rocprofiler_kernel_id_t, kernel_symbol_data_t> client_kernels;
|
|
tracy::unordered_map<rocprofiler_dispatch_id_t, DispatchData> dispatch_data;
|
|
tracy::unordered_set<std::string> counter_names = { "SQ_WAVES", "GL2C_MISS", "GL2C_HIT" };
|
|
std::unique_ptr<tracy::Thread> cal_thread;
|
|
std::mutex mut{};
|
|
};
|
|
|
|
using namespace tracy;
|
|
|
|
rocprofiler_context_id_t& get_client_ctx()
|
|
{
|
|
static rocprofiler_context_id_t ctx{ 0 };
|
|
return ctx;
|
|
}
|
|
|
|
const char* CTX_NAME = "rocprofv3";
|
|
|
|
uint8_t gpu_context_allocate( ToolData* data )
|
|
{
|
|
|
|
timespec ts;
|
|
clock_gettime( CLOCK_BOOTTIME, &ts );
|
|
uint64_t cpu_timestamp = Profiler::GetTime();
|
|
uint64_t gpu_timestamp = ( (uint64_t)ts.tv_sec * 1000000000 ) + ts.tv_nsec;
|
|
float timestamp_period = 1.0f;
|
|
data->previous_cpu_time = cpu_timestamp;
|
|
|
|
// Allocate the process-unique GPU context ID. There's a max of 255 available;
|
|
// if we are recreating devices a lot we may exceed that. Don't do that, or
|
|
// wrap around and get weird (but probably still usable) numbers.
|
|
uint8_t context_id = tracy::GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed );
|
|
if( context_id >= 255 )
|
|
{
|
|
context_id %= 255;
|
|
}
|
|
|
|
uint8_t context_flags = 0;
|
|
#ifdef TRACY_ROCPROF_CALIBRATION
|
|
// Tell tracy we'll be passing calibrated timestamps and not to mess with
|
|
// the times. We'll periodically send GpuCalibration events in case the
|
|
// times drift.
|
|
context_flags |= tracy::GpuContextCalibration;
|
|
#endif
|
|
{
|
|
auto* item = tracy::Profiler::QueueSerial();
|
|
tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuNewContext );
|
|
tracy::MemWrite( &item->gpuNewContext.cpuTime, cpu_timestamp );
|
|
tracy::MemWrite( &item->gpuNewContext.gpuTime, gpu_timestamp );
|
|
memset( &item->gpuNewContext.thread, 0, sizeof( item->gpuNewContext.thread ) );
|
|
tracy::MemWrite( &item->gpuNewContext.period, timestamp_period );
|
|
tracy::MemWrite( &item->gpuNewContext.context, context_id );
|
|
tracy::MemWrite( &item->gpuNewContext.flags, context_flags );
|
|
tracy::MemWrite( &item->gpuNewContext.type, tracy::GpuContextType::Rocprof );
|
|
tracy::Profiler::QueueSerialFinish();
|
|
}
|
|
|
|
// Send the name of the context along.
|
|
// NOTE: Tracy will unconditionally free the name so we must clone it here.
|
|
// Since internally Tracy will use its own rpmalloc implementation we must
|
|
// make sure we allocate from the same source.
|
|
size_t name_length = strlen( CTX_NAME );
|
|
char* cloned_name = (char*)tracy::tracy_malloc( name_length );
|
|
memcpy( cloned_name, CTX_NAME, name_length );
|
|
{
|
|
auto* item = tracy::Profiler::QueueSerial();
|
|
tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuContextName );
|
|
tracy::MemWrite( &item->gpuContextNameFat.context, context_id );
|
|
tracy::MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)cloned_name );
|
|
tracy::MemWrite( &item->gpuContextNameFat.size, name_length );
|
|
tracy::Profiler::QueueSerialFinish();
|
|
}
|
|
|
|
return context_id;
|
|
}
|
|
|
|
uint64_t kernel_src_loc( ToolData* data, uint64_t kernel_id )
|
|
{
|
|
uint64_t src_loc = 0;
|
|
auto _lk = std::unique_lock{ data->mut };
|
|
rocprofiler_kernel_id_t kid = kernel_id;
|
|
if( data->client_kernels.count( kid ) )
|
|
{
|
|
auto& sym_data = data->client_kernels[kid];
|
|
const char* name = sym_data.kernel_name;
|
|
size_t name_len = strlen( name );
|
|
uint32_t line = 0;
|
|
src_loc = tracy::Profiler::AllocSourceLocation( line, NULL, 0, name, name_len, NULL, 0 );
|
|
}
|
|
return src_loc;
|
|
}
|
|
|
|
void record_interval( ToolData* data, rocprofiler_timestamp_t start_timestamp, rocprofiler_timestamp_t end_timestamp,
|
|
uint64_t src_loc, rocprofiler_dispatch_id_t dispatch_id )
|
|
{
|
|
|
|
uint16_t query_id = 0;
|
|
uint8_t context_id = data->context_id;
|
|
|
|
{
|
|
auto _lk = std::unique_lock{ data->mut };
|
|
query_id = data->query_id;
|
|
data->query_id++;
|
|
if( dispatch_id != UINT64_MAX )
|
|
{
|
|
DispatchData& dispatch_data = data->dispatch_data[dispatch_id];
|
|
dispatch_data.query_id = query_id;
|
|
dispatch_data.thread_id = tracy::GetThreadHandle();
|
|
}
|
|
}
|
|
|
|
uint64_t cpu_start_time = 0, cpu_end_time = 0;
|
|
if( dispatch_id == UINT64_MAX )
|
|
{
|
|
cpu_start_time = tracy::Profiler::GetTime();
|
|
cpu_end_time = tracy::Profiler::GetTime();
|
|
}
|
|
else
|
|
{
|
|
auto _lk = std::unique_lock{ data->mut };
|
|
DispatchData& dispatch_data = data->dispatch_data[dispatch_id];
|
|
cpu_start_time = dispatch_data.launch_start;
|
|
cpu_end_time = dispatch_data.launch_end;
|
|
}
|
|
|
|
if( src_loc != 0 )
|
|
{
|
|
{
|
|
auto* item = tracy::Profiler::QueueSerial();
|
|
tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocSerial );
|
|
tracy::MemWrite( &item->gpuZoneBegin.cpuTime, cpu_start_time );
|
|
tracy::MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)src_loc );
|
|
tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
|
|
tracy::MemWrite( &item->gpuZoneBegin.queryId, query_id );
|
|
tracy::MemWrite( &item->gpuZoneBegin.context, context_id );
|
|
tracy::Profiler::QueueSerialFinish();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
static const ___tracy_source_location_data src_loc = { NULL, NULL, NULL, 0, 0 };
|
|
{
|
|
auto* item = tracy::Profiler::QueueSerial();
|
|
tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginSerial );
|
|
tracy::MemWrite( &item->gpuZoneBegin.cpuTime, cpu_start_time );
|
|
tracy::MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)&src_loc );
|
|
tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
|
|
tracy::MemWrite( &item->gpuZoneBegin.queryId, query_id );
|
|
tracy::MemWrite( &item->gpuZoneBegin.context, context_id );
|
|
tracy::Profiler::QueueSerialFinish();
|
|
}
|
|
}
|
|
|
|
{
|
|
auto* item = tracy::Profiler::QueueSerial();
|
|
tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuTime );
|
|
tracy::MemWrite( &item->gpuTime.gpuTime, start_timestamp );
|
|
tracy::MemWrite( &item->gpuTime.queryId, query_id );
|
|
tracy::MemWrite( &item->gpuTime.context, context_id );
|
|
tracy::Profiler::QueueSerialFinish();
|
|
}
|
|
|
|
{
|
|
auto* item = tracy::Profiler::QueueSerial();
|
|
tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneEndSerial );
|
|
tracy::MemWrite( &item->gpuZoneEnd.cpuTime, cpu_end_time );
|
|
tracy::MemWrite( &item->gpuZoneEnd.thread, tracy::GetThreadHandle() );
|
|
tracy::MemWrite( &item->gpuZoneEnd.queryId, query_id );
|
|
tracy::MemWrite( &item->gpuZoneEnd.context, context_id );
|
|
tracy::Profiler::QueueSerialFinish();
|
|
}
|
|
|
|
{
|
|
auto* item = tracy::Profiler::QueueSerial();
|
|
tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuTime );
|
|
tracy::MemWrite( &item->gpuTime.gpuTime, end_timestamp );
|
|
tracy::MemWrite( &item->gpuTime.queryId, query_id );
|
|
tracy::MemWrite( &item->gpuTime.context, context_id );
|
|
tracy::Profiler::QueueSerialFinish();
|
|
}
|
|
}
|
|
|
|
void record_callback( rocprofiler_dispatch_counting_service_data_t dispatch_data,
|
|
rocprofiler_record_counter_t* record_data, size_t record_count,
|
|
rocprofiler_user_data_t /*user_data*/, void* callback_data )
|
|
{
|
|
assert( callback_data != nullptr );
|
|
ToolData* data = static_cast<ToolData*>( callback_data );
|
|
if( !data->init ) return;
|
|
|
|
std::unordered_map<rocprofiler_counter_instance_id_t, double> sums;
|
|
for( size_t i = 0; i < record_count; ++i )
|
|
{
|
|
auto _counter_id = rocprofiler_counter_id_t{};
|
|
ROCPROFILER_CALL( rocprofiler_query_record_counter_id( record_data[i].id, &_counter_id ),
|
|
"query record counter id" );
|
|
sums[_counter_id.handle] += record_data[i].counter_value;
|
|
}
|
|
|
|
uint16_t query_id = 0;
|
|
uint32_t thread_id = 0;
|
|
{
|
|
auto _lk = std::unique_lock{ data->mut };
|
|
// An assumption is made here that the counter values are supplied after the dispatch
|
|
// complete callback.
|
|
assert( data->dispatch_data.count( dispatch_data.dispatch_info.dispatch_id ) );
|
|
DispatchData& ddata = data->dispatch_data[dispatch_data.dispatch_info.dispatch_id];
|
|
query_id = ddata.query_id;
|
|
thread_id = ddata.thread_id;
|
|
}
|
|
|
|
for( auto& p : sums )
|
|
{
|
|
auto* item = tracy::Profiler::QueueSerial();
|
|
tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneAnnotation );
|
|
tracy::MemWrite( &item->zoneAnnotation.noteId, p.first );
|
|
tracy::MemWrite( &item->zoneAnnotation.queryId, query_id );
|
|
tracy::MemWrite( &item->zoneAnnotation.thread, thread_id );
|
|
tracy::MemWrite( &item->zoneAnnotation.value, p.second );
|
|
tracy::MemWrite( &item->zoneAnnotation.context, data->context_id );
|
|
tracy::Profiler::QueueSerialFinish();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Callback from rocprofiler when an kernel dispatch is enqueued into the HSA queue.
|
|
* rocprofiler_counter_config_id_t* is a return to specify what counters to collect
|
|
* for this dispatch (dispatch_packet).
|
|
*/
|
|
void dispatch_callback( rocprofiler_dispatch_counting_service_data_t dispatch_data,
|
|
rocprofiler_profile_config_id_t* config, rocprofiler_user_data_t* /*user_data*/,
|
|
void* callback_data )
|
|
{
|
|
assert( callback_data != nullptr );
|
|
ToolData* data = static_cast<ToolData*>( callback_data );
|
|
if( !data->init ) return;
|
|
|
|
/**
|
|
* This simple example uses the same profile counter set for all agents.
|
|
* We store this in a cache to prevent constructing many identical profile counter
|
|
* sets. We first check the cache to see if we have already constructed a counter"
|
|
* set for the agent. If we have, return it. Otherwise, construct a new profile counter
|
|
* set.
|
|
*/
|
|
static std::shared_mutex m_mutex = {};
|
|
static std::unordered_map<uint64_t, rocprofiler_profile_config_id_t> profile_cache = {};
|
|
|
|
auto search_cache = [&]()
|
|
{
|
|
if( auto pos = profile_cache.find( dispatch_data.dispatch_info.agent_id.handle ); pos != profile_cache.end() )
|
|
{
|
|
*config = pos->second;
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
{
|
|
auto rlock = std::shared_lock{ m_mutex };
|
|
if( search_cache() ) return;
|
|
}
|
|
|
|
auto wlock = std::unique_lock{ m_mutex };
|
|
if( search_cache() ) return;
|
|
|
|
// GPU Counter IDs
|
|
std::vector<rocprofiler_counter_id_t> gpu_counters;
|
|
|
|
// Iterate through the agents and get the counters available on that agent
|
|
ROCPROFILER_CALL(
|
|
rocprofiler_iterate_agent_supported_counters(
|
|
dispatch_data.dispatch_info.agent_id,
|
|
[]( rocprofiler_agent_id_t, rocprofiler_counter_id_t* counters, size_t num_counters, void* user_data )
|
|
{
|
|
std::vector<rocprofiler_counter_id_t>* vec =
|
|
static_cast<std::vector<rocprofiler_counter_id_t>*>( user_data );
|
|
for( size_t i = 0; i < num_counters; i++ )
|
|
{
|
|
vec->push_back( counters[i] );
|
|
}
|
|
return ROCPROFILER_STATUS_SUCCESS;
|
|
},
|
|
static_cast<void*>( &gpu_counters ) ),
|
|
"Could not fetch supported counters" );
|
|
|
|
std::vector<rocprofiler_counter_id_t> collect_counters;
|
|
collect_counters.reserve( data->counter_names.size() );
|
|
// Look for the counters contained in counters_to_collect in gpu_counters
|
|
for( auto& counter : gpu_counters )
|
|
{
|
|
rocprofiler_counter_info_v0_t info;
|
|
ROCPROFILER_CALL(
|
|
rocprofiler_query_counter_info( counter, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast<void*>( &info ) ),
|
|
"Could not query info" );
|
|
if( data->counter_names.count( std::string( info.name ) ) > 0 )
|
|
{
|
|
collect_counters.push_back( counter );
|
|
|
|
size_t name_length = strlen( info.name );
|
|
char* cloned_name = (char*)tracy::tracy_malloc( name_length );
|
|
memcpy( cloned_name, info.name, name_length );
|
|
{
|
|
auto* item = tracy::Profiler::QueueSerial();
|
|
tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuAnnotationName );
|
|
tracy::MemWrite( &item->gpuAnnotationNameFat.context, data->context_id );
|
|
tracy::MemWrite( &item->gpuAnnotationNameFat.noteId, counter.handle );
|
|
tracy::MemWrite( &item->gpuAnnotationNameFat.ptr, (uint64_t)cloned_name );
|
|
tracy::MemWrite( &item->gpuAnnotationNameFat.size, name_length );
|
|
tracy::Profiler::QueueSerialFinish();
|
|
}
|
|
}
|
|
}
|
|
|
|
// Create a colleciton profile for the counters
|
|
rocprofiler_profile_config_id_t profile = { .handle = 0 };
|
|
ROCPROFILER_CALL( rocprofiler_create_profile_config( dispatch_data.dispatch_info.agent_id, collect_counters.data(),
|
|
collect_counters.size(), &profile ),
|
|
"Could not construct profile cfg" );
|
|
|
|
profile_cache.emplace( dispatch_data.dispatch_info.agent_id.handle, profile );
|
|
// Return the profile to collect those counters for this dispatch
|
|
*config = profile;
|
|
}
|
|
|
|
void tool_callback_tracing_callback( rocprofiler_callback_tracing_record_t record, rocprofiler_user_data_t* user_data,
|
|
void* callback_data )
|
|
{
|
|
assert( callback_data != nullptr );
|
|
ToolData* data = static_cast<ToolData*>( callback_data );
|
|
if( !data->init ) return;
|
|
|
|
if( record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT &&
|
|
record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER )
|
|
{
|
|
auto* sym_data = static_cast<kernel_symbol_data_t*>( record.payload );
|
|
|
|
if( record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD )
|
|
{
|
|
auto _lk = std::unique_lock{ data->mut };
|
|
data->client_kernels.emplace( sym_data->kernel_id, *sym_data );
|
|
}
|
|
else if( record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD )
|
|
{
|
|
auto _lk = std::unique_lock{ data->mut };
|
|
data->client_kernels.erase( sym_data->kernel_id );
|
|
}
|
|
}
|
|
else if( record.kind == ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH )
|
|
{
|
|
auto* rdata = static_cast<rocprofiler_callback_tracing_kernel_dispatch_data_t*>( record.payload );
|
|
if( record.operation == ROCPROFILER_KERNEL_DISPATCH_ENQUEUE )
|
|
{
|
|
if( record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER )
|
|
{
|
|
auto _lk = std::unique_lock{ data->mut };
|
|
data->dispatch_data[rdata->dispatch_info.dispatch_id].launch_start = tracy::Profiler::GetTime();
|
|
}
|
|
else if( record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT )
|
|
{
|
|
auto _lk = std::unique_lock{ data->mut };
|
|
data->dispatch_data[rdata->dispatch_info.dispatch_id].launch_end = tracy::Profiler::GetTime();
|
|
}
|
|
}
|
|
else if( record.operation == ROCPROFILER_KERNEL_DISPATCH_COMPLETE )
|
|
{
|
|
uint64_t src_loc = kernel_src_loc( data, rdata->dispatch_info.kernel_id );
|
|
record_interval( data, rdata->start_timestamp, rdata->end_timestamp, src_loc,
|
|
rdata->dispatch_info.dispatch_id );
|
|
}
|
|
}
|
|
else if( record.kind == ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY &&
|
|
record.operation != ROCPROFILER_MEMORY_COPY_NONE && record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT )
|
|
{
|
|
auto* rdata = static_cast<rocprofiler_callback_tracing_memory_copy_data_t*>( record.payload );
|
|
const char* name = nullptr;
|
|
switch( record.operation )
|
|
{
|
|
case ROCPROFILER_MEMORY_COPY_DEVICE_TO_DEVICE:
|
|
name = "DeviceToDeviceCopy";
|
|
break;
|
|
case ROCPROFILER_MEMORY_COPY_DEVICE_TO_HOST:
|
|
name = "DeviceToHostCopy";
|
|
break;
|
|
case ROCPROFILER_MEMORY_COPY_HOST_TO_DEVICE:
|
|
name = "HostToDeviceCopy";
|
|
break;
|
|
case ROCPROFILER_MEMORY_COPY_HOST_TO_HOST:
|
|
name = "HostToHostCopy";
|
|
break;
|
|
}
|
|
size_t name_len = strlen( name );
|
|
uint64_t src_loc = tracy::Profiler::AllocSourceLocation( 0, NULL, 0, name, name_len, NULL, 0 );
|
|
record_interval( data, rdata->start_timestamp, rdata->end_timestamp, src_loc, UINT64_MAX );
|
|
}
|
|
}
|
|
|
|
void calibration_thread( void* ptr )
|
|
{
|
|
while( !TracyIsStarted )
|
|
;
|
|
ToolData* data = static_cast<ToolData*>( ptr );
|
|
data->context_id = gpu_context_allocate( data );
|
|
const char* user_counters = GetEnvVar( "TRACY_ROCPROF_COUNTERS" );
|
|
if( user_counters )
|
|
{
|
|
data->counter_names.clear();
|
|
std::stringstream ss( user_counters );
|
|
std::string counter;
|
|
while( std::getline( ss, counter, ',' ) ) data->counter_names.insert( counter );
|
|
}
|
|
data->init = true;
|
|
|
|
#ifdef TRACY_ROCPROF_CALIBRATION
|
|
while( data->init )
|
|
{
|
|
sleep( 1 );
|
|
|
|
timespec ts;
|
|
// HSA performs a linear interpolation of GPU time to CLOCK_BOOTTIME. However, this is
|
|
// subject to network time updates and can drift relative to tracy's clock.
|
|
clock_gettime( CLOCK_BOOTTIME, &ts );
|
|
int64_t cpu_timestamp = Profiler::GetTime();
|
|
int64_t gpu_timestamp = ts.tv_nsec + ts.tv_sec * 1e9L;
|
|
|
|
if( cpu_timestamp > data->previous_cpu_time )
|
|
{
|
|
auto* item = tracy::Profiler::QueueSerial();
|
|
tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuCalibration );
|
|
tracy::MemWrite( &item->gpuCalibration.gpuTime, gpu_timestamp );
|
|
tracy::MemWrite( &item->gpuCalibration.cpuTime, cpu_timestamp );
|
|
tracy::MemWrite( &item->gpuCalibration.cpuDelta, cpu_timestamp - data->previous_cpu_time );
|
|
tracy::MemWrite( &item->gpuCalibration.context, data->context_id );
|
|
tracy::Profiler::QueueSerialFinish();
|
|
data->previous_cpu_time = cpu_timestamp;
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
int tool_init( rocprofiler_client_finalize_t fini_func, void* user_data )
|
|
{
|
|
ToolData* data = static_cast<ToolData*>( user_data );
|
|
data->cal_thread = std::make_unique<tracy::Thread>( calibration_thread, data );
|
|
|
|
ROCPROFILER_CALL( rocprofiler_create_context( &get_client_ctx() ), "context creation failed" );
|
|
|
|
ROCPROFILER_CALL( rocprofiler_configure_callback_dispatch_counting_service( get_client_ctx(), dispatch_callback,
|
|
user_data, record_callback, user_data ),
|
|
"Could not setup counting service" );
|
|
|
|
rocprofiler_tracing_operation_t ops[] = { ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER };
|
|
ROCPROFILER_CALL( rocprofiler_configure_callback_tracing_service( get_client_ctx(),
|
|
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, ops, 1,
|
|
tool_callback_tracing_callback, user_data ),
|
|
"callback tracing service failed to configure" );
|
|
|
|
rocprofiler_tracing_operation_t ops2[] = { ROCPROFILER_KERNEL_DISPATCH_COMPLETE,
|
|
ROCPROFILER_KERNEL_DISPATCH_ENQUEUE };
|
|
ROCPROFILER_CALL(
|
|
rocprofiler_configure_callback_tracing_service( get_client_ctx(), ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH,
|
|
ops2, 2, tool_callback_tracing_callback, user_data ),
|
|
"callback tracing service failed to configure" );
|
|
|
|
ROCPROFILER_CALL( rocprofiler_configure_callback_tracing_service( get_client_ctx(),
|
|
ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, nullptr,
|
|
0, tool_callback_tracing_callback, user_data ),
|
|
"callback tracing service failed to configure" );
|
|
|
|
ROCPROFILER_CALL( rocprofiler_start_context( get_client_ctx() ), "start context" );
|
|
return 0;
|
|
}
|
|
|
|
void tool_fini( void* tool_data_v )
|
|
{
|
|
rocprofiler_stop_context( get_client_ctx() );
|
|
|
|
ToolData* data = static_cast<ToolData*>( tool_data_v );
|
|
data->init = false;
|
|
data->cal_thread.reset();
|
|
}
|
|
}
|
|
|
|
extern "C"
|
|
{
|
|
rocprofiler_tool_configure_result_t* rocprofiler_configure( uint32_t version, const char* runtime_version,
|
|
uint32_t priority, rocprofiler_client_id_t* client_id )
|
|
{
|
|
// If not the first tool to register, indicate that the tool doesn't want to do anything
|
|
if( priority > 0 ) return nullptr;
|
|
|
|
// (optional) Provide a name for this tool to rocprofiler
|
|
client_id->name = "Tracy";
|
|
|
|
// (optional) create configure data
|
|
static ToolData data = ToolData{ version, runtime_version, priority, *client_id, 0, false, 0, 0 };
|
|
|
|
// construct configure result
|
|
static auto cfg = rocprofiler_tool_configure_result_t{ sizeof( rocprofiler_tool_configure_result_t ),
|
|
&tool_init, &tool_fini, static_cast<void*>( &data ) };
|
|
|
|
return &cfg;
|
|
}
|
|
}
|