Skip to content

Commit

Permalink
Add Stat type for Source Stack to show in trace viewer
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 682500639
  • Loading branch information
Google-ML-Automation committed Oct 5, 2024
1 parent 5da861b commit 1bbbee5
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 144 deletions.
286 changes: 143 additions & 143 deletions xla/tsl/profiler/utils/xplane_schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -201,149 +201,149 @@ const HostEventTypeMap& GetHostEventTypeMap() {
}

const StatTypeMap& GetStatTypeMap() {
static auto* stat_type_map = new StatTypeMap({
{"UnknownStatType", kUnknownStatType},
// TraceMe arguments.
{"id", kStepId},
{"device_ordinal", kDeviceOrdinal},
{"chip_ordinal", kChipOrdinal},
{"node_ordinal", kNodeOrdinal},
{"model_id", kModelId},
{"queue_addr", kQueueAddr},
{"queue_id", kQueueId},
{"request_id", kRequestId},
{"run_id", kRunId},
{"replica_id", kReplicaId},
{"graph_type", kGraphType},
{"step_num", kStepNum},
{"iter_num", kIterNum},
{"index_on_host", kIndexOnHost},
{"allocator_name", kAllocatorName},
{"bytes_reserved", kBytesReserved},
{"bytes_allocated", kBytesAllocated},
{"bytes_available", kBytesAvailable},
{"fragmentation", kFragmentation},
{"peak_bytes_in_use", kPeakBytesInUse},
{"requested_bytes", kRequestedBytes},
{"allocation_bytes", kAllocationBytes},
{"addr", kAddress},
{"region_type", kRegionType},
{"data_type", kDataType},
{"shape", kTensorShapes},
{"layout", kTensorLayout},
{"kpi_name", kKpiName},
{"kpi_value", kKpiValue},
{"element_id", kElementId},
{"parent_id", kParentId},
{"core_type", kCoreType},
// XPlane semantics related.
{"_pt", kProducerType},
{"_ct", kConsumerType},
{"_p", kProducerId},
{"_c", kConsumerId},
{"_r", kIsRoot},
{"_a", kIsAsync},
// Device trace arguments.
{"device_id", kDeviceId},
{"device_type_string", kDeviceTypeString},
{"context_id", kContextId},
{"correlation_id", kCorrelationId},
{"memcpy_details", kMemcpyDetails},
{"memalloc_details", kMemallocDetails},
{"MemFree_details", kMemFreeDetails},
{"Memset_details", kMemsetDetails},
{"MemoryResidency_details", kMemoryResidencyDetails},
{"kernel_details", kKernelDetails},
{"nvtx_range", kNVTXRange},
{"stream", kStream},
// Stats added when processing traces.
{"group_id", kGroupId},
{"flow", kFlow},
{"step_name", kStepName},
{"tf_op", kTfOp},
{"hlo_op", kHloOp},
{"deduplicated_name", kDeduplicatedName},
{"hlo_category", kHloCategory},
{"hlo_module", kHloModule},
{"program_id", kProgramId},
{"equation", kEquation},
{"is_eager", kIsEager},
{"is_func", kIsFunc},
{"tf_function_call", kTfFunctionCall},
{"tracing_count", kTfFunctionTracingCount},
{"flops", kFlops},
{"model_flops", kModelFlops},
{"bytes_accessed", kBytesAccessed},
{"memory_access_breakdown", kMemoryAccessBreakdown},
{"source", kSourceInfo},
{"model_name", kModelName},
{"model_version", kModelVersion},
{"bytes_transferred", kBytesTransferred},
{"queue", kDmaQueue},
{"dcn_collective_info", kDcnCollectiveInfo},
// Performance counter related.
{"Raw Value", kRawValue},
{"Scaled Value", kScaledValue},
{"Thread Id", kThreadId},
{"matrix_unit_utilization_percent", kMatrixUnitUtilizationPercent},
// XLA metadata map related.
{"Hlo Proto", kHloProto},
{"EdgeTPU Model information", kEdgeTpuModelInfo},
{"EdgeTPU Model Profile information", kEdgeTpuModelProfileInfo},
{"EdgeTPU MLIR", kEdgeTpuMlir},
// Device capability related.
{"clock_rate", kDevCapClockRateKHz},
{"core_count", kDevCapCoreCount},
{"memory_bandwidth", kDevCapMemoryBandwidth},
{"memory_size", kDevCapMemorySize},
{"compute_cap_major", kDevCapComputeCapMajor},
{"compute_cap_minor", kDevCapComputeCapMinor},
{"peak_teraflops_per_second", kDevCapPeakTeraflopsPerSecond},
{"peak_hbm_bw_gigabytes_per_second", kDevCapPeakHbmBwGigabytesPerSecond},
{"peak_sram_rd_bw_gigabytes_per_second",
kDevCapPeakSramRdBwGigabytesPerSecond},
{"peak_sram_wr_bw_gigabytes_per_second",
kDevCapPeakSramWrBwGigabytesPerSecond},
{"device_vendor", kDevVendor},
// Batching related.
{"batch_size_after_padding", kBatchSizeAfterPadding},
{"padding_amount", kPaddingAmount},
{"batching_input_task_size", kBatchingInputTaskSize},
// GPU related metrics.
{"theoretical_occupancy_pct", kTheoreticalOccupancyPct},
{"occupancy_min_grid_size", kOccupancyMinGridSize},
{"occupancy_suggested_block_size", kOccupancySuggestedBlockSize},
// Aggregated Stat
{"self_duration_ps", kSelfDurationPs},
{"min_duration_ps", kMinDurationPs},
{"total_profile_duration_ps", kTotalProfileDurationPs},
{"max_iteration_num", kMaxIterationNum},
{"device_type", kDeviceType},
{"uses_megacore", kUsesMegaCore},
{"symbol_id", kSymbolId},
{"hlo_category", kHloCategory},
{"tf_op_name", kTfOpName},
{"dma_stall_duration_ps", kDmaStallDurationPs},
{"key", kKey},
{"payload_size_bytes", kPayloadSizeBytes},
{"duration_us", kDuration},
{"buffer_size", kBufferSize},
{"transfers", kTransfers},
// Dcn message Stats
{"dcn_label", kDcnLabel},
{"dcn_source_slice_id", kDcnSourceSliceId},
{"dcn_source_per_slice_device_id", kDcnSourcePerSliceDeviceId},
{"dcn_destination_slice_id", kDcnDestinationSliceId},
{"dcn_destination_per_slice_device_id", kDcnDestinationPerSliceDeviceId},
{"dcn_chunk", kDcnChunk},
{"dcn_loop_index", kDcnLoopIndex},
{"dropped_traces", kDroppedTraces},
{"cuda_graph_id", kCudaGraphId},
{"cuda_graph_exec_id", kCudaGraphExecId},
{"cuda_graph_orig_id", kCudaGraphOrigId},
{"step_idle_time_ps", kStepIdleTimePs},
{"gpu_device_name", kGpuDeviceName},
});
static auto* stat_type_map = new StatTypeMap(
{{"UnknownStatType", kUnknownStatType},
// TraceMe arguments.
{"id", kStepId},
{"device_ordinal", kDeviceOrdinal},
{"chip_ordinal", kChipOrdinal},
{"node_ordinal", kNodeOrdinal},
{"model_id", kModelId},
{"queue_addr", kQueueAddr},
{"queue_id", kQueueId},
{"request_id", kRequestId},
{"run_id", kRunId},
{"replica_id", kReplicaId},
{"graph_type", kGraphType},
{"step_num", kStepNum},
{"iter_num", kIterNum},
{"index_on_host", kIndexOnHost},
{"allocator_name", kAllocatorName},
{"bytes_reserved", kBytesReserved},
{"bytes_allocated", kBytesAllocated},
{"bytes_available", kBytesAvailable},
{"fragmentation", kFragmentation},
{"peak_bytes_in_use", kPeakBytesInUse},
{"requested_bytes", kRequestedBytes},
{"allocation_bytes", kAllocationBytes},
{"addr", kAddress},
{"region_type", kRegionType},
{"data_type", kDataType},
{"shape", kTensorShapes},
{"layout", kTensorLayout},
{"kpi_name", kKpiName},
{"kpi_value", kKpiValue},
{"element_id", kElementId},
{"parent_id", kParentId},
{"core_type", kCoreType},
// XPlane semantics related.
{"_pt", kProducerType},
{"_ct", kConsumerType},
{"_p", kProducerId},
{"_c", kConsumerId},
{"_r", kIsRoot},
{"_a", kIsAsync},
// Device trace arguments.
{"device_id", kDeviceId},
{"device_type_string", kDeviceTypeString},
{"context_id", kContextId},
{"correlation_id", kCorrelationId},
{"memcpy_details", kMemcpyDetails},
{"memalloc_details", kMemallocDetails},
{"MemFree_details", kMemFreeDetails},
{"Memset_details", kMemsetDetails},
{"MemoryResidency_details", kMemoryResidencyDetails},
{"kernel_details", kKernelDetails},
{"nvtx_range", kNVTXRange},
{"stream", kStream},
// Stats added when processing traces.
{"group_id", kGroupId},
{"flow", kFlow},
{"step_name", kStepName},
{"tf_op", kTfOp},
{"hlo_op", kHloOp},
{"deduplicated_name", kDeduplicatedName},
{"hlo_category", kHloCategory},
{"hlo_module", kHloModule},
{"program_id", kProgramId},
{"equation", kEquation},
{"is_eager", kIsEager},
{"is_func", kIsFunc},
{"tf_function_call", kTfFunctionCall},
{"tracing_count", kTfFunctionTracingCount},
{"flops", kFlops},
{"model_flops", kModelFlops},
{"bytes_accessed", kBytesAccessed},
{"memory_access_breakdown", kMemoryAccessBreakdown},
{"source", kSourceInfo},
{"model_name", kModelName},
{"model_version", kModelVersion},
{"bytes_transferred", kBytesTransferred},
{"queue", kDmaQueue},
{"dcn_collective_info", kDcnCollectiveInfo},
// Performance counter related.
{"Raw Value", kRawValue},
{"Scaled Value", kScaledValue},
{"Thread Id", kThreadId},
{"matrix_unit_utilization_percent", kMatrixUnitUtilizationPercent},
// XLA metadata map related.
{"Hlo Proto", kHloProto},
{"EdgeTPU Model information", kEdgeTpuModelInfo},
{"EdgeTPU Model Profile information", kEdgeTpuModelProfileInfo},
{"EdgeTPU MLIR", kEdgeTpuMlir},
// Device capability related.
{"clock_rate", kDevCapClockRateKHz},
{"core_count", kDevCapCoreCount},
{"memory_bandwidth", kDevCapMemoryBandwidth},
{"memory_size", kDevCapMemorySize},
{"compute_cap_major", kDevCapComputeCapMajor},
{"compute_cap_minor", kDevCapComputeCapMinor},
{"peak_teraflops_per_second", kDevCapPeakTeraflopsPerSecond},
{"peak_hbm_bw_gigabytes_per_second", kDevCapPeakHbmBwGigabytesPerSecond},
{"peak_sram_rd_bw_gigabytes_per_second",
kDevCapPeakSramRdBwGigabytesPerSecond},
{"peak_sram_wr_bw_gigabytes_per_second",
kDevCapPeakSramWrBwGigabytesPerSecond},
{"device_vendor", kDevVendor},
// Batching related.
{"batch_size_after_padding", kBatchSizeAfterPadding},
{"padding_amount", kPaddingAmount},
{"batching_input_task_size", kBatchingInputTaskSize},
// GPU related metrics.
{"theoretical_occupancy_pct", kTheoreticalOccupancyPct},
{"occupancy_min_grid_size", kOccupancyMinGridSize},
{"occupancy_suggested_block_size", kOccupancySuggestedBlockSize},
// Aggregated Stat
{"self_duration_ps", kSelfDurationPs},
{"min_duration_ps", kMinDurationPs},
{"total_profile_duration_ps", kTotalProfileDurationPs},
{"max_iteration_num", kMaxIterationNum},
{"device_type", kDeviceType},
{"uses_megacore", kUsesMegaCore},
{"symbol_id", kSymbolId},
{"hlo_category", kHloCategory},
{"tf_op_name", kTfOpName},
{"dma_stall_duration_ps", kDmaStallDurationPs},
{"key", kKey},
{"payload_size_bytes", kPayloadSizeBytes},
{"duration_us", kDuration},
{"buffer_size", kBufferSize},
{"transfers", kTransfers},
// Dcn message Stats
{"dcn_label", kDcnLabel},
{"dcn_source_slice_id", kDcnSourceSliceId},
{"dcn_source_per_slice_device_id", kDcnSourcePerSliceDeviceId},
{"dcn_destination_slice_id", kDcnDestinationSliceId},
{"dcn_destination_per_slice_device_id", kDcnDestinationPerSliceDeviceId},
{"dcn_chunk", kDcnChunk},
{"dcn_loop_index", kDcnLoopIndex},
{"dropped_traces", kDroppedTraces},
{"cuda_graph_id", kCudaGraphId},
{"cuda_graph_exec_id", kCudaGraphExecId},
{"cuda_graph_orig_id", kCudaGraphOrigId},
{"step_idle_time_ps", kStepIdleTimePs},
{"gpu_device_name", kGpuDeviceName},
{"source_stack", kSourceStack}});
DCHECK_EQ(stat_type_map->size(), kNumStatTypes);
return *stat_type_map;
}
Expand Down
3 changes: 2 additions & 1 deletion xla/tsl/profiler/utils/xplane_schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,8 @@ enum StatType {
kCudaGraphOrigId,
kStepIdleTimePs,
kGpuDeviceName,
kLastStatType = kGpuDeviceName,
kSourceStack,
kLastStatType = kSourceStack,
};

enum MegaScaleStatType : uint8_t {
Expand Down

0 comments on commit 1bbbee5

Please sign in to comment.