diff --git a/xla/tsl/profiler/utils/xplane_schema.cc b/xla/tsl/profiler/utils/xplane_schema.cc index d1ea0faf889d0..a3b0e6d68aeb9 100644 --- a/xla/tsl/profiler/utils/xplane_schema.cc +++ b/xla/tsl/profiler/utils/xplane_schema.cc @@ -201,149 +201,149 @@ const HostEventTypeMap& GetHostEventTypeMap() { } const StatTypeMap& GetStatTypeMap() { - static auto* stat_type_map = new StatTypeMap({ - {"UnknownStatType", kUnknownStatType}, - // TraceMe arguments. - {"id", kStepId}, - {"device_ordinal", kDeviceOrdinal}, - {"chip_ordinal", kChipOrdinal}, - {"node_ordinal", kNodeOrdinal}, - {"model_id", kModelId}, - {"queue_addr", kQueueAddr}, - {"queue_id", kQueueId}, - {"request_id", kRequestId}, - {"run_id", kRunId}, - {"replica_id", kReplicaId}, - {"graph_type", kGraphType}, - {"step_num", kStepNum}, - {"iter_num", kIterNum}, - {"index_on_host", kIndexOnHost}, - {"allocator_name", kAllocatorName}, - {"bytes_reserved", kBytesReserved}, - {"bytes_allocated", kBytesAllocated}, - {"bytes_available", kBytesAvailable}, - {"fragmentation", kFragmentation}, - {"peak_bytes_in_use", kPeakBytesInUse}, - {"requested_bytes", kRequestedBytes}, - {"allocation_bytes", kAllocationBytes}, - {"addr", kAddress}, - {"region_type", kRegionType}, - {"data_type", kDataType}, - {"shape", kTensorShapes}, - {"layout", kTensorLayout}, - {"kpi_name", kKpiName}, - {"kpi_value", kKpiValue}, - {"element_id", kElementId}, - {"parent_id", kParentId}, - {"core_type", kCoreType}, - // XPlane semantics related. - {"_pt", kProducerType}, - {"_ct", kConsumerType}, - {"_p", kProducerId}, - {"_c", kConsumerId}, - {"_r", kIsRoot}, - {"_a", kIsAsync}, - // Device trace arguments. - {"device_id", kDeviceId}, - {"device_type_string", kDeviceTypeString}, - {"context_id", kContextId}, - {"correlation_id", kCorrelationId}, - {"memcpy_details", kMemcpyDetails}, - {"memalloc_details", kMemallocDetails}, - {"MemFree_details", kMemFreeDetails}, - {"Memset_details", kMemsetDetails}, - {"MemoryResidency_details", kMemoryResidencyDetails}, - {"kernel_details", kKernelDetails}, - {"nvtx_range", kNVTXRange}, - {"stream", kStream}, - // Stats added when processing traces. - {"group_id", kGroupId}, - {"flow", kFlow}, - {"step_name", kStepName}, - {"tf_op", kTfOp}, - {"hlo_op", kHloOp}, - {"deduplicated_name", kDeduplicatedName}, - {"hlo_category", kHloCategory}, - {"hlo_module", kHloModule}, - {"program_id", kProgramId}, - {"equation", kEquation}, - {"is_eager", kIsEager}, - {"is_func", kIsFunc}, - {"tf_function_call", kTfFunctionCall}, - {"tracing_count", kTfFunctionTracingCount}, - {"flops", kFlops}, - {"model_flops", kModelFlops}, - {"bytes_accessed", kBytesAccessed}, - {"memory_access_breakdown", kMemoryAccessBreakdown}, - {"source", kSourceInfo}, - {"model_name", kModelName}, - {"model_version", kModelVersion}, - {"bytes_transferred", kBytesTransferred}, - {"queue", kDmaQueue}, - {"dcn_collective_info", kDcnCollectiveInfo}, - // Performance counter related. - {"Raw Value", kRawValue}, - {"Scaled Value", kScaledValue}, - {"Thread Id", kThreadId}, - {"matrix_unit_utilization_percent", kMatrixUnitUtilizationPercent}, - // XLA metadata map related. - {"Hlo Proto", kHloProto}, - {"EdgeTPU Model information", kEdgeTpuModelInfo}, - {"EdgeTPU Model Profile information", kEdgeTpuModelProfileInfo}, - {"EdgeTPU MLIR", kEdgeTpuMlir}, - // Device capability related. - {"clock_rate", kDevCapClockRateKHz}, - {"core_count", kDevCapCoreCount}, - {"memory_bandwidth", kDevCapMemoryBandwidth}, - {"memory_size", kDevCapMemorySize}, - {"compute_cap_major", kDevCapComputeCapMajor}, - {"compute_cap_minor", kDevCapComputeCapMinor}, - {"peak_teraflops_per_second", kDevCapPeakTeraflopsPerSecond}, - {"peak_hbm_bw_gigabytes_per_second", kDevCapPeakHbmBwGigabytesPerSecond}, - {"peak_sram_rd_bw_gigabytes_per_second", - kDevCapPeakSramRdBwGigabytesPerSecond}, - {"peak_sram_wr_bw_gigabytes_per_second", - kDevCapPeakSramWrBwGigabytesPerSecond}, - {"device_vendor", kDevVendor}, - // Batching related. - {"batch_size_after_padding", kBatchSizeAfterPadding}, - {"padding_amount", kPaddingAmount}, - {"batching_input_task_size", kBatchingInputTaskSize}, - // GPU related metrics. - {"theoretical_occupancy_pct", kTheoreticalOccupancyPct}, - {"occupancy_min_grid_size", kOccupancyMinGridSize}, - {"occupancy_suggested_block_size", kOccupancySuggestedBlockSize}, - // Aggregated Stat - {"self_duration_ps", kSelfDurationPs}, - {"min_duration_ps", kMinDurationPs}, - {"total_profile_duration_ps", kTotalProfileDurationPs}, - {"max_iteration_num", kMaxIterationNum}, - {"device_type", kDeviceType}, - {"uses_megacore", kUsesMegaCore}, - {"symbol_id", kSymbolId}, - {"hlo_category", kHloCategory}, - {"tf_op_name", kTfOpName}, - {"dma_stall_duration_ps", kDmaStallDurationPs}, - {"key", kKey}, - {"payload_size_bytes", kPayloadSizeBytes}, - {"duration_us", kDuration}, - {"buffer_size", kBufferSize}, - {"transfers", kTransfers}, - // Dcn message Stats - {"dcn_label", kDcnLabel}, - {"dcn_source_slice_id", kDcnSourceSliceId}, - {"dcn_source_per_slice_device_id", kDcnSourcePerSliceDeviceId}, - {"dcn_destination_slice_id", kDcnDestinationSliceId}, - {"dcn_destination_per_slice_device_id", kDcnDestinationPerSliceDeviceId}, - {"dcn_chunk", kDcnChunk}, - {"dcn_loop_index", kDcnLoopIndex}, - {"dropped_traces", kDroppedTraces}, - {"cuda_graph_id", kCudaGraphId}, - {"cuda_graph_exec_id", kCudaGraphExecId}, - {"cuda_graph_orig_id", kCudaGraphOrigId}, - {"step_idle_time_ps", kStepIdleTimePs}, - {"gpu_device_name", kGpuDeviceName}, - }); + static auto* stat_type_map = new StatTypeMap( + {{"UnknownStatType", kUnknownStatType}, + // TraceMe arguments. + {"id", kStepId}, + {"device_ordinal", kDeviceOrdinal}, + {"chip_ordinal", kChipOrdinal}, + {"node_ordinal", kNodeOrdinal}, + {"model_id", kModelId}, + {"queue_addr", kQueueAddr}, + {"queue_id", kQueueId}, + {"request_id", kRequestId}, + {"run_id", kRunId}, + {"replica_id", kReplicaId}, + {"graph_type", kGraphType}, + {"step_num", kStepNum}, + {"iter_num", kIterNum}, + {"index_on_host", kIndexOnHost}, + {"allocator_name", kAllocatorName}, + {"bytes_reserved", kBytesReserved}, + {"bytes_allocated", kBytesAllocated}, + {"bytes_available", kBytesAvailable}, + {"fragmentation", kFragmentation}, + {"peak_bytes_in_use", kPeakBytesInUse}, + {"requested_bytes", kRequestedBytes}, + {"allocation_bytes", kAllocationBytes}, + {"addr", kAddress}, + {"region_type", kRegionType}, + {"data_type", kDataType}, + {"shape", kTensorShapes}, + {"layout", kTensorLayout}, + {"kpi_name", kKpiName}, + {"kpi_value", kKpiValue}, + {"element_id", kElementId}, + {"parent_id", kParentId}, + {"core_type", kCoreType}, + // XPlane semantics related. + {"_pt", kProducerType}, + {"_ct", kConsumerType}, + {"_p", kProducerId}, + {"_c", kConsumerId}, + {"_r", kIsRoot}, + {"_a", kIsAsync}, + // Device trace arguments. + {"device_id", kDeviceId}, + {"device_type_string", kDeviceTypeString}, + {"context_id", kContextId}, + {"correlation_id", kCorrelationId}, + {"memcpy_details", kMemcpyDetails}, + {"memalloc_details", kMemallocDetails}, + {"MemFree_details", kMemFreeDetails}, + {"Memset_details", kMemsetDetails}, + {"MemoryResidency_details", kMemoryResidencyDetails}, + {"kernel_details", kKernelDetails}, + {"nvtx_range", kNVTXRange}, + {"stream", kStream}, + // Stats added when processing traces. + {"group_id", kGroupId}, + {"flow", kFlow}, + {"step_name", kStepName}, + {"tf_op", kTfOp}, + {"hlo_op", kHloOp}, + {"deduplicated_name", kDeduplicatedName}, + {"hlo_category", kHloCategory}, + {"hlo_module", kHloModule}, + {"program_id", kProgramId}, + {"equation", kEquation}, + {"is_eager", kIsEager}, + {"is_func", kIsFunc}, + {"tf_function_call", kTfFunctionCall}, + {"tracing_count", kTfFunctionTracingCount}, + {"flops", kFlops}, + {"model_flops", kModelFlops}, + {"bytes_accessed", kBytesAccessed}, + {"memory_access_breakdown", kMemoryAccessBreakdown}, + {"source", kSourceInfo}, + {"model_name", kModelName}, + {"model_version", kModelVersion}, + {"bytes_transferred", kBytesTransferred}, + {"queue", kDmaQueue}, + {"dcn_collective_info", kDcnCollectiveInfo}, + // Performance counter related. + {"Raw Value", kRawValue}, + {"Scaled Value", kScaledValue}, + {"Thread Id", kThreadId}, + {"matrix_unit_utilization_percent", kMatrixUnitUtilizationPercent}, + // XLA metadata map related. + {"Hlo Proto", kHloProto}, + {"EdgeTPU Model information", kEdgeTpuModelInfo}, + {"EdgeTPU Model Profile information", kEdgeTpuModelProfileInfo}, + {"EdgeTPU MLIR", kEdgeTpuMlir}, + // Device capability related. + {"clock_rate", kDevCapClockRateKHz}, + {"core_count", kDevCapCoreCount}, + {"memory_bandwidth", kDevCapMemoryBandwidth}, + {"memory_size", kDevCapMemorySize}, + {"compute_cap_major", kDevCapComputeCapMajor}, + {"compute_cap_minor", kDevCapComputeCapMinor}, + {"peak_teraflops_per_second", kDevCapPeakTeraflopsPerSecond}, + {"peak_hbm_bw_gigabytes_per_second", kDevCapPeakHbmBwGigabytesPerSecond}, + {"peak_sram_rd_bw_gigabytes_per_second", + kDevCapPeakSramRdBwGigabytesPerSecond}, + {"peak_sram_wr_bw_gigabytes_per_second", + kDevCapPeakSramWrBwGigabytesPerSecond}, + {"device_vendor", kDevVendor}, + // Batching related. + {"batch_size_after_padding", kBatchSizeAfterPadding}, + {"padding_amount", kPaddingAmount}, + {"batching_input_task_size", kBatchingInputTaskSize}, + // GPU related metrics. + {"theoretical_occupancy_pct", kTheoreticalOccupancyPct}, + {"occupancy_min_grid_size", kOccupancyMinGridSize}, + {"occupancy_suggested_block_size", kOccupancySuggestedBlockSize}, + // Aggregated Stat + {"self_duration_ps", kSelfDurationPs}, + {"min_duration_ps", kMinDurationPs}, + {"total_profile_duration_ps", kTotalProfileDurationPs}, + {"max_iteration_num", kMaxIterationNum}, + {"device_type", kDeviceType}, + {"uses_megacore", kUsesMegaCore}, + {"symbol_id", kSymbolId}, + {"hlo_category", kHloCategory}, + {"tf_op_name", kTfOpName}, + {"dma_stall_duration_ps", kDmaStallDurationPs}, + {"key", kKey}, + {"payload_size_bytes", kPayloadSizeBytes}, + {"duration_us", kDuration}, + {"buffer_size", kBufferSize}, + {"transfers", kTransfers}, + // Dcn message Stats + {"dcn_label", kDcnLabel}, + {"dcn_source_slice_id", kDcnSourceSliceId}, + {"dcn_source_per_slice_device_id", kDcnSourcePerSliceDeviceId}, + {"dcn_destination_slice_id", kDcnDestinationSliceId}, + {"dcn_destination_per_slice_device_id", kDcnDestinationPerSliceDeviceId}, + {"dcn_chunk", kDcnChunk}, + {"dcn_loop_index", kDcnLoopIndex}, + {"dropped_traces", kDroppedTraces}, + {"cuda_graph_id", kCudaGraphId}, + {"cuda_graph_exec_id", kCudaGraphExecId}, + {"cuda_graph_orig_id", kCudaGraphOrigId}, + {"step_idle_time_ps", kStepIdleTimePs}, + {"gpu_device_name", kGpuDeviceName}, + {"source_stack", kSourceStack}}); DCHECK_EQ(stat_type_map->size(), kNumStatTypes); return *stat_type_map; } diff --git a/xla/tsl/profiler/utils/xplane_schema.h b/xla/tsl/profiler/utils/xplane_schema.h index d61b59b746414..c0a6c969dfffd 100644 --- a/xla/tsl/profiler/utils/xplane_schema.h +++ b/xla/tsl/profiler/utils/xplane_schema.h @@ -332,7 +332,8 @@ enum StatType { kCudaGraphOrigId, kStepIdleTimePs, kGpuDeviceName, - kLastStatType = kGpuDeviceName, + kSourceStack, + kLastStatType = kSourceStack, }; enum MegaScaleStatType : uint8_t {