From 6e19c4be57ba1ce1b14d7d8ec78d89d85c93987b Mon Sep 17 00:00:00 2001 From: Yin Zhang Date: Mon, 25 Nov 2024 18:06:51 -0800 Subject: [PATCH] Create Roofline Model tool in Tensorboard Plugin Profiler PiperOrigin-RevId: 700161829 --- frontend/app/common/constants/BUILD | 3 + .../constants/roofline_model_constants.ts | 2 +- frontend/app/components/main_page/BUILD | 1 + .../components/main_page/main_page_module.ts | 5 + frontend/app/components/roofline_model/BUILD | 40 + .../operation_level_analysis.ts | 2 +- .../roofline_model/roofline_model.ng.html | 56 + .../roofline_model/roofline_model.scss | 39 + .../roofline_model/roofline_model.ts | 1102 +++++++++++++++++ .../roofline_model/roofline_model_module.ts | 25 + .../tensorboard_plugin_profile/convert/BUILD | 10 + .../convert/raw_to_tool_data.py | 5 + .../convert/roofline_model_proto_to_gviz.py | 392 ++++++ .../tpu/tensorflow/tpu_tf2_keras_test.py | 3 +- .../profile_plugin.py | 1 + .../tensorboard_plugin_profile/protobuf/BUILD | 2 + .../protobuf/roofline_model.proto | 196 +++ 17 files changed, 1881 insertions(+), 3 deletions(-) create mode 100644 frontend/app/components/roofline_model/BUILD create mode 100644 frontend/app/components/roofline_model/roofline_model.ng.html create mode 100644 frontend/app/components/roofline_model/roofline_model.scss create mode 100644 frontend/app/components/roofline_model/roofline_model.ts create mode 100644 frontend/app/components/roofline_model/roofline_model_module.ts create mode 100644 plugin/tensorboard_plugin_profile/convert/roofline_model_proto_to_gviz.py create mode 100644 plugin/tensorboard_plugin_profile/protobuf/roofline_model.proto diff --git a/frontend/app/common/constants/BUILD b/frontend/app/common/constants/BUILD index 95afc064..47e1e03c 100644 --- a/frontend/app/common/constants/BUILD +++ b/frontend/app/common/constants/BUILD @@ -15,6 +15,9 @@ ts_library( srcs = [ "roofline_model_constants.ts", ], + deps = [ + "@npm//@types/google.visualization", + ], ) ts_library( diff --git a/frontend/app/common/constants/roofline_model_constants.ts b/frontend/app/common/constants/roofline_model_constants.ts index 3eb75f4c..4ea3557c 100644 --- a/frontend/app/common/constants/roofline_model_constants.ts +++ b/frontend/app/common/constants/roofline_model_constants.ts @@ -53,7 +53,7 @@ export const SCATTER_CHART_OPTIONS = { // Ticks have to be explicitly defined for scaling axis evenly. ticks: [0, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000, 1000000], }, - legend: {position: 'right'}, + legend: {position: 'right' as google.visualization.ChartLegendPosition}, tooltip: {isHtml: true}, // Be mindful that series is not specified here, otherwise the shallow copy in // components of this object could result in overwritten series styles diff --git a/frontend/app/components/main_page/BUILD b/frontend/app/components/main_page/BUILD index 2763afba..38de3ba0 100644 --- a/frontend/app/components/main_page/BUILD +++ b/frontend/app/components/main_page/BUILD @@ -38,6 +38,7 @@ xprof_ng_module( "@org_xprof//frontend/app/components/op_profile", "@org_xprof//frontend/app/components/overview_page", "@org_xprof//frontend/app/components/pod_viewer", + "@org_xprof//frontend/app/components/roofline_model", "@org_xprof//frontend/app/components/sidenav", "@org_xprof//frontend/app/components/tf_data_bottleneck_analysis", "@org_xprof//frontend/app/components/trace_viewer", diff --git a/frontend/app/components/main_page/main_page_module.ts b/frontend/app/components/main_page/main_page_module.ts index d1d20eb5..9ccc094a 100644 --- a/frontend/app/components/main_page/main_page_module.ts +++ b/frontend/app/components/main_page/main_page_module.ts @@ -29,6 +29,8 @@ import {OverviewPage} from 'org_xprof/frontend/app/components/overview_page/over import {OverviewPageModule} from 'org_xprof/frontend/app/components/overview_page/overview_page_module'; import {PodViewer} from 'org_xprof/frontend/app/components/pod_viewer/pod_viewer'; import {PodViewerModule} from 'org_xprof/frontend/app/components/pod_viewer/pod_viewer_module'; +import {RooflineModel} from 'org_xprof/frontend/app/components/roofline_model/roofline_model'; +import {RooflineModelModule} from 'org_xprof/frontend/app/components/roofline_model/roofline_model_module'; import {SideNavModule} from 'org_xprof/frontend/app/components/sidenav/sidenav_module'; import {TfDataBottleneckAnalysis} from 'org_xprof/frontend/app/components/tf_data_bottleneck_analysis/tf_data_bottleneck_analysis'; import {TfDataBottleneckAnalysisModule} from 'org_xprof/frontend/app/components/tf_data_bottleneck_analysis/tf_data_bottleneck_analysis_module'; @@ -69,6 +71,8 @@ export const routes: Routes = [ {path: 'inference_profile', component: InferenceProfile}, {path: 'hlo_stats', component: HloStats}, {path: 'hlo_stats^', component: HloStats}, + {path: 'roofline_model', component: RooflineModel}, + {path: 'roofline_model^', component: RooflineModel}, {path: '**', component: EmptyPage}, ]; @@ -96,6 +100,7 @@ export const routes: Routes = [ FrameworkOpStatsAdapterModule, DcnCollectiveStatsModule, HloStatsModule, + RooflineModelModule, InferenceProfileModule, RouterModule.forRoot(routes), ], diff --git a/frontend/app/components/roofline_model/BUILD b/frontend/app/components/roofline_model/BUILD new file mode 100644 index 00000000..1e8f0f90 --- /dev/null +++ b/frontend/app/components/roofline_model/BUILD @@ -0,0 +1,40 @@ +load("@io_bazel_rules_sass//:defs.bzl", "sass_binary") +load("//defs:defs.bzl", "xprof_ng_module") + +package(default_visibility = ["//frontend:internal"]) + +xprof_ng_module( + name = "roofline_model", + srcs = [ + "roofline_model.ts", + "roofline_model_module.ts", + ], + assets = [ + ":roofline_model_css", + "roofline_model.ng.html", + ], + deps = [ + "@npm//@angular/core", + "@npm//@angular/router", + "@npm//@ngrx/store", + "@npm//@types/google.visualization", + "@npm//rxjs", + "@org_xprof//frontend/app/common/constants:roofline_model_constants", + "@org_xprof//frontend/app/common/interfaces", + "@org_xprof//frontend/app/common/utils", + "@org_xprof//frontend/app/components/chart/table", + "@org_xprof//frontend/app/components/controls/category_filter", + "@org_xprof//frontend/app/components/controls/string_filter", + "@org_xprof//frontend/app/components/roofline_model/operation_level_analysis", + "@org_xprof//frontend/app/components/roofline_model/program_level_analysis", + "@org_xprof//frontend/app/services/data_service", + "@org_xprof//frontend/app/store", + ], +) + +sass_binary( + name = "roofline_model_css", + src = "roofline_model.scss", + # stack = True, + sourcemap = False, +) diff --git a/frontend/app/components/roofline_model/operation_level_analysis/operation_level_analysis.ts b/frontend/app/components/roofline_model/operation_level_analysis/operation_level_analysis.ts index 62bdc505..7d1ecd48 100644 --- a/frontend/app/components/roofline_model/operation_level_analysis/operation_level_analysis.ts +++ b/frontend/app/components/roofline_model/operation_level_analysis/operation_level_analysis.ts @@ -64,7 +64,7 @@ export class OperationLevelAnalysis title: 'Percentage of self time per HLO op category', colors: PIE_CHART_PALETTE, sliceVisibilityThreshold: 0.01, - }, + } as google.visualization.PieChartOptions, }; dataInfoRooflineScatterChart: ChartDataInfo = { data: null, diff --git a/frontend/app/components/roofline_model/roofline_model.ng.html b/frontend/app/components/roofline_model/roofline_model.ng.html new file mode 100644 index 00000000..dabd72b8 --- /dev/null +++ b/frontend/app/components/roofline_model/roofline_model.ng.html @@ -0,0 +1,56 @@ +
+
+
+

Device Information

+
+
+
+
+
+
{{info.label}}: {{info.value}} {{info.unit}} {{info.context}}
+
+
+
+ +
+
+

Section1: Program-Level Analysis

+
+
+
+ Note:
+ (1) This section provides program-level analysis.
+ (2) A tooltip with extra information will show up if you mouse over a point in the roofline chart.
+ (3) You can choose whether to include infeed and outfeed ops for the analysis.
+ (4) "Total" aggregates all operations throughout the entire profiling session. It includes incomplete steps.
+ (5) "Total (HW)" is based on the hardware performance counters while the others are based on the XLA's cost analysis. + It is always calculated including infeed and outfeed ops regardless of the option. + The gap between "Total" and "Total (HW)" is due to hardware limitation (e.g., padding).
+ (6) "Average" shows the average step information by aggregating the operations in the complete steps only. +
+
+
+ +
+
+ +
+
+

Section2: Operation-Level Analysis

+
+
+
+ Note:
+ (1) This section provides operation-level analysis.
+ (2) A tooltip with extra information will show up if you mouse over a point in the roofline chart.
+ (3) To avoid sluggishness, only the 1000 most time-consuming operations are shown.
+ (4) You can choose whether to include infeed and outfeed ops for the analysis.
+ (5) You can filter data by HLO category, bottleneck resource or HLO name.
+ (6) "IDLE" represents the portion of the total execution time on device that is idle.
+ (7) Ops with zero FLOP (e.g., data formatting ops like reshape, IDLE, etc.) do not show up in the roofline chart.
+
+
+
+ +
+
diff --git a/frontend/app/components/roofline_model/roofline_model.scss b/frontend/app/components/roofline_model/roofline_model.scss new file mode 100644 index 00000000..da57955c --- /dev/null +++ b/frontend/app/components/roofline_model/roofline_model.scss @@ -0,0 +1,39 @@ +.section-container { + margin: 20px 20px 0px; +} + +.block-content { + padding: 5px; +} + +.row { + display: flex; +} + +.flex-space { + flex: 1; +} + +.description { + font-size: 14px; +} + +.tableHeaderCell { + word-wrap: break-word; + background-color: azure; //!to hide the scrolled-up text. +} + +.tableTableCell { + word-break: break-all; +} + +.opColumnClass { + max-height: 200px; + overflow-y: auto; +} + +.errorMessage { + border: 2px solid; + background-color: #ffcccb; + color: red; +} diff --git a/frontend/app/components/roofline_model/roofline_model.ts b/frontend/app/components/roofline_model/roofline_model.ts new file mode 100644 index 00000000..00450b67 --- /dev/null +++ b/frontend/app/components/roofline_model/roofline_model.ts @@ -0,0 +1,1102 @@ +import {Component, OnDestroy} from '@angular/core'; +import {ActivatedRoute} from '@angular/router'; +import {Store} from '@ngrx/store'; +import {DEVICE_INFO, NUMERIC_DATA_FORMAT, PIE_CHART_PALETTE, ROOFLINE_STYLES, SCATTER_CHART_AXIS, SCATTER_CHART_OPTIONS,} from 'org_xprof/frontend/app/common/constants/roofline_model_constants'; +import {NavigationEvent} from 'org_xprof/frontend/app/common/interfaces/navigation_event'; +import {RooflineModelData} from 'org_xprof/frontend/app/common/interfaces/roofline_model'; +import {setLoadingState} from 'org_xprof/frontend/app/common/utils/utils'; +import {DataService} from 'org_xprof/frontend/app/services/data_service/data_service'; +import {setCurrentToolStateAction} from 'org_xprof/frontend/app/store/actions'; +import {ReplaySubject} from 'rxjs'; +import {takeUntil} from 'rxjs/operators'; + +interface DeviceInfoData { + id: string; + label: string; + type?: string; + value?: string | number; + unit?: string; + context?: string; + display?: boolean; +} +declare interface DeviceIndicators { + hasMergedVmem: boolean; + hasCmem: boolean; + hasMegacore: boolean; + isGpu: boolean; +} +type ColumnIdxArr = Array; + +interface TooltipRow { + id: string; + label: string; + operation?: (val: string | number) => string; +} + +const NVIDIA_GPU_TYPE_PREFIX = 'Nvidia GPU'; + +/** A roofline model component. */ +@Component({ + standalone: false, + selector: 'roofline-model', + templateUrl: './roofline_model.ng.html', + styleUrls: ['./roofline_model.scss'], +}) +export class RooflineModel implements OnDestroy { + readonly tool = 'roofline_model'; + + /** Handles on-destroy Subject, used to unsubscribe. */ + private readonly destroyed = new ReplaySubject(1); + + currentRun = ''; + // Device Information section data + deviceInfoArray: DeviceInfoData[] = []; + // Some critical indicators + deviceIndicators: DeviceIndicators = { + hasMergedVmem: false, + hasCmem: false, + hasMegacore: false, + isGpu: false, + }; + + // dataTableRaw from the raw roofline model data + // DataTable data format makes a lot data manipulation easier + dataTableRaw: google.visualization.DataTable | null = null; + + /** Program level section variables */ + // DataTable data for underlying table chart filtered on category for program + dataTableProgram: google.visualization.DataTable | null = null; + // visible columns for the table chart view, if empty all columns are shown + columnsIdxProgram: ColumnIdxArr = []; + // preprocessed data for underlying roofline scatter chart + scatterDataProgram: google.visualization.DataTable | null = null; + readonly scatterChartOptionsProgram: + google.visualization.ScatterChartOptions = { + ...SCATTER_CHART_OPTIONS, + series: [], + }; + readonly programLevelAgg = ['Total', 'Total (HW)', 'Average', 'Step']; + + /** Operation level section variables */ + dataTableOp?: google.visualization.DataTable | null = null; + columnsIdxOp: ColumnIdxArr = []; + scatterDataOp?: google.visualization.DataTable | null = null; + readonly scatterChartOptionsOp: google.visualization.ScatterChartOptions = { + ...SCATTER_CHART_OPTIONS, + series: [], + }; + // Prepopulated op name from url + selectedOpName = ''; + + constructor( + route: ActivatedRoute, + private readonly dataService: DataService, + private readonly store: Store<{}>, + ) { + route.params.pipe(takeUntil(this.destroyed)).subscribe((params) => { + this.update(params as NavigationEvent); + }); + this.store.dispatch(setCurrentToolStateAction({currentTool: this.tool})); + } + + parseUrlParams() { + this.selectedOpName = + this.dataService.searchParams?.get('roofline_op_name') || ''; + } + + update(event: NavigationEvent) { + setLoadingState(true, this.store, 'Loading roofline model data'); + + // get tool data + this.currentRun = event.run || ''; + const tag = event.tag || 'roofline_model'; + const host = event.host || ''; + this.dataService.getData(this.currentRun, tag, host) + .pipe(takeUntil(this.destroyed)) + .subscribe((data) => { + setLoadingState(false, this.store); + this.parseData(data as RooflineModelData[]); + this.parseUrlParams(); + }); + } + + parseData(data?: RooflineModelData[]) { + if (data === null || !Array.isArray(data) || data.length < 1) { + return; + } + this.dataTableRaw = new google.visualization.DataTable(data[0]); + + this.parseDeviceInfoData(this.dataTableRaw); + this.parseBaseOpAndProgramTableData(); + + // process section 1 data + this.setColumnsIdxProgram(); + this.processScatterDataProgram(); + + // process section 2 data + this.setColumnsIdxOp(); + this.processScatterDataOp(); + } + + /** parse the device information from the original dataset */ + parseDeviceInfoData(dataTableRaw: google.visualization.DataTable) { + this.deviceIndicators = { + hasMergedVmem: !(dataTableRaw.getTableProperty('has_merged_vmem') === '0'), + hasCmem: !(dataTableRaw.getTableProperty('has_cmem') === '0'), + hasMegacore: !(dataTableRaw.getTableProperty('megacore') === '0'), + isGpu: dataTableRaw.getTableProperty('device_type') + .startsWith( + NVIDIA_GPU_TYPE_PREFIX, + ), + }; + + this.deviceInfoArray = DEVICE_INFO.reduce( + (acc: DeviceInfoData[], cur: DeviceInfoData) => { + // copy cur to avoid mutating the original object + // when switch between GPU and TPU runs + const curInfo = {...cur}; + // deal with category of specific context + if (this.deviceIndicators.isGpu) { + if (cur.id === 'peak_flop_rate') { + curInfo.label = 'Peak FLOP Rate per GPU'; + } else if (cur.id === 'peak_hbm_bw') { + curInfo.label = 'Peak HBM Bandwidth per GPU'; + } else if (cur.id.startsWith('peak_cmem')) { + curInfo.display = false; + } else if (cur.id === 'megacore') { + curInfo.display = false; + } else if (cur.id === 'peak_vmem_read_bw') { + // TODO(b/374835204): Better refactor proto for GPU roofline + // model and refine related code. including ids like this + // peak_vmem_read_bw, and peak_vmem_write_bw, megacore, etc. + curInfo.label = 'Peak L2 cache Bandwidth per GPU'; + curInfo.display = false; + } else if (cur.id === 'peak_vmem_write_bw') { + curInfo.label = 'Peak Shared Memory / L1 Cache Bandwidth per GPU'; + } + } else { + if (cur.id.startsWith('peak_vmem')) { + if (!this.deviceIndicators.hasMergedVmem) { + curInfo.display = false; + } + } else if (cur.id.startsWith('peak_cmem')) { + if (!this.deviceIndicators.hasCmem) { + curInfo.display = false; + } + } else if (cur.id === 'megacore') { + curInfo.context += + '(if yes, the analysis assumes Megacore where an HLO runs on both TensorCores utilizing the full chip\'s resources so that the rooflines are twice higher)'; + curInfo.value = this.deviceIndicators.hasMegacore ? 'Yes' : 'No'; + } + } + const value = this.dataTableRaw!.getTableProperty(cur.id); + acc.push({ + // convert numeric value to numbers, as some ridge numbers will be + // used as axis values in chart + value: cur.type === 'number' ? Number(value) : value, + // put cur at last to overwrite with preprocessed data + ...curInfo, + }); + return acc; + }, + [] as DeviceInfoData[], + ); + } + + /** Filter and get DataTable data for op and program secions */ + parseBaseOpAndProgramTableData() { + if (!this.dataTableRaw) { + return; + } + const gViewProgram = new google.visualization.DataView(this.dataTableRaw); + gViewProgram.setRows( + this.dataTableRaw.getFilteredRows([ + { + column: this.dataTableRaw.getColumnIndex('category'), + value: 'Program', + }, + ]), + ); + this.dataTableProgram = gViewProgram.toDataTable(); + this.formatTableData(this.dataTableProgram); + + const gViewOp = new google.visualization.DataView(this.dataTableRaw); + gViewOp.setRows( + this.dataTableRaw.getFilteredRows([ + {column: this.dataTableRaw.getColumnIndex('step'), value: 'Total'}, + ]), + ); + // TODO(b/359276801) Enable injecting Graph Viewer crosslink after + // dispatching host list to global store, so we can infer module name from + // program_id given the module list (aka host list in graph viewer) + this.dataTableOp = gViewOp.toDataTable(); + this.formatTableData(this.dataTableOp); + } + + /** Get the index array of columns that is visible on the table view */ + getColumnIdx(baseColumnsIds: string[]) { + const cmemColumnsIds = this.deviceIndicators.hasCmem + ? ['measured_memory_bw', 'cmem_read_bw', 'cmem_write_bw'] + : []; + const coreColumnsIds = [ + 'roofline_efficiency', + 'compute_efficiency', + 'max_mem_bw_utilization', + ]; + const columnsIds = [ + ...baseColumnsIds, + ...cmemColumnsIds, + ...coreColumnsIds, + ]; + + const getColumnIdxes = (columnIds: string[]) => { + return columnIds.reduce((acc: ColumnIdxArr, cur: string) => { + acc.push(this.dataTableRaw!.getColumnIndex(cur)); + return acc; + }, [] as ColumnIdxArr); + }; + return getColumnIdxes(columnsIds); + } + + setColumnsIdxProgram() { + const baseColumnsIds = [ + 'step', + 'total_time_per_core', + 'measured_flop_rate', + 'bound_by', + 'hbm_bw', + ]; + this.columnsIdxProgram = this.getColumnIdx(baseColumnsIds); + } + + setColumnsIdxOp() { + const baseColumnIds = [ + 'step', + 'rank', + 'hlo_module_id', + 'category', + 'operation', + 'occurrences', + 'total_time', + 'measured_flop_rate', + 'model_flop_rate', + 'bound_by', + 'hbm_bw', + ]; + this.columnsIdxOp = this.getColumnIdx(baseColumnIds); + } + + formatTableData(data: google.visualization.DataTable | null) { + if (!data) return; + let dataFormatter = null; + for ( + let columnIdx = 0; + columnIdx < data.getNumberOfColumns(); + ++columnIdx + ) { + const id = data.getColumnId(columnIdx); + const formattedColumnIds = Object.keys(NUMERIC_DATA_FORMAT); + if (!formattedColumnIds.includes(id)) { + continue; + } + switch (NUMERIC_DATA_FORMAT[id].type) { + case 'decimal': + dataFormatter = new google.visualization.NumberFormat({ + fractionDigits: NUMERIC_DATA_FORMAT[id].digit, + }); + dataFormatter.format(data, columnIdx); + break; + case 'percent': + const pattern = `##.${'#'.repeat( + NUMERIC_DATA_FORMAT[id].digit || 2, + )}%`; + dataFormatter = new google.visualization.NumberFormat({pattern}); + dataFormatter.format(data, columnIdx); + break; + default: + console.log(`Cannot identify format config for column ${id}`); + } + } + } + + /** Helper function to get operation name from op graph viewer link + * eg: op_name + */ + getOpName(opGraphLinkStr: string) { + const regex = '(.*?)'; + const match = opGraphLinkStr.match(regex); + const opName = match?.[1] || ''; + return this.truncateOperationName(opName); + } + + /** Helper function to truncate operation name for up to 30 chars */ + truncateOperationName(operationName: string) { + if (operationName.length > 30) { + return operationName.substring(0, 30) + '...'; + } else { + return operationName; + } + } + + /** + * Helper function to add columns to the scatter plot data + * General for program and operation levels + * # columns = (1 y value + 1 tooltip) * #series + 1 X axis value + */ + addScatterDataColumns( + seriesNames: string[], + scatterData: google.visualization.DataTable, + ) { + // add columns: x axis, series data + corresponding tooltip + scatterData.addColumn('number', 'Bottleneck Operational Intensity'); + // create 1 value + 1 tooltip column for each series + seriesNames.forEach((s: string) => { + scatterData.addColumn('number', s); + scatterData.addColumn({ + type: 'string', + role: 'tooltip', + 'p': {'html': true}, + }); + }); + } + + /** + * Helper function to construct data rows for the scatter chart + * scatter chart includes the rooflines and other clustered points + */ + makeScatterRow( + numColumns: number, + xIndex: number, + yIndex: number, + xVal: number, + yVal: number, + tooltip: string, + ) { + const newRow = Array.from({ + length: numColumns, + }).fill(null); + newRow[xIndex] = xVal; + newRow[yIndex] = yVal; + newRow[yIndex + 1] = tooltip; + return newRow; + } + + /** Helper function to add a data row for the scatter chart */ + addSeriesRow( + sourceDataTable: google.visualization.DataTable, + scatterDataTable: google.visualization.DataTable, + rowIndex: number, + columnIndex: number, + ) { + if (rowIndex < 0 || columnIndex < 0) { + return; + } + const numScatterDataColumns = scatterDataTable.getNumberOfColumns(); + const xValue = sourceDataTable.getValue( + rowIndex, + sourceDataTable.getColumnIndex('bottleneck_operational_intensity'), + ); + const yValue = sourceDataTable.getValue( + rowIndex, + sourceDataTable.getColumnIndex('measured_flop_rate'), + ); + // xValue is always assigned to the first column + // yValue is assigned to the given Step agg level column (columnIdx) + scatterDataTable.addRow( + this.makeScatterRow( + numScatterDataColumns, + 0, + columnIndex, + xValue, + yValue, + this.makeTooltip(sourceDataTable, rowIndex), + ), + ); + } + + /** Helper function to add data rows for a single roofline */ + addRoofline( + rooflineName: string, + seriesIndex: number, + peakFlopRate: number, + peakMemoryBw: number, + ridgePoint: number, + scatterData: google.visualization.DataTable, + ) { + if (seriesIndex < 0) { + return; + } + const numColumns = scatterData.getNumberOfColumns(); + // Roofline before the ridge point. + scatterData.addRow( + this.makeScatterRow( + numColumns, + 0, + seriesIndex, + SCATTER_CHART_AXIS.minX, + SCATTER_CHART_AXIS.minX * peakMemoryBw, + this.makeRooflineTooltip( + 'Roofline', + SCATTER_CHART_AXIS.minX, + SCATTER_CHART_AXIS.minX * peakMemoryBw, + ), + ), + ); + // Ridge point. + scatterData.addRow( + this.makeScatterRow( + numColumns, + 0, + seriesIndex, + ridgePoint, + peakFlopRate, + this.makeRooflineTooltip( + rooflineName + ' Ridge Point', + ridgePoint, + peakFlopRate, + ), + ), + ); + // Roofline after the ridge point. + scatterData.addRow( + this.makeScatterRow( + numColumns, + 0, + seriesIndex, + SCATTER_CHART_AXIS.maxX, + peakFlopRate, + this.makeRooflineTooltip( + 'Roofline', + SCATTER_CHART_AXIS.maxX, + peakFlopRate, + ), + ), + ); + } + + /** Callback function when filterUpdated in child is triggered */ + updateDataTableOp(newFilters: google.visualization.DataTableCellFilter[]) { + this.processScatterDataOp(newFilters); + } + + /** Callback function when filterUpdated in child is triggered */ + updateDataTableProgram( + newFilters: google.visualization.DataTableCellFilter[], + ) { + this.processScatterDataProgram(newFilters); + } + + /** + * Parse dataset for program level roofline scatter chart + * With series of data, operation scatter plot = + * rooflines (line) plot + program level step cluster(scatter) plot + */ + processScatterDataProgram( + filters?: google.visualization.DataTableCellFilter[], + ) { + if (!this.dataTableProgram) { + return; + } + const filteredDataTableProgram = this.getFilteredDataTable( + this.dataTableProgram, + filters, + ); + // TODO: update the programSeries based on data received + const programSeries = this.getProgramSeries(); + // clear and recreate the scatter data + this.scatterDataProgram = new google.visualization.DataTable(); + this.addScatterDataColumns(programSeries, this.scatterDataProgram); + this.addRooflinesSeriesRows(this.scatterDataProgram); + this.addProgramSeriesRows(programSeries, filteredDataTableProgram); + this.updateProgramScatterStyles(programSeries.length); + } + + /** + * Parse dataset for operation level roofline scatter chart + * With series of data, operation scatter plot = + * rooflines (line) plot + op categoreis cluster(scatter) plot + */ + processScatterDataOp(filters?: google.visualization.DataTableCellFilter[]) { + if (!this.dataTableOp) { + return; + } + const filteredDataTableOp = this.getFilteredDataTable( + this.dataTableOp, + filters, + ); + + const opCategories = this.getOpCategories(filteredDataTableOp); + const opSeries = this.getOpSeries(opCategories); + + // clear the original scatter data + this.scatterDataOp = new google.visualization.DataTable(); + this.addScatterDataColumns(opSeries, this.scatterDataOp); + this.addRooflinesSeriesRows(this.scatterDataOp); + this.addOpSeriesRows(opSeries, filteredDataTableOp); + this.updateOpScatterStyles(opSeries.length); + } + + /** + * Helper function to get filtered DataTable given base op/proram DataTable, + * and feed to child component as source data for roofline scatter chart. + * Because scatter chart DataTable is in a different structure than the table + * chart DataTable. + * Filteres are passed from child filters. + */ + getFilteredDataTable( + dataTable: google.visualization.DataTable, + filters?: google.visualization.DataTableCellFilter[], + ) { + // apply filters if any, filters are emitted from child component + // because the scatter dataTable is restructured and cannot be applied in + // child directly + let filteredDataTable: google.visualization.DataTable | null = null; + if (filters && filters.length > 0) { + const filteredDataView = new google.visualization.DataView(dataTable); + filteredDataView.setRows(dataTable.getFilteredRows(filters)); + filteredDataTable = filteredDataView.toDataTable(); + } else { + filteredDataTable = dataTable; + } + return filteredDataTable; + } + + /** + * Helper function to get operation categories, with data filtered on + * "step == Total", the list is sorted by total_self_time in order to make the + * scatter chart style in consistent with the pie chart + */ + getOpCategories(filteredDataTableOp: google.visualization.DataTable) { + const sortedOpCategories: string[] = []; + + // sort the categories given frequency + const chartView = google.visualization.data.group( + filteredDataTableOp, + [filteredDataTableOp.getColumnIndex('category')], + [ + { + 'column': filteredDataTableOp.getColumnIndex('total_self_time'), + 'aggregation': google.visualization.data.sum, + 'type': 'number', + }, + ], + ); + // sort categories on sum of total_self_time + chartView.sort({column: 1, desc: true}); + for (let i = 0; i < chartView.getNumberOfRows(); ++i) { + const category = chartView.getValue(i, 0); + // Program will be appended separately + if (category !== 'Program') { + sortedOpCategories.push(category); + } + } + return sortedOpCategories; + } + + /** + * The roofline chart consists of a seris of data (roofline series + + * appregation series) + * This helper function gets the roofline base series + */ + getRooflineBaseSeries() { + let series: string[] = []; + if (this.deviceIndicators.isGpu) { + series = series.concat(['Shared Mem / L1 Roofline']); + } else { + if (this.deviceIndicators.hasMergedVmem) { + series = series.concat(['VMEM Read Roofline', 'VMEM Write Roofline']); + } else if (this.deviceIndicators.hasCmem) { + series = series.concat(['CMEM Read Roofline', 'CMEM Write Roofline']); + } + } + return [...series, 'HBM Roofline']; + } + + /** + * #series = #roofline(line) + 4 program level aggregation series + */ + getProgramSeries() { + let series: string[] = this.getRooflineBaseSeries(); + series = series.concat(this.programLevelAgg); + return series; + } + + /** + * #series = #roofline(line) + #operation level aggregation series + * (categories) + 2 'Program' datapoints + * The sereis list will decide the style of the scatter chart + */ + getOpSeries(opCategories: string[]) { + let series: string[] = this.getRooflineBaseSeries(); + // the first program is to make it's legend shows on top + // the second program is to show marker on top layer on the chart + series = series.concat(['Program', ...opCategories, 'Program']); + return series; + } + + /** + * Helper function to add data rows for roofline plot - vmem, cmem, hbm + * generalized function for both op & program + */ + addRooflinesSeriesRows(scatterData: google.visualization.DataTable) { + const rooflineInfo = this.deviceInfoArray.reduce( + (acc, item) => { + acc[item.id] = Number(item.value || 0); + return acc; + }, + {} as {[key: string]: number}, + ); + let columnIndex = 1; + + if (!this.deviceIndicators.isGpu) { + const addRooflinePairs = (memType: 'cmem' | 'vmem') => { + ['read', 'write'].forEach((opType) => { + this.addRoofline( + `${memType.toUpperCase()} ${opType.charAt(0).toUpperCase() + opType.slice(1)}`, + columnIndex, + rooflineInfo['peak_flop_rate'], + rooflineInfo[`peak_${memType}_${opType}_bw`], + rooflineInfo[`${memType}_${opType}_ridge_point`], + scatterData, + ); + columnIndex += 2; // value col + tooltip col + }); + }; + if (this.deviceIndicators.hasMergedVmem) { + addRooflinePairs('vmem'); + } + if (this.deviceIndicators.hasCmem) { + addRooflinePairs('cmem'); + } + } else { + // Just use vmem_read for gpu SHM/L1 + this.addRoofline( + 'Shared Mem / L1', + columnIndex, + rooflineInfo['peak_flop_rate'], + rooflineInfo['peak_vmem_write_bw'], + rooflineInfo['vmem_write_ridge_point'], + scatterData, + ); + columnIndex += 2; // value col + tooltip col + } + + this.addRoofline( + 'HBM', + columnIndex, + rooflineInfo['peak_flop_rate'], + rooflineInfo['peak_hbm_bw'], + rooflineInfo['hbm_ridge_point'], + scatterData, + ); + } + + /** + * Poluplate program level scatter chart data rows with series using filtered + * operation DataTable data + */ + addProgramSeriesRows( + programSeries: string[], + filteredDataTableProgram: google.visualization.DataTable, + ) { + for ( + let rowIndex = 0; + rowIndex < filteredDataTableProgram.getNumberOfRows(); + ++rowIndex + ) { + let step = filteredDataTableProgram.getValue( + rowIndex, + filteredDataTableProgram.getColumnIndex('step'), + ); + // Assgin 'Step' as value if the step field is numeric string + if (!this.programLevelAgg.includes(step)) { + step = 'Step'; + } + const columnIndex = 1 + 2 * programSeries.lastIndexOf(step); + this.addSeriesRow( + filteredDataTableProgram, + this.scatterDataProgram!, + rowIndex, + columnIndex, + ); + } + } + + /** + * Poluplate operation level scatter chart data rows with series using + * filtered operation DataTable data + */ + addOpSeriesRows( + opSeries: string[], + filteredDataTableOp: google.visualization.DataTable, + ) { + for ( + let rowIndex = 0; + rowIndex < filteredDataTableOp.getNumberOfRows(); + ++rowIndex + ) { + const category = filteredDataTableOp.getValue( + rowIndex, + filteredDataTableOp.getColumnIndex('category'), + ); + const columnIndex = 1 + 2 * opSeries.lastIndexOf(category); + if ( + columnIndex > 0 && + filteredDataTableOp.getValue( + rowIndex, + filteredDataTableOp.getColumnIndex('bound_by'), + ) !== 'Unknown' + ) { + this.addSeriesRow( + filteredDataTableOp, + this.scatterDataOp!, + rowIndex, + columnIndex, + ); + } + } + } + + /** Make tooltip for rooflines series in the scatter chart */ + makeRooflineTooltip( + rooflineName: string, + operationIntensity: number, + flopRate: number, + ) { + return ( + '
' + + '' + + rooflineName + + '
' + + 'Operational Intensity (FLOP/Byte): ' + + operationIntensity.toLocaleString(undefined, {maximumFractionDigits: 2}) + + '
' + + 'Flop Rate (GFLOP/s): ' + + flopRate.toLocaleString(undefined, {maximumFractionDigits: 2}) + + '
' + + '
' + ); + } + + /** Make tooltip for the clustered series (points) in the scatter chart */ + makeTooltip(dataTable: google.visualization.DataTable, rowIndex: number) { + // Prepare column index to make easier access + const columns: {[columnKey: string]: number} = {}; + for (let i = 0; i < dataTable.getNumberOfColumns(); i++) { + columns[dataTable.getColumnId(i)] = i; + } + // TODO(jihochoi): fix the utilization numbers for TPU V4. + // ' - Percent relative to optimal: ' + // + (100 * dataTable.getValue(rowIndex, + // columns.roofline_efficiency)).toLocaleString(undefined, {maximumFractionDigits:2}) + // + '%
' + + // ' - Percent relative to HW limit: ' + // + (100 * dataTable.getValue(rowIndex, + // columns.compute_efficiency)).toLocaleString(undefined, {maximumFractionDigits:2}) + // + '%
' + + // ' - Percent relative to HW limit: ' + // + (100 * dataTable.getValue(rowIndex, + // columns.hbm_bw_utilization)).toLocaleString(undefined, {maximumFractionDigits:2}) + // + '%
' + + const tooltipRows: TooltipRow[] = [ + { + id: 'step', + label: 'Step', + }, + { + id: 'rank', + label: 'Rank', + }, + { + id: 'hlo_module_id', + label: 'Program ID', + }, + { + id: 'category', + label: 'Category', + }, + { + id: 'operation', + label: 'Operation', + operation: (val) => this.getOpName(val as string), + }, + { + id: 'occurrences', + label: '# of Occurrences', + }, + { + id: 'total_time_per_core', + label: 'Total Time per core (us)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 2}), + }, + { + id: 'total_time_in_percentage', + label: 'Total Time / Program', + operation: (val) => `${100 * Number(Number(val).toFixed(4))}%`, + }, + { + id: 'measured_flop_rate', + label: 'Normalized FLOP Rate (GFLOP/s)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'model_flop_rate', + label: 'Model FLOP Rate (GFLOP/s)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'hbm_bw', + label: 'HBM BW (GiB/s)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'cmem_read_bw', + label: 'CMEM Read BW (GiB/s)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'cmem_write_bw', + label: 'CMEM Write BW (GiB/s)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'vmem_read_bw', + label: 'VMEM Read BW (GiB/s)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'vmem_write_bw', + label: 'VMEM Write BW (GiB/s)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'operational_intensity', + label: 'Operational Intensity (FLOP/Byte)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'hbm_operational_intensity', + label: 'HBM Operational Intensity (FLOP/Byte)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'cmem_read_operational_intensity', + label: 'CMEM Read Operational Intensity (FLOP/Byte)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'cmem_write_operational_intensity', + label: 'CMEM Write Operational Intensity (FLOP/Byte)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'vmem_read_operational_intensity', + label: 'VMEM Read Operational Intensity (FLOP/Byte)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'vmem_write_operational_intensity', + label: 'VMEM Write Operational Intensity (FLOP/Byte)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'bottleneck_operational_intensity', + label: 'Bottleneck Operational Intensity (FLOP/Byte)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + {id: 'boundy_by', label: 'Bound By'}, + ]; + const gpuTooltipRows: TooltipRow[] = [ + { + id: 'step', + label: 'Step', + }, + { + id: 'rank', + label: 'Rank', + }, + { + id: 'hlo_module_id', + label: 'Program ID', + }, + { + id: 'category', + label: 'Category', + }, + { + id: 'operation', + label: 'Operation', + operation: (val) => this.getOpName(val as string), + }, + { + id: 'occurrences', + label: '# of Occurrences', + }, + { + id: 'total_time_per_core', + label: 'Total Time per gpu (us)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 2}), + }, + { + id: 'total_time_in_percentage', + label: 'Total Time / Program', + operation: (val) => `${100 * Number(Number(val).toFixed(4))}%`, + }, + { + id: 'measured_flop_rate', + label: 'Normalized FLOP Rate (GFLOP/s)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'model_flop_rate', + label: 'Model FLOP Rate (GFLOP/s)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'hbm_bw', + label: 'HBM BW (GiB/s)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'vmem_write_bw', + label: 'Shm/L1 BW (GiB/s)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'operational_intensity', + label: 'Operational Intensity (FLOP/Byte)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'hbm_operational_intensity', + label: 'HBM Operational Intensity (FLOP/Byte)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + { + id: 'bottleneck_operational_intensity', + label: 'Bottleneck Operational Intensity (FLOP/Byte)', + operation: (val) => + val.toLocaleString(undefined, {maximumFractionDigits: 4}), + }, + {id: 'boundy_by', label: 'Bound By'}, + ]; + + const tooltipBodyHtml = ( + this.deviceIndicators.isGpu ? gpuTooltipRows : tooltipRows + ).reduce((acc: string, row: TooltipRow) => { + if (!columns.hasOwnProperty(row.id)) { + return acc; + } + const val: string | number = dataTable.getValue( + rowIndex, + columns[row.id], + ); + acc += `${row.label}: ${ + row.operation ? row.operation(val) : val + }
`; + return acc; + }, ''); + return `
${tooltipBodyHtml}
`; + } + + // TODO(yinzz) remove the style updating dependency on the series order + // make it a k-v based format + formatRooflineSeriesStyle( + seriesIndex: number, + chartOptions: google.visualization.ScatterChartOptions, + ) { + if (this.deviceIndicators.isGpu) { + chartOptions.series[seriesIndex++] = ROOFLINE_STYLES.write; + } else { + if (this.deviceIndicators.hasMergedVmem) { + chartOptions.series[seriesIndex++] = ROOFLINE_STYLES.read; + chartOptions.series[seriesIndex++] = ROOFLINE_STYLES.write; + } + if (this.deviceIndicators.hasCmem) { + chartOptions.series[seriesIndex++] = ROOFLINE_STYLES.read; + chartOptions.series[seriesIndex++] = ROOFLINE_STYLES.write; + } + } + chartOptions.series[seriesIndex++] = ROOFLINE_STYLES.hbm; + return seriesIndex; + } + + updateProgramScatterStyles(numSeries: number) { + let seriesIndex = 0; + seriesIndex = this.formatRooflineSeriesStyle( + seriesIndex, + this.scatterChartOptionsProgram, + ); + for (; seriesIndex < numSeries; ++seriesIndex) { + this.scatterChartOptionsProgram.series[seriesIndex] = {pointSize: 4}; + } + } + + // TODO(yinzz) remove the style updating dependency on the series order + updateOpScatterStyles(numSeries: number) { + let seriesIndex = 0; + seriesIndex = this.formatRooflineSeriesStyle( + seriesIndex, + this.scatterChartOptionsOp, + ); + // extra series style record for the Program legend + this.scatterChartOptionsOp.series[seriesIndex++] = { + pointSize: 20, + color: '#FF0000', + pointShape: 'star', + }; + // Other ops are colored in the same order as in the pie chart, cmem, vmem, + // hbm, program + const numSeriesBeforeOps = + 2 * (this.deviceIndicators.hasCmem ? 1 : 0) + + 2 * (this.deviceIndicators.hasMergedVmem ? 1 : 0) + + 2; + for (; seriesIndex < numSeries - 1; ++seriesIndex) { + this.scatterChartOptionsOp.series[seriesIndex] = { + pointSize: 3, + // make sure the color of series matches the pie chart + color: + PIE_CHART_PALETTE[ + (seriesIndex - numSeriesBeforeOps) % PIE_CHART_PALETTE.length + ], + }; + } + // Real series for program which does not show in the legend. + // This is added at the end to make it plotted at the top and not buried by + // other op points. + this.scatterChartOptionsOp.series[numSeries - 1] = { + pointSize: 20, + color: '#FF0000', + pointShape: 'star', + visibleInLegend: false, + }; + } + + ngOnDestroy() { + setLoadingState(false, this.store); + this.destroyed.next(); + this.destroyed.complete(); + } +} diff --git a/frontend/app/components/roofline_model/roofline_model_module.ts b/frontend/app/components/roofline_model/roofline_model_module.ts new file mode 100644 index 00000000..6e475789 --- /dev/null +++ b/frontend/app/components/roofline_model/roofline_model_module.ts @@ -0,0 +1,25 @@ +import {CommonModule} from '@angular/common'; +import {NgModule} from '@angular/core'; +import {TableModule} from 'org_xprof/frontend/app/components/chart/table/table_module'; +import {CategoryFilterModule} from 'org_xprof/frontend/app/components/controls/category_filter/category_filter_module'; +import {StringFilterModule} from 'org_xprof/frontend/app/components/controls/string_filter/string_filter_module'; +import {OperationLevelAnalysisModule} from 'org_xprof/frontend/app/components/roofline_model/operation_level_analysis/operation_level_analysis_module'; +import {ProgramLevelAnalysisModule} from 'org_xprof/frontend/app/components/roofline_model/program_level_analysis/program_level_analysis_module'; + +import {RooflineModel} from './roofline_model'; + +/** A roofline model module. */ +@NgModule({ + declarations: [RooflineModel], + imports: [ + CommonModule, + TableModule, + CategoryFilterModule, + StringFilterModule, + ProgramLevelAnalysisModule, + OperationLevelAnalysisModule, + ], + exports: [RooflineModel], +}) +export class RooflineModelModule { +} diff --git a/plugin/tensorboard_plugin_profile/convert/BUILD b/plugin/tensorboard_plugin_profile/convert/BUILD index 73de40fc..537617e6 100644 --- a/plugin/tensorboard_plugin_profile/convert/BUILD +++ b/plugin/tensorboard_plugin_profile/convert/BUILD @@ -97,6 +97,15 @@ py_library( ], ) +py_library( + name = "roofline_model_proto_to_gviz", + srcs = ["roofline_model_proto_to_gviz.py"], + deps = [ + requirement("gviz_api"), + "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:protos_all_py_pb2", + ], +) + py_test( name = "overview_page_proto_to_gviz_test", size = "small", @@ -251,6 +260,7 @@ py_library( ":input_pipeline_proto_to_gviz", ":kernel_stats_proto_to_gviz", ":overview_page_proto_to_gviz", + ":roofline_model_proto_to_gviz", ":tf_data_stats_proto_to_gviz", ":tf_stats_proto_to_gviz", ":trace_events_json", diff --git a/plugin/tensorboard_plugin_profile/convert/raw_to_tool_data.py b/plugin/tensorboard_plugin_profile/convert/raw_to_tool_data.py index 57c0f790..836c407e 100644 --- a/plugin/tensorboard_plugin_profile/convert/raw_to_tool_data.py +++ b/plugin/tensorboard_plugin_profile/convert/raw_to_tool_data.py @@ -32,6 +32,7 @@ from tensorboard_plugin_profile.convert import input_pipeline_proto_to_gviz from tensorboard_plugin_profile.convert import kernel_stats_proto_to_gviz from tensorboard_plugin_profile.convert import overview_page_proto_to_gviz +from tensorboard_plugin_profile.convert import roofline_model_proto_to_gviz from tensorboard_plugin_profile.convert import tf_data_stats_proto_to_gviz from tensorboard_plugin_profile.convert import tf_stats_proto_to_gviz from tensorboard_plugin_profile.convert import trace_events_json @@ -175,6 +176,10 @@ def xspace_to_tool_data( raw_data, success = xspace_wrapper_func(xspace_paths, tool) if success: data = hlo_stats_proto_to_gviz.to_json(raw_data) + elif tool == 'roofline_model': + raw_data, success = xspace_wrapper_func(xspace_paths, tool) + if success: + data = roofline_model_proto_to_gviz.to_json(raw_data) elif tool == 'graph_viewer': options = params.get('graph_viewer_options', {}) raw_data, success = xspace_wrapper_func(xspace_paths, tool, options) diff --git a/plugin/tensorboard_plugin_profile/convert/roofline_model_proto_to_gviz.py b/plugin/tensorboard_plugin_profile/convert/roofline_model_proto_to_gviz.py new file mode 100644 index 00000000..91b7d1c5 --- /dev/null +++ b/plugin/tensorboard_plugin_profile/convert/roofline_model_proto_to_gviz.py @@ -0,0 +1,392 @@ +# Copyright 2024 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""For conversion of RooflineModel protos to GViz DataTables. + +Usage: + gviz_data_tables = generate_roofline_model_table(roofline_model_db) +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import gviz_api + +from tensorboard_plugin_profile.protobuf import roofline_model_pb2 + + +def get_step_string(record_type, step_num): + match record_type: + case roofline_model_pb2.RecordType.INVALID_RECORD_TYPE: + return "Invalid" + case roofline_model_pb2.RecordType.ALL: + return "Total" + case roofline_model_pb2.RecordType.ALL_HW: + return "Total (HW)" + case roofline_model_pb2.RecordType.AVERAGE_STEP: + return "Average" + case roofline_model_pb2.RecordType.PER_STEP: + return step_num + + +def gibi_to_giga(gibibytes): + return gibibytes * ((1 << 30) / 1.0e9) + + +def ridge_point(peak_gigaflops_per_second, peak_gibibytes_per_second): + if peak_gibibytes_per_second == 0: + return 0.0 + return peak_gigaflops_per_second / gibi_to_giga(peak_gibibytes_per_second) + + +def get_roofline_model_table_args_for_gpu(roofline_model_db): + """Creates roofline model table args from a roofline model proto for gpu. + + Args: + roofline_model_db: A RooflineModelDatabase proto. + + Returns: + Returns table description(columns), data(rows) and custom properties. + """ + table_description = [ + ("step", "string", "Step"), + ("rank", "number", "Rank"), + ("category", "string", "Category"), + ("operation", "string", "Operation"), + ("occurrences", "number", "# Occurrences"), + ("total_time", "number", "Total Time (us)"), + ("avg_time", "number", "Avg. time (us)"), + ("total_self_time", "number", "Total self time (us)"), + ("avg_self_time", "number", "Avg. self time (us)"), + ("total_self_time_percent", "number", "Total self time (%)"), + ( + "cumulative_total_self_time_percent", + "number", + "Cumulative total self time (%)", + ), + ("measured_flop_rate", "number", "Normalized FLOP Rate (GFLOP/s)"), + ("model_flop_rate", "number", "Model FLOP Rate (GFLOP/s)"), + ("measured_memory_bw", "number", "Memory BW (GiB/s)"), + ("hbm_bw", "number", "HBM BW (GiB/s)"), + # For nvidia gpu, currently no vmem_read_bw field, and + # vmem_write_bw is used for SHM/L1. + ("vmem_write_bw", "number", "SHM/L1 BW (GiB/s)"), + ("operational_intensity", "number", "Operational Intensity (FLOP/Byte)"), + ( + "hbm_operational_intensity", + "number", + "HBM Operational Intensity (FLOP/Byte)", + ), + # for nvidia gpu, currently novmem_read_operational_intensity field, and + # vmem_write_operational_intensity used for SHM/L1. + ( + "vmem_write_operational_intensity", + "number", + "SHM/L1 Operational Intensity (FLOP/Byte)", + ), + ( + "bottleneck_operational_intensity", + "number", + "Bottleneck Operational Intensity (FLOP/Byte)", + ), + ("bound_by", "string", "Bound by"), + ("total_time_per_core", "number", "Total Time per core (us)"), + ("total_time_in_percentage", "number", "Total Time (%)"), + ("optimal_flop_rate", "number", "Optimal FLOP Rate (GFLOP/s)"), + ("roofline_efficiency", "number", "Roofline efficiency (%)"), + ("compute_efficiency", "number", "FLOP Rate / Peak (%)"), + ( + "max_mem_bw_utilization", + "number", + "Max memory (cmem or hbm) bandwidth utilization (%)", + ), + ("include_infeed_outfeed", "boolean", "Include Infeed/Outfeed"), + ("hlo_module_id", "string", "Program ID"), + ] + + data = [] + for record in roofline_model_db.roofline_model_record: + row = [ + get_step_string(record.record_type, record.step_num), + record.rank, + record.hlo_category, + record.hlo_name, + record.occurrences, + record.total_time_in_us, + record.avg_time_in_us, + record.total_self_time_in_us, + record.avg_self_time_in_us, + record.total_self_time_as_fraction, + record.cumulative_total_self_time_as_fraction, + record.measured_flop_rate, + record.model_flop_rate, + record.measured_memory_bw, + record.hbm_bw, + record.vmem_write_bw, + record.operational_intensity, + record.hbm_operational_intensity, + record.vmem_write_operational_intensity, + record.bottleneck_operational_intensity, + record.bound_by, + record.total_time_per_core_in_us, + record.total_time_in_percentage, + record.optimal_flop_rate, + record.roofline_efficiency, + record.flop_rate_relative_to_hw_limit, + record.memory_bw_relative_to_hw_limit, + record.include_infeed_outfeed, + record.hlo_module_id, + ] + data.append(row) + custom_properties = { + "device_type": roofline_model_db.device_type, + "has_cmem": roofline_model_db.has_cmem, + "has_merged_vmem": roofline_model_db.has_merged_vmem, + "peak_flop_rate": roofline_model_db.peak_flop_rate, + "peak_hbm_bw": roofline_model_db.peak_hbm_bw, + "peak_shml1_write_bw": roofline_model_db.peak_vmem_write_bw, + "hbm_ridge_point": ridge_point( + roofline_model_db.peak_flop_rate, roofline_model_db.peak_hbm_bw + ), + "shml1_write_ridge_point": ridge_point( + roofline_model_db.peak_flop_rate, roofline_model_db.peak_vmem_write_bw + ), + } + return (table_description, data, custom_properties) + + +def get_roofline_model_table_args(roofline_model_db): + """Creates roofline model table args from a roofline model proto. + + Args: + roofline_model_db: A RooflineModelDatabase proto. + + Returns: + Returns table description(columns), data(rows) and custom properties. + """ + + table_description = [ + ("step", "string", "Step"), + ("rank", "number", "Rank"), + ("category", "string", "Category"), + ("operation", "string", "Operation"), + ("occurrences", "number", "# Occurrences"), + ("total_time", "number", "Total Time (us)"), + ("avg_time", "number", "Avg. time (us)"), + ("total_self_time", "number", "Total self time (us)"), + ("avg_self_time", "number", "Avg. self time (us)"), + ("total_self_time_percent", "number", "Total self time (%)"), + ( + "cumulative_total_self_time_percent", + "number", + "Cumulative total self time (%)", + ), + ("dma_stall_percent", "number", "%time stalled by DMA"), + ("measured_flop_rate", "number", "Normalized FLOP Rate (GFLOP/s)"), + ("model_flop_rate", "number", "Model FLOP Rate (GFLOP/s)"), + ("measured_memory_bw", "number", "Memory BW (GiB/s)"), + ("hbm_bw", "number", "HBM BW (GiB/s)"), + ("cmem_read_bw", "number", "CMEM Read BW (GiB/s)"), + ("cmem_write_bw", "number", "CMEM Write BW (GiB/s)"), + ("vmem_read_bw", "number", "VMEM Read BW (GiB/s)"), + ("vmem_write_bw", "number", "VMEM Write BW (GiB/s)"), + ("operational_intensity", "number", "Operational Intensity (FLOP/Byte)"), + ( + "hbm_operational_intensity", + "number", + "HBM Operational Intensity (FLOP/Byte)", + ), + ( + "cmem_read_operational_intensity", + "number", + "CMEM Read Operational Intensity (FLOP/Byte)", + ), + ( + "cmem_write_operational_intensity", + "number", + "CMEM Write Operational Intensity (FLOP/Byte)", + ), + ( + "vmem_read_operational_intensity", + "number", + "VMEM Read Operational Intensity (FLOP/Byte)", + ), + ( + "vmem_write_operational_intensity", + "number", + "VMEM Write Operational Intensity (FLOP/Byte)", + ), + ( + "bottleneck_operational_intensity", + "number", + "Bottleneck Operational Intensity (FLOP/Byte)", + ), + ("bound_by", "string", "Bound by"), + ("total_time_per_core", "number", "Total Time per core (us)"), + ("total_time_in_percentage", "number", "Total Time (%)"), + ("optimal_flop_rate", "number", "Optimal FLOP Rate (GFLOP/s)"), + ("roofline_efficiency", "number", "Roofline efficiency (%)"), + ("compute_efficiency", "number", "FLOP Rate / Peak (%)"), + ( + "max_mem_bw_utilization", + "number", + "Max memory (cmem or hbm) bandwidth utilization (%)", + ), + ("include_infeed_outfeed", "boolean", "Include Infeed/Outfeed"), + ("hlo_module_id", "string", "Program ID"), + ] + + data = [] + for record in roofline_model_db.roofline_model_record: + row = [ + get_step_string(record.record_type, record.step_num), + record.rank, + record.hlo_category, + record.hlo_name, + record.occurrences, + record.total_time_in_us, + record.avg_time_in_us, + record.total_self_time_in_us, + record.avg_self_time_in_us, + record.total_self_time_as_fraction, + record.cumulative_total_self_time_as_fraction, + record.dma_stall_fraction, + record.measured_flop_rate, + record.model_flop_rate, + record.measured_memory_bw, + record.hbm_bw, + record.cmem_read_bw, + record.cmem_write_bw, + record.vmem_read_bw, + record.vmem_write_bw, + record.operational_intensity, + record.hbm_operational_intensity, + record.cmem_read_operational_intensity, + record.cmem_write_operational_intensity, + record.vmem_read_operational_intensity, + record.vmem_write_operational_intensity, + record.bottleneck_operational_intensity, + record.bound_by, + record.total_time_per_core_in_us, + record.total_time_in_percentage, + record.optimal_flop_rate, + record.roofline_efficiency, + record.flop_rate_relative_to_hw_limit, + record.memory_bw_relative_to_hw_limit, + record.include_infeed_outfeed, + record.hlo_module_id, + ] + data.append(row) + custom_properties = { + "device_type": roofline_model_db.device_type, + "megacore": str(int(roofline_model_db.megacore)), + "has_cmem": str(int(roofline_model_db.has_cmem)), + "has_merged_vmem": str(int(roofline_model_db.has_merged_vmem)), + "peak_flop_rate": str(roofline_model_db.peak_flop_rate), + "peak_hbm_bw": str(roofline_model_db.peak_hbm_bw), + "peak_cmem_read_bw": str(roofline_model_db.peak_cmem_read_bw), + "peak_cmem_write_bw": str(roofline_model_db.peak_cmem_write_bw), + "peak_vmem_read_bw": str(roofline_model_db.peak_vmem_read_bw), + "peak_vmem_write_bw": str(roofline_model_db.peak_vmem_write_bw), + "hbm_ridge_point": str( + ridge_point( + roofline_model_db.peak_flop_rate, roofline_model_db.peak_hbm_bw + ) + ), + "cmem_read_ridge_point": str( + ridge_point( + roofline_model_db.peak_flop_rate, + roofline_model_db.peak_cmem_read_bw, + ) + ), + "cmem_write_ridge_point": str( + ridge_point( + roofline_model_db.peak_flop_rate, + roofline_model_db.peak_cmem_write_bw, + ) + ), + "vmem_read_ridge_point": str( + ridge_point( + roofline_model_db.peak_flop_rate, + roofline_model_db.peak_vmem_read_bw, + ) + ), + "vmem_write_ridge_point": str( + ridge_point( + roofline_model_db.peak_flop_rate, + roofline_model_db.peak_vmem_write_bw, + ) + ), + } + + return (table_description, data, custom_properties) + + +def generate_roofline_model_table(roofline_model_db): + """Creates roofline model table from a list of roofline model protos. + + Args: + roofline_model_db: a RooflineModelDatabase proto. + + included and one without.. + + Returns: + Returns a gviz_api.DataTable + """ + device_type_str = roofline_model_db.device_type + if "GPU" not in device_type_str: + table_description, data, custom_properties = get_roofline_model_table_args( + roofline_model_db + ) + else: + table_description, data, custom_properties = ( + get_roofline_model_table_args_for_gpu(roofline_model_db) + ) + + return gviz_api.DataTable(table_description, data, custom_properties) + + +def get_diagnostics_table_args(roofline_model_db): + """Creates diagnostics table from a roofline model proto.""" + table_description = [ + ("severity", "string", "Severity"), + ("message", "string", "Message"), + ] + data = [] + for info in roofline_model_db.diagnostics.info: + data.append(["INFO", info]) + for warning in roofline_model_db.diagnostics.warnings: + data.append(["WARNING", warning]) + for error in roofline_model_db.diagnostics.errors: + data.append(["ERROR", error]) + return (table_description, data, {}) + + +def generate_diagnostics_table(roofline_model_db): + table_description, data, custom_properties = get_diagnostics_table_args( + roofline_model_db + ) + return gviz_api.DataTable(table_description, data, custom_properties) + + +def to_json(raw_data): + """Converts a serialized HloStatsDb string to json.""" + roofline_model_db = roofline_model_pb2.RooflineModelDatabase() + roofline_model_db.ParseFromString(raw_data) + roofline_model_table = generate_roofline_model_table( + roofline_model_db + ).ToJSon() + diagnostics_table = generate_diagnostics_table(roofline_model_db).ToJSon() + return "[" + roofline_model_table + "," + diagnostics_table + "]" diff --git a/plugin/tensorboard_plugin_profile/integration_tests/tpu/tensorflow/tpu_tf2_keras_test.py b/plugin/tensorboard_plugin_profile/integration_tests/tpu/tensorflow/tpu_tf2_keras_test.py index 18a2529f..41f09993 100644 --- a/plugin/tensorboard_plugin_profile/integration_tests/tpu/tensorflow/tpu_tf2_keras_test.py +++ b/plugin/tensorboard_plugin_profile/integration_tests/tpu/tensorflow/tpu_tf2_keras_test.py @@ -109,7 +109,8 @@ def test_tools_are_in_list(self): 'memory_viewer^', 'graph_viewer^', 'hlo_stats^', - 'inference_profile^' + 'inference_profile^', + 'roofline_model^', ] expected.sort() self.assertListEqual(expected, result) diff --git a/plugin/tensorboard_plugin_profile/profile_plugin.py b/plugin/tensorboard_plugin_profile/profile_plugin.py index 0516c038..409010f0 100644 --- a/plugin/tensorboard_plugin_profile/profile_plugin.py +++ b/plugin/tensorboard_plugin_profile/profile_plugin.py @@ -111,6 +111,7 @@ 'tf_data_bottleneck_analysis^', 'op_profile^', 'hlo_stats^', + 'roofline_model^', ] # XPlane generated tools that support all host mode. diff --git a/plugin/tensorboard_plugin_profile/protobuf/BUILD b/plugin/tensorboard_plugin_profile/protobuf/BUILD index c13a65a0..1c79823a 100644 --- a/plugin/tensorboard_plugin_profile/protobuf/BUILD +++ b/plugin/tensorboard_plugin_profile/protobuf/BUILD @@ -18,6 +18,7 @@ proto_library( "kernel_stats.proto", "overview_page.proto", "power_metrics.proto", + "roofline_model.proto", "tf_data_stats.proto", "tf_stats.proto", "tpu_input_pipeline.proto", @@ -37,6 +38,7 @@ py_proto_library( "kernel_stats.proto", "overview_page.proto", "power_metrics.proto", + "roofline_model.proto", "tf_data_stats.proto", "tf_stats.proto", "tpu_input_pipeline.proto", diff --git a/plugin/tensorboard_plugin_profile/protobuf/roofline_model.proto b/plugin/tensorboard_plugin_profile/protobuf/roofline_model.proto new file mode 100644 index 00000000..6a7bfbe7 --- /dev/null +++ b/plugin/tensorboard_plugin_profile/protobuf/roofline_model.proto @@ -0,0 +1,196 @@ +// This proto describes the format of the output profile file from +// the Roofline Model tool. +syntax = "proto2"; + +package tensorflow.profiler.roofline_model; + +import "plugin/tensorboard_plugin_profile/protobuf/diagnostics.proto"; + +// The record type which describes the scope this record captures. +enum RecordType { + INVALID_RECORD_TYPE = 0; + + // Captures the entire profiling duration including incomplete steps. + ALL = 1; + + // Captures the average of all complete steps. + AVERAGE_STEP = 2; + + // Captures a single step. + PER_STEP = 3; + + // Same as ALL but the performance metrics (FLOPS and memory bandwidth) are + // derived from the hardware performance conuters. + ALL_HW = 4; +} + +// A database of RooflineModel records. +message RooflineModelDatabase { + // The device type. + optional string device_type = 1; + + // Whether megacore is used. + optional bool megacore = 12; + + // Whether the device has shared CMEM. + optional bool has_cmem = 8; + + // Whether the device has merged VMEM. + optional bool has_merged_vmem = 15; + + // Peak flop rate in GFLOP/s. + optional double peak_flop_rate = 2; + + // Peak HBM bandwidth in GiB/s + optional double peak_hbm_bw = 9; + + // Peak CMEM read bandwidth in GiB/s + optional double peak_cmem_read_bw = 10; + + // Peak CMEM write bandwidth in GiB/s + optional double peak_cmem_write_bw = 11; + + // Peak VMEM read bandwidth in GiB/s + optional double peak_vmem_read_bw = 13; + + // Peak VMEM write bandwidth in GiB/s + optional double peak_vmem_write_bw = 14; + + // All RooflineModel records, one for each HLO operation. + repeated RooflineModelRecord roofline_model_record = 5; + + // Error and warning messages for diagnosing profiling issues. + optional tensorflow.profiler.Diagnostics diagnostics = 7; + + reserved 3, 4, 6; +} + +// There is one RooflineModelRecord for each HLO operation profiled. +// Next ID: 43 +message RooflineModelRecord { + // The record type. + optional RecordType record_type = 18; + + // Step number when record type is PER_STEP. Otherwise, invalid. + optional uint32 step_num = 19; + + // The rank by self time + optional uint64 rank = 1; + + // The hlo module id of the op + optional uint64 hlo_module_id = 35; + + // The HLO category name. + optional string hlo_category = 17; + + // The HLO operation name. + optional string hlo_name = 2; + + // Number of occurrences of the operation. + optional int64 occurrences = 3; + + // Total "accumulated" time in micro-seconds that the operation + // took. If this operation has any children operations, + // the "accumulated" time includes the time spent inside children. + optional double total_time_in_us = 4; + + // Total time per core in micro-seconds. + optional double total_time_per_core_in_us = 20; + + // Total time as fraction of the total program time. + optional double total_time_in_percentage = 21; + + // Average "accumulated" time in micro-seconds that each + // occurrence of the operation took. + optional double avg_time_in_us = 5; + + // Total "self" time in micro-seconds that the operation took. + // If this operation has any children operations, the "self" time + // doesn't include the time spent inside children. + optional double total_self_time_in_us = 6; + + // Average "self" time in micro-seconds that the operation took. + optional double avg_self_time_in_us = 7; + + // Percentage of the total "accumulated" time that was caused by + // DMA stall. + optional double total_self_time_as_fraction = 8; + optional double cumulative_total_self_time_as_fraction = 9; + optional double dma_stall_fraction = 10; + + // Number of total floating-point operations (FLOPs) performed per second + // normalized to the bf16 peak performance. + optional double measured_flop_rate = 13; + + // Numbef or total floating point operations (FLOPs) performed per second for + // the op. + optional double model_flop_rate = 38; + + // Number of total bytes (including both read and write) accessed per + // second. + optional double measured_memory_bw = 14; + + // HBM bandwidth in GiB/s (including both read and write). + optional double hbm_bw = 27; + + // CMEM read bandwidth in GiB/s. + optional double cmem_read_bw = 28; + + // CMEM write bandwidth in GiB/s. + optional double cmem_write_bw = 29; + + // VMEM read bandwidth in GiB/s. + optional double vmem_read_bw = 39; + + // VMEM write bandwidth in GiB/s. + optional double vmem_write_bw = 40; + + // Overall operational intensity in FLOP/Byte. + optional double operational_intensity = 15; + + // Operational intensity based on HBM in FLOP/Byte. + optional double hbm_operational_intensity = 30; + + // Operational intensity based on CMEM read in FLOP/Byte. + optional double cmem_read_operational_intensity = 31; + + // Operational intensity based on CMEM write in FLOP/Byte. + optional double cmem_write_operational_intensity = 32; + + // Operational intensity based on VMEM read in FLOP/Byte. + optional double vmem_read_operational_intensity = 41; + + // Operational intensity based on VMEM write in FLOP/Byte. + optional double vmem_write_operational_intensity = 42; + + // Operational intensity based on the bottleneck resource in FLOP/Byte. + optional double bottleneck_operational_intensity = 33; + + // Whether this operation is "Compute", "HBM", "CMEM Read", "CMEM Write" + // bound, according to the Roofline Model. + optional string bound_by = 16; + + // The optimal flop rate calculated as + // (operational intensity) * (peak memory bw) + optional double optimal_flop_rate = 22; + + // Roofline efficiency. + optional double roofline_efficiency = 34; + + // Percentage of measured flop rate relative to the hardware limit. + optional double flop_rate_relative_to_hw_limit = 24; + + // Percentage of measured memory bandwidth relative to the hardware limit. + optional double memory_bw_relative_to_hw_limit = 25; + + // Whether the record is calculated including infeed and outfeed ops. + optional bool include_infeed_outfeed = 26; + + // Flops for the record + optional uint64 flops = 36; + + // Bytes accessed for the record + optional uint64 bytes_accessed = 37; + + reserved 11, 12, 23; +}