From 6e19c4be57ba1ce1b14d7d8ec78d89d85c93987b Mon Sep 17 00:00:00 2001
From: Yin Zhang <yinzz@google.com>
Date: Mon, 25 Nov 2024 18:06:51 -0800
Subject: [PATCH] Create Roofline Model tool in Tensorboard Plugin Profiler

PiperOrigin-RevId: 700161829
---
 frontend/app/common/constants/BUILD           |    3 +
 .../constants/roofline_model_constants.ts     |    2 +-
 frontend/app/components/main_page/BUILD       |    1 +
 .../components/main_page/main_page_module.ts  |    5 +
 frontend/app/components/roofline_model/BUILD  |   40 +
 .../operation_level_analysis.ts               |    2 +-
 .../roofline_model/roofline_model.ng.html     |   56 +
 .../roofline_model/roofline_model.scss        |   39 +
 .../roofline_model/roofline_model.ts          | 1102 +++++++++++++++++
 .../roofline_model/roofline_model_module.ts   |   25 +
 .../tensorboard_plugin_profile/convert/BUILD  |   10 +
 .../convert/raw_to_tool_data.py               |    5 +
 .../convert/roofline_model_proto_to_gviz.py   |  392 ++++++
 .../tpu/tensorflow/tpu_tf2_keras_test.py      |    3 +-
 .../profile_plugin.py                         |    1 +
 .../tensorboard_plugin_profile/protobuf/BUILD |    2 +
 .../protobuf/roofline_model.proto             |  196 +++
 17 files changed, 1881 insertions(+), 3 deletions(-)
 create mode 100644 frontend/app/components/roofline_model/BUILD
 create mode 100644 frontend/app/components/roofline_model/roofline_model.ng.html
 create mode 100644 frontend/app/components/roofline_model/roofline_model.scss
 create mode 100644 frontend/app/components/roofline_model/roofline_model.ts
 create mode 100644 frontend/app/components/roofline_model/roofline_model_module.ts
 create mode 100644 plugin/tensorboard_plugin_profile/convert/roofline_model_proto_to_gviz.py
 create mode 100644 plugin/tensorboard_plugin_profile/protobuf/roofline_model.proto

diff --git a/frontend/app/common/constants/BUILD b/frontend/app/common/constants/BUILD
index 95afc064..47e1e03c 100644
--- a/frontend/app/common/constants/BUILD
+++ b/frontend/app/common/constants/BUILD
@@ -15,6 +15,9 @@ ts_library(
     srcs = [
         "roofline_model_constants.ts",
     ],
+    deps = [
+        "@npm//@types/google.visualization",
+    ],
 )
 
 ts_library(
diff --git a/frontend/app/common/constants/roofline_model_constants.ts b/frontend/app/common/constants/roofline_model_constants.ts
index 3eb75f4c..4ea3557c 100644
--- a/frontend/app/common/constants/roofline_model_constants.ts
+++ b/frontend/app/common/constants/roofline_model_constants.ts
@@ -53,7 +53,7 @@ export const SCATTER_CHART_OPTIONS = {
     // Ticks have to be explicitly defined for scaling axis evenly.
     ticks: [0, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000, 1000000],
   },
-  legend: {position: 'right'},
+  legend: {position: 'right' as google.visualization.ChartLegendPosition},
   tooltip: {isHtml: true},
   // Be mindful that series is not specified here, otherwise the shallow copy in
   // components of this object could result in overwritten series styles
diff --git a/frontend/app/components/main_page/BUILD b/frontend/app/components/main_page/BUILD
index 2763afba..38de3ba0 100644
--- a/frontend/app/components/main_page/BUILD
+++ b/frontend/app/components/main_page/BUILD
@@ -38,6 +38,7 @@ xprof_ng_module(
         "@org_xprof//frontend/app/components/op_profile",
         "@org_xprof//frontend/app/components/overview_page",
         "@org_xprof//frontend/app/components/pod_viewer",
+        "@org_xprof//frontend/app/components/roofline_model",
         "@org_xprof//frontend/app/components/sidenav",
         "@org_xprof//frontend/app/components/tf_data_bottleneck_analysis",
         "@org_xprof//frontend/app/components/trace_viewer",
diff --git a/frontend/app/components/main_page/main_page_module.ts b/frontend/app/components/main_page/main_page_module.ts
index d1d20eb5..9ccc094a 100644
--- a/frontend/app/components/main_page/main_page_module.ts
+++ b/frontend/app/components/main_page/main_page_module.ts
@@ -29,6 +29,8 @@ import {OverviewPage} from 'org_xprof/frontend/app/components/overview_page/over
 import {OverviewPageModule} from 'org_xprof/frontend/app/components/overview_page/overview_page_module';
 import {PodViewer} from 'org_xprof/frontend/app/components/pod_viewer/pod_viewer';
 import {PodViewerModule} from 'org_xprof/frontend/app/components/pod_viewer/pod_viewer_module';
+import {RooflineModel} from 'org_xprof/frontend/app/components/roofline_model/roofline_model';
+import {RooflineModelModule} from 'org_xprof/frontend/app/components/roofline_model/roofline_model_module';
 import {SideNavModule} from 'org_xprof/frontend/app/components/sidenav/sidenav_module';
 import {TfDataBottleneckAnalysis} from 'org_xprof/frontend/app/components/tf_data_bottleneck_analysis/tf_data_bottleneck_analysis';
 import {TfDataBottleneckAnalysisModule} from 'org_xprof/frontend/app/components/tf_data_bottleneck_analysis/tf_data_bottleneck_analysis_module';
@@ -69,6 +71,8 @@ export const routes: Routes = [
   {path: 'inference_profile', component: InferenceProfile},
   {path: 'hlo_stats', component: HloStats},
   {path: 'hlo_stats^', component: HloStats},
+  {path: 'roofline_model', component: RooflineModel},
+  {path: 'roofline_model^', component: RooflineModel},
   {path: '**', component: EmptyPage},
 ];
 
@@ -96,6 +100,7 @@ export const routes: Routes = [
     FrameworkOpStatsAdapterModule,
     DcnCollectiveStatsModule,
     HloStatsModule,
+    RooflineModelModule,
     InferenceProfileModule,
     RouterModule.forRoot(routes),
   ],
diff --git a/frontend/app/components/roofline_model/BUILD b/frontend/app/components/roofline_model/BUILD
new file mode 100644
index 00000000..1e8f0f90
--- /dev/null
+++ b/frontend/app/components/roofline_model/BUILD
@@ -0,0 +1,40 @@
+load("@io_bazel_rules_sass//:defs.bzl", "sass_binary")
+load("//defs:defs.bzl", "xprof_ng_module")
+
+package(default_visibility = ["//frontend:internal"])
+
+xprof_ng_module(
+    name = "roofline_model",
+    srcs = [
+        "roofline_model.ts",
+        "roofline_model_module.ts",
+    ],
+    assets = [
+        ":roofline_model_css",
+        "roofline_model.ng.html",
+    ],
+    deps = [
+        "@npm//@angular/core",
+        "@npm//@angular/router",
+        "@npm//@ngrx/store",
+        "@npm//@types/google.visualization",
+        "@npm//rxjs",
+        "@org_xprof//frontend/app/common/constants:roofline_model_constants",
+        "@org_xprof//frontend/app/common/interfaces",
+        "@org_xprof//frontend/app/common/utils",
+        "@org_xprof//frontend/app/components/chart/table",
+        "@org_xprof//frontend/app/components/controls/category_filter",
+        "@org_xprof//frontend/app/components/controls/string_filter",
+        "@org_xprof//frontend/app/components/roofline_model/operation_level_analysis",
+        "@org_xprof//frontend/app/components/roofline_model/program_level_analysis",
+        "@org_xprof//frontend/app/services/data_service",
+        "@org_xprof//frontend/app/store",
+    ],
+)
+
+sass_binary(
+    name = "roofline_model_css",
+    src = "roofline_model.scss",
+    # stack = True,
+    sourcemap = False,
+)
diff --git a/frontend/app/components/roofline_model/operation_level_analysis/operation_level_analysis.ts b/frontend/app/components/roofline_model/operation_level_analysis/operation_level_analysis.ts
index 62bdc505..7d1ecd48 100644
--- a/frontend/app/components/roofline_model/operation_level_analysis/operation_level_analysis.ts
+++ b/frontend/app/components/roofline_model/operation_level_analysis/operation_level_analysis.ts
@@ -64,7 +64,7 @@ export class OperationLevelAnalysis
       title: 'Percentage of self time per HLO op category',
       colors: PIE_CHART_PALETTE,
       sliceVisibilityThreshold: 0.01,
-    },
+    } as google.visualization.PieChartOptions,
   };
   dataInfoRooflineScatterChart: ChartDataInfo = {
     data: null,
diff --git a/frontend/app/components/roofline_model/roofline_model.ng.html b/frontend/app/components/roofline_model/roofline_model.ng.html
new file mode 100644
index 00000000..dabd72b8
--- /dev/null
+++ b/frontend/app/components/roofline_model/roofline_model.ng.html
@@ -0,0 +1,56 @@
+<div class="section-container">
+  <div class="block-content">
+    <div class="row">
+      <h2>Device Information</h2>
+      <div class="flex-space"></div>
+    </div>
+  </div>
+  <div class="block-content">
+    <div *ngFor="let info of deviceInfoArray" style="font-size:16px">
+      <div *ngIf="info.display"><b>{{info.label}}:</b> <span> {{info.value}} {{info.unit}} {{info.context}}</span></div>
+    </div>
+  </div>
+</div>
+
+<div class="section-container">
+  <div class="block-content">
+    <h2>Section1: Program-Level Analysis</h2>
+  </div>
+  <div class="block-content">
+    <div class="description">
+      Note:<br>
+      (1) This section provides program-level analysis.<br>
+      (2) A tooltip with extra information will show up if you mouse over a point in the roofline chart.<br>
+      (3) You can choose whether to include infeed and outfeed ops for the analysis.<br>
+      (4) "Total" aggregates all operations throughout the entire profiling session. It includes incomplete steps.<br>
+      (5) "Total (HW)" is based on the hardware performance counters while the others are based on the XLA's cost analysis.
+      It is always calculated including infeed and outfeed ops regardless of the option.
+      The gap between "Total" and "Total (HW)" is due to hardware limitation (e.g., padding).<br>
+      (6) "Average" shows the average step information by aggregating the operations in the complete steps only.
+    </div>
+  </div>
+  <div class="block-content">
+    <program-level-analysis [rooflineModelData]="dataTableProgram" [viewColumns]="columnsIdxProgram" [rooflineSeriesData]="scatterDataProgram" [scatterChartOptions]="scatterChartOptionsProgram" (filterUpdated)="updateDataTableProgram($event)"></program-level-analysis>
+  </div>
+</div>
+
+<div class="section-container">
+  <div class="block-content">
+    <h2>Section2: Operation-Level Analysis</h2>
+  </div>
+  <div class="block-content">
+    <div class="description">
+      Note:<br>
+      (1) This section provides operation-level analysis.<br>
+      (2) A tooltip with extra information will show up if you mouse over a point in the roofline chart.<br>
+      (3) To avoid sluggishness, only the 1000 most time-consuming operations are shown.<br>
+      (4) You can choose whether to include infeed and outfeed ops for the analysis.<br>
+      (5) You can filter data by HLO category, bottleneck resource or HLO name.<br>
+      (6) "IDLE" represents the portion of the total execution time on device that is idle.<br>
+      (7) Ops with zero FLOP (e.g., data formatting ops like reshape, IDLE, etc.) do not show up in the roofline chart.<br>
+    </div>
+  </div>
+  <div class="block-content">
+    <operation-level-analysis [selectedOp]="selectedOpName" [rooflineModelData]="dataTableOp" [viewColumns]="columnsIdxOp" [rooflineSeriesData]="scatterDataOp" [scatterChartOptions]="scatterChartOptionsOp" (filterUpdated)="updateDataTableOp($event)"></operation-level-analysis>
+  </div>
+</div>
diff --git a/frontend/app/components/roofline_model/roofline_model.scss b/frontend/app/components/roofline_model/roofline_model.scss
new file mode 100644
index 00000000..da57955c
--- /dev/null
+++ b/frontend/app/components/roofline_model/roofline_model.scss
@@ -0,0 +1,39 @@
+.section-container {
+  margin: 20px 20px 0px;
+}
+
+.block-content {
+  padding: 5px;
+}
+
+.row {
+  display: flex;
+}
+
+.flex-space {
+  flex: 1;
+}
+
+.description {
+  font-size: 14px;
+}
+
+.tableHeaderCell {
+  word-wrap: break-word;
+  background-color: azure; //!to hide the scrolled-up text.
+}
+
+.tableTableCell {
+  word-break: break-all;
+}
+
+.opColumnClass {
+  max-height: 200px;
+  overflow-y: auto;
+}
+
+.errorMessage {
+  border: 2px solid;
+  background-color: #ffcccb;
+  color: red;
+}
diff --git a/frontend/app/components/roofline_model/roofline_model.ts b/frontend/app/components/roofline_model/roofline_model.ts
new file mode 100644
index 00000000..00450b67
--- /dev/null
+++ b/frontend/app/components/roofline_model/roofline_model.ts
@@ -0,0 +1,1102 @@
+import {Component, OnDestroy} from '@angular/core';
+import {ActivatedRoute} from '@angular/router';
+import {Store} from '@ngrx/store';
+import {DEVICE_INFO, NUMERIC_DATA_FORMAT, PIE_CHART_PALETTE, ROOFLINE_STYLES, SCATTER_CHART_AXIS, SCATTER_CHART_OPTIONS,} from 'org_xprof/frontend/app/common/constants/roofline_model_constants';
+import {NavigationEvent} from 'org_xprof/frontend/app/common/interfaces/navigation_event';
+import {RooflineModelData} from 'org_xprof/frontend/app/common/interfaces/roofline_model';
+import {setLoadingState} from 'org_xprof/frontend/app/common/utils/utils';
+import {DataService} from 'org_xprof/frontend/app/services/data_service/data_service';
+import {setCurrentToolStateAction} from 'org_xprof/frontend/app/store/actions';
+import {ReplaySubject} from 'rxjs';
+import {takeUntil} from 'rxjs/operators';
+
+interface DeviceInfoData {
+  id: string;
+  label: string;
+  type?: string;
+  value?: string | number;
+  unit?: string;
+  context?: string;
+  display?: boolean;
+}
+declare interface DeviceIndicators {
+  hasMergedVmem: boolean;
+  hasCmem: boolean;
+  hasMegacore: boolean;
+  isGpu: boolean;
+}
+type ColumnIdxArr = Array<number | google.visualization.ColumnSpec>;
+
+interface TooltipRow {
+  id: string;
+  label: string;
+  operation?: (val: string | number) => string;
+}
+
+const NVIDIA_GPU_TYPE_PREFIX = 'Nvidia GPU';
+
+/** A roofline model component. */
+@Component({
+  standalone: false,
+  selector: 'roofline-model',
+  templateUrl: './roofline_model.ng.html',
+  styleUrls: ['./roofline_model.scss'],
+})
+export class RooflineModel implements OnDestroy {
+  readonly tool = 'roofline_model';
+
+  /** Handles on-destroy Subject, used to unsubscribe. */
+  private readonly destroyed = new ReplaySubject<void>(1);
+
+  currentRun = '';
+  // Device Information section data
+  deviceInfoArray: DeviceInfoData[] = [];
+  // Some critical indicators
+  deviceIndicators: DeviceIndicators = {
+    hasMergedVmem: false,
+    hasCmem: false,
+    hasMegacore: false,
+    isGpu: false,
+  };
+
+  // dataTableRaw from the raw roofline model data
+  // DataTable data format makes a lot data manipulation easier
+  dataTableRaw: google.visualization.DataTable | null = null;
+
+  /** Program level section variables */
+  // DataTable data for underlying table chart filtered on category for program
+  dataTableProgram: google.visualization.DataTable | null = null;
+  // visible columns for the table chart view, if empty all columns are shown
+  columnsIdxProgram: ColumnIdxArr = [];
+  // preprocessed data for underlying roofline scatter chart
+  scatterDataProgram: google.visualization.DataTable | null = null;
+  readonly scatterChartOptionsProgram:
+      google.visualization.ScatterChartOptions = {
+    ...SCATTER_CHART_OPTIONS,
+    series: [],
+  };
+  readonly programLevelAgg = ['Total', 'Total (HW)', 'Average', 'Step'];
+
+  /** Operation level section variables */
+  dataTableOp?: google.visualization.DataTable | null = null;
+  columnsIdxOp: ColumnIdxArr = [];
+  scatterDataOp?: google.visualization.DataTable | null = null;
+  readonly scatterChartOptionsOp: google.visualization.ScatterChartOptions = {
+    ...SCATTER_CHART_OPTIONS,
+    series: [],
+  };
+  // Prepopulated op name from url
+  selectedOpName = '';
+
+  constructor(
+    route: ActivatedRoute,
+    private readonly dataService: DataService,
+    private readonly store: Store<{}>,
+  ) {
+    route.params.pipe(takeUntil(this.destroyed)).subscribe((params) => {
+      this.update(params as NavigationEvent);
+    });
+    this.store.dispatch(setCurrentToolStateAction({currentTool: this.tool}));
+  }
+
+  parseUrlParams() {
+    this.selectedOpName =
+      this.dataService.searchParams?.get('roofline_op_name') || '';
+  }
+
+  update(event: NavigationEvent) {
+    setLoadingState(true, this.store, 'Loading roofline model data');
+
+    // get tool data
+    this.currentRun = event.run || '';
+    const tag = event.tag || 'roofline_model';
+    const host = event.host || '';
+    this.dataService.getData(this.currentRun, tag, host)
+        .pipe(takeUntil(this.destroyed))
+        .subscribe((data) => {
+          setLoadingState(false, this.store);
+          this.parseData(data as RooflineModelData[]);
+          this.parseUrlParams();
+        });
+  }
+
+  parseData(data?: RooflineModelData[]) {
+    if (data === null || !Array.isArray(data) || data.length < 1) {
+      return;
+    }
+    this.dataTableRaw = new google.visualization.DataTable(data[0]);
+
+    this.parseDeviceInfoData(this.dataTableRaw);
+    this.parseBaseOpAndProgramTableData();
+
+    // process section 1 data
+    this.setColumnsIdxProgram();
+    this.processScatterDataProgram();
+
+    // process section 2 data
+    this.setColumnsIdxOp();
+    this.processScatterDataOp();
+  }
+
+  /** parse the device information from the original dataset */
+  parseDeviceInfoData(dataTableRaw: google.visualization.DataTable) {
+    this.deviceIndicators = {
+      hasMergedVmem: !(dataTableRaw.getTableProperty('has_merged_vmem') === '0'),
+      hasCmem: !(dataTableRaw.getTableProperty('has_cmem') === '0'),
+      hasMegacore: !(dataTableRaw.getTableProperty('megacore') === '0'),
+      isGpu: dataTableRaw.getTableProperty('device_type')
+                 .startsWith(
+                     NVIDIA_GPU_TYPE_PREFIX,
+                     ),
+    };
+
+    this.deviceInfoArray = DEVICE_INFO.reduce(
+      (acc: DeviceInfoData[], cur: DeviceInfoData) => {
+        // copy cur to avoid mutating the original object
+        // when switch between GPU and TPU runs
+        const curInfo = {...cur};
+        // deal with category of specific context
+        if (this.deviceIndicators.isGpu) {
+          if (cur.id === 'peak_flop_rate') {
+            curInfo.label = 'Peak FLOP Rate per GPU';
+          } else if (cur.id === 'peak_hbm_bw') {
+            curInfo.label = 'Peak HBM Bandwidth per GPU';
+          } else if (cur.id.startsWith('peak_cmem')) {
+            curInfo.display = false;
+          } else if (cur.id === 'megacore') {
+            curInfo.display = false;
+          } else if (cur.id === 'peak_vmem_read_bw') {
+            // TODO(b/374835204): Better refactor proto for GPU roofline
+            // model and refine related code. including ids like this
+            // peak_vmem_read_bw, and peak_vmem_write_bw, megacore, etc.
+            curInfo.label = 'Peak L2 cache Bandwidth per GPU';
+            curInfo.display = false;
+          } else if (cur.id === 'peak_vmem_write_bw') {
+            curInfo.label = 'Peak Shared Memory / L1 Cache Bandwidth per GPU';
+          }
+        } else {
+          if (cur.id.startsWith('peak_vmem')) {
+            if (!this.deviceIndicators.hasMergedVmem) {
+              curInfo.display = false;
+            }
+          } else if (cur.id.startsWith('peak_cmem')) {
+            if (!this.deviceIndicators.hasCmem) {
+              curInfo.display = false;
+            }
+          } else if (cur.id === 'megacore') {
+            curInfo.context +=
+                '(if yes, the analysis assumes Megacore where an HLO runs on both TensorCores utilizing the full chip\'s resources so that the rooflines are twice higher)';
+            curInfo.value = this.deviceIndicators.hasMegacore ? 'Yes' : 'No';
+          }
+        }
+        const value = this.dataTableRaw!.getTableProperty(cur.id);
+        acc.push({
+          // convert numeric value to numbers, as some ridge numbers will be
+          // used as axis values in chart
+          value: cur.type === 'number' ? Number(value) : value,
+          // put cur at last to overwrite with preprocessed data
+          ...curInfo,
+        });
+        return acc;
+      },
+      [] as DeviceInfoData[],
+    );
+  }
+
+  /** Filter and get DataTable data for op and program secions */
+  parseBaseOpAndProgramTableData() {
+    if (!this.dataTableRaw) {
+      return;
+    }
+    const gViewProgram = new google.visualization.DataView(this.dataTableRaw);
+    gViewProgram.setRows(
+      this.dataTableRaw.getFilteredRows([
+        {
+          column: this.dataTableRaw.getColumnIndex('category'),
+          value: 'Program',
+        },
+      ]),
+    );
+    this.dataTableProgram = gViewProgram.toDataTable();
+    this.formatTableData(this.dataTableProgram);
+
+    const gViewOp = new google.visualization.DataView(this.dataTableRaw);
+    gViewOp.setRows(
+      this.dataTableRaw.getFilteredRows([
+        {column: this.dataTableRaw.getColumnIndex('step'), value: 'Total'},
+      ]),
+    );
+    // TODO(b/359276801) Enable injecting Graph Viewer crosslink after
+    // dispatching host list to global store, so we can infer module name from
+    // program_id given the module list (aka host list in graph viewer)
+    this.dataTableOp = gViewOp.toDataTable();
+    this.formatTableData(this.dataTableOp);
+  }
+
+  /** Get the index array of columns that is visible on the table view */
+  getColumnIdx(baseColumnsIds: string[]) {
+    const cmemColumnsIds = this.deviceIndicators.hasCmem
+      ? ['measured_memory_bw', 'cmem_read_bw', 'cmem_write_bw']
+      : [];
+    const coreColumnsIds = [
+      'roofline_efficiency',
+      'compute_efficiency',
+      'max_mem_bw_utilization',
+    ];
+    const columnsIds = [
+      ...baseColumnsIds,
+      ...cmemColumnsIds,
+      ...coreColumnsIds,
+    ];
+
+    const getColumnIdxes = (columnIds: string[]) => {
+      return columnIds.reduce((acc: ColumnIdxArr, cur: string) => {
+        acc.push(this.dataTableRaw!.getColumnIndex(cur));
+        return acc;
+      }, [] as ColumnIdxArr);
+    };
+    return getColumnIdxes(columnsIds);
+  }
+
+  setColumnsIdxProgram() {
+    const baseColumnsIds = [
+      'step',
+      'total_time_per_core',
+      'measured_flop_rate',
+      'bound_by',
+      'hbm_bw',
+    ];
+    this.columnsIdxProgram = this.getColumnIdx(baseColumnsIds);
+  }
+
+  setColumnsIdxOp() {
+    const baseColumnIds = [
+      'step',
+      'rank',
+      'hlo_module_id',
+      'category',
+      'operation',
+      'occurrences',
+      'total_time',
+      'measured_flop_rate',
+      'model_flop_rate',
+      'bound_by',
+      'hbm_bw',
+    ];
+    this.columnsIdxOp = this.getColumnIdx(baseColumnIds);
+  }
+
+  formatTableData(data: google.visualization.DataTable | null) {
+    if (!data) return;
+    let dataFormatter = null;
+    for (
+      let columnIdx = 0;
+      columnIdx < data.getNumberOfColumns();
+      ++columnIdx
+    ) {
+      const id = data.getColumnId(columnIdx);
+      const formattedColumnIds = Object.keys(NUMERIC_DATA_FORMAT);
+      if (!formattedColumnIds.includes(id)) {
+        continue;
+      }
+      switch (NUMERIC_DATA_FORMAT[id].type) {
+        case 'decimal':
+          dataFormatter = new google.visualization.NumberFormat({
+            fractionDigits: NUMERIC_DATA_FORMAT[id].digit,
+          });
+          dataFormatter.format(data, columnIdx);
+          break;
+        case 'percent':
+          const pattern = `##.${'#'.repeat(
+            NUMERIC_DATA_FORMAT[id].digit || 2,
+          )}%`;
+          dataFormatter = new google.visualization.NumberFormat({pattern});
+          dataFormatter.format(data, columnIdx);
+          break;
+        default:
+          console.log(`Cannot identify format config for column ${id}`);
+      }
+    }
+  }
+
+  /** Helper function to get operation name from op graph viewer link
+   * eg: <a href="/graph_viewer/...">op_name</a>
+   */
+  getOpName(opGraphLinkStr: string) {
+    const regex = '<a .*>(.*?)</a>';
+    const match = opGraphLinkStr.match(regex);
+    const opName = match?.[1] || '';
+    return this.truncateOperationName(opName);
+  }
+
+  /** Helper function to truncate operation name for up to 30 chars */
+  truncateOperationName(operationName: string) {
+    if (operationName.length > 30) {
+      return operationName.substring(0, 30) + '...';
+    } else {
+      return operationName;
+    }
+  }
+
+  /**
+   * Helper function to add columns to the scatter plot data
+   * General for program and operation levels
+   * # columns = (1 y value + 1 tooltip) * #series + 1 X axis value
+   */
+  addScatterDataColumns(
+    seriesNames: string[],
+    scatterData: google.visualization.DataTable,
+  ) {
+    // add columns: x axis, series data + corresponding tooltip
+    scatterData.addColumn('number', 'Bottleneck Operational Intensity');
+    // create 1 value + 1 tooltip column for each series
+    seriesNames.forEach((s: string) => {
+      scatterData.addColumn('number', s);
+      scatterData.addColumn({
+        type: 'string',
+        role: 'tooltip',
+        'p': {'html': true},
+      });
+    });
+  }
+
+  /**
+   * Helper function to construct data rows for the scatter chart
+   * scatter chart includes the rooflines and other clustered points
+   */
+  makeScatterRow(
+    numColumns: number,
+    xIndex: number,
+    yIndex: number,
+    xVal: number,
+    yVal: number,
+    tooltip: string,
+  ) {
+    const newRow = Array.from<number | string | null>({
+      length: numColumns,
+    }).fill(null);
+    newRow[xIndex] = xVal;
+    newRow[yIndex] = yVal;
+    newRow[yIndex + 1] = tooltip;
+    return newRow;
+  }
+
+  /** Helper function to add a data row for the scatter chart */
+  addSeriesRow(
+    sourceDataTable: google.visualization.DataTable,
+    scatterDataTable: google.visualization.DataTable,
+    rowIndex: number,
+    columnIndex: number,
+  ) {
+    if (rowIndex < 0 || columnIndex < 0) {
+      return;
+    }
+    const numScatterDataColumns = scatterDataTable.getNumberOfColumns();
+    const xValue = sourceDataTable.getValue(
+      rowIndex,
+      sourceDataTable.getColumnIndex('bottleneck_operational_intensity'),
+    );
+    const yValue = sourceDataTable.getValue(
+      rowIndex,
+      sourceDataTable.getColumnIndex('measured_flop_rate'),
+    );
+    // xValue is always assigned to the first column
+    // yValue is assigned to the given Step agg level column (columnIdx)
+    scatterDataTable.addRow(
+      this.makeScatterRow(
+        numScatterDataColumns,
+        0,
+        columnIndex,
+        xValue,
+        yValue,
+        this.makeTooltip(sourceDataTable, rowIndex),
+      ),
+    );
+  }
+
+  /** Helper function to add data rows for a single roofline */
+  addRoofline(
+    rooflineName: string,
+    seriesIndex: number,
+    peakFlopRate: number,
+    peakMemoryBw: number,
+    ridgePoint: number,
+    scatterData: google.visualization.DataTable,
+  ) {
+    if (seriesIndex < 0) {
+      return;
+    }
+    const numColumns = scatterData.getNumberOfColumns();
+    // Roofline before the ridge point.
+    scatterData.addRow(
+      this.makeScatterRow(
+        numColumns,
+        0,
+        seriesIndex,
+        SCATTER_CHART_AXIS.minX,
+        SCATTER_CHART_AXIS.minX * peakMemoryBw,
+        this.makeRooflineTooltip(
+          'Roofline',
+          SCATTER_CHART_AXIS.minX,
+          SCATTER_CHART_AXIS.minX * peakMemoryBw,
+        ),
+      ),
+    );
+    // Ridge point.
+    scatterData.addRow(
+      this.makeScatterRow(
+        numColumns,
+        0,
+        seriesIndex,
+        ridgePoint,
+        peakFlopRate,
+        this.makeRooflineTooltip(
+          rooflineName + ' Ridge Point',
+          ridgePoint,
+          peakFlopRate,
+        ),
+      ),
+    );
+    // Roofline after the ridge point.
+    scatterData.addRow(
+      this.makeScatterRow(
+        numColumns,
+        0,
+        seriesIndex,
+        SCATTER_CHART_AXIS.maxX,
+        peakFlopRate,
+        this.makeRooflineTooltip(
+          'Roofline',
+          SCATTER_CHART_AXIS.maxX,
+          peakFlopRate,
+        ),
+      ),
+    );
+  }
+
+  /** Callback function when filterUpdated in child is triggered */
+  updateDataTableOp(newFilters: google.visualization.DataTableCellFilter[]) {
+    this.processScatterDataOp(newFilters);
+  }
+
+  /** Callback function when filterUpdated in child is triggered */
+  updateDataTableProgram(
+    newFilters: google.visualization.DataTableCellFilter[],
+  ) {
+    this.processScatterDataProgram(newFilters);
+  }
+
+  /**
+   * Parse dataset for program level roofline scatter chart
+   * With series of data, operation scatter plot =
+   * rooflines (line) plot + program level step cluster(scatter) plot
+   */
+  processScatterDataProgram(
+    filters?: google.visualization.DataTableCellFilter[],
+  ) {
+    if (!this.dataTableProgram) {
+      return;
+    }
+    const filteredDataTableProgram = this.getFilteredDataTable(
+      this.dataTableProgram,
+      filters,
+    );
+    // TODO: update the programSeries based on data received
+    const programSeries = this.getProgramSeries();
+    // clear and recreate the scatter data
+    this.scatterDataProgram = new google.visualization.DataTable();
+    this.addScatterDataColumns(programSeries, this.scatterDataProgram);
+    this.addRooflinesSeriesRows(this.scatterDataProgram);
+    this.addProgramSeriesRows(programSeries, filteredDataTableProgram);
+    this.updateProgramScatterStyles(programSeries.length);
+  }
+
+  /**
+   * Parse dataset for operation level roofline scatter chart
+   * With series of data, operation scatter plot =
+   * rooflines (line) plot + op categoreis cluster(scatter) plot
+   */
+  processScatterDataOp(filters?: google.visualization.DataTableCellFilter[]) {
+    if (!this.dataTableOp) {
+      return;
+    }
+    const filteredDataTableOp = this.getFilteredDataTable(
+      this.dataTableOp,
+      filters,
+    );
+
+    const opCategories = this.getOpCategories(filteredDataTableOp);
+    const opSeries = this.getOpSeries(opCategories);
+
+    // clear the original scatter data
+    this.scatterDataOp = new google.visualization.DataTable();
+    this.addScatterDataColumns(opSeries, this.scatterDataOp);
+    this.addRooflinesSeriesRows(this.scatterDataOp);
+    this.addOpSeriesRows(opSeries, filteredDataTableOp);
+    this.updateOpScatterStyles(opSeries.length);
+  }
+
+  /**
+   * Helper function to get filtered DataTable given base op/proram DataTable,
+   * and feed to child component as source data for roofline scatter chart.
+   * Because scatter chart DataTable is in a different structure than the table
+   * chart DataTable.
+   * Filteres are passed from child filters.
+   */
+  getFilteredDataTable(
+    dataTable: google.visualization.DataTable,
+    filters?: google.visualization.DataTableCellFilter[],
+  ) {
+    // apply filters if any, filters are emitted from child component
+    // because the scatter dataTable is restructured and cannot be applied in
+    // child directly
+    let filteredDataTable: google.visualization.DataTable | null = null;
+    if (filters && filters.length > 0) {
+      const filteredDataView = new google.visualization.DataView(dataTable);
+      filteredDataView.setRows(dataTable.getFilteredRows(filters));
+      filteredDataTable = filteredDataView.toDataTable();
+    } else {
+      filteredDataTable = dataTable;
+    }
+    return filteredDataTable;
+  }
+
+  /**
+   * Helper function to get operation categories, with data filtered on
+   * "step == Total", the list is sorted by total_self_time in order to make the
+   * scatter chart style in consistent with the pie chart
+   */
+  getOpCategories(filteredDataTableOp: google.visualization.DataTable) {
+    const sortedOpCategories: string[] = [];
+
+    // sort the categories given frequency
+    const chartView = google.visualization.data.group(
+      filteredDataTableOp,
+      [filteredDataTableOp.getColumnIndex('category')],
+      [
+        {
+          'column': filteredDataTableOp.getColumnIndex('total_self_time'),
+          'aggregation': google.visualization.data.sum,
+          'type': 'number',
+        },
+      ],
+    );
+    // sort categories on sum of total_self_time
+    chartView.sort({column: 1, desc: true});
+    for (let i = 0; i < chartView.getNumberOfRows(); ++i) {
+      const category = chartView.getValue(i, 0);
+      // Program will be appended separately
+      if (category !== 'Program') {
+        sortedOpCategories.push(category);
+      }
+    }
+    return sortedOpCategories;
+  }
+
+  /**
+   * The roofline chart consists of a seris of data (roofline series +
+   * appregation series)
+   * This helper function gets the roofline base series
+   */
+  getRooflineBaseSeries() {
+    let series: string[] = [];
+    if (this.deviceIndicators.isGpu) {
+      series = series.concat(['Shared Mem / L1 Roofline']);
+    } else {
+      if (this.deviceIndicators.hasMergedVmem) {
+        series = series.concat(['VMEM Read Roofline', 'VMEM Write Roofline']);
+      } else if (this.deviceIndicators.hasCmem) {
+        series = series.concat(['CMEM Read Roofline', 'CMEM Write Roofline']);
+      }
+    }
+    return [...series, 'HBM Roofline'];
+  }
+
+  /**
+   * #series = #roofline(line) + 4 program level aggregation series
+   */
+  getProgramSeries() {
+    let series: string[] = this.getRooflineBaseSeries();
+    series = series.concat(this.programLevelAgg);
+    return series;
+  }
+
+  /**
+   * #series = #roofline(line) + #operation level aggregation series
+   * (categories) + 2 'Program' datapoints
+   * The sereis list will decide the style of the scatter chart
+   */
+  getOpSeries(opCategories: string[]) {
+    let series: string[] = this.getRooflineBaseSeries();
+    // the first program is to make it's legend shows on top
+    // the second program is to show marker on top layer on the chart
+    series = series.concat(['Program', ...opCategories, 'Program']);
+    return series;
+  }
+
+  /**
+   * Helper function to add data rows for roofline plot - vmem, cmem, hbm
+   * generalized function for both op & program
+   */
+  addRooflinesSeriesRows(scatterData: google.visualization.DataTable) {
+    const rooflineInfo = this.deviceInfoArray.reduce(
+      (acc, item) => {
+        acc[item.id] = Number(item.value || 0);
+        return acc;
+      },
+      {} as {[key: string]: number},
+    );
+    let columnIndex = 1;
+
+    if (!this.deviceIndicators.isGpu) {
+      const addRooflinePairs = (memType: 'cmem' | 'vmem') => {
+        ['read', 'write'].forEach((opType) => {
+          this.addRoofline(
+            `${memType.toUpperCase()} ${opType.charAt(0).toUpperCase() + opType.slice(1)}`,
+            columnIndex,
+            rooflineInfo['peak_flop_rate'],
+            rooflineInfo[`peak_${memType}_${opType}_bw`],
+            rooflineInfo[`${memType}_${opType}_ridge_point`],
+            scatterData,
+          );
+          columnIndex += 2; // value col + tooltip col
+        });
+      };
+      if (this.deviceIndicators.hasMergedVmem) {
+        addRooflinePairs('vmem');
+      }
+      if (this.deviceIndicators.hasCmem) {
+        addRooflinePairs('cmem');
+      }
+    } else {
+      // Just use vmem_read for gpu SHM/L1
+      this.addRoofline(
+        'Shared Mem / L1',
+        columnIndex,
+        rooflineInfo['peak_flop_rate'],
+        rooflineInfo['peak_vmem_write_bw'],
+        rooflineInfo['vmem_write_ridge_point'],
+        scatterData,
+      );
+      columnIndex += 2; // value col + tooltip col
+    }
+
+    this.addRoofline(
+      'HBM',
+      columnIndex,
+      rooflineInfo['peak_flop_rate'],
+      rooflineInfo['peak_hbm_bw'],
+      rooflineInfo['hbm_ridge_point'],
+      scatterData,
+    );
+  }
+
+  /**
+   * Poluplate program level scatter chart data rows with series using filtered
+   * operation DataTable data
+   */
+  addProgramSeriesRows(
+    programSeries: string[],
+    filteredDataTableProgram: google.visualization.DataTable,
+  ) {
+    for (
+      let rowIndex = 0;
+      rowIndex < filteredDataTableProgram.getNumberOfRows();
+      ++rowIndex
+    ) {
+      let step = filteredDataTableProgram.getValue(
+        rowIndex,
+        filteredDataTableProgram.getColumnIndex('step'),
+      );
+      // Assgin 'Step' as value if the step field is numeric string
+      if (!this.programLevelAgg.includes(step)) {
+        step = 'Step';
+      }
+      const columnIndex = 1 + 2 * programSeries.lastIndexOf(step);
+      this.addSeriesRow(
+        filteredDataTableProgram,
+        this.scatterDataProgram!,
+        rowIndex,
+        columnIndex,
+      );
+    }
+  }
+
+  /**
+   * Poluplate operation level scatter chart data rows with series using
+   * filtered operation DataTable data
+   */
+  addOpSeriesRows(
+    opSeries: string[],
+    filteredDataTableOp: google.visualization.DataTable,
+  ) {
+    for (
+      let rowIndex = 0;
+      rowIndex < filteredDataTableOp.getNumberOfRows();
+      ++rowIndex
+    ) {
+      const category = filteredDataTableOp.getValue(
+        rowIndex,
+        filteredDataTableOp.getColumnIndex('category'),
+      );
+      const columnIndex = 1 + 2 * opSeries.lastIndexOf(category);
+      if (
+        columnIndex > 0 &&
+        filteredDataTableOp.getValue(
+          rowIndex,
+          filteredDataTableOp.getColumnIndex('bound_by'),
+        ) !== 'Unknown'
+      ) {
+        this.addSeriesRow(
+          filteredDataTableOp,
+          this.scatterDataOp!,
+          rowIndex,
+          columnIndex,
+        );
+      }
+    }
+  }
+
+  /** Make tooltip for rooflines series in the scatter chart */
+  makeRooflineTooltip(
+    rooflineName: string,
+    operationIntensity: number,
+    flopRate: number,
+  ) {
+    return (
+      '<div style="padding:5px;">' +
+      '<b>' +
+      rooflineName +
+      '</b><br/>' +
+      '<b>Operational Intensity (FLOP/Byte): </b>' +
+      operationIntensity.toLocaleString(undefined, {maximumFractionDigits: 2}) +
+      '<br/>' +
+      '<b>Flop Rate (GFLOP/s): </b>' +
+      flopRate.toLocaleString(undefined, {maximumFractionDigits: 2}) +
+      '<br/>' +
+      '</div>'
+    );
+  }
+
+  /** Make tooltip for the clustered series (points) in the scatter chart */
+  makeTooltip(dataTable: google.visualization.DataTable, rowIndex: number) {
+    // Prepare column index to make easier access
+    const columns: {[columnKey: string]: number} = {};
+    for (let i = 0; i < dataTable.getNumberOfColumns(); i++) {
+      columns[dataTable.getColumnId(i)] = i;
+    }
+    // TODO(jihochoi): fix the utilization numbers for TPU V4.
+    // '<b> - Percent relative to optimal: </b>'
+    // + (100 * dataTable.getValue(rowIndex,
+    // columns.roofline_efficiency)).toLocaleString(undefined, {maximumFractionDigits:2})
+    // + '%<br/>' +
+    // '<b> - Percent relative to HW limit: </b>'
+    // + (100 * dataTable.getValue(rowIndex,
+    // columns.compute_efficiency)).toLocaleString(undefined, {maximumFractionDigits:2})
+    // + '%<br/>' +
+    // '<b> - Percent relative to HW limit: </b>'
+    // + (100 * dataTable.getValue(rowIndex,
+    // columns.hbm_bw_utilization)).toLocaleString(undefined, {maximumFractionDigits:2})
+    // + '%<br/>' +
+    const tooltipRows: TooltipRow[] = [
+      {
+        id: 'step',
+        label: 'Step',
+      },
+      {
+        id: 'rank',
+        label: 'Rank',
+      },
+      {
+        id: 'hlo_module_id',
+        label: 'Program ID',
+      },
+      {
+        id: 'category',
+        label: 'Category',
+      },
+      {
+        id: 'operation',
+        label: 'Operation',
+        operation: (val) => this.getOpName(val as string),
+      },
+      {
+        id: 'occurrences',
+        label: '# of Occurrences',
+      },
+      {
+        id: 'total_time_per_core',
+        label: 'Total Time per core (us)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 2}),
+      },
+      {
+        id: 'total_time_in_percentage',
+        label: 'Total Time / Program',
+        operation: (val) => `${100 * Number(Number(val).toFixed(4))}%`,
+      },
+      {
+        id: 'measured_flop_rate',
+        label: 'Normalized FLOP Rate (GFLOP/s)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'model_flop_rate',
+        label: 'Model FLOP Rate (GFLOP/s)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'hbm_bw',
+        label: 'HBM BW (GiB/s)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'cmem_read_bw',
+        label: 'CMEM Read BW (GiB/s)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'cmem_write_bw',
+        label: 'CMEM Write BW (GiB/s)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'vmem_read_bw',
+        label: 'VMEM Read BW (GiB/s)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'vmem_write_bw',
+        label: 'VMEM Write BW (GiB/s)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'operational_intensity',
+        label: 'Operational Intensity (FLOP/Byte)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'hbm_operational_intensity',
+        label: 'HBM Operational Intensity (FLOP/Byte)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'cmem_read_operational_intensity',
+        label: 'CMEM Read Operational Intensity (FLOP/Byte)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'cmem_write_operational_intensity',
+        label: 'CMEM Write Operational Intensity (FLOP/Byte)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'vmem_read_operational_intensity',
+        label: 'VMEM Read Operational Intensity (FLOP/Byte)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'vmem_write_operational_intensity',
+        label: 'VMEM Write Operational Intensity (FLOP/Byte)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'bottleneck_operational_intensity',
+        label: 'Bottleneck Operational Intensity (FLOP/Byte)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {id: 'boundy_by', label: 'Bound By'},
+    ];
+    const gpuTooltipRows: TooltipRow[] = [
+      {
+        id: 'step',
+        label: 'Step',
+      },
+      {
+        id: 'rank',
+        label: 'Rank',
+      },
+      {
+        id: 'hlo_module_id',
+        label: 'Program ID',
+      },
+      {
+        id: 'category',
+        label: 'Category',
+      },
+      {
+        id: 'operation',
+        label: 'Operation',
+        operation: (val) => this.getOpName(val as string),
+      },
+      {
+        id: 'occurrences',
+        label: '# of Occurrences',
+      },
+      {
+        id: 'total_time_per_core',
+        label: 'Total Time per gpu (us)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 2}),
+      },
+      {
+        id: 'total_time_in_percentage',
+        label: 'Total Time / Program',
+        operation: (val) => `${100 * Number(Number(val).toFixed(4))}%`,
+      },
+      {
+        id: 'measured_flop_rate',
+        label: 'Normalized FLOP Rate (GFLOP/s)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'model_flop_rate',
+        label: 'Model FLOP Rate (GFLOP/s)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'hbm_bw',
+        label: 'HBM BW (GiB/s)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'vmem_write_bw',
+        label: 'Shm/L1 BW (GiB/s)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'operational_intensity',
+        label: 'Operational Intensity (FLOP/Byte)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'hbm_operational_intensity',
+        label: 'HBM Operational Intensity (FLOP/Byte)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {
+        id: 'bottleneck_operational_intensity',
+        label: 'Bottleneck Operational Intensity (FLOP/Byte)',
+        operation: (val) =>
+          val.toLocaleString(undefined, {maximumFractionDigits: 4}),
+      },
+      {id: 'boundy_by', label: 'Bound By'},
+    ];
+
+    const tooltipBodyHtml = (
+      this.deviceIndicators.isGpu ? gpuTooltipRows : tooltipRows
+    ).reduce((acc: string, row: TooltipRow) => {
+      if (!columns.hasOwnProperty(row.id)) {
+        return acc;
+      }
+      const val: string | number = dataTable.getValue(
+        rowIndex,
+        columns[row.id],
+      );
+      acc += `<b>${row.label}: </b> ${
+        row.operation ? row.operation(val) : val
+      }<br>`;
+      return acc;
+    }, '');
+    return `<div style="padding: 5px">${tooltipBodyHtml}</div>`;
+  }
+
+  // TODO(yinzz) remove the style updating dependency on the series order
+  // make it a k-v based format
+  formatRooflineSeriesStyle(
+    seriesIndex: number,
+    chartOptions: google.visualization.ScatterChartOptions,
+  ) {
+    if (this.deviceIndicators.isGpu) {
+      chartOptions.series[seriesIndex++] = ROOFLINE_STYLES.write;
+    } else {
+      if (this.deviceIndicators.hasMergedVmem) {
+        chartOptions.series[seriesIndex++] = ROOFLINE_STYLES.read;
+        chartOptions.series[seriesIndex++] = ROOFLINE_STYLES.write;
+      }
+      if (this.deviceIndicators.hasCmem) {
+        chartOptions.series[seriesIndex++] = ROOFLINE_STYLES.read;
+        chartOptions.series[seriesIndex++] = ROOFLINE_STYLES.write;
+      }
+    }
+    chartOptions.series[seriesIndex++] = ROOFLINE_STYLES.hbm;
+    return seriesIndex;
+  }
+
+  updateProgramScatterStyles(numSeries: number) {
+    let seriesIndex = 0;
+    seriesIndex = this.formatRooflineSeriesStyle(
+      seriesIndex,
+      this.scatterChartOptionsProgram,
+    );
+    for (; seriesIndex < numSeries; ++seriesIndex) {
+      this.scatterChartOptionsProgram.series[seriesIndex] = {pointSize: 4};
+    }
+  }
+
+  // TODO(yinzz) remove the style updating dependency on the series order
+  updateOpScatterStyles(numSeries: number) {
+    let seriesIndex = 0;
+    seriesIndex = this.formatRooflineSeriesStyle(
+      seriesIndex,
+      this.scatterChartOptionsOp,
+    );
+    // extra series style record for the Program legend
+    this.scatterChartOptionsOp.series[seriesIndex++] = {
+      pointSize: 20,
+      color: '#FF0000',
+      pointShape: 'star',
+    };
+    // Other ops are colored in the same order as in the pie chart, cmem, vmem,
+    // hbm, program
+    const numSeriesBeforeOps =
+      2 * (this.deviceIndicators.hasCmem ? 1 : 0) +
+      2 * (this.deviceIndicators.hasMergedVmem ? 1 : 0) +
+      2;
+    for (; seriesIndex < numSeries - 1; ++seriesIndex) {
+      this.scatterChartOptionsOp.series[seriesIndex] = {
+        pointSize: 3,
+        // make sure the color of series matches the pie chart
+        color:
+          PIE_CHART_PALETTE[
+            (seriesIndex - numSeriesBeforeOps) % PIE_CHART_PALETTE.length
+          ],
+      };
+    }
+    // Real series for program which does not show in the legend.
+    // This is added at the end to make it plotted at the top and not buried by
+    // other op points.
+    this.scatterChartOptionsOp.series[numSeries - 1] = {
+      pointSize: 20,
+      color: '#FF0000',
+      pointShape: 'star',
+      visibleInLegend: false,
+    };
+  }
+
+  ngOnDestroy() {
+    setLoadingState(false, this.store);
+    this.destroyed.next();
+    this.destroyed.complete();
+  }
+}
diff --git a/frontend/app/components/roofline_model/roofline_model_module.ts b/frontend/app/components/roofline_model/roofline_model_module.ts
new file mode 100644
index 00000000..6e475789
--- /dev/null
+++ b/frontend/app/components/roofline_model/roofline_model_module.ts
@@ -0,0 +1,25 @@
+import {CommonModule} from '@angular/common';
+import {NgModule} from '@angular/core';
+import {TableModule} from 'org_xprof/frontend/app/components/chart/table/table_module';
+import {CategoryFilterModule} from 'org_xprof/frontend/app/components/controls/category_filter/category_filter_module';
+import {StringFilterModule} from 'org_xprof/frontend/app/components/controls/string_filter/string_filter_module';
+import {OperationLevelAnalysisModule} from 'org_xprof/frontend/app/components/roofline_model/operation_level_analysis/operation_level_analysis_module';
+import {ProgramLevelAnalysisModule} from 'org_xprof/frontend/app/components/roofline_model/program_level_analysis/program_level_analysis_module';
+
+import {RooflineModel} from './roofline_model';
+
+/** A roofline model module. */
+@NgModule({
+  declarations: [RooflineModel],
+  imports: [
+    CommonModule,
+    TableModule,
+    CategoryFilterModule,
+    StringFilterModule,
+    ProgramLevelAnalysisModule,
+    OperationLevelAnalysisModule,
+  ],
+  exports: [RooflineModel],
+})
+export class RooflineModelModule {
+}
diff --git a/plugin/tensorboard_plugin_profile/convert/BUILD b/plugin/tensorboard_plugin_profile/convert/BUILD
index 73de40fc..537617e6 100644
--- a/plugin/tensorboard_plugin_profile/convert/BUILD
+++ b/plugin/tensorboard_plugin_profile/convert/BUILD
@@ -97,6 +97,15 @@ py_library(
     ],
 )
 
+py_library(
+    name = "roofline_model_proto_to_gviz",
+    srcs = ["roofline_model_proto_to_gviz.py"],
+    deps = [
+        requirement("gviz_api"),
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:protos_all_py_pb2",
+    ],
+)
+
 py_test(
     name = "overview_page_proto_to_gviz_test",
     size = "small",
@@ -251,6 +260,7 @@ py_library(
         ":input_pipeline_proto_to_gviz",
         ":kernel_stats_proto_to_gviz",
         ":overview_page_proto_to_gviz",
+        ":roofline_model_proto_to_gviz",
         ":tf_data_stats_proto_to_gviz",
         ":tf_stats_proto_to_gviz",
         ":trace_events_json",
diff --git a/plugin/tensorboard_plugin_profile/convert/raw_to_tool_data.py b/plugin/tensorboard_plugin_profile/convert/raw_to_tool_data.py
index 57c0f790..836c407e 100644
--- a/plugin/tensorboard_plugin_profile/convert/raw_to_tool_data.py
+++ b/plugin/tensorboard_plugin_profile/convert/raw_to_tool_data.py
@@ -32,6 +32,7 @@
 from tensorboard_plugin_profile.convert import input_pipeline_proto_to_gviz
 from tensorboard_plugin_profile.convert import kernel_stats_proto_to_gviz
 from tensorboard_plugin_profile.convert import overview_page_proto_to_gviz
+from tensorboard_plugin_profile.convert import roofline_model_proto_to_gviz
 from tensorboard_plugin_profile.convert import tf_data_stats_proto_to_gviz
 from tensorboard_plugin_profile.convert import tf_stats_proto_to_gviz
 from tensorboard_plugin_profile.convert import trace_events_json
@@ -175,6 +176,10 @@ def xspace_to_tool_data(
     raw_data, success = xspace_wrapper_func(xspace_paths, tool)
     if success:
       data = hlo_stats_proto_to_gviz.to_json(raw_data)
+  elif tool == 'roofline_model':
+    raw_data, success = xspace_wrapper_func(xspace_paths, tool)
+    if success:
+      data = roofline_model_proto_to_gviz.to_json(raw_data)
   elif tool == 'graph_viewer':
     options = params.get('graph_viewer_options', {})
     raw_data, success = xspace_wrapper_func(xspace_paths, tool, options)
diff --git a/plugin/tensorboard_plugin_profile/convert/roofline_model_proto_to_gviz.py b/plugin/tensorboard_plugin_profile/convert/roofline_model_proto_to_gviz.py
new file mode 100644
index 00000000..91b7d1c5
--- /dev/null
+++ b/plugin/tensorboard_plugin_profile/convert/roofline_model_proto_to_gviz.py
@@ -0,0 +1,392 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""For conversion of RooflineModel protos to GViz DataTables.
+
+Usage:
+    gviz_data_tables = generate_roofline_model_table(roofline_model_db)
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gviz_api
+
+from tensorboard_plugin_profile.protobuf import roofline_model_pb2
+
+
+def get_step_string(record_type, step_num):
+  match record_type:
+    case roofline_model_pb2.RecordType.INVALID_RECORD_TYPE:
+      return "Invalid"
+    case roofline_model_pb2.RecordType.ALL:
+      return "Total"
+    case roofline_model_pb2.RecordType.ALL_HW:
+      return "Total (HW)"
+    case roofline_model_pb2.RecordType.AVERAGE_STEP:
+      return "Average"
+    case roofline_model_pb2.RecordType.PER_STEP:
+      return step_num
+
+
+def gibi_to_giga(gibibytes):
+  return gibibytes * ((1 << 30) / 1.0e9)
+
+
+def ridge_point(peak_gigaflops_per_second, peak_gibibytes_per_second):
+  if peak_gibibytes_per_second == 0:
+    return 0.0
+  return peak_gigaflops_per_second / gibi_to_giga(peak_gibibytes_per_second)
+
+
+def get_roofline_model_table_args_for_gpu(roofline_model_db):
+  """Creates roofline model table args from a roofline model proto for gpu.
+
+  Args:
+    roofline_model_db: A RooflineModelDatabase proto.
+
+  Returns:
+    Returns table description(columns), data(rows) and custom properties.
+  """
+  table_description = [
+      ("step", "string", "Step"),
+      ("rank", "number", "Rank"),
+      ("category", "string", "Category"),
+      ("operation", "string", "Operation"),
+      ("occurrences", "number", "# Occurrences"),
+      ("total_time", "number", "Total Time (us)"),
+      ("avg_time", "number", "Avg. time (us)"),
+      ("total_self_time", "number", "Total self time (us)"),
+      ("avg_self_time", "number", "Avg. self time (us)"),
+      ("total_self_time_percent", "number", "Total self time (%)"),
+      (
+          "cumulative_total_self_time_percent",
+          "number",
+          "Cumulative total self time (%)",
+      ),
+      ("measured_flop_rate", "number", "Normalized FLOP Rate (GFLOP/s)"),
+      ("model_flop_rate", "number", "Model FLOP Rate (GFLOP/s)"),
+      ("measured_memory_bw", "number", "Memory BW (GiB/s)"),
+      ("hbm_bw", "number", "HBM BW (GiB/s)"),
+      # For nvidia gpu, currently no vmem_read_bw field, and
+      # vmem_write_bw is used for SHM/L1.
+      ("vmem_write_bw", "number", "SHM/L1 BW (GiB/s)"),
+      ("operational_intensity", "number", "Operational Intensity (FLOP/Byte)"),
+      (
+          "hbm_operational_intensity",
+          "number",
+          "HBM Operational Intensity (FLOP/Byte)",
+      ),
+      # for nvidia gpu, currently novmem_read_operational_intensity field, and
+      # vmem_write_operational_intensity used for SHM/L1.
+      (
+          "vmem_write_operational_intensity",
+          "number",
+          "SHM/L1 Operational Intensity (FLOP/Byte)",
+      ),
+      (
+          "bottleneck_operational_intensity",
+          "number",
+          "Bottleneck Operational Intensity (FLOP/Byte)",
+      ),
+      ("bound_by", "string", "Bound by"),
+      ("total_time_per_core", "number", "Total Time per core (us)"),
+      ("total_time_in_percentage", "number", "Total Time (%)"),
+      ("optimal_flop_rate", "number", "Optimal FLOP Rate (GFLOP/s)"),
+      ("roofline_efficiency", "number", "Roofline efficiency (%)"),
+      ("compute_efficiency", "number", "FLOP Rate / Peak (%)"),
+      (
+          "max_mem_bw_utilization",
+          "number",
+          "Max memory (cmem or hbm) bandwidth utilization (%)",
+      ),
+      ("include_infeed_outfeed", "boolean", "Include Infeed/Outfeed"),
+      ("hlo_module_id", "string", "Program ID"),
+  ]
+
+  data = []
+  for record in roofline_model_db.roofline_model_record:
+    row = [
+        get_step_string(record.record_type, record.step_num),
+        record.rank,
+        record.hlo_category,
+        record.hlo_name,
+        record.occurrences,
+        record.total_time_in_us,
+        record.avg_time_in_us,
+        record.total_self_time_in_us,
+        record.avg_self_time_in_us,
+        record.total_self_time_as_fraction,
+        record.cumulative_total_self_time_as_fraction,
+        record.measured_flop_rate,
+        record.model_flop_rate,
+        record.measured_memory_bw,
+        record.hbm_bw,
+        record.vmem_write_bw,
+        record.operational_intensity,
+        record.hbm_operational_intensity,
+        record.vmem_write_operational_intensity,
+        record.bottleneck_operational_intensity,
+        record.bound_by,
+        record.total_time_per_core_in_us,
+        record.total_time_in_percentage,
+        record.optimal_flop_rate,
+        record.roofline_efficiency,
+        record.flop_rate_relative_to_hw_limit,
+        record.memory_bw_relative_to_hw_limit,
+        record.include_infeed_outfeed,
+        record.hlo_module_id,
+    ]
+    data.append(row)
+  custom_properties = {
+      "device_type": roofline_model_db.device_type,
+      "has_cmem": roofline_model_db.has_cmem,
+      "has_merged_vmem": roofline_model_db.has_merged_vmem,
+      "peak_flop_rate": roofline_model_db.peak_flop_rate,
+      "peak_hbm_bw": roofline_model_db.peak_hbm_bw,
+      "peak_shml1_write_bw": roofline_model_db.peak_vmem_write_bw,
+      "hbm_ridge_point": ridge_point(
+          roofline_model_db.peak_flop_rate, roofline_model_db.peak_hbm_bw
+      ),
+      "shml1_write_ridge_point": ridge_point(
+          roofline_model_db.peak_flop_rate, roofline_model_db.peak_vmem_write_bw
+      ),
+  }
+  return (table_description, data, custom_properties)
+
+
+def get_roofline_model_table_args(roofline_model_db):
+  """Creates roofline model table args from a roofline model proto.
+
+  Args:
+    roofline_model_db: A RooflineModelDatabase proto.
+
+  Returns:
+    Returns table description(columns), data(rows) and custom properties.
+  """
+
+  table_description = [
+      ("step", "string", "Step"),
+      ("rank", "number", "Rank"),
+      ("category", "string", "Category"),
+      ("operation", "string", "Operation"),
+      ("occurrences", "number", "# Occurrences"),
+      ("total_time", "number", "Total Time (us)"),
+      ("avg_time", "number", "Avg. time (us)"),
+      ("total_self_time", "number", "Total self time (us)"),
+      ("avg_self_time", "number", "Avg. self time (us)"),
+      ("total_self_time_percent", "number", "Total self time (%)"),
+      (
+          "cumulative_total_self_time_percent",
+          "number",
+          "Cumulative total self time (%)",
+      ),
+      ("dma_stall_percent", "number", "%time stalled by DMA"),
+      ("measured_flop_rate", "number", "Normalized FLOP Rate (GFLOP/s)"),
+      ("model_flop_rate", "number", "Model FLOP Rate (GFLOP/s)"),
+      ("measured_memory_bw", "number", "Memory BW (GiB/s)"),
+      ("hbm_bw", "number", "HBM BW (GiB/s)"),
+      ("cmem_read_bw", "number", "CMEM Read BW (GiB/s)"),
+      ("cmem_write_bw", "number", "CMEM Write BW (GiB/s)"),
+      ("vmem_read_bw", "number", "VMEM Read BW (GiB/s)"),
+      ("vmem_write_bw", "number", "VMEM Write BW (GiB/s)"),
+      ("operational_intensity", "number", "Operational Intensity (FLOP/Byte)"),
+      (
+          "hbm_operational_intensity",
+          "number",
+          "HBM Operational Intensity (FLOP/Byte)",
+      ),
+      (
+          "cmem_read_operational_intensity",
+          "number",
+          "CMEM Read Operational Intensity (FLOP/Byte)",
+      ),
+      (
+          "cmem_write_operational_intensity",
+          "number",
+          "CMEM Write Operational Intensity (FLOP/Byte)",
+      ),
+      (
+          "vmem_read_operational_intensity",
+          "number",
+          "VMEM Read Operational Intensity (FLOP/Byte)",
+      ),
+      (
+          "vmem_write_operational_intensity",
+          "number",
+          "VMEM Write Operational Intensity (FLOP/Byte)",
+      ),
+      (
+          "bottleneck_operational_intensity",
+          "number",
+          "Bottleneck Operational Intensity (FLOP/Byte)",
+      ),
+      ("bound_by", "string", "Bound by"),
+      ("total_time_per_core", "number", "Total Time per core (us)"),
+      ("total_time_in_percentage", "number", "Total Time (%)"),
+      ("optimal_flop_rate", "number", "Optimal FLOP Rate (GFLOP/s)"),
+      ("roofline_efficiency", "number", "Roofline efficiency (%)"),
+      ("compute_efficiency", "number", "FLOP Rate / Peak (%)"),
+      (
+          "max_mem_bw_utilization",
+          "number",
+          "Max memory (cmem or hbm) bandwidth utilization (%)",
+      ),
+      ("include_infeed_outfeed", "boolean", "Include Infeed/Outfeed"),
+      ("hlo_module_id", "string", "Program ID"),
+  ]
+
+  data = []
+  for record in roofline_model_db.roofline_model_record:
+    row = [
+        get_step_string(record.record_type, record.step_num),
+        record.rank,
+        record.hlo_category,
+        record.hlo_name,
+        record.occurrences,
+        record.total_time_in_us,
+        record.avg_time_in_us,
+        record.total_self_time_in_us,
+        record.avg_self_time_in_us,
+        record.total_self_time_as_fraction,
+        record.cumulative_total_self_time_as_fraction,
+        record.dma_stall_fraction,
+        record.measured_flop_rate,
+        record.model_flop_rate,
+        record.measured_memory_bw,
+        record.hbm_bw,
+        record.cmem_read_bw,
+        record.cmem_write_bw,
+        record.vmem_read_bw,
+        record.vmem_write_bw,
+        record.operational_intensity,
+        record.hbm_operational_intensity,
+        record.cmem_read_operational_intensity,
+        record.cmem_write_operational_intensity,
+        record.vmem_read_operational_intensity,
+        record.vmem_write_operational_intensity,
+        record.bottleneck_operational_intensity,
+        record.bound_by,
+        record.total_time_per_core_in_us,
+        record.total_time_in_percentage,
+        record.optimal_flop_rate,
+        record.roofline_efficiency,
+        record.flop_rate_relative_to_hw_limit,
+        record.memory_bw_relative_to_hw_limit,
+        record.include_infeed_outfeed,
+        record.hlo_module_id,
+    ]
+    data.append(row)
+  custom_properties = {
+      "device_type": roofline_model_db.device_type,
+      "megacore": str(int(roofline_model_db.megacore)),
+      "has_cmem": str(int(roofline_model_db.has_cmem)),
+      "has_merged_vmem": str(int(roofline_model_db.has_merged_vmem)),
+      "peak_flop_rate": str(roofline_model_db.peak_flop_rate),
+      "peak_hbm_bw": str(roofline_model_db.peak_hbm_bw),
+      "peak_cmem_read_bw": str(roofline_model_db.peak_cmem_read_bw),
+      "peak_cmem_write_bw": str(roofline_model_db.peak_cmem_write_bw),
+      "peak_vmem_read_bw": str(roofline_model_db.peak_vmem_read_bw),
+      "peak_vmem_write_bw": str(roofline_model_db.peak_vmem_write_bw),
+      "hbm_ridge_point": str(
+          ridge_point(
+              roofline_model_db.peak_flop_rate, roofline_model_db.peak_hbm_bw
+          )
+      ),
+      "cmem_read_ridge_point": str(
+          ridge_point(
+              roofline_model_db.peak_flop_rate,
+              roofline_model_db.peak_cmem_read_bw,
+          )
+      ),
+      "cmem_write_ridge_point": str(
+          ridge_point(
+              roofline_model_db.peak_flop_rate,
+              roofline_model_db.peak_cmem_write_bw,
+          )
+      ),
+      "vmem_read_ridge_point": str(
+          ridge_point(
+              roofline_model_db.peak_flop_rate,
+              roofline_model_db.peak_vmem_read_bw,
+          )
+      ),
+      "vmem_write_ridge_point": str(
+          ridge_point(
+              roofline_model_db.peak_flop_rate,
+              roofline_model_db.peak_vmem_write_bw,
+          )
+      ),
+  }
+
+  return (table_description, data, custom_properties)
+
+
+def generate_roofline_model_table(roofline_model_db):
+  """Creates roofline model table from a list of roofline model protos.
+
+  Args:
+    roofline_model_db: a RooflineModelDatabase proto.
+
+  included and one without..
+
+  Returns:
+    Returns a gviz_api.DataTable
+  """
+  device_type_str = roofline_model_db.device_type
+  if "GPU" not in device_type_str:
+    table_description, data, custom_properties = get_roofline_model_table_args(
+        roofline_model_db
+    )
+  else:
+    table_description, data, custom_properties = (
+        get_roofline_model_table_args_for_gpu(roofline_model_db)
+    )
+
+  return gviz_api.DataTable(table_description, data, custom_properties)
+
+
+def get_diagnostics_table_args(roofline_model_db):
+  """Creates diagnostics table from a roofline model proto."""
+  table_description = [
+      ("severity", "string", "Severity"),
+      ("message", "string", "Message"),
+  ]
+  data = []
+  for info in roofline_model_db.diagnostics.info:
+    data.append(["INFO", info])
+  for warning in roofline_model_db.diagnostics.warnings:
+    data.append(["WARNING", warning])
+  for error in roofline_model_db.diagnostics.errors:
+    data.append(["ERROR", error])
+  return (table_description, data, {})
+
+
+def generate_diagnostics_table(roofline_model_db):
+  table_description, data, custom_properties = get_diagnostics_table_args(
+      roofline_model_db
+  )
+  return gviz_api.DataTable(table_description, data, custom_properties)
+
+
+def to_json(raw_data):
+  """Converts a serialized HloStatsDb string to json."""
+  roofline_model_db = roofline_model_pb2.RooflineModelDatabase()
+  roofline_model_db.ParseFromString(raw_data)
+  roofline_model_table = generate_roofline_model_table(
+      roofline_model_db
+  ).ToJSon()
+  diagnostics_table = generate_diagnostics_table(roofline_model_db).ToJSon()
+  return "[" + roofline_model_table + "," + diagnostics_table + "]"
diff --git a/plugin/tensorboard_plugin_profile/integration_tests/tpu/tensorflow/tpu_tf2_keras_test.py b/plugin/tensorboard_plugin_profile/integration_tests/tpu/tensorflow/tpu_tf2_keras_test.py
index 18a2529f..41f09993 100644
--- a/plugin/tensorboard_plugin_profile/integration_tests/tpu/tensorflow/tpu_tf2_keras_test.py
+++ b/plugin/tensorboard_plugin_profile/integration_tests/tpu/tensorflow/tpu_tf2_keras_test.py
@@ -109,7 +109,8 @@ def test_tools_are_in_list(self):
         'memory_viewer^',
         'graph_viewer^',
         'hlo_stats^',
-        'inference_profile^'
+        'inference_profile^',
+        'roofline_model^',
     ]
     expected.sort()
     self.assertListEqual(expected, result)
diff --git a/plugin/tensorboard_plugin_profile/profile_plugin.py b/plugin/tensorboard_plugin_profile/profile_plugin.py
index 0516c038..409010f0 100644
--- a/plugin/tensorboard_plugin_profile/profile_plugin.py
+++ b/plugin/tensorboard_plugin_profile/profile_plugin.py
@@ -111,6 +111,7 @@
     'tf_data_bottleneck_analysis^',
     'op_profile^',
     'hlo_stats^',
+    'roofline_model^',
 ]
 
 # XPlane generated tools that support all host mode.
diff --git a/plugin/tensorboard_plugin_profile/protobuf/BUILD b/plugin/tensorboard_plugin_profile/protobuf/BUILD
index c13a65a0..1c79823a 100644
--- a/plugin/tensorboard_plugin_profile/protobuf/BUILD
+++ b/plugin/tensorboard_plugin_profile/protobuf/BUILD
@@ -18,6 +18,7 @@ proto_library(
         "kernel_stats.proto",
         "overview_page.proto",
         "power_metrics.proto",
+        "roofline_model.proto",
         "tf_data_stats.proto",
         "tf_stats.proto",
         "tpu_input_pipeline.proto",
@@ -37,6 +38,7 @@ py_proto_library(
         "kernel_stats.proto",
         "overview_page.proto",
         "power_metrics.proto",
+        "roofline_model.proto",
         "tf_data_stats.proto",
         "tf_stats.proto",
         "tpu_input_pipeline.proto",
diff --git a/plugin/tensorboard_plugin_profile/protobuf/roofline_model.proto b/plugin/tensorboard_plugin_profile/protobuf/roofline_model.proto
new file mode 100644
index 00000000..6a7bfbe7
--- /dev/null
+++ b/plugin/tensorboard_plugin_profile/protobuf/roofline_model.proto
@@ -0,0 +1,196 @@
+// This proto describes the format of the output profile file from
+// the Roofline Model tool.
+syntax = "proto2";
+
+package tensorflow.profiler.roofline_model;
+
+import "plugin/tensorboard_plugin_profile/protobuf/diagnostics.proto";
+
+// The record type which describes the scope this record captures.
+enum RecordType {
+  INVALID_RECORD_TYPE = 0;
+
+  // Captures the entire profiling duration including incomplete steps.
+  ALL = 1;
+
+  // Captures the average of all complete steps.
+  AVERAGE_STEP = 2;
+
+  // Captures a single step.
+  PER_STEP = 3;
+
+  // Same as ALL but the performance metrics (FLOPS and memory bandwidth) are
+  // derived from the hardware performance conuters.
+  ALL_HW = 4;
+}
+
+// A database of RooflineModel records.
+message RooflineModelDatabase {
+  // The device type.
+  optional string device_type = 1;
+
+  // Whether megacore is used.
+  optional bool megacore = 12;
+
+  // Whether the device has shared CMEM.
+  optional bool has_cmem = 8;
+
+  // Whether the device has merged VMEM.
+  optional bool has_merged_vmem = 15;
+
+  // Peak flop rate in GFLOP/s.
+  optional double peak_flop_rate = 2;
+
+  // Peak HBM bandwidth in GiB/s
+  optional double peak_hbm_bw = 9;
+
+  // Peak CMEM read bandwidth in GiB/s
+  optional double peak_cmem_read_bw = 10;
+
+  // Peak CMEM write bandwidth in GiB/s
+  optional double peak_cmem_write_bw = 11;
+
+  // Peak VMEM read bandwidth in GiB/s
+  optional double peak_vmem_read_bw = 13;
+
+  // Peak VMEM write bandwidth in GiB/s
+  optional double peak_vmem_write_bw = 14;
+
+  // All RooflineModel records, one for each HLO operation.
+  repeated RooflineModelRecord roofline_model_record = 5;
+
+  // Error and warning messages for diagnosing profiling issues.
+  optional tensorflow.profiler.Diagnostics diagnostics = 7;
+
+  reserved 3, 4, 6;
+}
+
+// There is one RooflineModelRecord for each HLO operation profiled.
+// Next ID: 43
+message RooflineModelRecord {
+  // The record type.
+  optional RecordType record_type = 18;
+
+  // Step number when record type is PER_STEP. Otherwise, invalid.
+  optional uint32 step_num = 19;
+
+  // The rank by self time
+  optional uint64 rank = 1;
+
+  // The hlo module id of the op
+  optional uint64 hlo_module_id = 35;
+
+  // The HLO category name.
+  optional string hlo_category = 17;
+
+  // The HLO operation name.
+  optional string hlo_name = 2;
+
+  // Number of occurrences of the operation.
+  optional int64 occurrences = 3;
+
+  // Total "accumulated" time in micro-seconds that the operation
+  // took. If this operation has any children operations,
+  // the "accumulated" time includes the time spent inside children.
+  optional double total_time_in_us = 4;
+
+  // Total time per core in micro-seconds.
+  optional double total_time_per_core_in_us = 20;
+
+  // Total time as fraction of the total program time.
+  optional double total_time_in_percentage = 21;
+
+  // Average "accumulated" time in micro-seconds that each
+  // occurrence of the operation took.
+  optional double avg_time_in_us = 5;
+
+  // Total "self" time in micro-seconds that the operation took.
+  // If this operation has any children operations, the "self" time
+  // doesn't include the time spent inside children.
+  optional double total_self_time_in_us = 6;
+
+  // Average "self" time in micro-seconds that the operation took.
+  optional double avg_self_time_in_us = 7;
+
+  // Percentage of the total "accumulated" time that was caused by
+  // DMA stall.
+  optional double total_self_time_as_fraction = 8;
+  optional double cumulative_total_self_time_as_fraction = 9;
+  optional double dma_stall_fraction = 10;
+
+  // Number of total floating-point operations (FLOPs) performed per second
+  // normalized to the bf16 peak performance.
+  optional double measured_flop_rate = 13;
+
+  // Numbef or total floating point operations (FLOPs) performed per second for
+  // the op.
+  optional double model_flop_rate = 38;
+
+  // Number of total bytes (including both read and write) accessed per
+  // second.
+  optional double measured_memory_bw = 14;
+
+  // HBM bandwidth in GiB/s (including both read and write).
+  optional double hbm_bw = 27;
+
+  // CMEM read bandwidth in GiB/s.
+  optional double cmem_read_bw = 28;
+
+  // CMEM write bandwidth in GiB/s.
+  optional double cmem_write_bw = 29;
+
+  // VMEM read bandwidth in GiB/s.
+  optional double vmem_read_bw = 39;
+
+  // VMEM write bandwidth in GiB/s.
+  optional double vmem_write_bw = 40;
+
+  // Overall operational intensity in FLOP/Byte.
+  optional double operational_intensity = 15;
+
+  // Operational intensity based on HBM in FLOP/Byte.
+  optional double hbm_operational_intensity = 30;
+
+  // Operational intensity based on CMEM read in FLOP/Byte.
+  optional double cmem_read_operational_intensity = 31;
+
+  // Operational intensity based on CMEM write in FLOP/Byte.
+  optional double cmem_write_operational_intensity = 32;
+
+  // Operational intensity based on VMEM read in FLOP/Byte.
+  optional double vmem_read_operational_intensity = 41;
+
+  // Operational intensity based on VMEM write in FLOP/Byte.
+  optional double vmem_write_operational_intensity = 42;
+
+  // Operational intensity based on the bottleneck resource in FLOP/Byte.
+  optional double bottleneck_operational_intensity = 33;
+
+  // Whether this operation is "Compute", "HBM", "CMEM Read", "CMEM Write"
+  // bound, according to the Roofline Model.
+  optional string bound_by = 16;
+
+  // The optimal flop rate calculated as
+  // (operational intensity) * (peak memory bw)
+  optional double optimal_flop_rate = 22;
+
+  // Roofline efficiency.
+  optional double roofline_efficiency = 34;
+
+  // Percentage of measured flop rate relative to the hardware limit.
+  optional double flop_rate_relative_to_hw_limit = 24;
+
+  // Percentage of measured memory bandwidth relative to the hardware limit.
+  optional double memory_bw_relative_to_hw_limit = 25;
+
+  // Whether the record is calculated including infeed and outfeed ops.
+  optional bool include_infeed_outfeed = 26;
+
+  // Flops for the record
+  optional uint64 flops = 36;
+
+  // Bytes accessed for the record
+  optional uint64 bytes_accessed = 37;
+
+  reserved 11, 12, 23;
+}