From 86fd79da7a0060e19258fc8f76ee0ecd376735d7 Mon Sep 17 00:00:00 2001
From: mariecwhite <mariewhite@google.com>
Date: Tue, 5 Sep 2023 03:54:47 +0000
Subject: [PATCH] Add GPT2 GGML Android Benchmarks

---
 .github/workflows/run_ggml_benchmark.yml      | 128 ++++++++-
 .../openxla/benchmark/devices/__init__.py     |   4 +-
 .../benchmark/devices/mobile_devices.py       |  22 ++
 .../docker/dockerfiles/android.Dockerfile     |  20 ++
 devtools/docker/image_deps.json               |   3 +-
 devtools/docker/prod_digests.txt              |   1 +
 experimental/ggml/benchmark_android.py        | 261 ++++++++++++++++++
 experimental/ggml/benchmark_ggml.sh           |  51 ++--
 experimental/ggml/build_ggml.sh               |  68 +++++
 experimental/ggml/requirements.txt            |   3 +-
 .../ggml/set_android_scaling_governor.sh      |  51 ++++
 11 files changed, 572 insertions(+), 40 deletions(-)
 create mode 100644 common_benchmark_suite/openxla/benchmark/devices/mobile_devices.py
 create mode 100644 devtools/docker/dockerfiles/android.Dockerfile
 create mode 100755 experimental/ggml/benchmark_android.py
 create mode 100755 experimental/ggml/build_ggml.sh
 create mode 100755 experimental/ggml/set_android_scaling_governor.sh

diff --git a/.github/workflows/run_ggml_benchmark.yml b/.github/workflows/run_ggml_benchmark.yml
index ff837cb0..f981e275 100644
--- a/.github/workflows/run_ggml_benchmark.yml
+++ b/.github/workflows/run_ggml_benchmark.yml
@@ -66,6 +66,7 @@ jobs:
       BENCHMARK_GCS_DIR: ${{ needs.setup.outputs.benchmark-gcs-dir }}
       RESULTS_DIR: results-dir
       TARGET_DEVICE: c2-standard-16
+      GGML_BUILD_DIR: build-dir
     steps:
       - name: "Checking out PR repository"
         uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791  # v2.5.0
@@ -74,7 +75,14 @@ jobs:
         run: |
           echo "results-gcs-dir=${BENCHMARK_GCS_DIR}/${TARGET_DEVICE}-results" >> "${GITHUB_OUTPUT}"
           mkdir "${RESULTS_DIR}"
-      - name: "Benchmarking GGML CPU"
+      - name: "Building GGML CPU"
+        run: |
+          docker run --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
+            "gcr.io/iree-oss/openxla-benchmark/base@sha256:1bf3e319465ec8fb465baae3f6ba9a5b09cb84a5349a675c671a552fc77f2251" \
+            ./experimental/ggml/build_ggml.sh \
+              "${TARGET_DEVICE}" \
+              "${GGML_BUILD_DIR}"
+      - name: "Benchmarking GGML"
         env:
           GGML_RESULTS_JSON: ggml.json
           RESULTS_GCS_DIR: ${{ steps.setup.outputs.results-gcs-dir }}
@@ -83,6 +91,122 @@ jobs:
           docker run --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
             "gcr.io/iree-oss/openxla-benchmark/base@sha256:1bf3e319465ec8fb465baae3f6ba9a5b09cb84a5349a675c671a552fc77f2251" \
             ./experimental/ggml/benchmark_ggml.sh \
-              "${TARGET_DEVICE}"\
+              "${TARGET_DEVICE}" \
+              "${GGML_BUILD_DIR}" \
               "${RESULTS_PATH}"
           gcloud storage cp "${RESULTS_PATH}" "${RESULTS_GCS_DIR}/"
+
+  build_ggml_for_android:
+    needs: [setup]
+    runs-on:
+      - self-hosted  # must come first
+      - runner-group=${{ needs.setup.outputs.runner-group }}
+      - environment=prod
+      - cpu
+      - os-family=Linux
+    env:
+      GGML_BUILD_DIR: ggml-build
+      TARGET_DEVICE: pixel-6-pro
+      BENCHMARK_GCS_DIR: ${{ needs.setup.outputs.benchmark-gcs-dir }}
+    outputs:
+      ggml-build-dir: ${{ env.GGML_BUILD_DIR }}
+      ggml-build-dir-archive: ${{ steps.archive.outputs.ggml-build-dir-archive }}
+      ggml-build-dir-gcs-artifact: ${{ steps.upload.outputs.ggml-build-dir-gcs-artifact }}
+    steps:
+      - name: "Checking out PR repository"
+        uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791  # v2.5.0
+      - name: "Building GGML"
+        run: |
+          mkdir -p "${GGML_BUILD_DIR}"
+          docker run --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
+            "gcr.io/iree-oss/openxla-benchmark/android@sha256:3211ade3856dfd46469e573f17baaf367f9c0830dfcc70c6d85891447cadc39e" \
+            ./experimental/ggml/build_ggml.sh \
+              "${TARGET_DEVICE}" \
+              "${GGML_BUILD_DIR}"
+      - name: "Creating build dir archive"
+        id: archive
+        env:
+          GGML_BUILD_DIR_ARCHIVE: ${{ env.GGML_BUILD_DIR }}.tgz
+        run: |
+          tar -zcvf ${GGML_BUILD_DIR_ARCHIVE} ${GGML_BUILD_DIR}
+          echo "ggml-build-dir-archive=${GGML_BUILD_DIR_ARCHIVE}" >> "${GITHUB_OUTPUT}"
+      - name: "Uploading build dir archive"
+        id: upload
+        env:
+          GGML_BUILD_DIR_ARCHIVE: ${{ steps.archive.outputs.ggml-build-dir-archive }}
+          GGML_BUILD_DIR_GCS_ARTIFACT: ${{ env.BENCHMARK_GCS_DIR }}/${{ steps.archive.outputs.ggml-build-dir-archive }}
+        run: |
+          gcloud storage cp "${GGML_BUILD_DIR_ARCHIVE}" "${GGML_BUILD_DIR_GCS_ARTIFACT}"
+          echo "ggml-build-dir-gcs-artifact=${GGML_BUILD_DIR_GCS_ARTIFACT}" >> "${GITHUB_OUTPUT}"
+
+  benchmark_on_pixel-6-pro:
+    needs: [setup, build_ggml_for_android]
+    runs-on:
+      - self-hosted  # must come first
+      - runner-group=${{ needs.setup.outputs.runner-group }}
+      - environment=prod
+      - machine-type=pixel-6-pro
+    env:
+      BENCHMARK_GCS_DIR: ${{ needs.setup.outputs.benchmark-gcs-dir }}
+      RESULTS_DIR: results-dir
+      TARGET_DEVICE: pixel-6-pro
+      GGML_BUILD_DIR: ${{ needs.build_ggml_for_android.outputs.ggml-build-dir }}
+      GGML_BUILD_DIR_ARCHIVE: ${{ needs.build_ggml_for_android.outputs.ggml-build-dir-archive }}
+      GGML_BUILD_DIR_GCS_ARTIFACT: ${{ needs.build_ggml_for_android.outputs.ggml-build-dir-gcs-artifact }}
+    steps:
+      - name: "Checking out PR repository"
+        uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791  # v2.5.0
+      - name: "Setup"
+        id: setup
+        run: |
+          echo "results-gcs-dir=${BENCHMARK_GCS_DIR}/${TARGET_DEVICE}-results" >> "${GITHUB_OUTPUT}"
+          mkdir "${RESULTS_DIR}"
+      - name: "Downloading and unpacking GGML build"
+        run: |
+          gcloud storage cp "${GGML_BUILD_DIR_GCS_ARTIFACT}" "${GGML_BUILD_DIR_ARCHIVE}"
+          tar -xvf "${GGML_BUILD_DIR_ARCHIVE}"
+      - name: "Benchmarking GGML on Android"
+        env:
+          GGML_RESULTS_JSON: ggml-android.json
+          RESULTS_GCS_DIR: ${{ steps.setup.outputs.results-gcs-dir }}
+        run: |
+          RESULTS_PATH="${RESULTS_DIR}/${GGML_RESULTS_JSON}"
+          ./experimental/ggml/benchmark_ggml.sh "${TARGET_DEVICE}" "${GGML_BUILD_DIR}" "${RESULTS_PATH}"
+          cat "${RESULTS_PATH}"
+
+#          adb push "./experimental/ggml/set_android_scaling_governor.sh" "/data/local/tmp"
+#          adb shell "chmod +x /data/local/tmp/set_android_scaling_governor.sh"
+#          adb shell "su root sh /data/local/tmp/set_android_scaling_governor.sh performance"
+#
+#          adb push "${GGML_BUILD_DIR}/bin/gpt-2" "/data/local/tmp"
+#          adb shell "chmod +x /data/local/tmp/gpt-2"
+#          adb push "${GGML_BUILD_DIR}/models/gpt-2-117M/ggml-model-f32.bin" "/data/local/tmp"
+#
+#          echo "Benchmarking ggml-model-f32.bin with 1 thread"
+#          adb shell 'taskset f0 /data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f32.bin --prompt "Once upon a time" --seed 0 --threads 1'
+#
+#          echo "Benchmarking ggml-model-f32.bin with 4 threads"
+#          adb shell 'taskset f0 /data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f32.bin --prompt "Once upon a time" --seed 0 --threads 4'
+#
+#          echo "Benchmarking ggml-model-f32.bin with 8 threads"
+#          adb shell '/data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f32.bin --prompt "Once upon a time" --seed 0 --threads 8'
+#
+#          echo "Benchmarking ggml-model-f32.bin with 16 threads"
+#          adb shell '/data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f32.bin --prompt "Once upon a time" --seed 0 --threads 16'
+#
+#          echo "Removing ggml-model-f32.bin"
+#          adb shell "rm /data/local/tmp/ggml-model-f32.bin"
+#
+#          adb push "${GGML_BUILD_DIR}/models/gpt-2-117M/ggml-model-f16.bin" "/data/local/tmp"
+#
+#          echo "Benchmarking ggml-model-f16.bin with 1 thread"
+#          adb shell 'taskset f0 /data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f16.bin --prompt "Once upon a time" --seed 0 --threads 1'
+#
+#          echo "Benchmarking ggml-model-f16.bin with 4 threads"
+#          adb shell 'taskset f0 /data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f16.bin --prompt "Once upon a time" --seed 0 --threads 4'
+#
+#          echo "Benchmarking ggml-model-f16.bin with 8 threads"
+#          adb shell '/data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f16.bin --prompt "Once upon a time" --seed 0 --threads 8'
+#
+#          echo "Benchmarking ggml-model-f16.bin with 16 threads"
+#          adb shell '/data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f16.bin --prompt "Once upon a time" --seed 0 --threads 16'
diff --git a/common_benchmark_suite/openxla/benchmark/devices/__init__.py b/common_benchmark_suite/openxla/benchmark/devices/__init__.py
index 049b6fd1..c9fd5892 100644
--- a/common_benchmark_suite/openxla/benchmark/devices/__init__.py
+++ b/common_benchmark_suite/openxla/benchmark/devices/__init__.py
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from . import gcp_devices, host_devices
+from . import gcp_devices, host_devices, mobile_devices
 
 # All defined device specs.
-ALL_DEVICES = gcp_devices.ALL_DEVICES + host_devices.ALL_DEVICES
+ALL_DEVICES = gcp_devices.ALL_DEVICES + host_devices.ALL_DEVICES + mobile_devices.ALL_DEVICES
diff --git a/common_benchmark_suite/openxla/benchmark/devices/mobile_devices.py b/common_benchmark_suite/openxla/benchmark/devices/mobile_devices.py
new file mode 100644
index 00000000..ca716280
--- /dev/null
+++ b/common_benchmark_suite/openxla/benchmark/devices/mobile_devices.py
@@ -0,0 +1,22 @@
+# Copyright 2023 The OpenXLA Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from openxla.benchmark import def_types
+
+MOBILE_PIXEL_6_PRO = def_types.DeviceSpec(
+    name="pixel-6-pro",
+    host_type="mobile",
+    host_model="pixel-6-pro",
+    host_environment="android",
+    accelerator_type="cpu",
+    accelerator_model="armv8.2-a",
+    accelerator_architecture="armv8.2-a",
+    accelerator_attributes={
+        "num_of_cores": 8,
+    },
+)
+
+ALL_DEVICES = [MOBILE_PIXEL_6_PRO]
\ No newline at end of file
diff --git a/devtools/docker/dockerfiles/android.Dockerfile b/devtools/docker/dockerfiles/android.Dockerfile
new file mode 100644
index 00000000..b6272b40
--- /dev/null
+++ b/devtools/docker/dockerfiles/android.Dockerfile
@@ -0,0 +1,20 @@
+# Copyright 2023 The OpenXLA Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# An image for cross-compiling towards Android.
+
+FROM gcr.io/iree-oss/openxla-benchmark/base@sha256:1bf3e319465ec8fb465baae3f6ba9a5b09cb84a5349a675c671a552fc77f2251
+
+ARG NDK_VERSION=r25c
+WORKDIR /install-ndk
+
+ENV ANDROID_NDK "/usr/src/android-ndk-${NDK_VERSION}"
+
+RUN wget -q "https://dl.google.com/android/repository/android-ndk-${NDK_VERSION}-linux.zip" \
+    && unzip -q "android-ndk-${NDK_VERSION}-linux.zip" -d /usr/src/  \
+    && rm -rf /install-ndk
+
+WORKDIR /
diff --git a/devtools/docker/image_deps.json b/devtools/docker/image_deps.json
index 7215e309..243f4320 100644
--- a/devtools/docker/image_deps.json
+++ b/devtools/docker/image_deps.json
@@ -3,5 +3,6 @@
   "cuda11.8-cudnn8.9": ["base"],
   "db_import": [],
   "mmperf": ["base"],
-  "convperf": ["base"]
+  "convperf": ["base"],
+  "android": ["base"]
 }
diff --git a/devtools/docker/prod_digests.txt b/devtools/docker/prod_digests.txt
index 013a9d5d..4a2f94a0 100644
--- a/devtools/docker/prod_digests.txt
+++ b/devtools/docker/prod_digests.txt
@@ -3,3 +3,4 @@ gcr.io/iree-oss/openxla-benchmark/cuda11.8-cudnn8.9@sha256:f43984cd6c16ad1faad4d
 gcr.io/iree-oss/openxla-benchmark/db_import@sha256:3de8a702b51ca1906fc2ef5bab2415a79e46bc132f2ceba994215539dd0ecdd4
 gcr.io/iree-oss/openxla-benchmark/mmperf@sha256:c972ce5b2144de0786f103611fecbd88d93dd45ecd068f8c97d98c08677cee57
 gcr.io/iree-oss/openxla-benchmark/convperf@sha256:0807d5e8144900752cfae72f3aa4d12530b408f73fc6f010a6cbad11cc09832c
+gcr.io/iree-oss/openxla-benchmark/android@sha256:3211ade3856dfd46469e573f17baaf367f9c0830dfcc70c6d85891447cadc39e
diff --git a/experimental/ggml/benchmark_android.py b/experimental/ggml/benchmark_android.py
new file mode 100755
index 00000000..0295e0b6
--- /dev/null
+++ b/experimental/ggml/benchmark_android.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+#
+# Copyright 2023 The OpenXLA Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import argparse
+import dataclasses
+import json
+import pathlib
+import re
+import statistics
+import subprocess
+import sys
+
+# Add comparative_benchmark dir to the search path.
+sys.path.insert(
+    0, str(pathlib.Path(__file__).parents[2] / "comparative_benchmark"))
+import utils
+
+# Add common_benchmark_suite dir to the search path.
+sys.path.insert(
+    0, str(pathlib.Path(__file__).parents[2] / "common_benchmark_suite"))
+from openxla.benchmark import def_types, devices
+
+ALL_DEVICE_NAMES = [device.name for device in devices.ALL_DEVICES]
+
+
+def _parse_output(output_text):
+  # Example output.
+  # main:      mem per token =  2011380 bytes
+  # main:          load time =   120.92 ms
+  # main:        sample time =    73.86 ms
+  # main: first predict time =    14.71 ms
+  # main:  loop predict time =  2261.72 ms / 11.20 ms per token
+  # main:       predict time =  2276.43 ms / 11.21 ms per token
+  # main:         total time =  2494.66 ms
+
+  LOAD_TIME_REGEXP = re.compile(f"main:          load time =   (.+) ms")
+  match = LOAD_TIME_REGEXP.search(output_text)
+  if not match:
+    "Unable to parse first prediction time"
+    return
+  load_time_ms = float(match.group(1))
+
+  SAMPLE_TIME_REGEXP = re.compile(f"main:        sample time =    (.+) ms")
+  match = SAMPLE_TIME_REGEXP.search(output_text)
+  if not match:
+    "Unable to parse first prediction time"
+    return
+  sample_time_ms = float(match.group(1))
+
+  FIRST_PREDICTION_TIME_REGEXP = re.compile(
+      f"main: first predict time = (.+) ms")
+  match = FIRST_PREDICTION_TIME_REGEXP.search(output_text)
+  if not match:
+    "Unable to parse first prediction time"
+    return
+  first_prediction_ms = float(match.group(1))
+
+  LOOP_PREDICTION_TIME_REGEXP = re.compile(
+      f"main:  loop predict time =  .+ ms / (.+) ms per token")
+  match = LOOP_PREDICTION_TIME_REGEXP.search(output_text)
+  if not match:
+    "Unable to parse loop prediction time"
+    return
+  loop_prediction_ms = float(match.group(1))
+
+  TOTAL_PREDICTION_TIME_REGEXP = re.compile(
+      f"main:       predict time =  (.+) ms / .+ ms per token")
+  match = TOTAL_PREDICTION_TIME_REGEXP.search(output_text)
+  if not match:
+    "Unable to parse total prediction time"
+    return
+  total_prediction_ms = float(match.group(1))
+
+  E2E_TIME_REGEXP = re.compile(f"main:         total time =  (.+) ms")
+  match = E2E_TIME_REGEXP.search(output_text)
+  if not match:
+    "Unable to parse total prediction time"
+    return
+  e2e_prediction_ms = float(match.group(1))
+
+  return {
+      "load_time_ms": load_time_ms,
+      "first_prediction_ms": first_prediction_ms,
+      "loop_prediction_ms": loop_prediction_ms,
+      "total_prediction_ms": total_prediction_ms,
+      "sample_time_ms": sample_time_ms,
+      "e2e_prediction_ms": e2e_prediction_ms,
+  }
+
+
+def _parse_arguments() -> argparse.Namespace:
+  parser = argparse.ArgumentParser(description="Run GGML benchmarks.")
+  parser.add_argument("-name",
+                      "--benchmark_name",
+                      type=str,
+                      required=True,
+                      help="The regex pattern to match benchmark names.")
+  parser.add_argument(
+      "-b",
+      "--benchmark_binary",
+      type=pathlib.Path,
+      required=True,
+      help="Path to benchmark binary e.g. /tmp/ggml/build/bin/gpt2")
+  parser.add_argument(
+      "-m",
+      "--model",
+      type=pathlib.Path,
+      required=True,
+      help=
+      "The GGML model to benchmark e.g. /tmp/ggml/build/models/gpt-2-117M/ggml-model.bin"
+  )
+  parser.add_argument("--data_type", type=str, help="The model data type.")
+  parser.add_argument("-p",
+                      "--prompt",
+                      type=str,
+                      default="Once upon a time",
+                      help="The input prompt to the model.")
+  parser.add_argument("-s",
+                      "--seed",
+                      type=int,
+                      default=0,
+                      help="The seed to use for the RNG.")
+  parser.add_argument("-t",
+                      "--threads",
+                      type=int,
+                      default=8,
+                      help="The number of threads to use.")
+  parser.add_argument("-o",
+                      "--output",
+                      type=pathlib.Path,
+                      required=True,
+                      help="JSON file path to merge the results.")
+  parser.add_argument("-device",
+                      "--target_device",
+                      dest="target_device_name",
+                      type=str,
+                      required=True,
+                      choices=ALL_DEVICE_NAMES,
+                      help="The target device to benchmark.")
+  parser.add_argument("-w",
+                      "--warmup_iterations",
+                      type=int,
+                      default=5,
+                      help="The number of warmup steps.")
+  parser.add_argument("-iter",
+                      "--iterations",
+                      type=int,
+                      default=100,
+                      help="The number of iterations to benchmark.")
+  parser.add_argument("--verbose",
+                      action="store_true",
+                      help="Show verbose messages.")
+  return parser.parse_args()
+
+
+def main(benchmark_name: str, benchmark_binary: pathlib.Path,
+         warmup_iterations: int, iterations: int, model: pathlib.Path,
+         data_type: str, prompt: str, seed: int, threads: int,
+         output: pathlib.Path, target_device_name: str, verbose: bool):
+
+  try:
+    target_device = next(device for device in devices.ALL_DEVICES
+                         if device.name == target_device_name)
+  except StopIteration:
+    raise ValueError(f'Target device "{target_device_name}" is not defined.'
+                     f' Available device options:\n{ALL_DEVICE_NAMES}')
+
+  benchmark_definition = {
+      "benchmark_name": benchmark_name,
+      "framework": str(def_types.ModelFrameworkType.GGML),
+      "data_type": data_type,
+      "batch_size": 1,
+      "compiler": str(def_types.ModelFrameworkType.GGML),
+      "device": target_device.name,
+      "num_threads": threads,
+      "warmup_iterations": warmup_iterations,
+      "num_iterations": iterations,
+      "tags": ["gpt2", "ggml"],
+  }
+
+  # Push artifacts to the Android device.
+  subprocess.run(["adb", "push", benchmark_binary, "/data/local/tmp"])
+  subprocess.run([
+      "adb", "shell", "chmod", "+x", f"/data/local/tmp/{benchmark_binary.name}"
+  ])
+  subprocess.run(["adb", "push", model, "/data/local/tmp"])
+
+  # Run benchmark.
+  cmd = [
+      "adb", "shell", f"/data/local/tmp/{benchmark_binary.name}", "--model",
+      f"/data/local/tmp/{model.name}", "--prompt", f"\"{prompt}\"", "--seed",
+      f"{seed}", "--threads", f"{threads}"
+  ]
+  print(f"cmd: {cmd}")
+
+  # Run warmup iterations.
+  for i in range(warmup_iterations):
+    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+  load_times = []
+  first_prediction_times = []
+  loop_prediction_times = []
+  total_prediction_times = []
+  sample_times = []
+  e2e_prediction_times = []
+
+  # Run iterations.
+  for i in range(iterations):
+    raw_result = subprocess.run(cmd,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.STDOUT)
+    raw_result = raw_result.stdout.decode("utf-8")
+    print(f"raw_result: {raw_result}")
+    metrics = _parse_output(raw_result)
+    print(f"metrics: {metrics}")
+
+    load_times.append(metrics["load_time_ms"])
+    first_prediction_times.append(metrics["first_prediction_ms"])
+    loop_prediction_times.append(metrics["loop_prediction_ms"])
+    total_prediction_times.append(metrics["total_prediction_ms"])
+    sample_times.append(metrics["sample_time_ms"])
+    e2e_prediction_times.append(metrics["e2e_prediction_ms"])
+
+  benchmark_metrics = {
+      "median_load_time_ms":
+          statistics.median(load_times) if load_times else None,
+      "median_first_prediction_ms":
+          statistics.median(first_prediction_times)
+          if first_prediction_times else None,
+      "median_loop_prediction_ms":
+          statistics.median(loop_prediction_times)
+          if loop_prediction_times else None,
+      "median_total_prediction_ms":
+          statistics.median(total_prediction_times)
+          if total_prediction_times else None,
+      "median_sample_time_ms":
+          statistics.median(sample_times) if sample_times else None,
+      "median_e2e_prediction_times":
+          statistics.median(e2e_prediction_times)
+          if e2e_prediction_times else None,
+  }
+
+  benchmark_result = utils.BenchmarkResult(
+      definition=benchmark_definition,
+      metrics={
+          "compiler_level": benchmark_metrics,
+      },
+  )
+
+  if verbose:
+    print(json.dumps(dataclasses.asdict(benchmark_result), indent=2))
+  utils.append_benchmark_result(output, benchmark_result)
+
+
+if __name__ == "__main__":
+  main(**vars(_parse_arguments()))
diff --git a/experimental/ggml/benchmark_ggml.sh b/experimental/ggml/benchmark_ggml.sh
index 6cb28dc1..01757e47 100755
--- a/experimental/ggml/benchmark_ggml.sh
+++ b/experimental/ggml/benchmark_ggml.sh
@@ -11,20 +11,20 @@
 # OOBI_VENV_DIR: path to create Python virtualenv, default: ggml-benchmarks.venv
 # OOBI_TARGET_DEVICE: target benchmark device, can also be specified the first
 #   argument.
+# OOBI_BUILD_DIR: path to the GGMl build directory.
 # OOBI_OUTPUT: path to output benchmark results, can also be specified the
 #   second argument.
-# OOBI_SCRATCH_DIR: the directory to place temporary benchmarking artifacts.
 #
 # Example usage:
-# ./benchmark_ggml.sh c2-standard-16 /tmp/results.json
+# ./benchmark_ggml.sh <target-device> <build-dir> <result-path>
 
 set -xeuo pipefail
 
 VENV_DIR="${OOBI_VENV_DIR:-ggml-benchmarks.venv}"
-ROOT_DIR="${OOBI_SCRATCH_DIR:-/tmp}"
 PYTHON="${PYTHON:-/usr/bin/python3}"
-TARGET_DEVICE="${1:-${OOBI_TARGET_DEVICE}}"
-OUTPUT_PATH="${2:-${OOBI_OUTPUT}}"
+TARGET_DEVICE_NAME="${1:-${OOBI_TARGET_DEVICE}}"
+BUILD_DIR="${2:-${OOBI_BUILD_DIR}}"
+OUTPUT_PATH="${3:-${OOBI_OUTPUT}}"
 
 TD="$(cd $(dirname $0) && pwd)"
 
@@ -35,27 +35,7 @@ VENV_DIR="${VENV_DIR}" PYTHON="${PYTHON}" source "${TD}/setup_venv.sh"
 OUTPUT_PATH="$(realpath ${OUTPUT_PATH})"
 "${TD}/../../comparative_benchmark/scripts/create_results_json.sh" "${OUTPUT_PATH}"
 
-pushd "${ROOT_DIR}"
-
-# We clone a fork of ggml which includes additional benchmark logging.
-git clone --branch benchmark https://github.com/mariecwhite/ggml.git
-pushd ggml
-
-# Build
-mkdir build
-pushd build
-cmake ..
-make -j8
-
-# Generate FP32, FP16 and INT4 versions of GPT2 117M (Small).
-GPT_VARIANT="117M"
-../examples/gpt-2/download-model.sh "${GPT_VARIANT}"
-# Generate FP32.
-python ../examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-${GPT_VARIANT}/ 0
-# Generate FP16.
-python ../examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-${GPT_VARIANT}/ 1
-# Generate INT4.
-./bin/gpt-2-quantize models/gpt-2-${GPT_VARIANT}/ggml-model-f16.bin models/gpt-2-${GPT_VARIANT}/ggml-model-q4_0.bin 2
+pushd "${BUILD_DIR}"
 
 PROMPT="Once upon a time"
 BENCHMARK_BINARY="$(realpath bin/gpt-2)"
@@ -68,24 +48,29 @@ MODEL="$(realpath models/gpt-2-117M/ggml-model-f32.bin)"
 declare -a BENCHMARK_NAMES=(
   "models/GPT2LMHEAD_FP32_GGML/inputs/INPUT_DATA_MODEL_DEFAULT"
   "models/GPT2LMHEAD_FP16_GGML/inputs/INPUT_DATA_MODEL_DEFAULT"
-  "models/GPT2LMHEAD_INT4_GGML/inputs/INPUT_DATA_MODEL_DEFAULT"
 )
 declare -a MODELS=(
   ggml-model-f32.bin
   ggml-model-f16.bin
-  ggml-model-q4_0.bin
+  #ggml-model-q4_0.bin
 )
 declare -a DATA_TYPES=(
   "fp32"
   "fp16"
-  "int4"
+  #"int4"
 )
 
+if [[ "${TARGET_DEVICE_NAME}" =~ ^(pixel-4|pixel-6-pro|moto-edge-x30)$ ]]; then
+BENCHMARK_SCRIPT="benchmark_android.py"
+else
+BENCHMARK_SCRIPT="benchmark.py"
+fi
+
 for i in ${!BENCHMARK_NAMES[@]}; do
   MODEL="$(realpath models/gpt-2-117M/${MODELS[$i]})"
 
   for threads in "${NUM_THREADS[@]}"; do
-    "${TD}/benchmark.py" \
+    "${TD}/${BENCHMARK_SCRIPT}" \
       --benchmark_name "${BENCHMARK_NAMES[$i]}" \
       --warmup_iterations "${WARMUP_ITERAIONS}" \
       --iterations "${NUM_ITERATIONS}" \
@@ -96,11 +81,9 @@ for i in ${!BENCHMARK_NAMES[@]}; do
       --seed 0 \
       --threads "${threads}" \
       --output "${OUTPUT_PATH}" \
-      --target_device "${TARGET_DEVICE}" \
+      --target_device "${TARGET_DEVICE_NAME}" \
       --verbose
   done
 done
 
-popd # build
-popd # ggml
-popd # ROOT_DIR
+popd # BUILD_DIR
diff --git a/experimental/ggml/build_ggml.sh b/experimental/ggml/build_ggml.sh
new file mode 100755
index 00000000..1e4202b0
--- /dev/null
+++ b/experimental/ggml/build_ggml.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#
+# Copyright 2023 The OpenXLA Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Environment variables:
+# PYTHON: Python interpreter, default: /usr/bin/python3
+# ANDROID_NDK: the path to the Android NDK if building for Android.
+# OOBI_VENV_DIR: path to create Python virtualenv, default: ggml-build.venv
+# OOBI_TARGET_DEVICE: target benchmark device, can also be specified the first
+#   argument.
+# OOBI_OUTPUT: path to output benchmark results, can also be specified the
+#   second argument.
+# OOBI_SCRATCH_DIR: the directory to place temporary benchmarking artifacts.
+#
+# Example usage:
+# ./build_ggml.sh <target-device>> <build-dir>
+
+set -xeuo pipefail
+
+VENV_DIR="${OOBI_VENV_DIR:-ggml-build.venv}"
+ROOT_DIR="${OOBI_SCRATCH_DIR:-/tmp}"
+PYTHON="${PYTHON:-/usr/bin/python3}"
+TARGET_DEVICE_NAME="${1:-${OOBI_TARGET_DEVICE}}"
+BUILD_DIR="${2:-/tmp/ggml-build}"
+
+TD="$(cd $(dirname $0) && pwd)"
+BUILD_DIR="$(realpath ${BUILD_DIR})"
+
+# Setup virtual environment.
+VENV_DIR="${VENV_DIR}" PYTHON="${PYTHON}" source "${TD}/setup_venv.sh"
+
+pushd "${ROOT_DIR}"
+
+# We clone a fork of ggml which includes additional benchmark logging.
+git clone --branch benchmark https://github.com/mariecwhite/ggml.git
+pushd ggml
+
+REPO_DIR="$(pwd)"
+
+# Build.
+if [[ "${TARGET_DEVICE_NAME}" =~ ^(pixel-4|pixel-6-pro|moto-edge-x30)$ ]]; then
+cmake -GNinja -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod -B "${BUILD_DIR}" .
+cmake --build "${BUILD_DIR}" -t gpt-2 gpt-2-quantize
+else
+cmake -G Ninja -B "${BUILD_DIR}" .
+cmake --build "${BUILD_DIR}" -t gpt-2 gpt-2-quantize
+fi
+
+popd # ggml
+popd # ROOT_DIR
+
+# Generate FP32 and FP16 versions of GPT2 117M (Small).
+pushd "${BUILD_DIR}"
+
+GPT_VARIANT="117M"
+${REPO_DIR}/examples/gpt-2/download-model.sh "${GPT_VARIANT}"
+# Generate FP32.
+python ${REPO_DIR}/examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-${GPT_VARIANT}/ 0
+# Generate FP16.
+python ${REPO_DIR}/examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-${GPT_VARIANT}/ 1
+# Generate INT4. Keep this disabled until we want to use it.
+#./bin/gpt-2-quantize models/gpt-2-${GPT_VARIANT}/ggml-model-f16.bin models/gpt-2-${GPT_VARIANT}/ggml-model-q4_0.bin 2
+
+popd # BUILD_DIR
diff --git a/experimental/ggml/requirements.txt b/experimental/ggml/requirements.txt
index c2bbfa1f..7947059c 100644
--- a/experimental/ggml/requirements.txt
+++ b/experimental/ggml/requirements.txt
@@ -1,2 +1,3 @@
 numpy
-tensorflow
\ No newline at end of file
+tensorflow
+requests
diff --git a/experimental/ggml/set_android_scaling_governor.sh b/experimental/ggml/set_android_scaling_governor.sh
new file mode 100755
index 00000000..0a297304
--- /dev/null
+++ b/experimental/ggml/set_android_scaling_governor.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+# Copyright 2023 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Runs on an android device itself to set the frequency scaling governor for all
+# CPUs (default performance).
+
+################################### WARNING ####################################
+# This will overheat the phone if it's not on a cooling plate, resulting in    #
+# thermal throttling. To prevent anything catching on fire, the actual CPU     #
+# frequencies will be throttled to below the maximum, skewing your results.    #
+################################################################################
+
+set -euo pipefail
+
+GOVERNOR="${1:-performance}"
+
+echo "CPU info (before changing governor):"
+echo 'cpu\tgovernor\tcur\tmin\tmax'
+echo "------------------------------------------------"
+for i in `cat /sys/devices/system/cpu/present | tr '-' ' ' | xargs seq`; do \
+    echo "cpu${i}" | paste \
+      - \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/scaling_governor" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_cur_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_min_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_max_freq"; \
+done
+
+echo "Setting CPU frequency governor to ${GOVERNOR}"
+
+for i in `cat /sys/devices/system/cpu/present | tr '-' ' ' | xargs seq`; do \
+  echo "${GOVERNOR}" > \
+    "/sys/devices/system/cpu/cpu${i?}/cpufreq/scaling_governor"; \
+done
+
+echo "CPU info (after changing governor):"
+echo 'cpu\tgovernor\tcur\tmin\tmax'
+echo "------------------------------------------------"
+for i in `cat /sys/devices/system/cpu/present | tr '-' ' ' | xargs seq`; do \
+    echo "cpu${i}" | paste \
+      - \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/scaling_governor" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_cur_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_min_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_max_freq"; \
+done