From 974bdbe84a011acf358a09cde273ace985340890 Mon Sep 17 00:00:00 2001
From: mariecwhite <mariewhite@google.com>
Date: Tue, 5 Sep 2023 03:54:47 +0000
Subject: [PATCH] Add GPT2 GGML Android Benchmarks

---
 .github/workflows/run_ggml_benchmark.yml      | 128 +++++++++++++++-
 .../openxla/benchmark/devices/__init__.py     |   5 +-
 .../benchmark/devices/mobile_devices.py       |  22 +++
 .../docker/dockerfiles/android.Dockerfile     |  20 +++
 devtools/docker/image_deps.json               |   3 +-
 devtools/docker/prod_digests.txt              |   1 +
 experimental/ggml/benchmark_ggml.sh           |  96 ++++++------
 .../ggml/{benchmark.py => benchmark_lib.py}   | 142 ++++++------------
 experimental/ggml/build_ggml.sh               |  72 +++++++++
 experimental/ggml/requirements.txt            |   3 +-
 experimental/ggml/run_benchmarks.py           |  66 ++++++++
 experimental/ggml/run_benchmarks_android.py   |  92 ++++++++++++
 .../ggml/set_android_scaling_governor.sh      |  51 +++++++
 13 files changed, 554 insertions(+), 147 deletions(-)
 create mode 100644 common_benchmark_suite/openxla/benchmark/devices/mobile_devices.py
 create mode 100644 devtools/docker/dockerfiles/android.Dockerfile
 rename experimental/ggml/{benchmark.py => benchmark_lib.py} (61%)
 mode change 100755 => 100644
 create mode 100755 experimental/ggml/build_ggml.sh
 create mode 100755 experimental/ggml/run_benchmarks.py
 create mode 100755 experimental/ggml/run_benchmarks_android.py
 create mode 100755 experimental/ggml/set_android_scaling_governor.sh

diff --git a/.github/workflows/run_ggml_benchmark.yml b/.github/workflows/run_ggml_benchmark.yml
index ff837cb0..f981e275 100644
--- a/.github/workflows/run_ggml_benchmark.yml
+++ b/.github/workflows/run_ggml_benchmark.yml
@@ -66,6 +66,7 @@ jobs:
       BENCHMARK_GCS_DIR: ${{ needs.setup.outputs.benchmark-gcs-dir }}
       RESULTS_DIR: results-dir
       TARGET_DEVICE: c2-standard-16
+      GGML_BUILD_DIR: build-dir
     steps:
       - name: "Checking out PR repository"
         uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791  # v2.5.0
@@ -74,7 +75,14 @@ jobs:
         run: |
           echo "results-gcs-dir=${BENCHMARK_GCS_DIR}/${TARGET_DEVICE}-results" >> "${GITHUB_OUTPUT}"
           mkdir "${RESULTS_DIR}"
-      - name: "Benchmarking GGML CPU"
+      - name: "Building GGML CPU"
+        run: |
+          docker run --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
+            "gcr.io/iree-oss/openxla-benchmark/base@sha256:1bf3e319465ec8fb465baae3f6ba9a5b09cb84a5349a675c671a552fc77f2251" \
+            ./experimental/ggml/build_ggml.sh \
+              "${TARGET_DEVICE}" \
+              "${GGML_BUILD_DIR}"
+      - name: "Benchmarking GGML"
         env:
           GGML_RESULTS_JSON: ggml.json
           RESULTS_GCS_DIR: ${{ steps.setup.outputs.results-gcs-dir }}
@@ -83,6 +91,122 @@ jobs:
           docker run --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
             "gcr.io/iree-oss/openxla-benchmark/base@sha256:1bf3e319465ec8fb465baae3f6ba9a5b09cb84a5349a675c671a552fc77f2251" \
             ./experimental/ggml/benchmark_ggml.sh \
-              "${TARGET_DEVICE}"\
+              "${TARGET_DEVICE}" \
+              "${GGML_BUILD_DIR}" \
               "${RESULTS_PATH}"
           gcloud storage cp "${RESULTS_PATH}" "${RESULTS_GCS_DIR}/"
+
+  build_ggml_for_android:
+    needs: [setup]
+    runs-on:
+      - self-hosted  # must come first
+      - runner-group=${{ needs.setup.outputs.runner-group }}
+      - environment=prod
+      - cpu
+      - os-family=Linux
+    env:
+      GGML_BUILD_DIR: ggml-build
+      TARGET_DEVICE: pixel-6-pro
+      BENCHMARK_GCS_DIR: ${{ needs.setup.outputs.benchmark-gcs-dir }}
+    outputs:
+      ggml-build-dir: ${{ env.GGML_BUILD_DIR }}
+      ggml-build-dir-archive: ${{ steps.archive.outputs.ggml-build-dir-archive }}
+      ggml-build-dir-gcs-artifact: ${{ steps.upload.outputs.ggml-build-dir-gcs-artifact }}
+    steps:
+      - name: "Checking out PR repository"
+        uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791  # v2.5.0
+      - name: "Building GGML"
+        run: |
+          mkdir -p "${GGML_BUILD_DIR}"
+          docker run --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \
+            "gcr.io/iree-oss/openxla-benchmark/android@sha256:3211ade3856dfd46469e573f17baaf367f9c0830dfcc70c6d85891447cadc39e" \
+            ./experimental/ggml/build_ggml.sh \
+              "${TARGET_DEVICE}" \
+              "${GGML_BUILD_DIR}"
+      - name: "Creating build dir archive"
+        id: archive
+        env:
+          GGML_BUILD_DIR_ARCHIVE: ${{ env.GGML_BUILD_DIR }}.tgz
+        run: |
+          tar -zcvf ${GGML_BUILD_DIR_ARCHIVE} ${GGML_BUILD_DIR}
+          echo "ggml-build-dir-archive=${GGML_BUILD_DIR_ARCHIVE}" >> "${GITHUB_OUTPUT}"
+      - name: "Uploading build dir archive"
+        id: upload
+        env:
+          GGML_BUILD_DIR_ARCHIVE: ${{ steps.archive.outputs.ggml-build-dir-archive }}
+          GGML_BUILD_DIR_GCS_ARTIFACT: ${{ env.BENCHMARK_GCS_DIR }}/${{ steps.archive.outputs.ggml-build-dir-archive }}
+        run: |
+          gcloud storage cp "${GGML_BUILD_DIR_ARCHIVE}" "${GGML_BUILD_DIR_GCS_ARTIFACT}"
+          echo "ggml-build-dir-gcs-artifact=${GGML_BUILD_DIR_GCS_ARTIFACT}" >> "${GITHUB_OUTPUT}"
+
+  benchmark_on_pixel-6-pro:
+    needs: [setup, build_ggml_for_android]
+    runs-on:
+      - self-hosted  # must come first
+      - runner-group=${{ needs.setup.outputs.runner-group }}
+      - environment=prod
+      - machine-type=pixel-6-pro
+    env:
+      BENCHMARK_GCS_DIR: ${{ needs.setup.outputs.benchmark-gcs-dir }}
+      RESULTS_DIR: results-dir
+      TARGET_DEVICE: pixel-6-pro
+      GGML_BUILD_DIR: ${{ needs.build_ggml_for_android.outputs.ggml-build-dir }}
+      GGML_BUILD_DIR_ARCHIVE: ${{ needs.build_ggml_for_android.outputs.ggml-build-dir-archive }}
+      GGML_BUILD_DIR_GCS_ARTIFACT: ${{ needs.build_ggml_for_android.outputs.ggml-build-dir-gcs-artifact }}
+    steps:
+      - name: "Checking out PR repository"
+        uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791  # v2.5.0
+      - name: "Setup"
+        id: setup
+        run: |
+          echo "results-gcs-dir=${BENCHMARK_GCS_DIR}/${TARGET_DEVICE}-results" >> "${GITHUB_OUTPUT}"
+          mkdir "${RESULTS_DIR}"
+      - name: "Downloading and unpacking GGML build"
+        run: |
+          gcloud storage cp "${GGML_BUILD_DIR_GCS_ARTIFACT}" "${GGML_BUILD_DIR_ARCHIVE}"
+          tar -xvf "${GGML_BUILD_DIR_ARCHIVE}"
+      - name: "Benchmarking GGML on Android"
+        env:
+          GGML_RESULTS_JSON: ggml-android.json
+          RESULTS_GCS_DIR: ${{ steps.setup.outputs.results-gcs-dir }}
+        run: |
+          RESULTS_PATH="${RESULTS_DIR}/${GGML_RESULTS_JSON}"
+          ./experimental/ggml/benchmark_ggml.sh "${TARGET_DEVICE}" "${GGML_BUILD_DIR}" "${RESULTS_PATH}"
+          cat "${RESULTS_PATH}"
+
+#          adb push "./experimental/ggml/set_android_scaling_governor.sh" "/data/local/tmp"
+#          adb shell "chmod +x /data/local/tmp/set_android_scaling_governor.sh"
+#          adb shell "su root sh /data/local/tmp/set_android_scaling_governor.sh performance"
+#
+#          adb push "${GGML_BUILD_DIR}/bin/gpt-2" "/data/local/tmp"
+#          adb shell "chmod +x /data/local/tmp/gpt-2"
+#          adb push "${GGML_BUILD_DIR}/models/gpt-2-117M/ggml-model-f32.bin" "/data/local/tmp"
+#
+#          echo "Benchmarking ggml-model-f32.bin with 1 thread"
+#          adb shell 'taskset f0 /data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f32.bin --prompt "Once upon a time" --seed 0 --threads 1'
+#
+#          echo "Benchmarking ggml-model-f32.bin with 4 threads"
+#          adb shell 'taskset f0 /data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f32.bin --prompt "Once upon a time" --seed 0 --threads 4'
+#
+#          echo "Benchmarking ggml-model-f32.bin with 8 threads"
+#          adb shell '/data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f32.bin --prompt "Once upon a time" --seed 0 --threads 8'
+#
+#          echo "Benchmarking ggml-model-f32.bin with 16 threads"
+#          adb shell '/data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f32.bin --prompt "Once upon a time" --seed 0 --threads 16'
+#
+#          echo "Removing ggml-model-f32.bin"
+#          adb shell "rm /data/local/tmp/ggml-model-f32.bin"
+#
+#          adb push "${GGML_BUILD_DIR}/models/gpt-2-117M/ggml-model-f16.bin" "/data/local/tmp"
+#
+#          echo "Benchmarking ggml-model-f16.bin with 1 thread"
+#          adb shell 'taskset f0 /data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f16.bin --prompt "Once upon a time" --seed 0 --threads 1'
+#
+#          echo "Benchmarking ggml-model-f16.bin with 4 threads"
+#          adb shell 'taskset f0 /data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f16.bin --prompt "Once upon a time" --seed 0 --threads 4'
+#
+#          echo "Benchmarking ggml-model-f16.bin with 8 threads"
+#          adb shell '/data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f16.bin --prompt "Once upon a time" --seed 0 --threads 8'
+#
+#          echo "Benchmarking ggml-model-f16.bin with 16 threads"
+#          adb shell '/data/local/tmp/gpt-2 --model /data/local/tmp/ggml-model-f16.bin --prompt "Once upon a time" --seed 0 --threads 16'
diff --git a/common_benchmark_suite/openxla/benchmark/devices/__init__.py b/common_benchmark_suite/openxla/benchmark/devices/__init__.py
index 049b6fd1..9a9b2106 100644
--- a/common_benchmark_suite/openxla/benchmark/devices/__init__.py
+++ b/common_benchmark_suite/openxla/benchmark/devices/__init__.py
@@ -4,7 +4,8 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from . import gcp_devices, host_devices
+from . import gcp_devices, host_devices, mobile_devices
 
 # All defined device specs.
-ALL_DEVICES = gcp_devices.ALL_DEVICES + host_devices.ALL_DEVICES
+ALL_DEVICES = gcp_devices.ALL_DEVICES + host_devices.ALL_DEVICES + mobile_devices.ALL_DEVICES
+ALL_DEVICE_NAMES = [device.name for device in ALL_DEVICES]
diff --git a/common_benchmark_suite/openxla/benchmark/devices/mobile_devices.py b/common_benchmark_suite/openxla/benchmark/devices/mobile_devices.py
new file mode 100644
index 00000000..ca716280
--- /dev/null
+++ b/common_benchmark_suite/openxla/benchmark/devices/mobile_devices.py
@@ -0,0 +1,22 @@
+# Copyright 2023 The OpenXLA Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from openxla.benchmark import def_types
+
+MOBILE_PIXEL_6_PRO = def_types.DeviceSpec(
+    name="pixel-6-pro",
+    host_type="mobile",
+    host_model="pixel-6-pro",
+    host_environment="android",
+    accelerator_type="cpu",
+    accelerator_model="armv8.2-a",
+    accelerator_architecture="armv8.2-a",
+    accelerator_attributes={
+        "num_of_cores": 8,
+    },
+)
+
+ALL_DEVICES = [MOBILE_PIXEL_6_PRO]
\ No newline at end of file
diff --git a/devtools/docker/dockerfiles/android.Dockerfile b/devtools/docker/dockerfiles/android.Dockerfile
new file mode 100644
index 00000000..b6272b40
--- /dev/null
+++ b/devtools/docker/dockerfiles/android.Dockerfile
@@ -0,0 +1,20 @@
+# Copyright 2023 The OpenXLA Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# An image for cross-compiling towards Android.
+
+FROM gcr.io/iree-oss/openxla-benchmark/base@sha256:1bf3e319465ec8fb465baae3f6ba9a5b09cb84a5349a675c671a552fc77f2251
+
+ARG NDK_VERSION=r25c
+WORKDIR /install-ndk
+
+ENV ANDROID_NDK "/usr/src/android-ndk-${NDK_VERSION}"
+
+RUN wget -q "https://dl.google.com/android/repository/android-ndk-${NDK_VERSION}-linux.zip" \
+    && unzip -q "android-ndk-${NDK_VERSION}-linux.zip" -d /usr/src/  \
+    && rm -rf /install-ndk
+
+WORKDIR /
diff --git a/devtools/docker/image_deps.json b/devtools/docker/image_deps.json
index 7215e309..243f4320 100644
--- a/devtools/docker/image_deps.json
+++ b/devtools/docker/image_deps.json
@@ -3,5 +3,6 @@
   "cuda11.8-cudnn8.9": ["base"],
   "db_import": [],
   "mmperf": ["base"],
-  "convperf": ["base"]
+  "convperf": ["base"],
+  "android": ["base"]
 }
diff --git a/devtools/docker/prod_digests.txt b/devtools/docker/prod_digests.txt
index 013a9d5d..4a2f94a0 100644
--- a/devtools/docker/prod_digests.txt
+++ b/devtools/docker/prod_digests.txt
@@ -3,3 +3,4 @@ gcr.io/iree-oss/openxla-benchmark/cuda11.8-cudnn8.9@sha256:f43984cd6c16ad1faad4d
 gcr.io/iree-oss/openxla-benchmark/db_import@sha256:3de8a702b51ca1906fc2ef5bab2415a79e46bc132f2ceba994215539dd0ecdd4
 gcr.io/iree-oss/openxla-benchmark/mmperf@sha256:c972ce5b2144de0786f103611fecbd88d93dd45ecd068f8c97d98c08677cee57
 gcr.io/iree-oss/openxla-benchmark/convperf@sha256:0807d5e8144900752cfae72f3aa4d12530b408f73fc6f010a6cbad11cc09832c
+gcr.io/iree-oss/openxla-benchmark/android@sha256:3211ade3856dfd46469e573f17baaf367f9c0830dfcc70c6d85891447cadc39e
diff --git a/experimental/ggml/benchmark_ggml.sh b/experimental/ggml/benchmark_ggml.sh
index 6cb28dc1..137592d0 100755
--- a/experimental/ggml/benchmark_ggml.sh
+++ b/experimental/ggml/benchmark_ggml.sh
@@ -11,20 +11,20 @@
 # OOBI_VENV_DIR: path to create Python virtualenv, default: ggml-benchmarks.venv
 # OOBI_TARGET_DEVICE: target benchmark device, can also be specified the first
 #   argument.
+# OOBI_BUILD_DIR: path to the GGMl build directory.
 # OOBI_OUTPUT: path to output benchmark results, can also be specified the
 #   second argument.
-# OOBI_SCRATCH_DIR: the directory to place temporary benchmarking artifacts.
 #
 # Example usage:
-# ./benchmark_ggml.sh c2-standard-16 /tmp/results.json
+# ./benchmark_ggml.sh <target-device> <build-dir> <result-path>
 
 set -xeuo pipefail
 
 VENV_DIR="${OOBI_VENV_DIR:-ggml-benchmarks.venv}"
-ROOT_DIR="${OOBI_SCRATCH_DIR:-/tmp}"
 PYTHON="${PYTHON:-/usr/bin/python3}"
-TARGET_DEVICE="${1:-${OOBI_TARGET_DEVICE}}"
-OUTPUT_PATH="${2:-${OOBI_OUTPUT}}"
+TARGET_DEVICE_NAME="${1:-${OOBI_TARGET_DEVICE}}"
+BUILD_DIR="${2:-${OOBI_BUILD_DIR}}"
+OUTPUT_PATH="${3:-${OOBI_OUTPUT}}"
 
 TD="$(cd $(dirname $0) && pwd)"
 
@@ -35,33 +35,12 @@ VENV_DIR="${VENV_DIR}" PYTHON="${PYTHON}" source "${TD}/setup_venv.sh"
 OUTPUT_PATH="$(realpath ${OUTPUT_PATH})"
 "${TD}/../../comparative_benchmark/scripts/create_results_json.sh" "${OUTPUT_PATH}"
 
-pushd "${ROOT_DIR}"
-
-# We clone a fork of ggml which includes additional benchmark logging.
-git clone --branch benchmark https://github.com/mariecwhite/ggml.git
-pushd ggml
-
-# Build
-mkdir build
-pushd build
-cmake ..
-make -j8
-
-# Generate FP32, FP16 and INT4 versions of GPT2 117M (Small).
-GPT_VARIANT="117M"
-../examples/gpt-2/download-model.sh "${GPT_VARIANT}"
-# Generate FP32.
-python ../examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-${GPT_VARIANT}/ 0
-# Generate FP16.
-python ../examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-${GPT_VARIANT}/ 1
-# Generate INT4.
-./bin/gpt-2-quantize models/gpt-2-${GPT_VARIANT}/ggml-model-f16.bin models/gpt-2-${GPT_VARIANT}/ggml-model-q4_0.bin 2
+pushd "${BUILD_DIR}"
 
 PROMPT="Once upon a time"
 BENCHMARK_BINARY="$(realpath bin/gpt-2)"
 WARMUP_ITERAIONS=2
 NUM_ITERATIONS=10
-declare -a NUM_THREADS=(1 8 16)
 
 MODEL="$(realpath models/gpt-2-117M/ggml-model-f32.bin)"
 
@@ -81,26 +60,51 @@ declare -a DATA_TYPES=(
   "int4"
 )
 
+declare -a args=(
+  --warmup_iterations "${WARMUP_ITERAIONS}"
+  --iterations "${NUM_ITERATIONS}"
+  --benchmark_binary "${BENCHMARK_BINARY}"
+  --prompt "${PROMPT}"
+  --seed 0
+  --output "${OUTPUT_PATH}"
+  --target_device "${TARGET_DEVICE_NAME}"
+  --verbose
+)
+
+if [[ "${TARGET_DEVICE_NAME}" =~ ^(pixel-4|pixel-6-pro|moto-edge-x30)$ ]]; then
+BENCHMARK_SCRIPT="run_benchmarks_android.py"
+# Pixel 6 has a maximum of 8 cores.
+THREADS="1,4,8"
+TASKSETS="80,f0,ff"
+
+args+=(
+   --threads "${THREADS}"
+   --tasksets "${TASKSETS}"
+)
+
+# Setup mobile device for benchmarking.
+adb push "${TD}/set_android_scaling_governor.sh" "/data/local/tmp"
+adb shell "chmod +x /data/local/tmp/set_android_scaling_governor.sh"
+adb shell "su root sh /data/local/tmp/set_android_scaling_governor.sh performance"
+
+else
+# c2-standard-16 has 16 cores.
+BENCHMARK_SCRIPT="run_benchmarks.py"
+THREADS="1,8,16"
+
+args+=(
+   --threads "${THREADS}"
+)
+fi
+
 for i in ${!BENCHMARK_NAMES[@]}; do
   MODEL="$(realpath models/gpt-2-117M/${MODELS[$i]})"
-
-  for threads in "${NUM_THREADS[@]}"; do
-    "${TD}/benchmark.py" \
-      --benchmark_name "${BENCHMARK_NAMES[$i]}" \
-      --warmup_iterations "${WARMUP_ITERAIONS}" \
-      --iterations "${NUM_ITERATIONS}" \
-      --benchmark_binary "${BENCHMARK_BINARY}" \
-      --model "${MODEL}" \
-      --data_type "${DATA_TYPES[$i]}" \
-      --prompt "${PROMPT}" \
-      --seed 0 \
-      --threads "${threads}" \
-      --output "${OUTPUT_PATH}" \
-      --target_device "${TARGET_DEVICE}" \
-      --verbose
-  done
+  args+=(
+    --benchmark_name "${BENCHMARK_NAMES[$i]}"
+    --model "${MODEL}"
+    --data_type "${DATA_TYPES[$i]}"
+  )
+  "${TD}/${BENCHMARK_SCRIPT}" "${args[@]}"
 done
 
-popd # build
-popd # ggml
-popd # ROOT_DIR
+popd # BUILD_DIR
diff --git a/experimental/ggml/benchmark.py b/experimental/ggml/benchmark_lib.py
old mode 100755
new mode 100644
similarity index 61%
rename from experimental/ggml/benchmark.py
rename to experimental/ggml/benchmark_lib.py
index 59c5d047..55e7416c
--- a/experimental/ggml/benchmark.py
+++ b/experimental/ggml/benchmark_lib.py
@@ -23,65 +23,51 @@
 # Add common_benchmark_suite dir to the search path.
 sys.path.insert(
     0, str(pathlib.Path(__file__).parents[2] / "common_benchmark_suite"))
-from openxla.benchmark import def_types, devices
-
-ALL_DEVICE_NAMES = [device.name for device in devices.ALL_DEVICES]
+from openxla.benchmark import devices
+
+# Regular expressions to parse GGML benchmark output.
+# Example output:
+# main:      mem per token =  2011380 bytes
+# main:          load time =   120.92 ms
+# main:        sample time =    73.86 ms
+# main: first predict time =    14.71 ms
+# main:  loop predict time =  2261.72 ms / 11.20 ms per token
+# main:       predict time =  2276.43 ms / 11.21 ms per token
+# main:         total time =  2494.66 ms
+LOAD_TIME_REGEXP = re.compile(f".+ load time = (.+) ms")
+SAMPLE_TIME_REGEXP = re.compile(f".+ sample time = (.+) ms")
+FIRST_PREDICTION_TIME_REGEXP = re.compile(f".+ first predict time = (.+) ms")
+LOOP_PREDICTION_TIME_REGEXP = re.compile(
+    f".+ loop predict time = .+ ms / (.+) ms per token")
+TOTAL_PREDICTION_TIME_REGEXP = re.compile(
+    f".+ predict time = (.+) ms / .+ ms per token")
+E2E_TIME_REGEXP = re.compile(f".+ total time = (.+) ms")
 
 
 def _parse_output(output_text):
-  # Example output.
-  # main:      mem per token =  2011380 bytes
-  # main:          load time =   120.92 ms
-  # main:        sample time =    73.86 ms
-  # main: first predict time =    14.71 ms
-  # main:  loop predict time =  2261.72 ms / 11.20 ms per token
-  # main:       predict time =  2276.43 ms / 11.21 ms per token
-  # main:         total time =  2494.66 ms
-
-  LOAD_TIME_REGEXP = re.compile(f"main:          load time =   (.+) ms")
   match = LOAD_TIME_REGEXP.search(output_text)
-  if not match:
-    "Unable to parse first prediction time"
-    return
-  load_time_ms = float(match.group(1))
+  load_time_ms = float(match.group(1)) if match else print(
+      "Unable to parse first prediction time")
 
-  SAMPLE_TIME_REGEXP = re.compile(f"main:        sample time =    (.+) ms")
   match = SAMPLE_TIME_REGEXP.search(output_text)
-  if not match:
-    "Unable to parse first prediction time"
-    return
-  sample_time_ms = float(match.group(1))
+  sample_time_ms = float(match.group(1)) if match else print(
+      "Unable to parse first prediction time")
 
-  FIRST_PREDICTION_TIME_REGEXP = re.compile(
-      f"main: first predict time = (.+) ms")
   match = FIRST_PREDICTION_TIME_REGEXP.search(output_text)
-  if not match:
-    "Unable to parse first prediction time"
-    return
-  first_prediction_ms = float(match.group(1))
+  first_prediction_ms = float(match.group(1)) if match else print(
+      "Unable to parse first prediction time")
 
-  LOOP_PREDICTION_TIME_REGEXP = re.compile(
-      f"main:  loop predict time =  .+ ms / (.+) ms per token")
   match = LOOP_PREDICTION_TIME_REGEXP.search(output_text)
-  if not match:
-    "Unable to parse loop prediction time"
-    return
-  loop_prediction_ms = float(match.group(1))
+  loop_prediction_ms = float(match.group(1)) if match else print(
+      "Unable to parse loop prediction time")
 
-  TOTAL_PREDICTION_TIME_REGEXP = re.compile(
-      f"main:       predict time =  (.+) ms / .+ ms per token")
   match = TOTAL_PREDICTION_TIME_REGEXP.search(output_text)
-  if not match:
-    "Unable to parse total prediction time"
-    return
-  total_prediction_ms = float(match.group(1))
+  total_prediction_ms = float(match.group(1)) if match else print(
+      "Unable to parse total prediction time")
 
-  E2E_TIME_REGEXP = re.compile(f"main:         total time =  (.+) ms")
   match = E2E_TIME_REGEXP.search(output_text)
-  if not match:
-    "Unable to parse total prediction time"
-    return
-  e2e_prediction_ms = float(match.group(1))
+  e2e_prediction_ms = float(match.group(1)) if match else print(
+      "Unable to parse total prediction time")
 
   return {
       "load_time_ms": load_time_ms,
@@ -93,8 +79,7 @@ def _parse_output(output_text):
   }
 
 
-def _parse_arguments() -> argparse.Namespace:
-  parser = argparse.ArgumentParser(description="Run GGML benchmarks.")
+def configure_parser(parser: argparse.ArgumentParser):
   parser.add_argument("-name",
                       "--benchmark_name",
                       type=str,
@@ -109,7 +94,7 @@ def _parse_arguments() -> argparse.Namespace:
   parser.add_argument(
       "-m",
       "--model",
-      type=str,
+      type=pathlib.Path,
       required=True,
       help=
       "The GGML model to benchmark e.g. /tmp/ggml/build/models/gpt-2-117M/ggml-model.bin"
@@ -127,9 +112,9 @@ def _parse_arguments() -> argparse.Namespace:
                       help="The seed to use for the RNG.")
   parser.add_argument("-t",
                       "--threads",
-                      type=int,
-                      default=8,
-                      help="The number of threads to use.")
+                      type=str,
+                      default="1,4",
+                      help="A comma-delimited list of threads.")
   parser.add_argument("-o",
                       "--output",
                       type=pathlib.Path,
@@ -140,7 +125,7 @@ def _parse_arguments() -> argparse.Namespace:
                       dest="target_device_name",
                       type=str,
                       required=True,
-                      choices=ALL_DEVICE_NAMES,
+                      choices=devices.ALL_DEVICE_NAMES,
                       help="The target device to benchmark.")
   parser.add_argument("-w",
                       "--warmup_iterations",
@@ -155,49 +140,17 @@ def _parse_arguments() -> argparse.Namespace:
   parser.add_argument("--verbose",
                       action="store_true",
                       help="Show verbose messages.")
-  return parser.parse_args()
-
 
-def main(benchmark_name: str, benchmark_binary: pathlib.Path,
-         warmup_iterations: int, iterations: int, model: str, data_type: str,
-         prompt: str, seed: int, threads: int, output: pathlib.Path,
-         target_device_name: str, verbose: bool):
 
-  try:
-    target_device = next(device for device in devices.ALL_DEVICES
-                         if device.name == target_device_name)
-  except StopIteration:
-    raise ValueError(f'Target device "{target_device_name}" is not defined.'
-                     f' Available device options:\n{ALL_DEVICE_NAMES}')
-
-  benchmark_definition = {
-      "benchmark_name": benchmark_name,
-      "framework": str(def_types.ModelFrameworkType.GGML),
-      "data_type": data_type,
-      "batch_size": 1,
-      "compiler": str(def_types.ModelFrameworkType.GGML),
-      "device": target_device.name,
-      "num_threads": threads,
-      "warmup_iterations": warmup_iterations,
-      "num_iterations": iterations,
-      "tags": ["gpt2", "ggml"],
-  }
-
-  cmd = [
-      benchmark_binary,
-      "--model",
-      f"{model}",
-      "--prompt",
-      f"{prompt}",
-      "--seed",
-      f"{seed}",
-      "--threads",
-      f"{threads}",
-  ]
+def benchmark(benchmark_command: str, benchmark_definition: dict,
+              warmup_iterations: int, iterations: int, output: pathlib.Path,
+              verbose: bool):
 
   # Run warmup iterations.
   for i in range(warmup_iterations):
-    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    subprocess.run(benchmark_command,
+                   stdout=subprocess.PIPE,
+                   stderr=subprocess.STDOUT)
 
   load_times = []
   first_prediction_times = []
@@ -208,10 +161,13 @@ def main(benchmark_name: str, benchmark_binary: pathlib.Path,
 
   # Run iterations.
   for i in range(iterations):
-    raw_result = subprocess.run(cmd,
+    raw_result = subprocess.run(benchmark_command,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT)
     raw_result = raw_result.stdout.decode("utf-8")
+    if verbose:
+      print(raw_result)
+
     metrics = _parse_output(raw_result)
 
     load_times.append(metrics["load_time_ms"])
@@ -250,7 +206,3 @@ def main(benchmark_name: str, benchmark_binary: pathlib.Path,
   if verbose:
     print(json.dumps(dataclasses.asdict(benchmark_result), indent=2))
   utils.append_benchmark_result(output, benchmark_result)
-
-
-if __name__ == "__main__":
-  main(**vars(_parse_arguments()))
diff --git a/experimental/ggml/build_ggml.sh b/experimental/ggml/build_ggml.sh
new file mode 100755
index 00000000..c87cfea8
--- /dev/null
+++ b/experimental/ggml/build_ggml.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+#
+# Copyright 2023 The OpenXLA Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Environment variables:
+# PYTHON: Python interpreter, default: /usr/bin/python3
+# ANDROID_NDK: the path to the Android NDK if building for Android.
+# OOBI_VENV_DIR: path to create Python virtualenv, default: ggml-build.venv
+# OOBI_TARGET_DEVICE: target benchmark device, can also be specified the first
+#   argument.
+# OOBI_OUTPUT: path to output benchmark results, can also be specified the
+#   second argument.
+# OOBI_SCRATCH_DIR: the directory to place temporary benchmarking artifacts.
+#
+# Example usage:
+# ./build_ggml.sh <target-device>> <build-dir>
+
+set -xeuo pipefail
+
+VENV_DIR="${OOBI_VENV_DIR:-ggml-build.venv}"
+ROOT_DIR="${OOBI_SCRATCH_DIR:-/tmp}"
+PYTHON="${PYTHON:-/usr/bin/python3}"
+TARGET_DEVICE_NAME="${1:-${OOBI_TARGET_DEVICE}}"
+BUILD_DIR="${2:-/tmp/ggml-build}"
+
+TD="$(cd $(dirname $0) && pwd)"
+BUILD_DIR="$(realpath ${BUILD_DIR})"
+
+# Setup virtual environment.
+VENV_DIR="${VENV_DIR}" PYTHON="${PYTHON}" source "${TD}/setup_venv.sh"
+
+pushd "${ROOT_DIR}"
+
+# We clone a fork of ggml which includes additional benchmark logging.
+git clone --branch benchmark https://github.com/mariecwhite/ggml.git
+pushd ggml
+
+REPO_DIR="$(pwd)"
+
+# Build gpt-2-quantize.
+cmake -G Ninja -B local-build .
+cmake --build local-build -t gpt-2-quantize
+
+# Build gpt-2.
+if [[ "${TARGET_DEVICE_NAME}" =~ ^(pixel-4|pixel-6-pro|moto-edge-x30)$ ]]; then
+cmake -GNinja -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod -B "${BUILD_DIR}" .
+cmake --build "${BUILD_DIR}" -t gpt-2
+else
+cmake -G Ninja -B "${BUILD_DIR}" .
+cmake --build "${BUILD_DIR}" -t gpt-2
+fi
+
+popd # ggml
+popd # ROOT_DIR
+
+# Generate FP32 and FP16 versions of GPT2 117M (Small).
+pushd "${BUILD_DIR}"
+
+GPT_VARIANT="117M"
+${REPO_DIR}/examples/gpt-2/download-model.sh "${GPT_VARIANT}"
+# Generate FP32.
+python ${REPO_DIR}/examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-${GPT_VARIANT}/ 0
+# Generate FP16.
+python ${REPO_DIR}/examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-${GPT_VARIANT}/ 1
+# Generate INT4. Keep this disabled until we want to use it.
+${REPO_DIR}/local-build/bin/gpt-2-quantize models/gpt-2-${GPT_VARIANT}/ggml-model-f16.bin models/gpt-2-${GPT_VARIANT}/ggml-model-q4_0.bin 2
+
+popd # BUILD_DIR
diff --git a/experimental/ggml/requirements.txt b/experimental/ggml/requirements.txt
index c2bbfa1f..7947059c 100644
--- a/experimental/ggml/requirements.txt
+++ b/experimental/ggml/requirements.txt
@@ -1,2 +1,3 @@
 numpy
-tensorflow
\ No newline at end of file
+tensorflow
+requests
diff --git a/experimental/ggml/run_benchmarks.py b/experimental/ggml/run_benchmarks.py
new file mode 100755
index 00000000..88cb4f1e
--- /dev/null
+++ b/experimental/ggml/run_benchmarks.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+#
+# Copyright 2023 The OpenXLA Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import argparse
+import pathlib
+import sys
+
+import benchmark_lib
+
+# Add common_benchmark_suite dir to the search path.
+sys.path.insert(
+    0, str(pathlib.Path(__file__).parents[2] / "common_benchmark_suite"))
+from openxla.benchmark import def_types, devices
+
+
+def _parse_arguments() -> argparse.Namespace:
+  parser = argparse.ArgumentParser(description="Run GGML benchmarks.")
+  benchmark_lib.configure_parser(parser)
+  return parser.parse_args()
+
+
+def main(benchmark_name: str, benchmark_binary: pathlib.Path,
+         warmup_iterations: int, iterations: int, model: pathlib.Path,
+         data_type: str, prompt: str, seed: int, threads: str,
+         output: pathlib.Path, target_device_name: str, verbose: bool):
+
+  try:
+    target_device = next(device for device in devices.ALL_DEVICES
+                         if device.name == target_device_name)
+  except StopIteration:
+    raise ValueError(f'Target device "{target_device_name}" is not defined.'
+                     f' Available device options:\n{devices.ALL_DEVICE_NAMES}')
+
+  threads = threads.split(",")
+  for thread in threads:
+    benchmark_definition = {
+        "benchmark_name": benchmark_name,
+        "framework": str(def_types.ModelFrameworkType.GGML),
+        "data_type": data_type,
+        "batch_size": 1,
+        "compiler": str(def_types.ModelFrameworkType.GGML),
+        "device": target_device.name,
+        "num_threads": thread,
+        "warmup_iterations": warmup_iterations,
+        "num_iterations": iterations,
+        "tags": ["gpt2", "ggml"],
+    }
+
+    cmd = [
+        benchmark_binary, "--model", model, "--prompt", f"\"{prompt}\"",
+        "--seed",
+        str(seed), "--threads",
+        str(thread)
+    ]
+
+    benchmark_lib.benchmark(cmd, benchmark_definition, warmup_iterations,
+                            iterations, output, verbose)
+
+
+if __name__ == "__main__":
+  main(**vars(_parse_arguments()))
diff --git a/experimental/ggml/run_benchmarks_android.py b/experimental/ggml/run_benchmarks_android.py
new file mode 100755
index 00000000..8cb35641
--- /dev/null
+++ b/experimental/ggml/run_benchmarks_android.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+#
+# Copyright 2023 The OpenXLA Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import argparse
+import pathlib
+import subprocess
+import sys
+
+import benchmark_lib
+
+# Add common_benchmark_suite dir to the search path.
+sys.path.insert(
+    0, str(pathlib.Path(__file__).parents[2] / "common_benchmark_suite"))
+from openxla.benchmark import def_types, devices
+
+
+def _parse_arguments() -> argparse.Namespace:
+  parser = argparse.ArgumentParser(description="Run GGML benchmarks.")
+  parser.add_argument(
+      "--tasksets",
+      type=str,
+      default="f0",
+      help=
+      "A comma-separated list of tasksets to run under each thread configuration."
+  )
+  benchmark_lib.configure_parser(parser)
+  return parser.parse_args()
+
+
+def main(benchmark_name: str, benchmark_binary: pathlib.Path,
+         warmup_iterations: int, iterations: int, model: pathlib.Path,
+         data_type: str, prompt: str, seed: int, threads: str, tasksets: str,
+         output: pathlib.Path, target_device_name: str, verbose: bool):
+  try:
+    target_device = next(device for device in devices.ALL_DEVICES
+                         if device.name == target_device_name)
+  except StopIteration:
+    raise ValueError(f'Target device "{target_device_name}" is not defined.'
+                     f' Available device options:\n{devices.ALL_DEVICE_NAMES}')
+
+  threads = threads.split(",")
+  tasksets = tasksets.split(",")
+  if len(threads) != len(tasksets):
+    raise ValueError(
+        "The number of tasksets specified must be equal to the number of threads."
+    )
+
+  # Push artifacts to the Android device.
+  subprocess.run(["adb", "push", benchmark_binary, "/data/local/tmp"])
+  subprocess.run([
+      "adb", "shell", "chmod", "+x", f"/data/local/tmp/{benchmark_binary.name}"
+  ])
+  subprocess.run(["adb", "push", model, "/data/local/tmp"])
+
+  for taskset, thread in zip(tasksets, threads):
+    benchmark_definition = {
+        "benchmark_name": benchmark_name,
+        "framework": str(def_types.ModelFrameworkType.GGML),
+        "data_type": data_type,
+        "batch_size": 1,
+        "compiler": str(def_types.ModelFrameworkType.GGML),
+        "device": target_device.name,
+        "taskset": taskset,
+        "num_threads": thread,
+        "warmup_iterations": warmup_iterations,
+        "num_iterations": iterations,
+        "tags": ["gpt2", "ggml"],
+    }
+
+    cmd = [
+        "adb", "shell", "taskset", taskset,
+        f"/data/local/tmp/{benchmark_binary.name}", "--model",
+        f"/data/local/tmp/{model.name}", "--prompt", f"\"{prompt}\"", "--seed",
+        str(seed), "--threads",
+        str(thread)
+    ]
+
+    benchmark_lib.benchmark(cmd, benchmark_definition, warmup_iterations,
+                            iterations, output, verbose)
+
+  # Cleanup.
+  subprocess.run(["adb", "rm", f"/data/local/tmp/{benchmark_binary.name}"])
+  subprocess.run(["adb", "rm", f"/data/local/tmp/{model.name}"])
+
+
+if __name__ == "__main__":
+  main(**vars(_parse_arguments()))
diff --git a/experimental/ggml/set_android_scaling_governor.sh b/experimental/ggml/set_android_scaling_governor.sh
new file mode 100755
index 00000000..0a297304
--- /dev/null
+++ b/experimental/ggml/set_android_scaling_governor.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+# Copyright 2023 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Runs on an android device itself to set the frequency scaling governor for all
+# CPUs (default performance).
+
+################################### WARNING ####################################
+# This will overheat the phone if it's not on a cooling plate, resulting in    #
+# thermal throttling. To prevent anything catching on fire, the actual CPU     #
+# frequencies will be throttled to below the maximum, skewing your results.    #
+################################################################################
+
+set -euo pipefail
+
+GOVERNOR="${1:-performance}"
+
+echo "CPU info (before changing governor):"
+echo 'cpu\tgovernor\tcur\tmin\tmax'
+echo "------------------------------------------------"
+for i in `cat /sys/devices/system/cpu/present | tr '-' ' ' | xargs seq`; do \
+    echo "cpu${i}" | paste \
+      - \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/scaling_governor" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_cur_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_min_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_max_freq"; \
+done
+
+echo "Setting CPU frequency governor to ${GOVERNOR}"
+
+for i in `cat /sys/devices/system/cpu/present | tr '-' ' ' | xargs seq`; do \
+  echo "${GOVERNOR}" > \
+    "/sys/devices/system/cpu/cpu${i?}/cpufreq/scaling_governor"; \
+done
+
+echo "CPU info (after changing governor):"
+echo 'cpu\tgovernor\tcur\tmin\tmax'
+echo "------------------------------------------------"
+for i in `cat /sys/devices/system/cpu/present | tr '-' ' ' | xargs seq`; do \
+    echo "cpu${i}" | paste \
+      - \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/scaling_governor" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_cur_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_min_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_max_freq"; \
+done