From 7e060162a5c86087e2ad3207e41dc4789d3c18a2 Mon Sep 17 00:00:00 2001 From: mariecwhite Date: Mon, 4 Sep 2023 03:17:35 +0000 Subject: [PATCH] Add GGML --- .github/workflows/run_ggml_benchmark.yml | 88 +++++++ .../comparative_suite/tf/model_definitions.py | 12 + .../openxla/benchmark/def_types.py | 1 + .../openxla/benchmark/devices/gcp_devices.py | 16 +- experimental/ggml/__init__.py | 0 experimental/ggml/benchmark.py | 242 ++++++++++++++++++ experimental/ggml/benchmark_ggml.sh | 105 ++++++++ experimental/ggml/requirements.txt | 2 + experimental/ggml/setup_venv.sh | 32 +++ 9 files changed, 497 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/run_ggml_benchmark.yml create mode 100644 experimental/ggml/__init__.py create mode 100755 experimental/ggml/benchmark.py create mode 100755 experimental/ggml/benchmark_ggml.sh create mode 100644 experimental/ggml/requirements.txt create mode 100644 experimental/ggml/setup_venv.sh diff --git a/.github/workflows/run_ggml_benchmark.yml b/.github/workflows/run_ggml_benchmark.yml new file mode 100644 index 00000000..a9282423 --- /dev/null +++ b/.github/workflows/run_ggml_benchmark.yml @@ -0,0 +1,88 @@ +# Copyright 2023 The OpenXLA Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# GGML Benchmarks Workflow. + +name: Comparative Benchmarks + +on: + workflow_dispatch: + pull_request: + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +env: + GCS_DIR: gs://openxla-github-actions-${{ github.event_name == 'pull_request' && 'presubmit' || 'postsubmit' }}-artifacts/${{ github.run_id }}/${{ github.run_attempt }} + +jobs: + setup: + runs-on: ubuntu-22.04 + outputs: + runner-group: ${{ steps.configure.outputs.runner-group }} + benchmark-gcs-dir: ${{ steps.configure.outputs.benchmark-gcs-dir }} + steps: + - name: "Checking out PR repository" + uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0 + - name: "Configuring CI options" + id: configure + env: + RUNNER_GROUP: ${{ github.event_name == 'pull_request' && 'presubmit' || 'postsubmit' }} + run: | + # Just informative logging. There should only be two commits in the + # history here, but limiting the depth helps when copying from a local + # repo instead of using checkout, e.g. with + # https://github.com/nektos/act where there will be more. + git log --oneline --graph --max-count=3 + # Workflow jobs can't access `env` in `runs-on`, so we need to make + # `runner-group` a job output variable. + echo "runner-group=${RUNNER_GROUP}" > "${GITHUB_OUTPUT}" + + # For presubmit testing, the result artifacts are uploaded to the + # temporary workflow GCS dir. In postsubmit, the result artifacts are + # uploaded to the comparative benchmark GCS dir. + if [[ "${RUNNER_GROUP}" == "presubmit" ]]; then + BENCHMARK_GCS_DIR="${GCS_DIR}/comparative-benchmark-artifacts" + else + BENCHMARK_GCS_DIR="gs://comparative-benchmark-artifacts/$(date +'%Y-%m-%d').$(date +'%s')" + fi + echo "benchmark-gcs-dir=${BENCHMARK_GCS_DIR}" >> "${GITHUB_OUTPUT}" + + benchmark_on_c2-standard-30: + needs: [setup] + runs-on: + - self-hosted # must come first + - runner-group=${{ needs.setup.outputs.runner-group }}g + - environment=prod + - machine-type=c2-standard-60 + env: + BENCHMARK_GCS_DIR: ${{ needs.setup.outputs.benchmark-gcs-dir }} + RESULTS_DIR: results-dir + TARGET_DEVICE: c2-standard-60 + steps: + - name: "Checking out PR repository" + uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0 + - name: "Setup" + id: setup + run: | + echo "results-gcs-dir=${BENCHMARK_GCS_DIR}/${TARGET_DEVICE}-results" >> "${GITHUB_OUTPUT}" + mkdir "${RESULTS_DIR}" + - name: "Benchmarking GGML CPU" + env: + GGML_RESULTS_JSON: ggml.json + RESULTS_GCS_DIR: ${{ steps.setup.outputs.results-gcs-dir }} + run: | + RESULTS_PATH="${RESULTS_DIR}/${GGML_RESULTS_JSON}" + docker run --mount="type=bind,src="${PWD}",target=/work" --workdir="/work" \ + "gcr.io/iree-oss/openxla-benchmark/base@sha256:1bf3e319465ec8fb465baae3f6ba9a5b09cb84a5349a675c671a552fc77f2251" \ + ./experimental/ggml/benchmark_ggml.sh \ + "${TARGET_DEVICE}"\ + "${RESULTS_PATH}" + gcloud storage cp "${RESULTS_PATH}" "${RESULTS_GCS_DIR}/" diff --git a/common_benchmark_suite/openxla/benchmark/comparative_suite/tf/model_definitions.py b/common_benchmark_suite/openxla/benchmark/comparative_suite/tf/model_definitions.py index 981355fd..4ccb2b53 100644 --- a/common_benchmark_suite/openxla/benchmark/comparative_suite/tf/model_definitions.py +++ b/common_benchmark_suite/openxla/benchmark/comparative_suite/tf/model_definitions.py @@ -137,6 +137,18 @@ template=EFFICIENTNETB7_FP32_TF_600X600X3XF32_BATCH_TEMPLATE, batch_sizes=[1, 64, 128]) +# GPT2LMHead models. +# Model implementation from https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.TFGPT2Model. +GPT2LMHEAD_TF_IMPL = def_types.ModelImplementation( + name="GPT2_TF", + tags=["transformer-decoder", "gpt2", "ggml"], + framework_type=def_types.ModelFrameworkType.TF_V2, + module_path=f"{utils.MODELS_MODULE_PATH}.tf.gpt2.gpt2lmhead_model", + source_info= + "https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.TFGPT2Model", +) + + ALL_MODELS = list( itertools.chain( T5_LARGE_FP32_TF_512XI32_BATCHES.values(), diff --git a/common_benchmark_suite/openxla/benchmark/def_types.py b/common_benchmark_suite/openxla/benchmark/def_types.py index 76e66e2e..666d2564 100644 --- a/common_benchmark_suite/openxla/benchmark/def_types.py +++ b/common_benchmark_suite/openxla/benchmark/def_types.py @@ -17,6 +17,7 @@ class ModelFrameworkType(Enum): TF_V2 = "tensorflow_v2" PYTORCH = "pytorch" JAX = "jax" + GGML = "ggml" @dataclass(frozen=True) diff --git a/common_benchmark_suite/openxla/benchmark/devices/gcp_devices.py b/common_benchmark_suite/openxla/benchmark/devices/gcp_devices.py index dea2a85d..7131fa19 100644 --- a/common_benchmark_suite/openxla/benchmark/devices/gcp_devices.py +++ b/common_benchmark_suite/openxla/benchmark/devices/gcp_devices.py @@ -32,4 +32,18 @@ }, ) -ALL_DEVICES = [GCP_A2_HIGHGPU_1G, GCP_C2_STANDARD_16] +GCP_C2_STANDARD_60 = def_types.DeviceSpec( + name="c2-standard-60", + host_type="gcp", + host_model="c2-standard-60", + host_environment="linux-x86_64", + accelerator_type="cpu", + accelerator_model="intel-cascadelake", + accelerator_architecture="x86_64-cascadelake", + accelerator_attributes={ + "num_of_cores": 30, + "hyper-threading": False, + }, +) + +ALL_DEVICES = [GCP_A2_HIGHGPU_1G, GCP_C2_STANDARD_16, GCP_C2_STANDARD_60] diff --git a/experimental/ggml/__init__.py b/experimental/ggml/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/experimental/ggml/benchmark.py b/experimental/ggml/benchmark.py new file mode 100755 index 00000000..56270bb1 --- /dev/null +++ b/experimental/ggml/benchmark.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +# +# Copyright 2023 The OpenXLA Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import argparse +import dataclasses +import json +import pathlib +import re +import statistics +import subprocess +import sys + +# Add comparative_benchmark dir to the search path. +sys.path.insert( + 0, str(pathlib.Path(__file__).parents[2] / "comparative_benchmark")) +import utils + +# Add common_benchmark_suite dir to the search path. +sys.path.insert( + 0, str(pathlib.Path(__file__).parents[2] / "common_benchmark_suite")) +from openxla.benchmark import def_types, devices + +ALL_DEVICE_NAMES = [device.name for device in devices.ALL_DEVICES] + + +def _parse_output(output_text): + # Example output. + # main: mem per token = 2011380 bytes + # main: load time = 120.92 ms + # main: sample time = 73.86 ms + # main: first predict time = 14.71 ms + # main: loop predict time = 2261.72 ms / 11.20 ms per token + # main: predict time = 2276.43 ms / 11.21 ms per token + # main: total time = 2494.66 ms + + LOAD_TIME_REGEXP = re.compile(f"main: load time = (.+) ms") + match = LOAD_TIME_REGEXP.search(output_text) + if not match: + "Unable to parse first prediction time" + return + load_time_ms = float(match.group(1)) + + SAMPLE_TIME_REGEXP = re.compile(f"main: sample time = (.+) ms") + match = SAMPLE_TIME_REGEXP.search(output_text) + if not match: + "Unable to parse first prediction time" + return + sample_time_ms = float(match.group(1)) + + FIRST_PREDICTION_TIME_REGEXP = re.compile(f"main: first predict time = (.+) ms") + match = FIRST_PREDICTION_TIME_REGEXP.search(output_text) + if not match: + "Unable to parse first prediction time" + return + first_prediction_ms = float(match.group(1)) + + LOOP_PREDICTION_TIME_REGEXP = re.compile(f"main: loop predict time = .+ ms / (.+) ms per token") + match = LOOP_PREDICTION_TIME_REGEXP.search(output_text) + if not match: + "Unable to parse loop prediction time" + return + loop_prediction_ms = float(match.group(1)) + + TOTAL_PREDICTION_TIME_REGEXP = re.compile(f"main: predict time = (.+) ms / .+ ms per token") + match = TOTAL_PREDICTION_TIME_REGEXP.search(output_text) + if not match: + "Unable to parse total prediction time" + return + total_prediction_ms = float(match.group(1)) + + E2E_TIME_REGEXP = re.compile(f"main: total time = (.+) ms") + match = E2E_TIME_REGEXP.search(output_text) + if not match: + "Unable to parse total prediction time" + return + e2e_prediction_ms = float(match.group(1)) + + return { + "load_time_ms": load_time_ms, + "first_prediction_ms": first_prediction_ms, + "loop_prediction_ms": loop_prediction_ms, + "total_prediction_ms": total_prediction_ms, + "sample_time_ms": sample_time_ms, + "e2e_prediction_ms": e2e_prediction_ms, + } + + +def _parse_arguments() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Run GGML benchmarks.") + parser.add_argument("-name", + "--benchmark_name", + type=str, + required=True, + help="The regex pattern to match benchmark names.") + parser.add_argument("-b", + "--benchmark_binary", + type=pathlib.Path, + help="Path to benchmark binary e.g. /tmp/ggml/build/bin/gpt2") + parser.add_argument("-m", + "--model", + type=str, + help="The GGML model to benchmark e.g. /tmp/ggml/build/models/gpt-2-117M/ggml-model.bin") + parser.add_argument("--data_type", + type=str, + help="The model data type.") + parser.add_argument("-p", + "--prompt", + type=str, + default="Once upon a time", + help="The input prompt to the model.") + parser.add_argument("-s", + "--seed", + type=int, + default=0, + help="The seed to use for the RNG.") + parser.add_argument("-t", + "--threads", + type=int, + default=8, + help="The number of threads to use.") + parser.add_argument("-o", + "--output", + type=pathlib.Path, + required=True, + help="JSON file path to merge the results.") + parser.add_argument("-device", + "--target_device", + dest="target_device_name", + type=str, + required=True, + choices=ALL_DEVICE_NAMES, + help="The target device to benchmark.") + parser.add_argument("-w", + "--warmup_iterations", + type=int, + default=5, + help="The number of warmup steps.") + parser.add_argument("-iter", + "--iterations", + type=int, + default=100, + help="The number of iterations to benchmark.") + parser.add_argument("--verbose", + action="store_true", + help="Show verbose messages.") + return parser.parse_args() + +def main( + benchmark_name: str, + benchmark_binary: pathlib.Path, + warmup_iterations: int, + iterations: int, + model: str, + data_type: str, + prompt: str, + seed: int, + threads: int, + output: pathlib.Path, + target_device_name: str, + verbose: bool): + + try: + target_device = next(device for device in devices.ALL_DEVICES + if device.name == target_device_name) + except StopIteration: + raise ValueError(f'Target device "{target_device_name}" is not defined.' + f' Available device options:\n{ALL_DEVICE_NAMES}') + + + benchmark_definition = { + "benchmark_name": benchmark_name, + "framework": str(def_types.ModelFrameworkType.GGML), + "data_type": data_type, + "batch_size": 1, + "compiler": str(def_types.ModelFrameworkType.GGML), + "device": target_device.name, + "num_threads": threads, + "warmup_iterations": warmup_iterations, + "num_iterations": iterations, + "tags": ["gpt2", "ggml"], + } + + cmd = [ + benchmark_binary, + "--model", f"{model}", + "--prompt", f"{prompt}", + "--seed", f"{seed}", + "--threads", f"{threads}", + ] + + # Run warmup iterations. + for i in range(warmup_iterations): + subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + load_times = [] + first_prediction_times = [] + loop_prediction_times = [] + total_prediction_times = [] + sample_times = [] + e2e_prediction_times = [] + + # Run iterations. + for i in range(iterations): + raw_result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + raw_result = raw_result.stdout.decode("utf-8") + metrics = _parse_output(raw_result) + + load_times.append(metrics["load_time_ms"]) + first_prediction_times.append(metrics["first_prediction_ms"]) + loop_prediction_times.append(metrics["loop_prediction_ms"]) + total_prediction_times.append(metrics["total_prediction_ms"]) + sample_times.append(metrics["sample_time_ms"]) + e2e_prediction_times.append(metrics["e2e_prediction_ms"]) + + benchmark_metrics = { + "median_load_time_ms": statistics.median(load_times) if load_times else None, + "median_first_prediction_ms": statistics.median(first_prediction_times) if first_prediction_times else None, + "median_loop_prediction_ms": statistics.median(loop_prediction_times) if loop_prediction_times else None, + "median_total_prediction_ms": statistics.median(total_prediction_times) if total_prediction_times else None, + "median_sample_time_ms": statistics.median(sample_times) if sample_times else None, + "median_e2e_prediction_times": statistics.median(e2e_prediction_times) if e2e_prediction_times else None, + } + + benchmark_result = utils.BenchmarkResult( + definition=benchmark_definition, + metrics={ + "compiler_level": benchmark_metrics, + }, + ) + + if verbose: + print(json.dumps(dataclasses.asdict(benchmark_result), indent=2)) + utils.append_benchmark_result(output, benchmark_result) + +if __name__ == "__main__": + main(**vars(_parse_arguments())) diff --git a/experimental/ggml/benchmark_ggml.sh b/experimental/ggml/benchmark_ggml.sh new file mode 100755 index 00000000..227af7ea --- /dev/null +++ b/experimental/ggml/benchmark_ggml.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# +# Copyright 2023 The OpenXLA Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Environment variables: +# PYTHON: Python interpreter, default: /usr/bin/python3 +# OOBI_VENV_DIR: path to create Python virtualenv, default: ggml-benchmarks.venv +# OOBI_TARGET_DEVICE: target benchmark device, can also be specified the first +# argument. +# OOBI_OUTPUT: path to output benchmark results, can also be specified the +# second argument. +# OOBI_SCRATCH_DIR: the directory to place temporary benchmarking artifacts. +# +# Example usage: +# ./benchmark_ggml.sh c2-standard-16 /tmp/results.json + +set -xeuo pipefail + +VENV_DIR="${OOBI_VENV_DIR:-ggml-benchmarks.venv}" +ROOT_DIR="${OOBI_SCRATCH_DIR:-/tmp}" +PYTHON="${PYTHON:-/usr/bin/python3}" +TARGET_DEVICE="${1:-${OOBI_TARGET_DEVICE}}" +OUTPUT_PATH="${2:-${OOBI_OUTPUT}}" + +TD="$(cd $(dirname $0) && pwd)" + +# Setup virtual environment. +VENV_DIR="${VENV_DIR}" PYTHON="${PYTHON}" source "${TD}/setup_venv.sh" + +pushd "${ROOT_DIR}" + +# We clone a fork of ggml which includes additional benchmark logging. +git clone --branch benchmark https://github.com/mariecwhite/ggml.git +pushd ggml + +# Build +mkdir build +pushd build +cmake .. +make -j8 + +# Generate FP32, FP16 and INT4 versions of GPT2 117M (Small). +GPT_VARIANT="117M" +../examples/gpt-2/download-model.sh "${GPT_VARIANT}" +# Generate FP32. +python ../examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-${GPT_VARIANT}/ 0 +# Generate FP16. +python ../examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-${GPT_VARIANT}/ 1 +# Generate INT4. +./bin/gpt-2-quantize models/gpt-2-${GPT_VARIANT}/ggml-model-f16.bin models/gpt-2-${GPT_VARIANT}/ggml-model-q4_0.bin 2 + +# Initialize results json. +"${TD}/../../comparative_benchmark/scripts/create_results_json.sh" "${OUTPUT_PATH}" + +PROMPT="Once upon a time" +BENCHMARK_BINARY="$(realpath bin/gpt-2)" +WARMUP_ITERAIONS=2 +NUM_ITERATIONS=10 +declare -a NUM_THREADS=(1 8 16 30) + +MODEL="$(realpath models/gpt-2-117M/ggml-model-f32.bin)" + +declare -a BENCHMARK_NAMES=( + "models/GPT2LMHEAD_FP32_GGML/inputs/INPUT_DATA_MODEL_DEFAULT" + "models/GPT2LMHEAD_FP16_GGML/inputs/INPUT_DATA_MODEL_DEFAULT" + "models/GPT2LMHEAD_INT4_GGML/inputs/INPUT_DATA_MODEL_DEFAULT" +) +declare -a MODELS=( + ggml-model-f32.bin + ggml-model-f16.bin + ggml-model-q4_0.bin +) +declare -a DATA_TYPES=( + "fp32" + "fp16" + "int4" +) + +for i in ${!BENCHMARK_NAMES[@]}; do + MODEL="$(realpath models/gpt-2-117M/${MODELS[$i]})" + + for threads in "${NUM_THREADS[@]}"; do + "${TD}/benchmark.py" \ + --benchmark_name "${BENCHMARK_NAMES[$i]}" \ + --warmup_iterations "${WARMUP_ITERAIONS}" \ + --iterations "${NUM_ITERATIONS}" \ + --benchmark_binary "${BENCHMARK_BINARY}" \ + --model "${MODEL}" \ + --data_type "${DATA_TYPES[$i]}" \ + --prompt "${PROMPT}" \ + --seed 0 \ + --threads "${threads}" \ + --output "${OUTPUT_PATH}" \ + --target_device "${TARGET_DEVICE}" \ + --verbose + done +done + +popd # build +popd # ggml +popd # ROOT_DIR diff --git a/experimental/ggml/requirements.txt b/experimental/ggml/requirements.txt new file mode 100644 index 00000000..c2bbfa1f --- /dev/null +++ b/experimental/ggml/requirements.txt @@ -0,0 +1,2 @@ +numpy +tensorflow \ No newline at end of file diff --git a/experimental/ggml/setup_venv.sh b/experimental/ggml/setup_venv.sh new file mode 100644 index 00000000..f500452a --- /dev/null +++ b/experimental/ggml/setup_venv.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# +# Copyright 2023 The OpenXLA Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Sets up a virtual environment suitable for running `xla_hlo` benchmarks. +# +# Environment variables: +# VENV_DIR=ggml-benchmarks.venv +# PYTHON=/usr/bin/python3.10 + +set -euo pipefail + +TD="$(cd $(dirname $0) && pwd)" +VENV_DIR="${VENV_DIR:-ggml-benchmarks.venv}" +PYTHON="${PYTHON:-"$(which python3)"}" + +echo "Setting up venv dir: ${VENV_DIR}" + +"${PYTHON}" -m venv "${VENV_DIR}" || echo "Could not create venv." +source "${VENV_DIR}/bin/activate" || echo "Could not activate venv" + +# Upgrade pip and install requirements. 'python' is used here in order to +# reference to the python executable from the venv. +python -m pip install --upgrade pip || echo "Could not upgrade pip" +python -m pip install --upgrade -r "${TD}/requirements.txt" + +echo "Activate venv with:" +echo " source ${VENV_DIR}/bin/activate"