From 3a4f14a0b18651f6df18cca7deadcb78442919f8 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 22 Nov 2024 14:53:37 -0500
Subject: [PATCH 1/2] Base docker image (#290)

* A docker file to create a base image to build vllm on

* Version pin

* Final touches

* Building base with python 3.12 and then vllm on top of it

* Installing lin2to3 for python
---
 Dockerfile.base | 107 +++++++++++++++++++++++++
 Dockerfile.rocm | 202 ++----------------------------------------------
 2 files changed, 114 insertions(+), 195 deletions(-)
 create mode 100644 Dockerfile.base

diff --git a/Dockerfile.base b/Dockerfile.base
new file mode 100644
index 0000000000000..6d735af43a332
--- /dev/null
+++ b/Dockerfile.base
@@ -0,0 +1,107 @@
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.2.4-complete
+FROM ${BASE_IMAGE} AS base
+
+ENV PATH=/opt/rocm/llvm/bin:$PATH
+ENV ROCM_PATH=/opt/rocm
+ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+
+ARG PYTHON_VERSION=3.12
+
+RUN mkdir -p /app
+WORKDIR /app
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python and other dependencies
+RUN apt-get update -y \
+    && apt-get install -y software-properties-common git curl sudo vim less \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+       python${PYTHON_VERSION}-lib2to3 python-is-python3  \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+RUN pip install -U packaging cmake ninja wheel setuptools Cython
+
+FROM base AS build_hipblaslt
+ARG HIPBLASLT_BRANCH="507a649"
+# Unset the following on ROCm6.3+
+ARG LEGACY_HIPBLASLT_OPTION="--legacy_hipblas_direct"
+RUN git clone https://github.com/ROCm/hipBLASLt
+RUN cd hipBLASLt \
+    && git checkout ${HIPBLASLT_BRANCH} \
+    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
+    && cd build/release \
+    && make package
+RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/install
+
+FROM base AS build_rccl
+ARG RCCL_BRANCH="dfe4a3e"
+ARG RCCL_REPO="https://github.com/ROCm/rccl"
+RUN git clone ${RCCL_REPO}
+RUN cd rccl \
+    && git checkout ${RCCL_BRANCH} \
+    && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
+RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
+
+FROM base AS build_triton
+ARG TRITON_BRANCH="release/3.1.x"
+ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
+RUN git clone ${TRITON_REPO}
+RUN cd triton \
+    && git checkout ${TRITON_BRANCH} \
+    && cd python \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
+
+FROM base AS build_amdsmi
+RUN cd /opt/rocm/share/amd_smi \
+    && pip wheel . --wheel-dir=dist
+RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
+
+FROM base AS build_pytorch
+ARG PYTORCH_BRANCH="8bc4033"
+ARG PYTORCH_VISION_BRANCH="v0.19.1"
+ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
+ARG FA_BRANCH="c555642"
+ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
+RUN git clone ${PYTORCH_REPO} pytorch
+RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
+    pip install -r requirements.txt && git submodule update --init --recursive \
+    && python3 tools/amd_build/build_amd.py \
+    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${PYTORCH_VISION_REPO} vision
+RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${FA_REPO}
+RUN cd flash-attention \
+    && git checkout ${FA_BRANCH} \
+    && git submodule update --init \
+    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
+RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
+    && cp /app/vision/dist/*.whl /app/install \
+    && cp /app/flash-attention/dist/*.whl /app/install
+
+FROM base AS final
+RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
+    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
+    && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    pip install /install/*.whl
\ No newline at end of file
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 66cbe61905f84..a9594833c3c99 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -1,151 +1,21 @@
 # default base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.2.2_ubuntu20.04_py3.9_pytorch_release_2.1.2"
-
-ARG COMMON_WORKDIR=/app
-
-# The following ARGs should be "0" or "1". If "1", the respective component will be built and installed on top of the base image
-ARG BUILD_HIPBLASLT="0"
-ARG BUILD_RCCL="0"
-ARG BUILD_FA="1"
-ARG BUILD_TRITON="1"
-ARG BUILD_PYTORCH="1"
-# This ARG should also be "0" or "1". If "1", the vLLM development directory is obtained via git clone.
-# If "0", it is copied in from the local working directory.
 ARG REMOTE_VLLM="0"
 ARG USE_CYTHON="0"
 ARG BUILD_RPD="1"
+ARG COMMON_WORKDIR=/app
+ARG BASE_IMAGE=rocm/vllm-dev:base_ubuntu22.04_py3.12_ROCm6.2.4_hipblaslt0.11_torch2.6
 
-# -----------------------
-# vLLM base image
-FROM $BASE_IMAGE AS base
-USER root
-
-# Import BASE_IMAGE arg from pre-FROM
-ARG BASE_IMAGE
-ARG COMMON_WORKDIR
-# Used as ARCHes for all components
-ARG ARG_PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH}
+FROM ${BASE_IMAGE} AS base
 
 # Install some basic utilities
-RUN apt-get update -q -y && apt-get install -q -y python3 python3-pip
 RUN apt-get update -q -y && apt-get install -q -y \
     sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
 # Remove sccache    
 RUN python3 -m pip install --upgrade pip
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
-
-
-ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
-ENV PATH=$PATH:/opt/rocm/bin:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/bin:
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/lib:
-ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/include:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/include/torch/csrc/api/include/:/opt/rocm/include/:
-
-WORKDIR ${COMMON_WORKDIR}
-
-# -----------------------
-# hipBLASLt build stages
-FROM base AS build_hipblaslt
-ARG HIPBLASLT_BRANCH="e6da924"
-RUN apt-get purge -y hipblaslt \
-    && git clone https://github.com/ROCm/hipBLASLt.git \
-    && cd hipBLASLt \
-    && git checkout ${HIPBLASLT_BRANCH} \
-    && ./install.sh --architecture ${PYTORCH_ROCM_ARCH} --legacy_hipblas_direct \
-    && cd build/release \
-    && make package
-FROM scratch AS export_hipblaslt_1
-ARG COMMON_WORKDIR
-COPY --from=build_hipblaslt ${COMMON_WORKDIR}/hipBLASLt/build/release/*.deb /
-FROM scratch AS export_hipblaslt_0
-FROM export_hipblaslt_${BUILD_HIPBLASLT} AS export_hipblaslt
-
-# -----------------------
-# RCCL build stages
-FROM base AS build_rccl
-ARG RCCL_BRANCH="rocm-6.2.0"
-RUN git clone https://github.com/ROCm/rccl \
-    && cd rccl \
-    && git checkout ${RCCL_BRANCH} \
-    && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
-FROM scratch AS export_rccl_1
-ARG COMMON_WORKDIR
-COPY --from=build_rccl ${COMMON_WORKDIR}/rccl/build/release/*.deb /
-FROM scratch AS export_rccl_0
-FROM export_rccl_${BUILD_RCCL} AS export_rccl
-# -----------------------
-# Triton build stages
-FROM base AS build_triton
-ARG TRITON_BRANCH="e192dba"
-ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-RUN python3 -m pip install ninja cmake wheel pybind11 && git clone ${TRITON_REPO} \
-    && cd triton \
-    && git checkout ${TRITON_BRANCH} \
-    && cd python \
-    && python3 setup.py bdist_wheel --dist-dir=dist
-FROM scratch AS export_triton_1
 ARG COMMON_WORKDIR
-COPY --from=build_triton ${COMMON_WORKDIR}/triton/python/dist/*.whl /
-FROM scratch AS export_triton_0
-FROM export_triton_${BUILD_TRITON} AS export_triton
+WORKDIR ${COMMON_WORKDIR}
 
-# AMD-SMI build stages
-FROM base AS build_amdsmi
-RUN cd /opt/rocm/share/amd_smi \
-    && pip wheel . --wheel-dir=dist
-FROM scratch AS export_amdsmi
-COPY --from=build_amdsmi /opt/rocm/share/amd_smi/dist/*.whl /
-
-FROM base as build_pytorch
-# A commit to fix the output scaling factor issue in _scaled_mm
-# Not yet in 2.5.0-rc1
-ARG PYTORCH_BRANCH="cedc116"
-ARG PYTORCH_VISION_BRANCH="v0.19.1"
-ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
-ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
-#RUN --mount=type=bind,from=export_hipblaslt,src=/,target=/install \
-#if ls /install/*.deb; then \
-#    apt-get purge -y hipblaslt \
-#    && dpkg -i /install/*.deb \
-#    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
-#    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status; \
-#fi
-RUN git clone ${PYTORCH_REPO} pytorch \
-    && cd pytorch && git checkout ${PYTORCH_BRANCH} && git submodule update --init --recursive \
-    && python tools/amd_build/build_amd.py \
-    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
-    && pip install dist/*.whl \
-    && cd .. \
-    && git clone ${PYTORCH_VISION_REPO} vision \
-    && cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
-    && python3 setup.py bdist_wheel --dist-dir=dist
-FROM scratch as export_pytorch_1
-ARG COMMON_WORKDIR
-COPY --from=build_pytorch ${COMMON_WORKDIR}/pytorch/dist/*.whl /
-COPY --from=build_pytorch ${COMMON_WORKDIR}/vision/dist/*.whl /
-FROM scratch as export_pytorch_0
-from export_pytorch_${BUILD_PYTORCH} as export_pytorch
-
-# -----------------------
-# flash attn build stages
-FROM base AS build_flash_attn
-ARG FA_BRANCH="3cea2fb"
-ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
-RUN --mount=type=bind,from=export_pytorch,src=/,target=/install \
-if ls /install/*.whl; then \
-    pip uninstall -y torch torchvision \
-    && pip install /install/*.whl; \
-fi
-RUN git clone ${FA_REPO} \
-    && cd flash-attention \
-    && git checkout ${FA_BRANCH} \
-    && git submodule update --init \
-    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
-FROM scratch AS export_flash_attn_1
-ARG COMMON_WORKDIR
-COPY --from=build_flash_attn ${COMMON_WORKDIR}/flash-attention/dist/*.whl /
-FROM scratch AS export_flash_attn_0
-FROM export_flash_attn_${BUILD_FA} AS export_flash_attn
 
 # -----------------------
 # vLLM fetch stages
@@ -162,22 +32,7 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 # -----------------------
 # vLLM build stages
 FROM fetch_vllm AS build_vllm
-ARG COMMON_WORKDIR
 ARG USE_CYTHON
-# Install hipblaslt
-RUN --mount=type=bind,from=export_hipblaslt,src=/,target=/install \
-if ls /install/*.deb; then \
-    apt-get purge -y hipblaslt \
-    && dpkg -i /install/*.deb \
-    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
-    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status; \
-fi
-# Install pytorch
-RUN --mount=type=bind,from=export_pytorch,src=/,target=/install \
-if ls /install/*.whl; then \
-    pip install /install/*.whl; \
-fi
-
 # Build vLLM
 RUN cd vllm \
     && python3 -m pip install -r requirements-rocm.txt \
@@ -198,8 +53,6 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
 # -----------------------
 # Final vLLM image
 FROM base AS final
-ARG COMMON_WORKDIR
-ARG BUILD_FA
 
 RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
@@ -209,49 +62,6 @@ RUN case "$(which python3)" in \
             rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
         *) ;; esac
 
-RUN --mount=type=bind,from=export_hipblaslt,src=/,target=/install \
-    if ls /install/*.deb; then \
-        apt-get purge -y hipblaslt \
-        && dpkg -i /install/*.deb \
-        && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
-        && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status; \
-    fi
-
-RUN --mount=type=bind,from=export_rccl,src=/,target=/install \
-    if ls /install/*.deb; then \
-        dpkg -i /install/*.deb \
-        # RCCL needs to be installed twice
-        && dpkg -i /install/*.deb \
-        && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
-        && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status; \
-    fi
-
-RUN --mount=type=bind,from=export_flash_attn,src=/,target=/install \
-    if ls /install/*.whl; then \
-        # Preemptively uninstall to prevent pip same-version no-installs
-        pip uninstall -y flash-attn \
-        && pip install /install/*.whl; \
-    fi
-
-RUN --mount=type=bind,from=export_triton,src=/,target=/install \
-    if ls /install/*.whl; then \
-        # Preemptively uninstall to prevent pip same-version no-installs
-        pip uninstall -y triton \
-        && pip install /install/*.whl; \
-    fi
-
-RUN --mount=type=bind,from=export_amdsmi,src=/,target=/install \
-    # Preemptively uninstall to prevent pip same-version no-installs
-    pip uninstall -y amdsmi \
-    && pip install /install/*.whl;
-
-RUN --mount=type=bind,from=export_pytorch,src=/,target=/install \
-    if ls /install/*.whl; then \
-        # Preemptively uninstall to prevent pip same-version no-installs
-        pip uninstall -y torch torchvision \
-        && pip install /install/*.whl; \
-    fi
-
 RUN python3 -m pip install --upgrade huggingface-hub[cli]
 ARG BUILD_RPD
 RUN if [ ${BUILD_RPD} -eq "1" ]; then \
@@ -259,7 +69,7 @@ RUN if [ ${BUILD_RPD} -eq "1" ]; then \
     && cd rocmProfileData/rpd_tracer \
     && pip install -r requirements.txt && cd ../ \
     && make && make install \
-    && cd hipMarker && python setup.py install ; fi
+    && cd hipMarker && python3 setup.py install ; fi
 
 # Install vLLM
 # Make sure punica kernels are built (for LoRA)
@@ -276,6 +86,8 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     && pip uninstall -y vllm \
     && pip install *.whl
 
+ARG COMMON_WORKDIR
+
 # Copy over the benchmark scripts as well
 COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
 COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests

From 01deb433f6936b3897ebbe990e6c60b58164dc7a Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 22 Nov 2024 16:11:09 -0500
Subject: [PATCH 2/2] Added --output-json parameter in the P3l script. Using
 arg_utils to support all vllm args (#289)

* Added --output-json parameter in the P3l script. Using arg_utils to support all vllm args

* Description
---
 benchmarks/P3L.py | 56 +++++++++++++++++------------------------------
 1 file changed, 20 insertions(+), 36 deletions(-)

diff --git a/benchmarks/P3L.py b/benchmarks/P3L.py
index 6c9ffd9ebd599..5fcb8b5a78dbb 100755
--- a/benchmarks/P3L.py
+++ b/benchmarks/P3L.py
@@ -42,13 +42,16 @@
 """
 
 import argparse
+import dataclasses
 import datetime
+import json
 import math
 import os
 
 from huggingface_hub import hf_hub_download
 
 from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -69,16 +72,8 @@ def get_wikitext2_text(tokenizer):
 
 
 def vllm_init(args):
-
-    llm = LLM(model=args.model,
-              tensor_parallel_size=args.tensor_parallel_size,
-              trust_remote_code=args.trust_remote_code,
-              dtype=args.dtype,
-              quantization=args.quantization,
-              kv_cache_dtype=args.kv_cache_dtype,
-              quantization_param_path=args.kv_cache_scales_path
-              if args.kv_cache_scales_path != '' else None,
-              enforce_eager=args.enforce_eager)
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
 
     sampling_params = SamplingParams(n=1,
                                      temperature=0.0,
@@ -186,6 +181,15 @@ def main(args: argparse.Namespace):
                 f"{my_ppl/num_tokens_generated}" \
                 f"\n\tPPL={math.exp(my_ppl/num_tokens_generated)}")
 
+    if args.output_json:
+        results = {
+            "integral_cross_entropy": my_ppl,
+            "average_cross_entropy": my_ppl / num_tokens_generated,
+            "ppl": math.exp(my_ppl / num_tokens_generated),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
     logger.info(MESSAGE)
     print(MESSAGE)
     return
@@ -193,41 +197,21 @@ def main(args: argparse.Namespace):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description='Benchmark the latency of processing a single batch of '
-        'requests till completion.')
-    parser.add_argument('--model', type=str, default='facebook/opt-125m')
+        description='Measure the PPPL (P3L) score of a given model.')
     parser.add_argument(
         '--data',
         type=str,
         default='./wikitext/wikitext-2-v1/test-00000-of-00001.parquet')
     parser.add_argument('--context-size', type=int, default=4096)
-    parser.add_argument('--kv-cache-scales-path', type=str, default='')
-    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
-    parser.add_argument('--quantization', type=str, default=None)
-    parser.add_argument('--trust-remote-code',
-                        action='store_true',
-                        help='trust remote code from huggingface')
-    parser.add_argument(
-        '--dtype',
-        type=str,
-        default='auto',
-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
-        help='data type for model weights and activations. '
-        'The "auto" option will use FP16 precision '
-        'for FP32 and FP16 models, and BF16 precision '
-        'for BF16 models.')
     parser.add_argument('--sample-size', type=int, default=512)
     parser.add_argument('--patch-size', type=int, default=None)
-    parser.add_argument('--enforce-eager',
-                        action='store_true',
-                        help='enforce eager mode and disable CUDA graph')
     parser.add_argument(
-        "--kv-cache-dtype",
+        '--output-json',
         type=str,
-        choices=['auto', 'fp8_e5m2', 'fp8'],
-        default='auto',
-        help=
-        'Data type for kv cache storage. If "auto", will use model data type.')
+        default=None,
+        help='Path to save the latency results in JSON format.')
+
+    parser = EngineArgs.add_cli_args(parser)
     args = parser.parse_args()
 
     main(args)