Remove JAX_RANDOM_WEIGHTS

kyuyeunk · kyuyeunk · commit d852c823ada0 · 2025-11-05T12:26:19.000Z
- Same functionality can be achieve with vllm argument `--load-format=dummy`
- It is better to remove duplicate configs to avoid confusion from users

Signed-off-by: Kyuyeun Kim &lt;kyuyeunk@google.com&gt;
diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml
@@ -190,13 +190,12 @@ steps:
        USE_V6E8_QUEUE: "True"
        SKIP_ACCURACY_TESTS: "True"
        VLLM_MLA_DISABLE: "1"
-       JAX_RANDOM_WEIGHTS: "True"
      agents:
        queue: tpu_v6e_8_queue
      commands:
        - |
          if [[ "$$NIGHTLY" == "1" ]]; then
-           .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/mlperf.sh -m deepseek-ai/DeepSeek-R1-0528
+           .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/mlperf.sh -m deepseek-ai/DeepSeek-R1-0528 --use-dummy-weight
          else
            echo "Skipping: NIGHTLY environment variable not set"
            exit 0
diff --git a/.buildkite/scripts/run_in_docker.sh b/.buildkite/scripts/run_in_docker.sh
@@ -108,7 +108,6 @@ exec docker run \
   ${QUANTIZATION:+-e QUANTIZATION="$QUANTIZATION"} \
   ${NEW_MODEL_DESIGN:+-e NEW_MODEL_DESIGN="$NEW_MODEL_DESIGN"} \
   ${USE_V6E8_QUEUE:+-e USE_V6E8_QUEUE="$USE_V6E8_QUEUE"} \
-  ${JAX_RANDOM_WEIGHTS:+-e JAX_RANDOM_WEIGHTS="$JAX_RANDOM_WEIGHTS"} \
   ${SKIP_ACCURACY_TESTS:+-e SKIP_ACCURACY_TESTS="$SKIP_ACCURACY_TESTS"} \
   ${VLLM_MLA_DISABLE:+-e VLLM_MLA_DISABLE="$VLLM_MLA_DISABLE"} \
   "${IMAGE_NAME}:${BUILDKITE_COMMIT}" \
diff --git a/tests/e2e/benchmarking/mlperf.sh b/tests/e2e/benchmarking/mlperf.sh
@@ -40,13 +40,12 @@ else
     echo "QUANTIZATION is False. Running without quantization."
 fi
 
-echo extra_serve_args: "${extra_serve_args[@]}"
-
 root_dir=/workspace
 dataset_name=mlperf
 dataset_path=""
 num_prompts=1000
 exit_code=0
+use_dummy_weight=false
 
 helpFunction()
 {
@@ -57,6 +56,7 @@ helpFunction()
    echo -e "\t-p The path to the processed MLPerf dataset (default: None, which will download the dataset)"
    echo -e "\t-m A space-separated list of HuggingFace model ids to use (default: Qwen/Qwen2.5-1.5B-Instruct, Qwen/Qwen2.5-0.5B-Instruct, meta-llama/Llama-3.1-8B-Instruct and meta-llama/Llama-4-Scout-17B-16E-Instruct)"
    echo -e "\t-n Number of prompts to use for the benchmark (default: 10)"
+   echo -e "\t--use-dummy-weight Boolean flag that uses dummy random weight when it's set (default: false)"
    exit 1
 }
 
@@ -87,6 +87,11 @@ while [[ "$#" -gt 0 ]]; do
             shift
             shift
             ;;
+        --use-dummy-weight)
+            use_dummy_weight=true
+            shift
+            shift
+            ;;
         -h|--help)
             helpFunction
             ;;
@@ -121,6 +126,12 @@ if [ -z "$dataset_path" ]; then
     fi
 fi
 
+if [ "$use_dummy_weight" = true ]; then
+    extra_serve_args+=("--load-format=dummy")
+fi
+
+echo Using extra_serve_args: "${extra_serve_args[@]}"
+
 echo "Using the dataset at $dataset_path"
 
 cd "$root_dir"/vllm || exit
diff --git a/tests/models/common/test_model_loader.py b/tests/models/common/test_model_loader.py
@@ -252,17 +252,13 @@ def test_get_vllm_model(mesh):
     assert callable(compute_logits_fn)
 
 
-@pytest.mark.parametrize("set_in_config", [True, False])
-def test_get_vllm_model_random_weights(mesh, set_in_config):
+def test_get_vllm_model_random_weights(mesh):
     rng = jax.random.PRNGKey(42)
 
     engine_args = EngineArgs(model="Qwen/Qwen3-0.6B")
     vllm_config = engine_args.create_engine_config()
     vllm_config.model_config.dtype = torch.bfloat16
-    if set_in_config:
-        vllm_config.load_config.load_format = "dummy"
-    else:
-        os.environ["JAX_RANDOM_WEIGHTS"] = "True"
+    vllm_config.load_config.load_format = "dummy"
 
     with set_current_vllm_config(vllm_config):
         temp_file = tempfile.mkstemp()[1]
diff --git a/tpu_inference/models/common/model_loader.py b/tpu_inference/models/common/model_loader.py
@@ -102,7 +102,7 @@ def create_jit_model(
                                             apply_to_abstract_model=False)
         return model
 
-    if os.getenv("JAX_RANDOM_WEIGHTS", False):
+    if vllm_config.load_config.load_format == "dummy":
         # Create a sharded model with random inited weights.
         # TODO: currently Qwen2ForCausalLM is using legacy model implementation
         # will merge the random init logic when all model are migrated to new model implementation
diff --git a/tpu_inference/models/vllm/vllm_model_wrapper.py b/tpu_inference/models/vllm/vllm_model_wrapper.py
@@ -1,6 +1,5 @@
 import copy
 import functools
-import os
 from collections.abc import Sequence
 from contextlib import nullcontext
 from typing import Any, List, Optional, Tuple
@@ -86,22 +85,16 @@ def load_weights(self):
         assert self.vllm_config.model_config.dtype in TORCH_DTYPE_TO_JAX, "The model_config.dtype must be a PyTorch dtype."
         vllm_config_for_load.device_config.device = "cpu"
 
-        if os.getenv("JAX_RANDOM_WEIGHTS", False):
-            vllm_config_for_load.load_config.load_format = "dummy"
-            use_random_weights = True
-        else:
-            use_random_weights = (
-                vllm_config_for_load.load_config.load_format == "dummy")
-        if use_random_weights:
+        if vllm_config_for_load.load_config.load_format == "dummy":
             logger.info(
                 "Initializing vLLM model with random weights, weight loading skipped."
             )
-        # The DummyModelLoader in vLLM calls torch._sync for torch_xla path when
-        # it detects the tpu platform, but we don't need it and it causes crash
-        # without proper setup.
-        load_context = patch(
-            "torch._sync",
-            return_value=None) if use_random_weights else nullcontext()
+            # The DummyModelLoader in vLLM calls torch._sync for torch_xla path
+            # when it detects the tpu platform, but we don't need it and it
+            # causes crash without proper setup.
+            load_context = patch("torch._sync", return_value=None)
+        else:
+            load_context = nullcontext()
 
         # Load the vLLM model and wrap it into a new model whose forward
         # function can calculate the hidden_state and logits.
diff --git a/tpu_inference/platforms/tpu_jax.py b/tpu_inference/platforms/tpu_jax.py
@@ -49,8 +49,7 @@ class TpuPlatform(Platform):
     ]
 
     additional_env_vars: list[str] = [
-        "JAX_RANDOM_WEIGHTS", "PHASED_PROFILING_DIR",
-        "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS",
+        "PHASED_PROFILING_DIR", "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS",
         "TPU_MULTIHOST_BACKEND", "VLLM_MLA_DISABLE", "TPU_BACKEND_TYPE"
     ]
 

Original file line number	Diff line number	Diff line change
`@@ -49,8 +49,7 @@ class TpuPlatform(Platform):`
`49`	`49`	`]`
`50`	`50`
`51`	`51`	`additional_env_vars: list[str] = [`
`52`		`- "JAX_RANDOM_WEIGHTS", "PHASED_PROFILING_DIR",`
`53`		`- "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS",`
	`52`	`+ "PHASED_PROFILING_DIR", "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS",`
`54`	`53`	`"TPU_MULTIHOST_BACKEND", "VLLM_MLA_DISABLE", "TPU_BACKEND_TYPE"`
`55`	`54`	`]`
`56`	`55`