Remove JAX_RANDOM_WEIGHTS

kyuyeunk · kyuyeunk · commit bcb4c98d387e · 2025-11-11T16:19:41.000Z
Signed-off-by: Kyuyeun Kim &lt;kyuyeunk@google.com&gt;
diff --git a/.buildkite/pipeline_jax.yml b/.buildkite/pipeline_jax.yml
@@ -190,13 +190,12 @@ steps:
        USE_V6E8_QUEUE: "True"
        SKIP_ACCURACY_TESTS: "True"
        VLLM_MLA_DISABLE: "1"
-       JAX_RANDOM_WEIGHTS: "True"
      agents:
        queue: tpu_v6e_8_queue
      commands:
        - |
          if [[ "$$NIGHTLY" == "1" ]]; then
-           .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/mlperf.sh -m deepseek-ai/DeepSeek-R1-0528
+           .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/mlperf.sh -m deepseek-ai/DeepSeek-R1-0528 --use-dummy-weights
          else
            echo "Skipping: NIGHTLY environment variable not set"
            exit 0
diff --git a/.buildkite/scripts/run_in_docker.sh b/.buildkite/scripts/run_in_docker.sh
@@ -108,7 +108,6 @@ exec docker run \
   ${QUANTIZATION:+-e QUANTIZATION="$QUANTIZATION"} \
   ${NEW_MODEL_DESIGN:+-e NEW_MODEL_DESIGN="$NEW_MODEL_DESIGN"} \
   ${USE_V6E8_QUEUE:+-e USE_V6E8_QUEUE="$USE_V6E8_QUEUE"} \
-  ${JAX_RANDOM_WEIGHTS:+-e JAX_RANDOM_WEIGHTS="$JAX_RANDOM_WEIGHTS"} \
   ${SKIP_ACCURACY_TESTS:+-e SKIP_ACCURACY_TESTS="$SKIP_ACCURACY_TESTS"} \
   ${VLLM_MLA_DISABLE:+-e VLLM_MLA_DISABLE="$VLLM_MLA_DISABLE"} \
   "${IMAGE_NAME}:${BUILDKITE_COMMIT}" \
diff --git a/tests/e2e/benchmarking/mlperf.sh b/tests/e2e/benchmarking/mlperf.sh
@@ -40,13 +40,12 @@ else
     echo "QUANTIZATION is False. Running without quantization."
 fi
 
-echo extra_serve_args: "${extra_serve_args[@]}"
-
 root_dir=/workspace
 dataset_name=mlperf
 dataset_path=""
 num_prompts=1000
 exit_code=0
+use_dummy_weights=false
 
 helpFunction()
 {
@@ -57,6 +56,7 @@ helpFunction()
    echo -e "\t-p The path to the processed MLPerf dataset (default: None, which will download the dataset)"
    echo -e "\t-m A space-separated list of HuggingFace model ids to use (default: Qwen/Qwen2.5-1.5B-Instruct, Qwen/Qwen2.5-0.5B-Instruct, meta-llama/Llama-3.1-8B-Instruct and meta-llama/Llama-4-Scout-17B-16E-Instruct)"
    echo -e "\t-n Number of prompts to use for the benchmark (default: 10)"
+   echo -e "\t--use-dummy-weights Use dummy random weight (default: false)"
    exit 1
 }
 
@@ -87,6 +87,11 @@ while [[ "$#" -gt 0 ]]; do
             shift
             shift
             ;;
+        --use-dummy-weightsj)
+            use_dummy_weights=true
+            shift
+            shift
+            ;;
         -h|--help)
             helpFunction
             ;;
@@ -121,6 +126,13 @@ if [ -z "$dataset_path" ]; then
     fi
 fi
 
+if [ "$use_dummy_weights" = true ]; then
+    extra_serve_args+=("--load-format=dummy")
+fi
+
+echo extra_serve_args: "${extra_serve_args[@]}"
+
+
 echo "Using the dataset at $dataset_path"
 
 cd "$root_dir"/vllm || exit
diff --git a/tests/models/common/test_model_loader.py b/tests/models/common/test_model_loader.py
@@ -254,17 +254,13 @@ def test_get_vllm_model(mesh):
     assert callable(compute_logits_fn)
 
 
-@pytest.mark.parametrize("set_in_config", [True, False])
-def test_get_vllm_model_random_weights(mesh, set_in_config):
+def test_get_vllm_model_random_weights(mesh):
     rng = jax.random.PRNGKey(42)
 
     engine_args = EngineArgs(model="Qwen/Qwen3-0.6B")
     vllm_config = engine_args.create_engine_config()
     vllm_config.model_config.dtype = torch.bfloat16
-    if set_in_config:
-        vllm_config.load_config.load_format = "dummy"
-    else:
-        os.environ["JAX_RANDOM_WEIGHTS"] = "True"
+    vllm_config.load_config.load_format = "dummy"
 
     with set_current_vllm_config(vllm_config):
         temp_file = tempfile.mkstemp()[1]
diff --git a/tpu_inference/models/common/model_loader.py b/tpu_inference/models/common/model_loader.py
@@ -103,7 +103,7 @@ def create_jit_model(
                                             apply_to_abstract_model=False)
         return model
 
-    if os.getenv("JAX_RANDOM_WEIGHTS", False):
+    if vllm_config.load_config.load_format == "dummy":
         # Create a sharded model with random inited weights.
         # TODO: currently Qwen2ForCausalLM is using legacy model implementation
         # will merge the random init logic when all model are migrated to new model implementation
diff --git a/tpu_inference/models/vllm/vllm_model_wrapper.py b/tpu_inference/models/vllm/vllm_model_wrapper.py
@@ -1,6 +1,5 @@
 import copy
 import functools
-import os
 from collections.abc import Sequence
 from contextlib import nullcontext
 from typing import Any, List, Optional, Tuple
@@ -91,12 +90,8 @@ def load_weights(self):
         # may casue errors. Therefore, we disable it during weight loading.
         vllm_config_for_load.parallel_config.enable_expert_parallel = False
 
-        if os.getenv("JAX_RANDOM_WEIGHTS", False):
-            vllm_config_for_load.load_config.load_format = "dummy"
-            use_random_weights = True
-        else:
-            use_random_weights = (
-                vllm_config_for_load.load_config.load_format == "dummy")
+        use_random_weights = (
+            vllm_config_for_load.load_config.load_format == "dummy")
         if use_random_weights:
             logger.info(
                 "Initializing vLLM model with random weights, weight loading skipped."
diff --git a/tpu_inference/platforms/tpu_platform.py b/tpu_inference/platforms/tpu_platform.py
@@ -48,8 +48,7 @@ class TpuPlatform(Platform):
     ]
 
     additional_env_vars: list[str] = [
-        "JAX_RANDOM_WEIGHTS", "PHASED_PROFILING_DIR",
-        "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS",
+        "PHASED_PROFILING_DIR", "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS",
         "TPU_MULTIHOST_BACKEND", "VLLM_MLA_DISABLE", "TPU_BACKEND_TYPE"
     ]
 

Original file line number	Diff line number	Diff line change
`@@ -48,8 +48,7 @@ class TpuPlatform(Platform):`
`48`	`48`	`]`
`49`	`49`
`50`	`50`	`additional_env_vars: list[str] = [`
`51`		`- "JAX_RANDOM_WEIGHTS", "PHASED_PROFILING_DIR",`
`52`		`- "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS",`
	`51`	`+ "PHASED_PROFILING_DIR", "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS",`
`53`	`52`	`"TPU_MULTIHOST_BACKEND", "VLLM_MLA_DISABLE", "TPU_BACKEND_TYPE"`
`54`	`53`	`]`
`55`	`54`