diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index 7302d484954f7..d01f187521fe6 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -34,7 +34,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0) t = all_tensors[rank % tp_size] t = tensor_model_parallel_all_reduce(t) - assert torch.allclose(t, expected) + torch.testing.assert_close(t, expected) @ray.remote(num_gpus=1, max_calls=1) @@ -62,7 +62,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, expected = torch.cat(all_tensors, dim=all_gather_dimension) t = all_tensors[rank % tp_size] t = tensor_model_parallel_all_gather(t, all_gather_dimension) - assert torch.allclose(t, expected) + torch.testing.assert_close(t, expected) @ray.remote(num_gpus=1, max_calls=1) @@ -96,12 +96,12 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, else: recv_dict = broadcast_tensor_dict(src=0) assert len(recv_dict) == len(test_dict) - assert torch.allclose(recv_dict["a"], test_dict["a"]) - assert torch.allclose(recv_dict["b"], test_dict["b"]) + torch.testing.assert_close(recv_dict["a"], test_dict["a"]) + torch.testing.assert_close(recv_dict["b"], test_dict["b"]) assert recv_dict["c"] == test_dict["c"] assert recv_dict["d"] == test_dict["d"] assert recv_dict["e"] == test_dict["e"] - assert torch.allclose(recv_dict["f"], test_dict["f"]) + torch.testing.assert_close(recv_dict["f"], test_dict["f"]) @ray.remote(num_gpus=1, max_calls=1) @@ -136,12 +136,12 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, if not get_pp_group().is_first_rank: assert len(recv_dict) == len(test_dict) - assert torch.allclose(recv_dict["a"], test_dict["a"]) - assert torch.allclose(recv_dict["b"], test_dict["b"]) + torch.testing.assert_close(recv_dict["a"], test_dict["a"]) + torch.testing.assert_close(recv_dict["b"], test_dict["b"]) assert recv_dict["c"] == test_dict["c"] assert recv_dict["d"] == test_dict["d"] assert recv_dict["e"] == test_dict["e"] - assert torch.allclose(recv_dict["f"], test_dict["f"]) + torch.testing.assert_close(recv_dict["f"], test_dict["f"]) @ray.remote(num_gpus=1, max_calls=1) @@ -163,7 +163,7 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, get_pp_group().send(test_tensor) if not get_pp_group().is_first_rank: - assert torch.allclose(test_tensor, recv_tensor) + torch.testing.assert_close(test_tensor, recv_tensor) @pytest.mark.skipif(torch.cuda.device_count() < 2, diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 3c281a45fcaf1..95435e753058a 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -72,8 +72,8 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): out2 = tensor_model_parallel_all_reduce(inp2) dist.all_reduce(inp2, group=group) graph.replay() - assert torch.allclose(out1, inp1) - assert torch.allclose(out2, inp2) + torch.testing.assert_close(out1, inp1) + torch.testing.assert_close(out2, inp2) @ray.remote(num_gpus=1, max_calls=1) @@ -96,13 +96,13 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port): out = inp for _ in range(num_communication): out = fa.all_reduce_unreg(out) - assert torch.allclose(out, inp * (tp_size**num_communication)) + torch.testing.assert_close(out, inp * (tp_size**num_communication)) inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device) out = inp for _ in range(num_communication): out = fa.all_reduce_unreg(out) - assert torch.allclose(out, inp * (tp_size**num_communication)) + torch.testing.assert_close(out, inp * (tp_size**num_communication)) @pytest.mark.parametrize("tp_size", [2]) diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index cec2b05bafd21..9bdace671487f 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -69,4 +69,4 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \ ref_iscale = one / ref_scale ref_out = (as_float32_tensor(x) * ref_iscale).clamp( fp8_traits.min, fp8_traits.max).to(dtype=torch.float8_e4m3fn) - return ref_out, ref_scale + return ref_out, ref_scale.view((1, )) diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index a4b9f91c7688b..38b0477063528 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -47,7 +47,7 @@ def test_act_and_mul( ref_out = layer.forward_native(x) # The SiLU and GELU implementations are equivalent to the native PyTorch # implementations, so we can do exact comparison. - assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0) + torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) @pytest.mark.parametrize("activation", [FastGELU, NewGELU]) @@ -73,7 +73,7 @@ def test_activation( layer = activation() out = layer(x) ref_out = layer.forward_native(x) - assert torch.allclose(out, - ref_out, - atol=get_default_atol(out), - rtol=get_default_rtol(out)) + torch.testing.assert_close(out, + ref_out, + atol=get_default_atol(out), + rtol=get_default_rtol(out)) diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index c7c6707461c3e..8aa2d4a53aaa0 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -276,7 +276,7 @@ def test_paged_attention( atol, rtol = 1e-3, 1e-5 if kv_cache_dtype == "fp8": atol, rtol = 1e-2, 1e-5 - assert torch.allclose(output, ref_output, atol=atol, rtol=rtol) + torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol) def ref_multi_query_kv_attention( @@ -379,4 +379,4 @@ def test_multi_query_kv_attention( ) atol = get_default_atol(output) if is_hip() else 1e-3 rtol = get_default_rtol(output) if is_hip() else 1e-5 - assert torch.allclose(output, ref_output, atol=atol, rtol=rtol) + torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol) diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index b3adb152949a2..7357508751ae1 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -327,7 +327,7 @@ def test_paged_attention( atol, rtol = 1e-3, 1e-5 if kv_cache_dtype == "fp8": atol, rtol = 1e-2, 1e-5 - assert torch.allclose(output, ref_output, atol=atol, rtol=rtol) + torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol) def ref_multi_query_kv_attention( @@ -441,4 +441,4 @@ def test_varlen_blocksparse_attention_prefill( scale, dtype, ) - assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2) + torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 3fb9b59be1701..71d18359164b1 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -98,10 +98,10 @@ def test_copy_blocks( # Compare the results. for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches): - assert torch.allclose(key_cache, cloned_key_cache) + torch.testing.assert_close(key_cache, cloned_key_cache) for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches): - assert torch.allclose(value_cache, cloned_value_cache) + torch.testing.assert_close(value_cache, cloned_value_cache) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @@ -184,17 +184,17 @@ def test_reshape_and_cache( cloned_value_cache[block_idx, :, :, block_offset] = value[i] if kv_cache_dtype == "fp8": - assert torch.allclose(result_key_cache, - cloned_key_cache, - atol=0.001, - rtol=0.1) - assert torch.allclose(result_value_cache, - cloned_value_cache, - atol=0.001, - rtol=0.1) + torch.testing.assert_close(result_key_cache, + cloned_key_cache, + atol=0.001, + rtol=0.1) + torch.testing.assert_close(result_value_cache, + cloned_value_cache, + atol=0.001, + rtol=0.1) else: - assert torch.allclose(key_cache, cloned_key_cache) - assert torch.allclose(value_cache, cloned_value_cache) + torch.testing.assert_close(key_cache, cloned_key_cache) + torch.testing.assert_close(value_cache, cloned_value_cache) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @@ -290,17 +290,17 @@ def test_reshape_and_cache_flash( cloned_value_cache[block_idx, block_offset, :, :] = value[i] if kv_cache_dtype == "fp8": - assert torch.allclose(result_key_cache, - cloned_key_cache, - atol=0.001, - rtol=0.1) - assert torch.allclose(result_value_cache, - cloned_value_cache, - atol=0.001, - rtol=0.1) + torch.testing.assert_close(result_key_cache, + cloned_key_cache, + atol=0.001, + rtol=0.1) + torch.testing.assert_close(result_value_cache, + cloned_value_cache, + atol=0.001, + rtol=0.1) else: - assert torch.allclose(key_cache, cloned_key_cache) - assert torch.allclose(value_cache, cloned_value_cache) + torch.testing.assert_close(key_cache, cloned_key_cache) + torch.testing.assert_close(value_cache, cloned_value_cache) @pytest.mark.parametrize("direction", COPYING_DIRECTION) @@ -372,10 +372,10 @@ def test_swap_blocks( block_mapping_tensor) for src, dst in block_mapping: - assert torch.allclose(src_key_caches_clone[src].cpu(), - dist_key_caches[0][dst].cpu()) - assert torch.allclose(src_value_caches_clone[src].cpu(), - dist_value_caches[0][dst].cpu()) + torch.testing.assert_close(src_key_caches_clone[src].cpu(), + dist_key_caches[0][dst].cpu()) + torch.testing.assert_close(src_value_caches_clone[src].cpu(), + dist_value_caches[0][dst].cpu()) @pytest.mark.parametrize("num_heads", NUM_HEADS) @@ -411,4 +411,4 @@ def test_fp8_e4m3_conversion( converted_cache = torch.empty_like(cache) ops.convert_fp8(converted_cache, cache_fp8) - assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1) + torch.testing.assert_close(cache, converted_cache, atol=0.001, rtol=0.1) diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 33c4ea8dd92e6..e818651fe9c6a 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -74,7 +74,7 @@ def cutlass_fp8_gemm_helper(m: int, out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) - assert torch.allclose(out, baseline, rtol=1e-2, atol=5e-2) + torch.testing.assert_close(out, baseline, rtol=1e-2, atol=5e-2) def cutlass_int8_gemm_helper(m: int, @@ -106,7 +106,7 @@ def cutlass_int8_gemm_helper(m: int, out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) - assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0) + torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) @pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33]) @@ -252,7 +252,7 @@ def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int, azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a # correct for rounding a_dq = scale_a * (aq_i32 + azp_aq_i8).to(dtype=torch.float32) - assert torch.allclose(a_dq, scale_a * aq_f32 + azp_a) + torch.testing.assert_close(a_dq, scale_a * aq_f32 + azp_a) baseline_dq = torch.mm(a_dq, b_dq).to(out_dtype) @@ -271,8 +271,8 @@ def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int, scale_b, out_dtype=out_dtype, bias=azp_bias[0, :]) - assert torch.allclose(out, baseline_dq, rtol=1e-2, atol=1e0) - assert torch.allclose(out, baseline_q, rtol=1e-2, atol=1e0) + torch.testing.assert_close(out, baseline_dq, rtol=1e-2, atol=1e0) + torch.testing.assert_close(out, baseline_q, rtol=1e-2, atol=1e0) @pytest.mark.parametrize("m", [32, 64, 128]) @@ -302,7 +302,10 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype, azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a # correct for rounding a_dq = scale_a * (aq_i32 - azp_aq_i8).to(dtype=torch.float32) - assert torch.allclose(a_dq, scale_a * aq_f32 - azp_a, rtol=1e-4, atol=1e-3) + torch.testing.assert_close(a_dq, + scale_a * aq_f32 - azp_a, + rtol=1e-4, + atol=1e-3) if use_bias: bias = torch.rand((1, n), device="cuda", dtype=out_dtype) * 10 + 2.5 @@ -335,8 +338,8 @@ def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype, # float16 precision is 10-bit mantissa -> 2^-11 ~ 0.05% rtol = 1e-2 if out_dtype == torch.bfloat16 else 1e-3 atol = 1e-3 - assert torch.allclose(out, baseline_dq, rtol=rtol, atol=atol) - assert torch.allclose(out, baseline_q, rtol=rtol, atol=atol) + torch.testing.assert_close(out, baseline_dq, rtol=rtol, atol=atol) + torch.testing.assert_close(out, baseline_q, rtol=rtol, atol=atol) # Test working with a subset of A and B @@ -363,7 +366,7 @@ def test_cutlass_subset(): scale_b, out_dtype=torch.bfloat16) - assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0) + torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) # Test to make sure cuda graphs work @@ -411,4 +414,4 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool): baseline = torch.mm(scale_a * a.to(dtype=torch.float32), scale_b * b.to(dtype=torch.float32)).to(torch.bfloat16) - assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0) + torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 374ba0afb5f41..779f340b6398e 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -126,7 +126,7 @@ def test_flash_attn_with_paged_kv( scale=scale, soft_cap=soft_cap, ) - assert torch.allclose(output, ref_output, atol=2e-2, rtol=1e-2), \ + torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" @@ -211,5 +211,5 @@ def test_varlen_with_paged_kv( sliding_window=sliding_window, soft_cap=soft_cap, ) - assert torch.allclose(output, ref_output, atol=2e-2, rtol=1e-2), \ + torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py index 5211be6aef009..1db47f8c8c2e0 100644 --- a/tests/kernels/test_flashinfer.py +++ b/tests/kernels/test_flashinfer.py @@ -144,7 +144,7 @@ def test_flashinfer_decode_with_paged_kv(kv_lens: List[int], block_tables=block_tables, scale=scale, soft_cap=soft_cap) - assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \ + torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" @@ -244,5 +244,5 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], block_tables=block_tables, scale=scale, soft_cap=soft_cap) - assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \ + torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/test_fp8_quant.py index 61ff54db560da..71a92cbc8f7c7 100644 --- a/tests/kernels/test_fp8_quant.py +++ b/tests/kernels/test_fp8_quant.py @@ -37,9 +37,9 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int, scale_ub=scale_ub, use_per_token_if_dynamic=True) - assert torch.allclose(ref_scales, ops_scales) - assert torch.allclose(ref_out.to(dtype=torch.float32), - ops_out.to(dtype=torch.float32)) + torch.testing.assert_close(ref_scales, ops_scales) + torch.testing.assert_close(ref_out.to(dtype=torch.float32), + ops_out.to(dtype=torch.float32)) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @@ -57,9 +57,9 @@ def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int, ref_out, ref_scale = ref_dynamic_per_tensor_fp8_quant(x) ops_out, ops_scale = ops.scaled_fp8_quant(x) - assert torch.allclose(ref_scale, ops_scale) - assert torch.allclose(ref_out.to(dtype=torch.float32), - ops_out.to(dtype=torch.float32)) + torch.testing.assert_close(ref_scale, ops_scale) + torch.testing.assert_close(ref_out.to(dtype=torch.float32), + ops_out.to(dtype=torch.float32)) # Regression test for a case with large activations where an int32 index cannot @@ -84,4 +84,4 @@ def test_fp8_quant_large(seed: int) -> None: ref_out = ref_out.to(dtype=dtype) ops_out = ops_out.to(dtype=dtype) - assert torch.allclose(ref_out, ops_out) + torch.testing.assert_close(ref_out, ops_out) diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index 0b7ed26a39e1e..7376dcaf60902 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -29,9 +29,10 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int, # kernel ops_out, ops_scales = scaled_int8_quant(x) - assert torch.allclose(ops_scales, ref_scales) - assert torch.allclose(ops_out, ref_out, - atol=1) # big atol to account for rounding errors + torch.testing.assert_close(ops_scales, ref_scales) + torch.testing.assert_close( + ops_out, ref_out, atol=1, + rtol=0.0) # big atol to account for rounding errors @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @@ -54,5 +55,6 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, int8_traits.max).to(torch.int8) out2, _ = scaled_int8_quant(x, scale) - assert torch.allclose(out1, out2, - atol=1) # big atol to account for rounding errors + torch.testing.assert_close( + out1, out2, atol=1, + rtol=0.0) # big atol to account for rounding errors diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index a635e6c12c594..21bc38d67b771 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -48,7 +48,7 @@ def test_rms_norm( # numerical errors than other operators because they involve reductions. # Therefore, we use a larger tolerance. if add_residual: - assert torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2) - assert torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2) + torch.testing.assert_close(out[0], ref_out[0], atol=1e-2, rtol=1e-2) + torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2) else: - assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2) + torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2) diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index 2f58ffda21408..18b66abe7be74 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -122,7 +122,7 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size, ) torch.cuda.synchronize() - assert torch.allclose(marlin_q_w_1, marlin_q_w_2) + torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2) @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), @@ -174,7 +174,7 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size, ) torch.cuda.synchronize() - assert torch.allclose(marlin_q_w_1, marlin_q_w_2) + torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2) @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 2f9eee420f270..f526c381b3339 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -50,7 +50,7 @@ def test_fused_moe( score = torch.randn((m, e), device='cuda', dtype=dtype) triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False) torch_output = torch_moe(a, w1, w2, score, topk) - assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0) + torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0) @pytest.mark.parametrize("dtype", @@ -95,7 +95,7 @@ def test_mixtral_moe(dtype: torch.dtype): torch.bfloat16: 1e-2, } - assert torch.allclose(hf_states.flatten(0, 1), - vllm_states, - rtol=mixtral_moe_tol[dtype], - atol=mixtral_moe_tol[dtype]) + torch.testing.assert_close(hf_states.flatten(0, 1), + vllm_states, + rtol=mixtral_moe_tol[dtype], + atol=mixtral_moe_tol[dtype]) diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 4a7ad6e0fa21d..65242e275650c 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -67,14 +67,14 @@ def test_rotary_embedding( ref_query, ref_key = rope.forward_native(positions, query, key) out_query, out_key = rope.forward(positions, query, key) # Compare the results. - assert torch.allclose(out_query, - ref_query, - atol=get_default_atol(out_query), - rtol=get_default_rtol(out_query)) - assert torch.allclose(out_key, - ref_key, - atol=get_default_atol(out_key), - rtol=get_default_rtol(out_key)) + torch.testing.assert_close(out_query, + ref_query, + atol=get_default_atol(out_query), + rtol=get_default_rtol(out_query)) + torch.testing.assert_close(out_key, + ref_key, + atol=get_default_atol(out_key), + rtol=get_default_rtol(out_key)) @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @@ -129,14 +129,14 @@ def test_batched_rotary_embedding( dtype=torch.long, device=device)) # Compare the results. - assert torch.allclose(out_query, - ref_query, - atol=get_default_atol(out_query), - rtol=get_default_rtol(out_query)) - assert torch.allclose(out_key, - ref_key, - atol=get_default_atol(out_key), - rtol=get_default_rtol(out_key)) + torch.testing.assert_close(out_query, + ref_query, + atol=get_default_atol(out_query), + rtol=get_default_rtol(out_query)) + torch.testing.assert_close(out_key, + ref_key, + atol=get_default_atol(out_key), + rtol=get_default_rtol(out_key)) @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @@ -200,14 +200,14 @@ def test_batched_rotary_embedding_multi_lora( out_query, out_key = rope.forward(positions, query, key, query_offsets.flatten()) # Compare the results. - assert torch.allclose(out_query, - ref_query, - atol=get_default_atol(out_query), - rtol=get_default_rtol(out_query)) - assert torch.allclose(out_key, - ref_key, - atol=get_default_atol(out_key), - rtol=get_default_rtol(out_key)) + torch.testing.assert_close(out_query, + ref_query, + atol=get_default_atol(out_query), + rtol=get_default_rtol(out_query)) + torch.testing.assert_close(out_key, + ref_key, + atol=get_default_atol(out_key), + rtol=get_default_rtol(out_key)) @torch.inference_mode() diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py index 713e868986a5f..03844aba20f8a 100644 --- a/tests/kernels/test_sampler.py +++ b/tests/kernels/test_sampler.py @@ -100,11 +100,11 @@ def test_sample_decoding_only(random_sampling, max_best_of, if modify_greedy_probs and not request_uses_random_sampling: # If we are modifying greedy probs and the request is greedy, # we want to make sure the probs tensor is modified in place - assert torch.allclose( + torch.testing.assert_close( probs[i][sampled_tokens[i]], torch.full_like(probs[i][sampled_tokens[i]], 1.0)) assert torch.sum(probs[i]) == 1.0 - assert torch.allclose( + torch.testing.assert_close( sampled_modified_probs[i][0], torch.full_like(sampled_modified_probs[i][0], 1.0)) elif request_uses_random_sampling: @@ -117,8 +117,8 @@ def test_sample_decoding_only(random_sampling, max_best_of, # If the request is greedy and we are not modifying greedy probs, # we want to make sure sampled_modified_probs tensor is the same as # the probs tensor. - assert torch.allclose(sampled_modified_probs[i][0], - probs[i][sampled_tokens[i]]) + torch.testing.assert_close(sampled_modified_probs[i], + probs[i][sampled_tokens[i]]) if save_logprobs: assert sampled_logprobs.shape == (bs, max_best_of) diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index e942336ff7fdc..3f8f6502039aa 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -924,5 +924,5 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters, * output_under_test: actually observed output value ''' ideal_output = test_params.packed_qkvo.ideal_output - assert torch.allclose(ideal_output, - output_under_test.view_as(ideal_output)) + torch.testing.assert_close(ideal_output, + output_under_test.view_as(ideal_output)) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index ad86f7bdf6101..effcffc5c174e 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -247,10 +247,10 @@ def create_random_embedding_layer(): expected_result = torch.cat(expected_results) rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) # Check that resetting the lora weights succeeds @@ -274,10 +274,10 @@ def create_random_embedding_layer(): expected_result = embedding(torch.cat(inputs)) rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) @torch.inference_mode() @@ -384,10 +384,10 @@ def create_random_embedding_layer(): expected_result = torch.cat(expected_results) rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) # Check that resetting the lora weights succeeds @@ -411,10 +411,10 @@ def create_random_embedding_layer(): expected_result = expanded_embedding(torch.cat(inputs)) rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) @torch.inference_mode() @@ -541,10 +541,10 @@ def _pretest(): embedding_bias=None) rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) @torch.inference_mode() @@ -614,10 +614,10 @@ def create_random_linear_replicated_layer(): expected_result = torch.cat(expected_results) rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) # Check that resetting the lora weights succeeds @@ -642,10 +642,10 @@ def create_random_linear_replicated_layer(): expected_result = linear(torch.cat(inputs))[0] rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) @torch.inference_mode() @@ -728,10 +728,10 @@ def create_random_linear_parallel_layer(): expected_result = torch.cat(expected_results) rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) # Check that resetting the lora weights succeeds @@ -756,10 +756,10 @@ def create_random_linear_parallel_layer(): expected_result = linear(torch.cat(inputs))[0] rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) @torch.inference_mode() @@ -868,10 +868,10 @@ class FakeConfig: expected_result = torch.cat(expected_results) rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) for slot_idx in range(max_loras): lora_linear.reset_lora(slot_idx) @@ -900,10 +900,10 @@ class FakeConfig: expected_result = linear(torch.cat(inputs))[0] rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) @torch.inference_mode() diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 7bff9e1fbcdcc..67cf298b4df2b 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -533,21 +533,21 @@ def test_packed_loras(dist_init, dummy_model_gate_up): packed_lora = model_lora.get_lora("gate_up_proj") assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights) - assert torch.allclose(packed_lora.lora_a[0], - model_lora.get_lora("gate_proj").lora_a) - assert torch.allclose(packed_lora.lora_b[0], - model_lora.get_lora("gate_proj").lora_b) - assert torch.allclose(packed_lora.lora_a[1], - model_lora.get_lora("up_proj").lora_a) - assert torch.allclose(packed_lora.lora_b[1], - model_lora.get_lora("up_proj").lora_b) + torch.testing.assert_close(packed_lora.lora_a[0], + model_lora.get_lora("gate_proj").lora_a) + torch.testing.assert_close(packed_lora.lora_b[0], + model_lora.get_lora("gate_proj").lora_b) + torch.testing.assert_close(packed_lora.lora_a[1], + model_lora.get_lora("up_proj").lora_a) + torch.testing.assert_close(packed_lora.lora_b[1], + model_lora.get_lora("up_proj").lora_b) packed_lora1 = model_lora1.get_lora("gate_up_proj") assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights) assert packed_lora1.lora_a[0] is None assert packed_lora1.lora_b[0] is None - assert torch.allclose(packed_lora1.lora_a[1], - model_lora1.get_lora("up_proj").lora_a) - assert torch.allclose(packed_lora1.lora_b[1], - model_lora1.get_lora("up_proj").lora_b) + torch.testing.assert_close(packed_lora1.lora_a[1], + model_lora1.get_lora("up_proj").lora_a) + torch.testing.assert_close(packed_lora1.lora_b[1], + model_lora1.get_lora("up_proj").lora_b) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index ebb06ed20f249..58864e83173f9 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -127,16 +127,18 @@ def per_tensor_dequantize(tensor, inv_scale, dtype): # Reference dynamic quantizaton y = quantize_ref(x, inv_scale) - assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype)) + torch.testing.assert_close(ref_y, + per_tensor_dequantize(y, inv_scale, dtype)) # Static quantization y, _ = ops.scaled_fp8_quant(x, inv_scale) - assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype)) + torch.testing.assert_close(ref_y, + per_tensor_dequantize(y, inv_scale, dtype)) # Padding y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17) assert y.shape[0] == 17 - assert torch.allclose( + torch.testing.assert_close( ref_y, per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale, dtype)) diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index f1370e411241c..74e7486e8012e 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -632,7 +632,7 @@ def mock_sample(probs, *args, **kwargs): hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone()) hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) - assert torch.allclose(hf_probs, sample_probs, atol=1e-5) + torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5) assert torch.equal(hf_probs.eq(0), sample_probs.eq(0)) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 86148291ae6ff..30eb99f868bfc 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -161,7 +161,7 @@ def assert_logprobs_dict_allclose( single_step_actual_logprobs[token_id].logprob) expected = torch.tensor( single_step_expected_logprobs[token_id].logprob) - assert torch.allclose(actual, expected) + torch.testing.assert_close(actual, expected) def create_sampler_output_list( diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py index 8ee2d78190cd1..7d4af963e25c5 100644 --- a/tests/test_logits_processor.py +++ b/tests/test_logits_processor.py @@ -90,5 +90,7 @@ def pick_ith(token_ids, logits): assert torch.isinf(logits_processor_output[:, 0]).all() fake_logits *= logits_processor.scale - assert torch.allclose(logits_processor_output[:, 1], fake_logits[:, 1], - 1e-4) + torch.testing.assert_close(logits_processor_output[:, 1], + fake_logits[:, 1], + rtol=1e-4, + atol=0.0) diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 4d2edc02139c4..84502043cbd26 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -77,7 +77,7 @@ def test_prepare_prompt(batch_size): device = model_runner.device assert attn_metadata.num_prefills > 0 assert attn_metadata.num_decode_tokens == 0 - assert torch.allclose( + torch.testing.assert_close( attn_metadata.seq_lens_tensor, torch.tensor(seq_lens, device=device, dtype=torch.int)) assert attn_metadata.seq_lens == seq_lens @@ -90,7 +90,7 @@ def test_prepare_prompt(batch_size): for seq_len in seq_lens: start_idx += seq_len start_loc.append(start_idx) - assert torch.allclose( + torch.testing.assert_close( attn_metadata.query_start_loc, torch.tensor(start_loc, dtype=torch.int32, device=device)) @@ -102,10 +102,10 @@ def test_prepare_prompt(batch_size): start_idx += seq_len seq_start_loc.append(start_idx) - assert torch.allclose( + torch.testing.assert_close( attn_metadata.seq_start_loc, torch.tensor(start_loc, dtype=torch.int32, device=device)) - assert torch.allclose( + torch.testing.assert_close( attn_metadata.context_lens_tensor, torch.zeros(attn_metadata.context_lens_tensor.shape[0], dtype=torch.int, @@ -114,7 +114,7 @@ def test_prepare_prompt(batch_size): expected = torch.tensor([[] for _ in range(len(seq_group_metadata_list))], dtype=torch.int32, device=model_runner.device) - assert torch.allclose(attn_metadata.block_tables, expected) + torch.testing.assert_close(attn_metadata.block_tables, expected) # Cuda graph should not be used for prerill. assert attn_metadata.use_cuda_graph is False @@ -201,7 +201,7 @@ def test_prepare_decode_cuda_graph(batch_size): # decode has only 1 token for query. start_idx += 1 start_loc.append(start_idx) - assert torch.allclose( + torch.testing.assert_close( attn_metadata.query_start_loc, torch.tensor(start_loc, dtype=torch.int32, device=device)) @@ -210,15 +210,15 @@ def test_prepare_decode_cuda_graph(batch_size): for seq_len in seq_lens: start_idx += seq_len seq_start_loc.append(start_idx) - assert torch.allclose( + torch.testing.assert_close( attn_metadata.seq_start_loc, torch.tensor(seq_start_loc, dtype=torch.int32, device=device)) - assert torch.allclose( + torch.testing.assert_close( attn_metadata.context_lens_tensor, torch.tensor(context_lens, dtype=torch.int, device=device)) assert attn_metadata.max_decode_seq_len == max(seq_lens) - assert torch.allclose( + torch.testing.assert_close( attn_metadata.seq_lens_tensor[:len(seq_lens)], torch.tensor(seq_lens, dtype=torch.int, device=device))