Merge pull request #6 from aws-neuron/release_2.21.1

awsjoshir · web-flow · commit 6eef2a68d39d · 2025-01-14T22:13:32.000-08:00
Neuron 2.21.1 release
diff --git a/src/neuronx_distributed_inference/_version.py b/src/neuronx_distributed_inference/_version.py
@@ -1,3 +1,3 @@
 # Copyright Amazon Web Services and its Affiliates. All Rights Reserved.
 # ==============================================================================
-__version__ = "0.1.0"
+__version__ = "0.1.1"
diff --git a/src/neuronx_distributed_inference/models/model_base.py b/src/neuronx_distributed_inference/models/model_base.py
@@ -38,6 +38,7 @@
     Sampler,
     prepare_sampling_params,
     rand_like,
+    validate_sampling_params,
 )
 from neuronx_distributed_inference.modules.kvcache.kv_cache_manager import (
     KVCacheManager,
@@ -1358,6 +1359,9 @@ def forward(
         sampling_params = (
             self.default_sampling_params if sampling_params is None else sampling_params
         )
+        if self.on_device_sampling:
+            validate_sampling_params(sampling_params, self.neuron_config.on_device_sampling_config)
+
         self.sampling_params = sampling_params
 
         output_attentions, output_hidden_states, return_dict = self._setup_func_config(
diff --git a/src/neuronx_distributed_inference/models/model_wrapper.py b/src/neuronx_distributed_inference/models/model_wrapper.py
@@ -300,26 +300,36 @@ def get_model_instance(self):
 
     def _forward_with_pad(self, *args):
         seq_ids = args[3]
-        if len(args) > 4:
-            medusa_args = args[4:8]
+        sampling_params = args[4]
+        if len(args) > 5:
+            medusa_args = args[5:8]
         else:
             medusa_args = None
 
         # pad the inputs up to the compiled batch size in the end
-        def pad_helper(tensor):
+        def pad_helper(tensor, pad_type="zeros"):
+            VALID_PAD_TYPES = set(["zeros", "ones", "repeat_first_batchline"])
+            assert (
+                pad_type in VALID_PAD_TYPES
+            ), f"Found {pad_type=}, but valid pad types are {VALID_PAD_TYPES}"
             if tensor is None or tensor.shape[0] == self.neuron_config.batch_size:
                 return tensor
 
             padded_shape = list(tensor.shape)
             padded_shape[0] = self.neuron_config.batch_size
-            padded_tensor = torch.zeros(padded_shape, dtype=tensor.dtype)
+            if pad_type == "repeat_first_batchline":
+                # pad with first batch line values instead of zeros, to reduce chances of NaN
+                padded_tensor = tensor[0].unsqueeze(0).repeat(padded_shape[0], 1).to(tensor.dtype)
+            else:
+                fill_value = 0 if pad_type == "zeros" else 1
+                padded_tensor = torch.full(padded_shape, fill_value=fill_value, dtype=tensor.dtype)
             padded_tensor[: tensor.shape[0]] = tensor
             return padded_tensor
 
         padded_args = []
         # pad input_ids, attn_mask and position_ids
         for arg in args[0:3]:
-            padded_args.append(pad_helper(arg))
+            padded_args.append(pad_helper(arg, pad_type="repeat_first_batchline"))
 
         # need to handle seq_ids separately, when compiled batch is 4, if we pad seq_ids from [0,2,1] to [0,2,1,
         # 0]. then the kv cache of padded input could be written into the first cache line, so we need to pad as [0,
@@ -333,6 +343,10 @@ def pad_helper(tensor):
         )
         padded_args.append(padded_seq_ids)
 
+        # pad sampling params by repeating first batchline
+        padded_sampling_params = pad_helper(sampling_params, pad_type="repeat_first_batchline")
+        padded_args.append(padded_sampling_params)
+
         if medusa_args is not None:
             for arg in medusa_args:
                 padded_args.append(pad_helper(arg))
diff --git a/src/neuronx_distributed_inference/modules/generation/sampling.py b/src/neuronx_distributed_inference/modules/generation/sampling.py
@@ -1,12 +1,12 @@
-from typing import Union
+from typing import Any, Dict, Union
 
 import torch
 from neuronx_distributed.operators.argmax import argmax as nxd_argmax
 from neuronx_distributed.operators.topk import topk as nxd_topk
 from neuronx_distributed.parallel_layers import parallel_state
 from torch_neuronx.xla_impl.ops import xla_hlo_call
 
-from neuronx_distributed_inference.models.config import NeuronConfig
+from neuronx_distributed_inference.models.config import NeuronConfig, OnDeviceSamplingConfig
 
 
 @xla_hlo_call
@@ -18,6 +18,62 @@ def rand_like(tensor):
     return dtype[shape].Rng(minimum, maximum, distribution=1)  # Uniform distribution
 
 
+def validate_sampling_params(
+    params: torch.Tensor, on_device_sampling_config: Union[Dict[str, Any], OnDeviceSamplingConfig]
+) -> None:
+    """
+    Validates sampling parameters for language models.
+
+    Args:
+    params (torch.Tensor): Tensor of shape (batch_size, 3) containing sampling parameters
+                           in the order: top-k, top-p, temperature.
+    on_device_sampling_config
+
+    Raises:
+    ValueError: If any of the parameters are invalid.
+    """
+    if params.shape[1] != 3:
+        raise ValueError(f"Expected tensor of shape (batch_size, 3), but got {params.shape}")
+
+    # autocast params tensor to float32
+    params = params.to(torch.float32)
+
+    # Unpack parameters
+    top_k, top_p, temperature = params[:, 0], params[:, 1], params[:, 2]
+
+    if isinstance(on_device_sampling_config, OnDeviceSamplingConfig):
+        global_top_k = on_device_sampling_config.global_topk
+    else:
+        global_top_k = on_device_sampling_config["global_topk"]
+
+    # Validate top-k value range
+    valid_top_k = (top_k == -1) | ((top_k > 0) & (top_k <= global_top_k))
+    if not torch.all(valid_top_k):
+        raise ValueError(
+            f"Invalid top-k values found. top-k must be -1 or greater than 0 but less than or equal to {global_top_k=}. Found {top_k=}."
+        )
+
+    # checks if top-k values can be represented as integers
+    if not torch.equal(top_k, top_k.floor()):
+        raise ValueError(
+            f"Invalid top-k values found. top-k values should be able to be represented as integer values, but found decimal parts. Found {top_k=}."
+        )
+
+    # Validate top-p
+    valid_top_p = (top_p > 0.0) & (top_p <= 1.0)
+    if not torch.all(valid_top_p):
+        raise ValueError(
+            f"Invalid top-p values found. top-p must be in the range (0.0, 1.0]. Found {top_p=}."
+        )
+
+    # Validate temperature
+    valid_temp = temperature > 0.0
+    if not torch.all(valid_temp):
+        raise ValueError(
+            f"Invalid temperature values found. Temperature must be strictly greater than 0.0. Found {temperature=}."
+        )
+
+
 def prepare_sampling_params(batch_size, top_k=[1], top_p=[1.0], temperature=[1.0]):
     top_k = prepare_tensor(top_k)
     top_p = prepare_tensor(top_p)