From 6b5959f4e59353f04cf695df1c03618ee710f2cc Mon Sep 17 00:00:00 2001
From: huhai463127310 <huhai463127310@126.com>
Date: Sat, 30 Aug 2025 00:21:01 +0800
Subject: [PATCH] fix build python,cpp_ext and cuda_ext error for Visual Studio
 2022 on Windows 11

---
 csrc/mlp.cpp                      | 10 ++---
 csrc/mlp_cuda.cu                  | 15 +++----
 csrc/multi_tensor_axpby_kernel.cu | 12 +++---
 csrc/multi_tensor_scale_kernel.cu |  4 +-
 setup.py                          | 67 +++++++++++++++++--------------
 5 files changed, 57 insertions(+), 51 deletions(-)
diff --git a/csrc/mlp.cpp b/csrc/mlp.cpp
index 772e73091..088b7743e 100644
--- a/csrc/mlp.cpp
+++ b/csrc/mlp.cpp
@@ -61,10 +61,10 @@ std::vector<at::Tensor> mlp_forward(int use_bias, int activation, std::vector<at
   auto reserved_size = get_mlp_reserved_space(batch_size, num_layers, output_features.data());
 
   // create output/workspace tensor
-  auto out = at::empty({batch_size, output_features.back()}, inputs[0].type());
-  auto reserved_space = at::empty({static_cast<long>(reserved_size)}, inputs[0].type());
+  auto out = at::empty({batch_size, output_features.back()}, inputs[0].options());
+  auto reserved_space = at::empty({static_cast<long>(reserved_size)}, inputs[0].options());
   // allocate fixed 4MB workspace for cublaslt for now, and this gets at least 4 MB
-  auto lt_workspace = at::empty({1 << 22}, inputs[0].type());
+  auto lt_workspace = at::empty({1 << 22}, inputs[0].options());
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(inputs[0].scalar_type(), "mlp_forward", [&] {
     std::vector<scalar_t*> w_ptr;
@@ -118,7 +118,7 @@ std::vector<at::Tensor> mlp_backward(
   // create outputs, length of inputs
   std::vector<at::Tensor> outputs;
   for (int i = 0; i < inputs.size(); i++) {
-    outputs.push_back(at::empty(inputs[i].sizes(), inputs[i].type()));  // clone for testing now
+    outputs.push_back(at::empty(inputs[i].sizes(), inputs[i].options()));  // clone for testing now
   }
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(inputs[0].scalar_type(), "mlp_backward", [&] {
@@ -135,7 +135,7 @@ std::vector<at::Tensor> mlp_backward(
         get_mlp_bp_workspace_in_bytes<scalar_t>(batch_size, num_layers, output_features.data());
 
     // auto work_space = at::empty({work_size*4}, at::kByte);
-    auto work_space = at::empty({static_cast<long>(work_size / sizeof(scalar_t))}, inputs[0].type());
+    auto work_space = at::empty({static_cast<long>(work_size / sizeof(scalar_t))}, inputs[0].options());
 
     auto result = mlp_bp<scalar_t>(
         inputs[0].data_ptr<scalar_t>(),
diff --git a/csrc/mlp_cuda.cu b/csrc/mlp_cuda.cu
index f93f1df1a..6aba5094f 100644
--- a/csrc/mlp_cuda.cu
+++ b/csrc/mlp_cuda.cu
@@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <assert.h>
+#include <cstdint>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -434,7 +435,7 @@ CLEANUP:
 // Bias ADD. Assume input X is [features x batch size], column major.
 // Bias is one 'features' long vector, with implicit broadcast.
 template <typename T>
-__global__ void biasAdd_fprop(T *X, T *b, uint batch_size, uint features) {
+__global__ void biasAdd_fprop(T *X, T *b, uint32_t batch_size, uint32_t features) {
   T r_x[ILP];
   T r_b[ILP];
   if(is_aligned(X) && is_aligned(b) && features % ILP ==0) {
@@ -481,7 +482,7 @@ __global__ void biasAdd_fprop(T *X, T *b, uint batch_size, uint features) {
 // Bias ADD + ReLU. Assume input X is [features x batch size], column major.
 // Activation support fuesed ReLU. Safe to call in-place.
 template <typename T>
-__global__ void biasAddRelu_fprop(T *X, T *b, uint batch_size, uint features) {
+__global__ void biasAddRelu_fprop(T *X, T *b, uint32_t batch_size, uint32_t features) {
   T r_x[ILP];
   T r_b[ILP];
   if(is_aligned(X) && is_aligned(b) && features % ILP ==0) {
@@ -528,7 +529,7 @@ __global__ void biasAddRelu_fprop(T *X, T *b, uint batch_size, uint features) {
 // ReLU. Assume input X is [features x batch size], column major.
 // Safe to call in-place.
 template <typename T>
-__global__ void Relu_fprop(T *X, uint batch_size, uint features) {
+__global__ void Relu_fprop(T *X, uint32_t batch_size, uint32_t features) {
   T r_x[ILP];
   if(is_aligned(X) && features % ILP ==0) {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -568,7 +569,7 @@ __global__ void Relu_fprop(T *X, uint batch_size, uint features) {
 // Sigmoid. Assume input X is [features x batch size], column major.
 // Safe to call in-place.
 template <typename T>
-__global__ void Sigmoid_fprop(T *X, uint batch_size, uint features) {
+__global__ void Sigmoid_fprop(T *X, uint32_t batch_size, uint32_t features) {
   T r_x[ILP];
   if(is_aligned(X) && features % ILP ==0) {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -608,7 +609,7 @@ __global__ void Sigmoid_fprop(T *X, uint batch_size, uint features) {
 // ReLU. Assume input X is [features x batch size], column major.
 // Safe to call in-place.
 template <typename T>
-__global__ void Relu_bprop(T *dY, T *Y, uint batch_size, uint features, T *dX) {
+__global__ void Relu_bprop(T *dY, T *Y, uint32_t batch_size, uint32_t features, T *dX) {
   T r_dy[ILP];
   T r_y[ILP];
   if(is_aligned(dY) &&
@@ -656,7 +657,7 @@ __global__ void Relu_bprop(T *dY, T *Y, uint batch_size, uint features, T *dX) {
 // Sigmoid. Assume input X is [features x batch size], column major.
 // Safe to call in-place.
 template <typename T>
-__global__ void Sigmoid_bprop(T *dY, T *Y, uint batch_size, uint features, T *dX) {
+__global__ void Sigmoid_bprop(T *dY, T *Y, uint32_t batch_size, uint32_t features, T *dX) {
   T r_dy[ILP];
   T r_y[ILP];
   if(is_aligned(dY) &&
@@ -1324,7 +1325,7 @@ int mlp_fp(
         return 1;
       }
 
-      const uint &input_size = ofeat;
+      const uint32_t &input_size = ofeat;
       int num_blocks = 0;
       int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
       // Call biasReLU
diff --git a/csrc/multi_tensor_axpby_kernel.cu b/csrc/multi_tensor_axpby_kernel.cu
index 021df27d7..45cbbacda 100644
--- a/csrc/multi_tensor_axpby_kernel.cu
+++ b/csrc/multi_tensor_axpby_kernel.cu
@@ -72,11 +72,11 @@ struct AxpbyFunctor
         {
           r_out[ii] = a*static_cast<float>(r_x[ii]) + b*static_cast<float>(r_y[ii]);
           if(arg_to_check == -1)
-            finite = finite && (isfinite(r_x[ii]) && isfinite(r_y[ii]));
+            finite = finite && (!isnan(static_cast<float>(r_x[ii])) && !isinf(static_cast<float>(r_x[ii])) && !isnan(static_cast<float>(r_y[ii])) && !isinf(static_cast<float>(r_y[ii])));
           if(arg_to_check == 0)
-            finite = finite && isfinite(r_x[ii]);
+            finite = finite && !isnan(static_cast<float>(r_x[ii])) && !isinf(static_cast<float>(r_x[ii]));
           if(arg_to_check == 1)
-            finite = finite && isfinite(r_y[ii]);
+            finite = finite && !isnan(static_cast<float>(r_y[ii])) && !isinf(static_cast<float>(r_y[ii]));
         }
         // store
         load_store(out, r_out, i_start , 0);
@@ -104,11 +104,11 @@ struct AxpbyFunctor
         {
           r_out[ii] = a*static_cast<float>(r_x[ii]) + b*static_cast<float>(r_y[ii]);
           if(arg_to_check == -1)
-            finite = finite && (isfinite(r_x[ii]) && isfinite(r_y[ii]));
+            finite = finite && (!isnan(static_cast<float>(r_x[ii])) && !isinf(static_cast<float>(r_x[ii])) && !isnan(static_cast<float>(r_y[ii])) && !isinf(static_cast<float>(r_y[ii])));
           if(arg_to_check == 0)
-            finite = finite && isfinite(r_x[ii]);
+            finite = finite && !isnan(static_cast<float>(r_x[ii])) && !isinf(static_cast<float>(r_x[ii]));
           if(arg_to_check == 1)
-            finite = finite && isfinite(r_y[ii]);
+            finite = finite && !isnan(static_cast<float>(r_y[ii])) && !isinf(static_cast<float>(r_y[ii]));
         }
         // see note in multi_tensor_scale_kernel.cu
 #pragma unroll
diff --git a/csrc/multi_tensor_scale_kernel.cu b/csrc/multi_tensor_scale_kernel.cu
index 4fd848bcd..946a63bf9 100644
--- a/csrc/multi_tensor_scale_kernel.cu
+++ b/csrc/multi_tensor_scale_kernel.cu
@@ -66,7 +66,7 @@ struct ScaleFunctor
         for(int ii = 0; ii < ILP; ii++)
         {
           r_out[ii] = static_cast<float>(r_in[ii]) * scale;
-          finite = finite && isfinite(r_in[ii]);
+          finite = finite && !isnan(static_cast<float>(r_in[ii])) && !isinf(static_cast<float>(r_in[ii]));
         }
         // store
         load_store(out, r_out, i_start, 0);
@@ -94,7 +94,7 @@ struct ScaleFunctor
         for(int ii = 0; ii < ILP; ii++)
         {
           r_out[ii] = static_cast<float>(r_in[ii]) * scale;
-          finite = finite && isfinite(r_in[ii]);
+          finite = finite && !isnan(static_cast<float>(r_in[ii])) && !isinf(static_cast<float>(r_in[ii]));
         }
 #pragma unroll
         for(int ii = 0; ii < ILP; ii++)
diff --git a/setup.py b/setup.py
index c8675ef5c..84852f441 100644
--- a/setup.py
+++ b/setup.py
@@ -184,7 +184,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             ],
             include_dirs=[os.path.join(this_dir, "csrc")],
             extra_compile_args={
-                "cxx": ["-O3"],
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
                 "nvcc": ["-O3", "--use_fast_math"],
             },
         )
@@ -203,7 +203,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             ],
             include_dirs=[os.path.join(this_dir, "csrc")],
             extra_compile_args={
-                "cxx": ["-O3"],
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
                 "nvcc": ["-O3", "--use_fast_math"],
             },
         )
@@ -236,10 +236,11 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
                 "csrc/update_scale_hysteresis.cu",
             ],
             extra_compile_args={
-                "cxx": ["-O3"],
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
                 "nvcc": [
                     "-lineinfo",
                     "-O3",
+                    "-D_ENABLE_EXTENDED_ALIGNED_STORAGE",
                     # '--resource-usage',
                     "--use_fast_math",
                 ],
@@ -251,7 +252,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             name="syncbn",
             sources=["csrc/syncbn.cpp", "csrc/welford.cu"],
             extra_compile_args={
-                "cxx": ["-O3"],
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
                 "nvcc": ["-O3"],
             },
         )
@@ -262,7 +263,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             name="fused_layer_norm_cuda",
             sources=["csrc/layer_norm_cuda.cpp", "csrc/layer_norm_cuda_kernel.cu"],
             extra_compile_args={
-                "cxx": ["-O3"],
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
                 "nvcc": ["-maxrregcount=50", "-O3", "--use_fast_math"],
             },
         )
@@ -272,9 +273,10 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
         CUDAExtension(
             name="mlp_cuda",
             sources=["csrc/mlp.cpp", "csrc/mlp_cuda.cu"],
+            libraries=["cublas", "cublasLt"],
             extra_compile_args={
-                "cxx": ["-O3"],
-                "nvcc": ["-O3"],
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
+                "nvcc": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
             },
         )
     )
@@ -283,9 +285,10 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             name="fused_dense_cuda",
             sources=["csrc/fused_dense.cpp", "csrc/fused_dense_cuda.cu"],
             extra_compile_args={
-                "cxx": ["-O3"],
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
                 "nvcc": ["-O3"],
             },
+            libraries=["cublas", "cublasLt"],
         )
     )
 
@@ -298,7 +301,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             ],
             include_dirs=[os.path.join(this_dir, "csrc")],
             extra_compile_args={
-                "cxx": ["-O3"],
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
                 "nvcc": [
                     "-O3",
                     "-U__CUDA_NO_HALF_OPERATORS__",
@@ -397,8 +400,9 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
                 "csrc/megatron/fused_weight_gradient_dense_cuda.cu",
                 "csrc/megatron/fused_weight_gradient_dense_16bit_prec_cuda.cu",
             ],
+            libraries=["cublas", "cublasLt"],
             extra_compile_args={
-                "cxx": ["-O3"],
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
                 "nvcc": [
                     "-O3",
                     "-U__CUDA_NO_HALF_OPERATORS__",
@@ -423,7 +427,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             CUDAExtension(name='permutation_search_cuda',
                           sources=['apex/contrib/sparsity/permutation_search_kernels/CUDA_kernels/permutation_search_kernels.cu'],
                           include_dirs=[os.path.join(this_dir, 'apex', 'contrib', 'sparsity', 'permutation_search_kernels', 'CUDA_kernels')],
-                          extra_compile_args={'cxx': ['-O3'],
+                          extra_compile_args={'cxx': ['-O3', '-D_ENABLE_EXTENDED_ALIGNED_STORAGE'],
                                               'nvcc':['-O3'] + cc_flag}))
 
 if has_flag("--bnp", "APEX_BNP"):
@@ -441,7 +445,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             ],
             include_dirs=[os.path.join(this_dir, "csrc")],
             extra_compile_args={
-                "cxx": [],
+                "cxx": ["-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
                 "nvcc": [
                     "-DCUDA_HAS_FP16=1",
                     "-D__CUDA_NO_HALF_OPERATORS__",
@@ -465,7 +469,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             sources=["apex/contrib/csrc/xentropy/interface.cpp", "apex/contrib/csrc/xentropy/xentropy_kernel.cu"],
             include_dirs=[os.path.join(this_dir, "csrc")],
             extra_compile_args={
-                "cxx": ["-O3"] + [f'-DXENTROPY_VER="{xentropy_ver}"'],
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"] + [f'-DXENTROPY_VER="{xentropy_ver}"'],
                 "nvcc": ["-O3"],
             },
         )
@@ -484,7 +488,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             ],
             include_dirs=[os.path.join(this_dir, 'csrc')],
             extra_compile_args={
-                'cxx': ['-O3'],
+                'cxx': ['-O3', '-D_ENABLE_EXTENDED_ALIGNED_STORAGE'],
                 'nvcc':['-O3', '--use_fast_math', '--ftz=false'],
             },
         )
@@ -503,7 +507,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             ] + glob.glob("apex/contrib/csrc/group_norm/*.cu"),
             include_dirs=[os.path.join(this_dir, 'csrc')],
             extra_compile_args={
-                "cxx": ["-O3", "-std=c++17"],
+                "cxx": ["-O3", "-std=c++17", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
                 "nvcc": [
                     "-O3", "-std=c++17", "--use_fast_math", "--ftz=false",
                 ],
@@ -526,7 +530,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
                 "apex/contrib/csrc/group_norm_v2/gn_utils.cpp",
             ] + glob.glob("apex/contrib/csrc/group_norm_v2/gn_cuda_inst_*.cu"),
             extra_compile_args={
-                "cxx": ["-O2"],
+                "cxx": ["-O2", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
                 "nvcc": [
                     "-O2", "--use_fast_math", "--ftz=false",
                     "-U__CUDA_NO_HALF_CONVERSIONS__",
@@ -551,7 +555,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             ],
             include_dirs=[os.path.join(this_dir, 'csrc')],
             extra_compile_args={
-                'cxx': ['-O3'],
+                'cxx': ['-O3', '-D_ENABLE_EXTENDED_ALIGNED_STORAGE'],
                 'nvcc':['-O3', '--use_fast_math', '--ftz=false'],
             },
         )
@@ -570,7 +574,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             ],
             include_dirs=[os.path.join(this_dir, "csrc")],
             extra_compile_args={
-                "cxx": ["-O3"],
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
                 "nvcc": ["-O3", "--use_fast_math"],
             },
         )
@@ -590,7 +594,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             ],
             include_dirs=[os.path.join(this_dir, "csrc")],
             extra_compile_args={
-                "cxx": ["-O3"],
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
                 "nvcc": ["-O3", "--use_fast_math"],
             },
         )
@@ -617,7 +621,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
                 "apex/contrib/csrc/layer_norm/ln_bwd_semi_cuda_kernel.cu",
             ],
             extra_compile_args={
-                "cxx": ["-O3"] + generator_flag,
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"] + generator_flag,
                 "nvcc": [
                     "-O3",
                     "-U__CUDA_NO_HALF_OPERATORS__",
@@ -676,7 +680,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
                 "apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_512_64_kernel.sm80.cu",
             ],
             extra_compile_args={
-                "cxx": ["-O3"] + generator_flag,
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"] + generator_flag,
                 "nvcc": [
                     "-O3",
                     "-U__CUDA_NO_HALF_OPERATORS__",
@@ -715,7 +719,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
                 "apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu",
             ],
             extra_compile_args={
-                "cxx": ["-O3"] + generator_flag,
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"] + generator_flag,
                 "nvcc": [
                     "-O3",
                     "-U__CUDA_NO_HALF_OPERATORS__",
@@ -729,6 +733,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
                 os.path.join(this_dir, "apex/contrib/csrc/multihead_attn/cutlass/include/"),
                 os.path.join(this_dir, "apex/contrib/csrc/multihead_attn/cutlass/tools/util/include")
             ],
+            libraries=["cublas", "cublasLt"],
         )
     )
 
@@ -744,7 +749,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
                 "apex/contrib/csrc/transducer/transducer_joint_kernel.cu",
             ],
             extra_compile_args={
-                "cxx": ["-O3"] + generator_flag,
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"] + generator_flag,
                 "nvcc": ["-O3"] + generator_flag,
             },
             include_dirs=[os.path.join(this_dir, "csrc"), os.path.join(this_dir, "apex/contrib/csrc/multihead_attn")],
@@ -759,7 +764,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             ],
             include_dirs=[os.path.join(this_dir, "csrc")],
             extra_compile_args={
-                "cxx": ["-O3"],
+                "cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"],
                 "nvcc": ["-O3"],
             },
         )
@@ -779,7 +784,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
                     "apex/contrib/csrc/cudnn_gbn/cudnn_gbn.cpp",
                 ],
                 include_dirs=[os.path.join(this_dir, "apex/contrib/csrc/cudnn-frontend/include")],
-                extra_compile_args={"cxx": ["-O3", "-g"] + generator_flag},
+                extra_compile_args={"cxx": ["-O3", "-g", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"] + generator_flag},
             )
         )
 
@@ -794,7 +799,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
                 "apex/contrib/csrc/peer_memory/peer_memory_cuda.cu",
                 "apex/contrib/csrc/peer_memory/peer_memory.cpp",
             ],
-            extra_compile_args={"cxx": ["-O3"] + generator_flag},
+            extra_compile_args={"cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"] + generator_flag},
         )
     )
 
@@ -818,7 +823,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
                     "apex/contrib/csrc/nccl_p2p/nccl_p2p_cuda.cu",
                     "apex/contrib/csrc/nccl_p2p/nccl_p2p.cpp",
                 ],
-                extra_compile_args={"cxx": ["-O3"] + generator_flag},
+                extra_compile_args={"cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"] + generator_flag},
             )
         )
     else:
@@ -838,7 +843,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
                 name="fast_bottleneck",
                 sources=["apex/contrib/csrc/bottleneck/bottleneck.cpp"],
                 include_dirs=[os.path.join(this_dir, "apex/contrib/csrc/cudnn-frontend/include")],
-                extra_compile_args={"cxx": ["-O3"] + generator_flag},
+                extra_compile_args={"cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"] + generator_flag},
             )
         )
 
@@ -854,7 +859,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
                 name="fused_conv_bias_relu",
                 sources=["apex/contrib/csrc/conv_bias_relu/conv_bias_relu.cpp"],
                 include_dirs=[os.path.join(this_dir, "apex/contrib/csrc/cudnn-frontend/include")],
-                extra_compile_args={"cxx": ["-O3"] + generator_flag},
+                extra_compile_args={"cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"] + generator_flag},
             )
         )
 
@@ -877,7 +882,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
                 ],
                 include_dirs=[os.path.join(this_dir, "apex/apex/contrib/csrc/nccl_allocator")],
                 libraries=["nccl"],
-                extra_compile_args={"cxx": ["-O3"] + generator_flag},
+                extra_compile_args={"cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"] + generator_flag},
             )
         )
     else:
@@ -896,7 +901,7 @@ def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int
             sources=["apex/contrib/csrc/gpu_direct_storage/gds.cpp", "apex/contrib/csrc/gpu_direct_storage/gds_pybind.cpp"],
             include_dirs=[os.path.join(this_dir, "apex/contrib/csrc/gpu_direct_storage")],
             libraries=["cufile"],
-            extra_compile_args={"cxx": ["-O3"] + generator_flag},
+            extra_compile_args={"cxx": ["-O3", "-D_ENABLE_EXTENDED_ALIGNED_STORAGE"] + generator_flag},
         )
     )