Update the rowwise adagrad optimizer to leverage optimizer state offloading, v3, backend (pytorch#4133)

q10 · facebook-github-bot · commit 5e2d988f7baa · 2025-05-20T21:50:19.000-07:00
Summary: Pull Request resolved: pytorch#4133 X-link: facebookresearch/FBGEMM#1214 This diff adds support for leveraging optimizer state offloading to make optimizer state updates, starting with the rowwise adagrad optimizer. - Add compile-time flag `kEnableOptimizerOffloading` to the table update kernel to enable handling optimizer offloading, starting with the rowwise adagrad case - Propagate the compile-time flag upwards to `embedding_backward_split_template.cu`, where it is a runtime user-supplied boolean argument Differential Revision: D74827718
diff --git a/fbgemm_gpu/codegen/genscript/optimizers.py b/fbgemm_gpu/codegen/genscript/optimizers.py
@@ -186,15 +186,32 @@ def rowwise_adagrad() -> Dict[str, Any]:
         g_local_sum_square += gx * gx + gy * gy + gz * gz + gw * gw;
     """
     )
-    split_precomputation += """
+    split_precomputation += """	
+	// Define the rowwise adagrad optimizer state struct view
+    struct OptimizerState {
+        at::acc_type<cache_t, true> momentum;
+    };
+
+    // Fetch the pointer to the optimizer state along the cache row
+    [[maybe_unused]] auto* optimizer = weight_row_template.template optimizer_state_ptr<OptimizerState>();
+
     const at::acc_type<cache_t, true> g_avg_square =
         GROUP_REDUCE_ALL_SUM(g_local_sum_square, at::acc_type<cache_t, true>) / D;
 
     at::acc_type<cache_t, true> multiplier = 0.0;
     at::acc_type<cache_t, true> correction = 0.0;
     if (threadIdx.x == 0) {
-        at::acc_type<cache_t, true> new_sum_square_grads = momentum1[idx] + g_avg_square;
-        momentum1[idx] = new_sum_square_grads;
+        auto new_sum_square_grads = g_avg_square;
+	
+        // Update the optimizer state.  Use optimizer state offloading only if enabled
+        if (enable_optimizer_offloading) {
+            new_sum_square_grads += optimizer->momentum;
+            optimizer->momentum = new_sum_square_grads;
+        } else {
+            new_sum_square_grads += momentum1[idx];
+            momentum1[idx] = new_sum_square_grads;
+        }
+
         multiplier = learning_rate / (sqrtf(new_sum_square_grads) + eps);
         if (weight_decay_mode == 1) {
             // L2 regularization
@@ -251,9 +268,10 @@ def rowwise_adagrad() -> Dict[str, Any]:
                 OptimItem(ArgType.FLOAT, "weight_decay", 0.0),
                 OptimItem(ArgType.INT, "weight_decay_mode", 0),
                 OptimItem(ArgType.FLOAT, "max_norm", 0.0),
+                OptimItem(ArgType.BOOL, "enable_optimizer_offloading", False),
             ],
             {
-                "v1": "Tensor momentum1, float eps = 0, float learning_rate = 0, float weight_decay = 0.0, int weight_decay_mode = 0.0, float max_norm = 0.0"
+                "v1": "Tensor momentum1, float eps = 0, float learning_rate = 0, float weight_decay = 0.0, int weight_decay_mode = 0.0, float max_norm = 0.0",
             },
         ),
         "split_precomputation": split_precomputation,
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_cpu_template.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_cpu_template.cpp
@@ -431,9 +431,9 @@ for (const auto d : c10::irange(D)) {
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   {% if not dense %}
-  m.def("split_embedding_backward_codegen_{{ optimizer }}_cpu(Tensor grad_output, Tensor(a!) host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets,int pooling_mode, Tensor indice_weights, bool stochastic_rounding, {{ (args.split_function_args | join(", ")).replace("double", "float").replace("int64_t", "int").replace("Tensor momentum1_host", "Tensor(b!) momentum1_host")}}, int output_dtype = 0) -> ()");
+  m.def("split_embedding_backward_codegen_{{ optimizer }}_cpu(Tensor grad_output, Tensor(a!) host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets,int pooling_mode, Tensor indice_weights, bool stochastic_rounding, {{ (args.split_function_args | join(", ")).replace("double", "float").replace("int64_t", "int").replace("Tensor momentum1_host", "Tensor(b!) momentum1_host").replace("false", "False")}}, int output_dtype = 0) -> ()");
   {% else %}
-  m.def("split_embedding_backward_codegen_{{ optimizer }}_cpu(Tensor grad_output, Tensor(a!) host_weights, Tensor weights_offsets, Tensor D_offsets, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets,int pooling_mode, Tensor indice_weights, {{ (args.split_function_args | join(", ")).replace("double", "float").replace("int64_t", "int").replace("Tensor momentum1_host", "Tensor(b!) momentum1_host")}}) -> Tensor");
+  m.def("split_embedding_backward_codegen_{{ optimizer }}_cpu(Tensor grad_output, Tensor(a!) host_weights, Tensor weights_offsets, Tensor D_offsets, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets,int pooling_mode, Tensor indice_weights, {{ (args.split_function_args | join(", ")).replace("double", "float").replace("int64_t", "int").replace("Tensor momentum1_host", "Tensor(b!) momentum1_host").replace("false", "False")}}) -> Tensor");
   {% endif %}
   DISPATCH_TO_CPU("split_embedding_backward_codegen_{{ optimizer }}_cpu", split_embedding_backward_codegen_{{ optimizer }}_cpu);
 }
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_cpu_template.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_cpu_template.cpp
@@ -226,6 +226,13 @@ Tensor split_embedding_codegen_lookup_{{ optimizer }}_function_cpu(
   // The unified PT2 interface already accepts learning rate as tensor.
   const auto learning_rate_tensor = at::tensor({learning_rate}, at::TensorOptions().dtype(at::kFloat).device(at::kCPU));
   {%- endif %}
+
+  // V1 API is frozen. New features/functionability can only be enabled in V2 API
+  // // New arguments are added here for compatibility
+  {%- if "enable_optimizer_offloading" in args.split_function_arg_names %}
+  const bool enable_optimizer_offloading = false;
+  {%- endif %}
+
   return SplitLookupFunction_{{ optimizer }}_Op::apply(
       host_weights,
       weights_placements,
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp
@@ -1094,6 +1094,12 @@ Tensor {{ bwd_mdesc }}_embedding_codegen_lookup_{{ optimizer }}_function(
     learning_rate_tensor.fill_(learning_rate);
     {%- endif %}
 
+    // V1 API is frozen. New features/functionability can only be enabled in V2 API
+    // New arguments are added here for compatibility
+    {%- if "enable_optimizer_offloading" in args.split_function_arg_names %}
+    const bool enable_optimizer_offloading = false;
+    {%- endif %}    
+
     {%- if not dense %}
     // Load the config value from JK once
     static auto is_tbev2_enabled = config::is_feature_enabled(config::FeatureGateName::TBE_V2);
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu
@@ -454,8 +454,7 @@ batch_index_select_dim0_codegen_backward_kernel_warp_per_row
             ph_type_combo,
             kFixedMaxVecsPerThread,
             kThreadGroupSize,
-            kUseVecBlocking
-          )
+            kUseVecBlocking)
         }}
     {%- endfor %}
     {%- endfor %}
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -969,7 +969,6 @@ Tensor {{ embedding_cuda_op }}(
             {%- endif %}
 
             DISPATCH_OPTIMAL_KERNEL(max_D, [&] {
-
                 auto long_run_ids = at::empty({indices.numel()}, sorted_linear_indices_run_lengths.options());
                 auto num_long_run_ids = at::zeros({1}, indices.options().dtype(at::kInt));
 
@@ -1032,18 +1031,17 @@ Tensor {{ embedding_cuda_op }}(
                         )
                     %}
 
-                    const auto backward_cta_per_row_kernel =
-                        {{ cta_kernel }}
-                            <emb_t,
-                             grad_t,
-                             cache_t,
-                             index_t,
-                             {%- for ph_name in args.placeholder_tensor_names %}
-                             {{ ph_name + "_ph_t" }},
-                             {%- endfor %}
-                             kFixedMaxVecsPerThread,
-                             kThreadGroupSize,
-                             kUseVecBlocking>;
+                    const auto backward_cta_per_row_kernel = {{ cta_kernel }}<
+                        emb_t,
+                        grad_t,
+                        cache_t,
+                        index_t,
+                        {%- for ph_name in args.placeholder_tensor_names %}
+                        {{ ph_name + "_ph_t" }},
+                        {%- endfor %}
+                        kFixedMaxVecsPerThread,
+                        kThreadGroupSize,
+                        kUseVecBlocking>;
 
                     // Compute shared memory size for cta_per_row
                     constexpr auto kCacheAccBytes = sizeof(at::acc_type<cache_t, true>);
@@ -1150,18 +1148,18 @@ Tensor {{ embedding_cuda_op }}(
                             desc_suffix,
                         )
                     %}
-                    auto backward_warp_per_row_kernel =
-                        {{ warp_kernel }}
-                            <emb_t,
-                             grad_t,
-                             cache_t,
-                             index_t,
-                             {%- for ph_name in args.placeholder_tensor_names %}
-                             {{ ph_name + "_ph_t" }},
-                             {%- endfor %}
-                             kFixedMaxVecsPerThread,
-                             kThreadGroupSize,
-                             kUseVecBlocking>;
+
+                    const auto backward_warp_per_row_kernel = {{ warp_kernel }}<
+                        emb_t,
+                        grad_t,
+                        cache_t,
+                        index_t,
+                        {%- for ph_name in args.placeholder_tensor_names %}
+                        {{ ph_name + "_ph_t" }},
+                        {%- endfor %}
+                        kFixedMaxVecsPerThread,
+                        kThreadGroupSize,
+                        kUseVecBlocking>;
 
                     // Compute shared memory size for warp_per_row
                     int32_t num_warp_per_row_groups = kBackwardMaxThreads / kThreadGroupSize;
diff --git a/fbgemm_gpu/codegen/training/python/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/training/python/split_embedding_codegen_lookup_invoker.template
@@ -352,7 +352,11 @@ def invoke(
     {%- if "optim_bool" in args_pt2.unified_pt2.split_function_arg_names %}
     optim_bool: List[bool] = []
     {%- for name in args_pt2.unified_pt2.split_args_dict["optim_bool"] %}
+    {%- if name == "enable_optimizer_offloading" %} # TODO: Remove this when the frontend lands
+    optim_bool.append(False)
+    {%- else %}
     optim_bool.append(dict_optim_bool["{{ name }}"])
+    {%- endif %}
     {%- endfor %}
     {%- endif %}