Inspect all IDs instead of just loop in ParallelDimensionMap (#3376)

This is important for Hopper MMA (see #3278) in which we only parallelize TIDx on the allocation domain of the MmaOp output. Currently this leads to us generating a usable kernel but we are not able to launch it properly because we can't infer the x dimension of the block size. This PR fixes that by replacing `tv->getLoopDomain()` with `tv->domain()->allIDs()` which will inspect the root, logical, loop, allocation domains and even intermediate IterDomains to try and find parallelized dimensions.
NVIDIA · Nov 8, 2024 · 2aacfd7 · 2aacfd7
1 parent 114e9a1
commit 2aacfd7
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 1 deletion.
diff --git a/csrc/parallel_dimension_map.cpp b/csrc/parallel_dimension_map.cpp
@@ -41,7 +41,7 @@ void ParallelDimensionMap::build(Fusion* fusion) {
   VectorOfUniqueEntries<PAndID> all_concrete_ids;
   auto all_vals = fusion->usedMathVals();
   for (auto tv : ir_utils::filterByType<TensorView>(all_vals)) {
-    for (auto id : tv->getLoopDomain()) {
+    for (auto id : tv->domain()->allIDs()) {
       auto ptype = id->getParallelType();
       if (!isParallelTypeThread(ptype)) {
         continue;

diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp
@@ -54,6 +54,7 @@
 #include <algorithm>
 #include <cmath>
 #include <sstream>
+#include "parallel_dimension_map.h"
 
 namespace nvfuser {
 
@@ -8991,6 +8992,32 @@ TEST_F(NVFuserTest, ReplaceSymbolicSizesPreferSimplerExtents) {
   }
 }
 
+// Test that we are able to infer parallel dimensions even if they are not
+// provided in loop domains. This is important for Hopper MMA since we
+// parallelize TIDx on an allocation domain for the MmaOp output that is not in
+// its loop domain.
+TEST_F(NVFuserTest, ParallelDimensionsInAllocation) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  auto tv0 = makeConcreteTensor({4, 8});
+  fusion.addInput(tv0);
+  auto tv1 = neg(tv0);
+  auto tv2 = exp(tv1);
+  fusion.addOutput(tv2);
+
+  IterDomain* merged_id = IterDomain::merge(tv1->axis(0), tv1->axis(1));
+  tv1->setAllocationDomain({merged_id}, true);
+  merged_id->parallelize(ParallelType::TIDx);
+
+  GpuLower gpulw(&fusion);
+  gpulw.run();
+
+  Val* tidx_dim = gpulw.parallelDimensionMap().get(ParallelType::TIDx);
+  ASSERT_TRUE(tidx_dim != nullptr);
+}
+
 // Test file size should be up to 10K LoC. Create a new file for more tests.
 
 } // namespace nvfuser