Skip to content

Commit

Permalink
Inspect all IDs instead of just loop in ParallelDimensionMap (#3376)
Browse files Browse the repository at this point in the history
This is important for Hopper MMA (see #3278) in which we only
parallelize TIDx on the allocation domain of the MmaOp output. Currently
this leads to us generating a usable kernel but we are not able to
launch it properly because we can't infer the x dimension of the block
size. This PR fixes that by replacing `tv->getLoopDomain()` with
`tv->domain()->allIDs()` which will inspect the root, logical, loop,
allocation domains and even intermediate IterDomains to try and find
parallelized dimensions.
  • Loading branch information
jacobhinkle authored Nov 8, 2024
1 parent 114e9a1 commit 2aacfd7
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 1 deletion.
2 changes: 1 addition & 1 deletion csrc/parallel_dimension_map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ void ParallelDimensionMap::build(Fusion* fusion) {
VectorOfUniqueEntries<PAndID> all_concrete_ids;
auto all_vals = fusion->usedMathVals();
for (auto tv : ir_utils::filterByType<TensorView>(all_vals)) {
for (auto id : tv->getLoopDomain()) {
for (auto id : tv->domain()->allIDs()) {
auto ptype = id->getParallelType();
if (!isParallelTypeThread(ptype)) {
continue;
Expand Down
27 changes: 27 additions & 0 deletions tests/cpp/test_gpu3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
#include <algorithm>
#include <cmath>
#include <sstream>
#include "parallel_dimension_map.h"

namespace nvfuser {

Expand Down Expand Up @@ -8991,6 +8992,32 @@ TEST_F(NVFuserTest, ReplaceSymbolicSizesPreferSimplerExtents) {
}
}

// Test that we are able to infer parallel dimensions even if they are not
// provided in loop domains. This is important for Hopper MMA since we
// parallelize TIDx on an allocation domain for the MmaOp output that is not in
// its loop domain.
TEST_F(NVFuserTest, ParallelDimensionsInAllocation) {
auto fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr;
FusionGuard fg(fusion_ptr.get());

auto tv0 = makeConcreteTensor({4, 8});
fusion.addInput(tv0);
auto tv1 = neg(tv0);
auto tv2 = exp(tv1);
fusion.addOutput(tv2);

IterDomain* merged_id = IterDomain::merge(tv1->axis(0), tv1->axis(1));
tv1->setAllocationDomain({merged_id}, true);
merged_id->parallelize(ParallelType::TIDx);

GpuLower gpulw(&fusion);
gpulw.run();

Val* tidx_dim = gpulw.parallelDimensionMap().get(ParallelType::TIDx);
ASSERT_TRUE(tidx_dim != nullptr);
}

// Test file size should be up to 10K LoC. Create a new file for more tests.

} // namespace nvfuser

0 comments on commit 2aacfd7

Please sign in to comment.