Skip to content

Commit

Permalink
WIP: basic lowering of for-loop to scf.for is working #2. 1) need to …
Browse files Browse the repository at this point in the history
…fix some issues with lowering of nested loops, 2) need to add support for sparse ops
  • Loading branch information
rizwanashraf committed Dec 7, 2022
1 parent 6145d20 commit 43054e2
Show file tree
Hide file tree
Showing 4 changed files with 377 additions and 0 deletions.
1 change: 1 addition & 0 deletions frontends/comet_dsl/comet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@ int loadAndProcessMLIR(mlir::MLIRContext &context,
// =============================================================================
// Lowering of other operations such as transpose, sum, etc. to SCF dialect
// =============================================================================
optPM.addPass(mlir::tensorAlgebra::createPCToLoopsLoweringPass());
optPM.addPass(mlir::IndexTree::createLowerIndexTreeIRToSCFPass());
optPM.addPass(mlir::tensorAlgebra::createSUMLowerToSCFPass());
// If it is a transpose of dense tensor, the rewrites rules replaces ta.transpose with linalg.copy.
Expand Down
3 changes: 3 additions & 0 deletions include/comet/Dialect/TensorAlgebra/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ namespace mlir
/// Create a pass for lowering sparse TA operations to SCFDimAttrGPU
std::unique_ptr<Pass> createSCFToSCFParallelPass();

/// Create a pass for lowering programming constructs to SCF ops
std::unique_ptr<Pass> createPCToLoopsLoweringPass();

} // end namespace tensorAlgebra
} // end namespace mlir

Expand Down
1 change: 1 addition & 0 deletions lib/Dialect/TensorAlgebra/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ add_llvm_library(COMETTensorAlgebra
Conversion/SUMLowerToSCF.cpp
Conversion/TransposeLowering.cpp
Conversion/LateLowering.cpp
Conversion/LowerPCToLoops.cpp

Transforms/Transforms.cpp
Transforms/LinalgTransforms.cpp
Expand Down
372 changes: 372 additions & 0 deletions lib/Dialect/TensorAlgebra/Conversion/LowerPCToLoops.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,372 @@
//===- LowerPCToLoops.cpp ------===//
//
// Copyright 2022 Battelle Memorial Institute
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this list of conditions
// and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
// and the following disclaimer in the documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//===----------------------------------------------------------------------===//
//
// This file implements a lowering of some programming constructs such as for-loops, etc.
//===----------------------------------------------------------------------===//

#include "comet/Dialect/TensorAlgebra/IR/TADialect.h"
#include "comet/Dialect/TensorAlgebra/Passes.h"
#include "comet/Dialect/IndexTree/IR/ITDialect.h"
#include "comet/Dialect/Utils/Utils.h"

#include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"
#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"

#include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"

#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/ADT/Sequence.h"

#include "mlir/EDSC/Builders.h"
#include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"

#include "llvm/Support/Debug.h"

#include <queue>
#include <vector>

using namespace mlir;
using namespace mlir::edsc;
using namespace mlir::edsc::intrinsics;
using namespace mlir::linalg;

using namespace mlir::tensorAlgebra;
using namespace mlir::indexTree;

#define DEBUG_TYPE "PC-lowering"

// *********** For debug purpose *********//
// #ifndef DEBUG_MODE_PCLoweringPass
// #define DEBUG_MODE_PCLoweringPass
// #endif

#ifdef DEBUG_MODE_PCLoweringPass
#define comet_debug() llvm::errs() << __FILE__ << " " << __LINE__ << " "
#define comet_pdump(n) \
llvm::errs() << __FILE__ << " " << __LINE__ << " "; \
n->dump()
#define comet_vdump(n) \
llvm::errs() << __FILE__ << " " << __LINE__ << " "; \
n.dump()
#else
#define comet_debug() llvm::nulls()
#define comet_pdump(n)
#define comet_vdump(n)
#endif
// *********** For debug purpose *********//

namespace
{
struct PCToLoopsLoweringPass
: public PassWrapper<PCToLoopsLoweringPass, FunctionPass>
{
void runOnFunction() final;

/// lowers ForLoopBeginOp and ForLoopEndOp to scf.for, one loop at a time.
/// pre-condition: output from ProcessLoopOps();
void PCToLoopsLowering(tensorAlgebra::ForLoopBeginOp op, tensorAlgebra::ForLoopEndOp op_end, std::vector<Operation *> &listOps);

/// find all ops to be placed inside the loop body.
std::vector<Operation *> ProcessLoopOps(tensorAlgebra::ForLoopBeginOp op_start, tensorAlgebra::ForLoopEndOp op_end);

/// replicates Ops for loop-body
/// pre-condition: some ops require previously generated ops.
void replicateOpsForLoopBody (Location loc, OpBuilder &builder, Operation *op, Value &rhs, Value &lhs, Value &compute, Value &indices, Value &transpose);
};
} // end anonymous namespace.

// find all ops to be placed inside the loop body.
std::vector<Operation *> PCToLoopsLoweringPass::ProcessLoopOps(tensorAlgebra::ForLoopBeginOp op_start, tensorAlgebra::ForLoopEndOp op_end)
{
comet_debug() << "START: Pre-processing to detect ops to be placed inside loop body.\n";
comet_pdump(op_start);

std::vector<Operation *> loop_blk;

auto *B = op_start.getOperation()->getBlock();
bool match = false;
// collect all ops to for `one` loop body
for (Operation &op : *B)
{
if (isa<tensorAlgebra::ForLoopBeginOp>(&op))
{
// check for match
if (cast<tensorAlgebra::ForLoopBeginOp>(op).iterator() == op_start.iterator())
{
match = true;
continue; // skip this iteration or op;
}
}

if (isa<tensorAlgebra::ForLoopEndOp>(&op))
{
if (cast<tensorAlgebra::ForLoopEndOp>(op) == op_end)
{
match = false;
break; // we are done with match of `one` loop-body
}
}

if (match)
{ // add to list of ops to be replicated
loop_blk.push_back(&op);
}
}

comet_debug() << "END: Pre-processing to detect ops to be placed inside loop body.\n";
return loop_blk;
}

/// replicates Ops for loop-body
/// pre-condition: some ops require previously generated ops.
void PCToLoopsLoweringPass::replicateOpsForLoopBody (Location loc, OpBuilder &builder, Operation *op,
Value &rhs, Value &lhs, Value &compute, Value &indices, Value &transpose)
{
IntegerType i64Type = IntegerType::get(builder.getContext(), 64);

if (isa<tensorAlgebra::TransposeOp>(op))
{
comet_debug() << "creating TransposeOp\n";
tensorAlgebra::TransposeOp ta_transpose_op = llvm::dyn_cast<tensorAlgebra::TransposeOp>(op);
std::vector<Value> lhs_lbls_value = {ta_transpose_op.getOperand(1), ta_transpose_op.getOperand(2)};
transpose = builder.create<tensorAlgebra::TransposeOp>(loc, ta_transpose_op.getOperand(0).getType(), ta_transpose_op.getOperand(0),
lhs_lbls_value, ta_transpose_op.indexing_maps(), ta_transpose_op.formats());
}

// TensorSetOp goes with TransposeOp at this stage of the lowering.
if (isa<tensorAlgebra::TensorSetOp>(op) && transpose != NULL)
{
comet_debug() << "creating SetOp after TransposeOp\n";
tensorAlgebra::TensorSetOp ta_set_op = llvm::dyn_cast<tensorAlgebra::TensorSetOp>(op);
builder.create<TensorSetOp>(loc, transpose, ta_set_op.getOperand(1));
}

// create IndexTreeComputeRHSOp, no dependency to earlier replications
if (isa<indexTree::IndexTreeComputeRHSOp>(op))
{
indexTree::IndexTreeComputeRHSOp it_compute_rhs_op = llvm::dyn_cast<indexTree::IndexTreeComputeRHSOp>(op);
ArrayAttr op_formats_ArrayAttr = it_compute_rhs_op.allFormats();
ArrayAttr op_perms_ArrayAttr = it_compute_rhs_op.allPerms();

rhs = builder.create<indexTree::IndexTreeComputeRHSOp>(loc, mlir::UnrankedTensorType::get(builder.getF64Type()),
it_compute_rhs_op->getOperands(), // tensors
op_perms_ArrayAttr, op_formats_ArrayAttr);
}

// create IndexTreeComputeLHSOp, no dependency to earlier replications
if (isa<indexTree::IndexTreeComputeLHSOp>(op))
{
indexTree::IndexTreeComputeLHSOp it_compute_lhs_op = llvm::dyn_cast<indexTree::IndexTreeComputeLHSOp>(op);
ArrayAttr op_formats_ArrayAttr = it_compute_lhs_op.allFormats();
ArrayAttr op_perms_ArrayAttr = it_compute_lhs_op.allPerms();

lhs = builder.create<indexTree::IndexTreeComputeLHSOp>(loc, mlir::UnrankedTensorType::get(builder.getF64Type()),
it_compute_lhs_op->getOperands(), // tensors
op_perms_ArrayAttr, op_formats_ArrayAttr);
}

// create IndexTreeComputeOp only if rhs and lhs are ready
if (isa<indexTree::IndexTreeComputeOp>(op) && rhs != NULL && lhs != NULL)
{
indexTree::IndexTreeComputeOp it_compute_op = llvm::dyn_cast<indexTree::IndexTreeComputeOp>(op);
compute = builder.create<indexTree::IndexTreeComputeOp>(loc, i64Type, rhs, lhs,
it_compute_op.comp_worksp_opt(), it_compute_op.semiring());
}

// create IndexTreeIndicesOp from existing IndexTreeIndicesOp (checks condition: indices != NULL)
if (isa<indexTree::IndexTreeIndicesOp>(op) && indices != NULL)
{
indexTree::IndexTreeIndicesOp it_indices_op = llvm::dyn_cast<indexTree::IndexTreeIndicesOp>(op);
Value indices_op_new = builder.create<indexTree::IndexTreeIndicesOp>(loc, i64Type, indices, it_indices_op.indices());

indices = indices_op_new; // for subsequent IndexTreeIndicesOp creation
}

// create the first instance of IndexTreeIndicesOp from IndexTreeComputeOp
if (isa<indexTree::IndexTreeIndicesOp>(op) && compute != NULL && indices == NULL)
{
indexTree::IndexTreeIndicesOp it_indices_op = llvm::dyn_cast<indexTree::IndexTreeIndicesOp>(op);
indices = builder.create<indexTree::IndexTreeIndicesOp>(loc, i64Type, compute, it_indices_op.indices());
}

if (isa<indexTree::IndexTreeOp>(op) && indices != NULL)
{
builder.create<indexTree::IndexTreeOp>(loc, i64Type, indices);
}
}

/// lowers ForLoopBeginOp and ForLoopEndOp to scf.for, one loop at a time.
void PCToLoopsLoweringPass::PCToLoopsLowering(tensorAlgebra::ForLoopBeginOp op, tensorAlgebra::ForLoopEndOp op_end, std::vector<Operation *> &listOps)
{
comet_debug() << "PCToLoopsLowering start\n";
OpBuilder builder(op);

comet_pdump(op);
comet_pdump(op_end);

auto loc = op->getLoc();
auto ForLoopStart = cast<tensorAlgebra::ForLoopBeginOp>(op);

// get info of loop
auto upperBound = ForLoopStart.max();
auto lowerBound = ForLoopStart.min();
auto step = ForLoopStart.step();

auto loop = builder.create<scf::ForOp>(op_end->getLoc(), lowerBound, upperBound, step);
comet_vdump(loop);

auto insertPt = builder.saveInsertionPoint();
builder.setInsertionPointToStart(loop.getBody());

// loop-body: listOps contains all ops obtained thru ProcessLoopOps()
// to be placed inside 'one' loop-body.
// TODO: verify all ops are covered here!
Value new_rhs_op, new_lhs_op, new_compute_op, indices_op;
Value transpose_op;
std::vector<Value> loop_bounds; // carries loop bounds
for (unsigned int i = 0; i < listOps.size(); i++)
{
comet_pdump(listOps[i]);

replicateOpsForLoopBody (loc, builder, listOps[i], new_rhs_op, new_lhs_op, new_compute_op, indices_op, transpose_op);

// nested loop
if (isa<mlir::ConstantIndexOp>(listOps[i]))
{
mlir::ConstantIndexOp constant_index = llvm::dyn_cast<mlir::ConstantIndexOp>(listOps[i]);
loop_bounds.push_back(builder.create<mlir::ConstantIndexOp>(loc, constant_index.getValue()));
}

// scf::for op
// the inner loop bodies have already been created. preserve them.
// the loop_bounds array should have lowerBound, upperBound and step Values.
if (isa<scf::ForOp>(listOps[i]) && loop_bounds.size() == 3)
{
scf::ForOp scf_for_op = llvm::dyn_cast<scf::ForOp>(listOps[i]);
scf::ForOp nested_scf_for_op = builder.create<scf::ForOp>(loc, loop_bounds[0], loop_bounds[1], loop_bounds[2]);

auto insertPt_nested = builder.saveInsertionPoint();
builder.setInsertionPointToStart(nested_scf_for_op.getBody());

Block *B = scf_for_op.getBody();
Value nested_rhs_op, nested_lhs_op, nested_compute_op, nested_indices_op;
Value nested_transpose_op;
comet_debug() << "going to replicate ops inside nested loop...\n";
for (Operation &op_for : *B)
{
replicateOpsForLoopBody (loc, builder, &op_for, nested_rhs_op, nested_lhs_op, nested_compute_op, nested_indices_op, nested_transpose_op);
}
// need to restore the insertion point to the previous point
builder.restoreInsertionPoint(insertPt_nested);
builder.setInsertionPoint(op_end); // TODO: need to re-visit this for nested loops.
}
}

// remove old ops, since now we are done with the clone inside loop-body.
// this is done in reverse order.
comet_debug() << "Removing ops that have been cloned\n";
for (unsigned int i = 0; i < listOps.size(); i++)
{
comet_pdump(listOps[listOps.size()-i-1]);
listOps[listOps.size()-i-1]->erase();
}

// need to restore the insertion point to the previous point
builder.restoreInsertionPoint(insertPt);
builder.setInsertionPoint(op_end);

// remove ForLoopBeginOp and ForLoopEndOp
op->erase();
op_end->erase();

comet_debug() << "PCToLoopsLowering end\n";
}

void PCToLoopsLoweringPass::runOnFunction()
{
comet_debug() << "start PCToLoopsLoweringPass\n";

auto function = getFunction();

std::vector<tensorAlgebra::ForLoopBeginOp> startOps;
std::vector<tensorAlgebra::ForLoopEndOp> endOps;

// collect all the loops (begin and ends) here in vector data-structure. e.g.,
// for-start1 ():
// for-start2 ():
// do_work2();
// end2
// do_work1();
// end1

// vector: for-start1, for-start2
// vector: end2, end1

for (Block &B : function.body())
{
for (Operation &op : B)
{
if (isa<tensorAlgebra::ForLoopBeginOp>(&op))
{
startOps.push_back(cast<tensorAlgebra::ForLoopBeginOp>(op));
}
if (isa<tensorAlgebra::ForLoopEndOp>(&op))
{
endOps.push_back(cast<tensorAlgebra::ForLoopEndOp>(op));
}
}
}

// if there are no for-loops, quit.
if (startOps.empty())
return;

// the size of the two datastructure should be same
assert(startOps.size() == endOps.size() && "the for-begins must match the ends");
std::vector<Operation *> opList;
for (unsigned int i = 0; i < startOps.size(); i++)
{
// start with inner most loop and move outwards.
opList = ProcessLoopOps(startOps[startOps.size()-i-1], endOps[i]); // for-1, end-1
PCToLoopsLowering(startOps[startOps.size()-i-1], endOps[i], opList);
opList.clear(); // clear for next round.
}
startOps.clear();
endOps.clear();

// debug
auto module = function.getOperation()->getParentOfType<ModuleOp>();
module->dump();

comet_debug() << "end PCToLoopsLoweringPass\n";
}

// Create a pass for lowering programming constructs
std::unique_ptr<Pass> mlir::tensorAlgebra::createPCToLoopsLoweringPass()
{
return std::make_unique<PCToLoopsLoweringPass>();
}

0 comments on commit 43054e2

Please sign in to comment.