From 113e82bce687d19f9ad5e5224f89ac49c677ebd3 Mon Sep 17 00:00:00 2001 From: Vassily Litvinov Date: Thu, 11 Jan 2024 22:24:12 -0800 Subject: [PATCH 1/4] Engin's context-phase-2 : 98f9c70465..eccaf0d7f8 Signed-off-by: Vassily Litvinov --- compiler/AST/primitive.cpp | 9 + compiler/include/driver.h | 1 + compiler/include/lowerLoopContexts.h | 2 + compiler/main/driver.cpp | 2 + compiler/optimizations/gpuTransforms.cpp | 13 +- compiler/passes/normalize.cpp | 40 + compiler/resolution/CMakeLists.txt | 1 + compiler/resolution/lowerIterators.cpp | 3 + compiler/resolution/lowerLoopContexts.cpp | 979 ++++++++++++++++++ frontend/include/chpl/uast/PragmaList.h | 2 + frontend/include/chpl/uast/prim-ops-list.h | 4 + frontend/lib/resolution/prims.cpp | 3 + modules/internal/ChapelBase.chpl | 13 + modules/internal/DefaultRectangular.chpl | 18 +- modules/standard/Collectives.chpl | 6 + modules/standard/GPU.chpl | 6 + runtime/include/chpl-gen-includes.h | 4 + runtime/include/gpu/chpl-gpu-gen-common.h | 3 - .../engin/context/ChapelContextSupport.chpl | 29 + .../engin/context/ChapelContextSupport.notest | 0 test/users/engin/context/Iterators.chpl | 309 ++++++ test/users/engin/context/Iterators.notest | 0 test/users/engin/context/basic2D.chpl | 23 + test/users/engin/context/basic2D.execenv | 1 + test/users/engin/context/basic2D.good | 64 ++ test/users/engin/context/basic2D.lm-gpu.good | 65 ++ test/users/engin/context/basic2D.numlocales | 1 + test/users/engin/context/basic2D.prediff | 4 + test/users/engin/context/basic2D.skipif | 4 + .../context/basicArrayHoist-hoisted.good | 1 + .../basicArrayHoist-hoisted.lm-gpu.good | 1 + .../basicArrayHoist-link-for-memleaks.chpl | 1 + ...basicArrayHoist-link-for-memleaks.execopts | 1 + .../basicArrayHoist-link-for-memleaks.good | 20 + ...icArrayHoist-link-for-memleaks.lm-gpu.good | 21 + ...sicArrayHoist-link-for-memleaks.numlocales | 1 + .../context/basicArrayHoist-not-hoisted.good | 1 + test/users/engin/context/basicArrayHoist.chpl | 25 + .../engin/context/basicArrayHoist.compopts | 2 + .../engin/context/basicArrayHoist.execopts | 1 + .../engin/context/basicArrayHoist.numlocales | 1 + .../engin/context/basicArrayHoist.prediff | 4 + test/users/engin/context/basicBarrier.chpl | 42 + test/users/engin/context/basicBarrier.good | 10 + .../engin/context/basicBarrier.lm-gpu.good | 11 + .../engin/context/basicBarrier.numlocales | 1 + test/users/engin/context/basicBarrier.prediff | 4 + test/users/engin/context/basicTaskIds.chpl | 23 + .../users/engin/context/basicTaskIds.execopts | 1 + test/users/engin/context/basicTaskIds.good | 40 + .../engin/context/basicTaskIds.lm-gpu.good | 41 + .../engin/context/basicTaskIds.numlocales | 1 + test/users/engin/context/basicTaskIds.prediff | 4 + test/users/engin/context/gpuSharedMem.chpl | 59 ++ test/users/engin/context/gpuSharedMem.good | 1 + test/users/engin/context/gpuSharedMem.prediff | 8 + test/users/engin/context/gpuSharedMem.skipif | 1 + test/users/engin/context/transpose.chpl | 85 ++ test/users/engin/context/transpose.compopts | 1 + test/users/engin/context/transpose.execopts | 1 + test/users/engin/context/transpose.good | 6 + .../users/engin/context/transpose.lm-gpu.good | 4 + test/users/engin/context/transpose.numlocales | 1 + util/chpl-completion.bash | 1 + 64 files changed, 2026 insertions(+), 9 deletions(-) create mode 100644 compiler/include/lowerLoopContexts.h create mode 100644 compiler/resolution/lowerLoopContexts.cpp create mode 100644 test/users/engin/context/ChapelContextSupport.chpl create mode 100644 test/users/engin/context/ChapelContextSupport.notest create mode 100644 test/users/engin/context/Iterators.chpl create mode 100644 test/users/engin/context/Iterators.notest create mode 100644 test/users/engin/context/basic2D.chpl create mode 100644 test/users/engin/context/basic2D.execenv create mode 100644 test/users/engin/context/basic2D.good create mode 100644 test/users/engin/context/basic2D.lm-gpu.good create mode 100644 test/users/engin/context/basic2D.numlocales create mode 100755 test/users/engin/context/basic2D.prediff create mode 100644 test/users/engin/context/basic2D.skipif create mode 100644 test/users/engin/context/basicArrayHoist-hoisted.good create mode 100644 test/users/engin/context/basicArrayHoist-hoisted.lm-gpu.good create mode 120000 test/users/engin/context/basicArrayHoist-link-for-memleaks.chpl create mode 100644 test/users/engin/context/basicArrayHoist-link-for-memleaks.execopts create mode 100644 test/users/engin/context/basicArrayHoist-link-for-memleaks.good create mode 100644 test/users/engin/context/basicArrayHoist-link-for-memleaks.lm-gpu.good create mode 120000 test/users/engin/context/basicArrayHoist-link-for-memleaks.numlocales create mode 100644 test/users/engin/context/basicArrayHoist-not-hoisted.good create mode 100644 test/users/engin/context/basicArrayHoist.chpl create mode 100644 test/users/engin/context/basicArrayHoist.compopts create mode 100644 test/users/engin/context/basicArrayHoist.execopts create mode 100644 test/users/engin/context/basicArrayHoist.numlocales create mode 100755 test/users/engin/context/basicArrayHoist.prediff create mode 100644 test/users/engin/context/basicBarrier.chpl create mode 100644 test/users/engin/context/basicBarrier.good create mode 100644 test/users/engin/context/basicBarrier.lm-gpu.good create mode 100644 test/users/engin/context/basicBarrier.numlocales create mode 100755 test/users/engin/context/basicBarrier.prediff create mode 100644 test/users/engin/context/basicTaskIds.chpl create mode 100644 test/users/engin/context/basicTaskIds.execopts create mode 100644 test/users/engin/context/basicTaskIds.good create mode 100644 test/users/engin/context/basicTaskIds.lm-gpu.good create mode 100644 test/users/engin/context/basicTaskIds.numlocales create mode 100755 test/users/engin/context/basicTaskIds.prediff create mode 100644 test/users/engin/context/gpuSharedMem.chpl create mode 100644 test/users/engin/context/gpuSharedMem.good create mode 100755 test/users/engin/context/gpuSharedMem.prediff create mode 100644 test/users/engin/context/gpuSharedMem.skipif create mode 100644 test/users/engin/context/transpose.chpl create mode 100644 test/users/engin/context/transpose.compopts create mode 100644 test/users/engin/context/transpose.execopts create mode 100644 test/users/engin/context/transpose.good create mode 100644 test/users/engin/context/transpose.lm-gpu.good create mode 100644 test/users/engin/context/transpose.numlocales diff --git a/compiler/AST/primitive.cpp b/compiler/AST/primitive.cpp index dc02c5964859..d4198d12557c 100644 --- a/compiler/AST/primitive.cpp +++ b/compiler/AST/primitive.cpp @@ -175,6 +175,11 @@ returnInfoFirst(CallExpr* call) { return call->get(1)->qualType(); } +static QualifiedType +returnInfoFirstAsValue(CallExpr* call) { + return QualifiedType(Qualifier::QUAL_CONST_VAL, call->get(1)->qualType().type()); +} + static QualifiedType returnInfoFirstDeref(CallExpr* call) { QualifiedType tmp = call->get(1)->qualType(); @@ -711,6 +716,10 @@ initPrimitive() { // use for any primitives not in this list primitives[PRIM_UNKNOWN] = NULL; + prim_def(PRIM_INNERMOST_CONTEXT, "innermost context", returnInfoFirstAsValue); + prim_def(PRIM_OUTER_CONTEXT, "outer context", returnInfoFirst); + prim_def(PRIM_HOIST_TO_CONTEXT, "hoist to context", returnInfoVoid); + prim_def(PRIM_ACTUALS_LIST, "actuals list", returnInfoVoid); prim_def(PRIM_NOOP, "noop", returnInfoVoid); // dst, src. PRIM_MOVE can set a reference. diff --git a/compiler/include/driver.h b/compiler/include/driver.h index d2f0e4da1246..e6f27def681d 100644 --- a/compiler/include/driver.h +++ b/compiler/include/driver.h @@ -253,6 +253,7 @@ extern bool fReportOptimizedOn; extern bool fReportPromotion; extern bool fReportScalarReplace; extern bool fReportGpu; +extern bool fReportContextAdj; extern bool fReportDeadBlocks; extern bool fReportDeadModules; extern bool fReportGpuTransformTime; diff --git a/compiler/include/lowerLoopContexts.h b/compiler/include/lowerLoopContexts.h new file mode 100644 index 000000000000..bed653b2aa08 --- /dev/null +++ b/compiler/include/lowerLoopContexts.h @@ -0,0 +1,2 @@ + +void lowerContexts(); diff --git a/compiler/main/driver.cpp b/compiler/main/driver.cpp index 23ba3e67cb94..ee91f8a3a9f5 100644 --- a/compiler/main/driver.cpp +++ b/compiler/main/driver.cpp @@ -295,6 +295,7 @@ bool fReportOptimizeForallUnordered = false; bool fReportPromotion = false; bool fReportScalarReplace = false; bool fReportGpu = false; +bool fReportContextAdj = false; bool fReportDeadBlocks = false; bool fReportDeadModules = false; bool fReportGpuTransformTime = false; @@ -1470,6 +1471,7 @@ static ArgumentDescription arg_desc[] = { {"report-promotion", ' ', NULL, "Print information about scalar promotion", "F", &fReportPromotion, NULL, NULL}, {"report-scalar-replace", ' ', NULL, "Print scalar replacement stats", "F", &fReportScalarReplace, NULL, NULL}, {"report-gpu", ' ', NULL, "Print information about what loops are and are not GPU eligible", "F", &fReportGpu, NULL, NULL}, + {"report-context-adjustments", ' ', NULL, "Print debugging information while handling contexts", "F", &fReportContextAdj, NULL, NULL}, {"", ' ', NULL, "Developer Flags -- Miscellaneous", NULL, NULL, NULL, NULL}, {"allow-noinit-array-not-pod", ' ', NULL, "Allow noinit for arrays of records", "N", &fAllowNoinitArrayNotPod, "CHPL_BREAK_ON_CODEGEN", NULL}, diff --git a/compiler/optimizations/gpuTransforms.cpp b/compiler/optimizations/gpuTransforms.cpp index 0f3daf2b715e..d95f4618a175 100644 --- a/compiler/optimizations/gpuTransforms.cpp +++ b/compiler/optimizations/gpuTransforms.cpp @@ -616,6 +616,7 @@ GpuizableLoop::GpuizableLoop(BlockStmt *blk) { INT_ASSERT(blk->getFunction()); this->loop_ = toCForLoop(blk); + this->parentFn_ = toFnSymbol(blk->getFunction()); this->assertionReporter_.noteGpuizableAssertion(findCompileTimeGpuAssertions()); this->isEligible_ = evaluateLoop(); @@ -1024,6 +1025,7 @@ class GpuKernel { static bool isCallToPrimitiveWeShouldNotCopyIntoKernel(CallExpr *call); void populateBody(FnSymbol *outlinedFunction); void normalizeOutlinedFunction(); + void setLateGpuizationFailure(bool flag); void finalize(); void generateIndexComputation(); @@ -1306,7 +1308,7 @@ void GpuKernel::populateBody(FnSymbol *outlinedFunction) { addKernelArgument(sym); } else { - INT_FATAL("Malformed PRIM_GET_MEMBER_*"); + this->setLateGpuizationFailure(true); } } else if (parent->isPrimitive()) { @@ -1322,7 +1324,7 @@ void GpuKernel::populateBody(FnSymbol *outlinedFunction) { } } else { - INT_FATAL("Unexpected call expression"); + this->setLateGpuizationFailure(true); } } else if (CondStmt* cond = toCondStmt(symExpr->parentExpr)) { // Parent is a conditional statement. @@ -1330,7 +1332,7 @@ void GpuKernel::populateBody(FnSymbol *outlinedFunction) { addKernelArgument(sym); } } else { - INT_FATAL("Unexpected symbol expression"); + this->setLateGpuizationFailure(true); } } } @@ -1344,6 +1346,9 @@ void GpuKernel::populateBody(FnSymbol *outlinedFunction) { update_symbols(outlinedFunction->body, ©Map_); } +void GpuKernel::setLateGpuizationFailure(bool flag) { + this->lateGpuizationFailure_ = flag; +} void GpuKernel::normalizeOutlinedFunction() { normalize(fn_); @@ -1355,7 +1360,7 @@ void GpuKernel::normalizeOutlinedFunction() { collectDefExprs(fn_, defExprsInBody); for_vector (DefExpr, def, defExprsInBody) { if(def->sym->type == dtUnknown) { - this->lateGpuizationFailure_ = true; + this->setLateGpuizationFailure(true); } } diff --git a/compiler/passes/normalize.cpp b/compiler/passes/normalize.cpp index 18e015190355..72ef7cd9ffab 100644 --- a/compiler/passes/normalize.cpp +++ b/compiler/passes/normalize.cpp @@ -130,6 +130,44 @@ static TypeSymbol* expandTypeAlias(SymExpr* se); * * ************************************** | *************************************/ +static void earlyGpuTransforms() { + forv_expanding_Vec(CallExpr, call, gCallExprs) { + if (!call->isPrimitive(PRIM_HOIST_TO_CONTEXT)) continue; + + // The particular definition we expect is a default-init c_array, which is: + // + // unknown myArray; + // unknown call_tmp; + // call_tmp = c_array(t, k); + // __primitive("default init var", myArray, call_tmp); + + auto hoistDefExpr = toSymExpr(call->get(2))->symbol()->defPoint; + if (!isDefExpr(hoistDefExpr->next)) continue; + auto typeDefExpr = toDefExpr(hoistDefExpr->next); + if (!isCallExpr(typeDefExpr->next)) continue; + auto typeAssign = toCallExpr(typeDefExpr->next); + if (!typeAssign->isPrimitive(PRIM_MOVE) || + !isCallExpr(typeAssign->get(2))) continue; + auto typeCall = toCallExpr(typeAssign->get(2)); + if (!isCallExpr(typeAssign->next)) continue; + auto initCall = toCallExpr(typeAssign->next); + if (!initCall->isPrimitive(PRIM_DEFAULT_INIT_VAR)) continue; + + auto typeConstructor = toSymExpr(typeCall->baseExpr); + if (!typeConstructor) continue; + if (typeConstructor->symbol()->name != astr("c_array")) continue; + + SET_LINENO(hoistDefExpr); + auto newBlock = new BlockStmt(); + auto newArr = new VarSymbol(astr("shared_", hoistDefExpr->sym->name)); + newArr->qual = Qualifier::QUAL_REF; + newBlock->insertAtTail(new DefExpr(newArr)); + newBlock->insertAtTail(new CallExpr(PRIM_MOVE, new SymExpr(newArr), new CallExpr("createSharedCArray", new SymExpr(typeDefExpr->sym)))); + initCall->insertAfter(newBlock); + } +} + + void normalize() { insertModuleInit(); @@ -264,6 +302,8 @@ void normalize() { } } + earlyGpuTransforms(); + find_printModuleInit_stuff(); } diff --git a/compiler/resolution/CMakeLists.txt b/compiler/resolution/CMakeLists.txt index 1a77735f204e..25eec29db045 100644 --- a/compiler/resolution/CMakeLists.txt +++ b/compiler/resolution/CMakeLists.txt @@ -35,6 +35,7 @@ set(SRCS loopDetails.cpp lowerForalls.cpp lowerIterators.cpp + lowerLoopContexts.cpp nilChecking.cpp postFold.cpp preFold.cpp diff --git a/compiler/resolution/lowerIterators.cpp b/compiler/resolution/lowerIterators.cpp index 17e3e24565f9..d1a042aad426 100644 --- a/compiler/resolution/lowerIterators.cpp +++ b/compiler/resolution/lowerIterators.cpp @@ -27,6 +27,7 @@ #include "ForallStmt.h" #include "ForLoop.h" #include "iterator.h" +#include "lowerLoopContexts.h" #include "optimizations.h" #include "passes.h" #include "resolution.h" @@ -3198,6 +3199,8 @@ void lowerIterators() { reconstructIRautoCopyAutoDestroy(); + lowerContexts(); + cleanupTemporaryVectors(); cleanupIteratorBreakToken(); cleanupPrimIRFieldValByFormal(); diff --git a/compiler/resolution/lowerLoopContexts.cpp b/compiler/resolution/lowerLoopContexts.cpp new file mode 100644 index 000000000000..5406bcbd690a --- /dev/null +++ b/compiler/resolution/lowerLoopContexts.cpp @@ -0,0 +1,979 @@ +#include + +// probably too much: +#include "astutil.h" +#include "AstVisitorTraverse.h" +#include "CForLoop.h" +#include "driver.h" +#include "errorHandling.h" +#include "expr.h" +#include "ForallStmt.h" +#include "ForLoop.h" +#include "iterator.h" +#include "lowerLoopContexts.h" +#include "optimizations.h" +#include "passes.h" +#include "resolution.h" +#include "resolveIntents.h" +#include "stlUtil.h" +#include "stmt.h" +#include "stringutil.h" +#include "symbol.h" +#include "view.h" +#include "wellknown.h" + +#include "global-ast-vecs.h" + +static bool isInUserCode(BaseAST* node) { + if (ModuleSymbol* mod = node->getModule()) { + return mod->modTag == MOD_USER; + } + return false; +} + +static bool fnCanHaveContext(FnSymbol* fn) { + // TODO make sure it is not cobegin ? + return (fn->hasFlag(FLAG_COBEGIN_OR_COFORALL) || + fn->hasFlag(FLAG_ON)) && + fn->singleInvocation() != NULL; + +} + +static void CONTEXT_DEBUG(int indent, std::string msg, BaseAST* node) { + if (fReportContextAdj) { + for(int i=0 ; iid << "] " << msg << std::endl; + } +} + +static int findFormalIndex(FnSymbol* fn, ArgSymbol* arg) { + int ret = -1; + + int i = 1; + for_formals (formal, fn) { + if (formal == arg) { + ret = i; + } + i++; + } + + return ret; +} + + +class Context { + public: + Symbol* localHandle_ = NULL; + + // TODO we can probably populate the bottom two together + std::vector localHandleAutoDestroys_; + Expr* endOfLocalHandleSetup_ = NULL; + Expr* upEndCount_ = NULL; + + // this'll need to be differentiated between LoopContext and IteratorContext + // when we have a proper syntax. The current implementation is more suitable + // for IteratorContext + bool findLoopContextHandle() { + const int debugDepth = 1; + std::vector defExprs; + collectDefExprs(this->node(), defExprs); + + for_vector (DefExpr, def, defExprs) { + //CONTEXT_DEBUG(debugDepth+1, "looking at DefExpr", def); + + if (defExprIsLocalHandle(def)) { + if (localHandle_ == NULL) { + CONTEXT_DEBUG(debugDepth+2, "found a context handle", def->sym); + localHandle_ = def->sym; + } + else { + // For now, ignore other context handles. Slight challenge: the innermost + // coforall_fn also contains the loop in question. So when trying to + // find the coforall_fn's context handles, we should avoid looking at + // loop's. + + //CONTEXT_DEBUG(debugDepth+2, "found another context handle?", def->sym); + //INT_FATAL("found another context handle?"); + } + } + } + + return localHandle_ != NULL; + } + + void collectLocalHandleAutoDestroys() { + std::vector calls; + collectCallExprs(this->node(), calls); + + for_vector(CallExpr, call, calls) { + if (FnSymbol* fn = call->resolvedFunction()) { + if (fn->hasFlag(FLAG_AUTO_DESTROY_FN)) { + if (toSymExpr(call->get(1))->symbol() == localHandle_) { + this->localHandleAutoDestroys_.push_back(call); + } + } + } + } + } + + std::vector& getLocalHandleAutoDestroys() { + if (localHandleAutoDestroys_.size() == 0) { + collectLocalHandleAutoDestroys(); + } + return localHandleAutoDestroys_; + } + + Expr* getEndOfLocalHandleSetup() { + if (endOfLocalHandleSetup_ == NULL) { + // probably can iterate over the body + std::vector calls; + collectCallExprs(this->node(), calls); + + for_vector (CallExpr, call, calls) { + if (call->isPrimitive(PRIM_MOVE)) { + if (Symbol* lhs = toSymExpr(call->get(1))->symbol()) { + if (lhs == localHandle_) { + endOfLocalHandleSetup_ = call; + break; + } + } + } + } + } + + return endOfLocalHandleSetup_; + } + + Expr* getUpEndCount() { + if (upEndCount_ == NULL) { + std::vector calls; + collectCallExprs(this->node(), calls); + + for_vector (CallExpr, call, calls) { + if (call->isNamed("_upEndCount")) { + upEndCount_ = call; + break; + } + } + } + return upEndCount_; + } + + virtual BaseAST* node() = 0; + + bool defExprIsLocalHandle(DefExpr* def) { + if (isArgSymbol(def->sym)) return false; + + return !def->sym->hasFlag(FLAG_TEMP) && + !isLabelSymbol(def->sym) && + !def->sym->hasFlag(FLAG_INDEX_VAR) && // avoid re-finding loop's + !def->sym->hasFlag(FLAG_EPILOGUE_LABEL) && // same as !isLabelSymbol? + def->sym->getValType()->symbol->hasFlag(FLAG_CONTEXT_TYPE); + } + +}; + +class LoopContext: public Context { + public: + CForLoop* loop_; + LoopContext(CForLoop* loop): loop_(loop) {} + + BaseAST* node() override { return loop_; }; + + + +}; + +class CoforallOnContext; +class VectorizedLoopContext; + +class IteratorContext: public Context { + protected: + enum class Kind { + CoforallOn, + VectorizedLoop, + }; + + /** Entry-point information for the context inside this one. */ + struct { + /** + If the inner context is a coforall loop, it's represented as a call to + a coforall_fn inside of a plain for-loop. This represents the for-loop. + */ + CForLoop* innerLoop_ = NULL; + + /** + If the inner context is a coforall loop or an on statement, it is represented + using a call to some function, either coforall_fn or on_fn. This represents + the call to that function. + */ + CallExpr* callToInner_ = NULL; + + /** + If inner context is the initial serial for-loop that triggered the context + lowering, this represents that loop. + */ + CForLoop* innermostLoop_ = NULL; + }; + + /** What kind of context this is, for casting. */ + Kind kind_; + + IteratorContext(Kind kind) : kind_(kind) {} + + public: + void setInnerLoop(CForLoop* loop) { innerLoop_ = loop; } + void setCallToInner(CallExpr* expr) { callToInner_ = expr; } + void setInnermostLooop(CForLoop* loop) { innermostLoop_ = loop; } + + CoforallOnContext* toCoforallOnContext() { + if (kind_ == Kind::CoforallOn) return (CoforallOnContext*) this; + return nullptr; + } + + VectorizedLoopContext* toVectorizedLoopContext() { + if (kind_ == Kind::VectorizedLoop) return (VectorizedLoopContext*) this; + return nullptr; + } + + /** + Determine the point right before the inner context, which hoisting should + use as an anchor. + */ + Expr* getInsertBeforeCallToInnerAnchor() const { + if (innerLoop_) { + return innerLoop_; + } else if (callToInner_) { + return callToInner_; + } else { + return innermostLoop_; + } + } + + /** + If a new symbol has been introduced outside, and if the inner context + is accessed via a call, add a new actual to the call to make that symbol + available there. This also modifies the function being called to + introduce the formal. + + Returns the symbol made available within the inner context. + */ + Symbol* insertActualForOuterSymbol(Symbol* sym) { + if (callToInner_) { + callToInner_->insertAtTail(new SymExpr(sym)); + // TODO this is probably the right intent for now. But maybe we want + // `const ref`? + auto formal = new ArgSymbol(INTENT_CONST_IN, sym->name, sym->getValType()); + callToInner_->resolvedFunction()->insertFormalAtTail(formal); + return formal; + } + // TODO: what do here + return nullptr; + } + + /** + If a symbol is being hoisted past the inner context, and the context is + accessed via a call, adds a new actual to the call to make the symbol available + inside. This also modifies the function being called to introduce the formal. + + Returns the symbol made available within the inner context. + */ + Symbol* insertActualForHoistedSymbol(Symbol* outerSym, const char* name, Type* type) { + if (CallExpr* toAdjust = callToInner_) { + toAdjust->insertAtTail(new SymExpr(outerSym)); + + auto formal = new ArgSymbol(INTENT_REF, name, type); + toAdjust->resolvedFunction()->insertFormalAtTail(formal); + return formal; + } + return outerSym; + } + public: + virtual ~IteratorContext() = default; + + virtual void dump(int depth) const = 0; + virtual void recomputeActualSymbol(int& actualIdx, Symbol*& actual) const = 0; +}; + +class CoforallOnContext : public IteratorContext { + private: + FnSymbol* fn_; + public: + CoforallOnContext(FnSymbol* fn) : + IteratorContext(Kind::CoforallOn), fn_(fn) {} + + BaseAST* node() override { return fn_; }; + + void dump(int depth) const override { + std::string msg = ""; + if (fn_->hasFlag(FLAG_ON)) { + msg += "on function with handle "; + } + else if (fn_->hasFlag(FLAG_COBEGIN_OR_COFORALL)) { + msg += "coforall function with handle "; + } + msg += "[" + std::to_string(localHandle_->id) + "]"; + CONTEXT_DEBUG(depth, msg, fn_); + } + + void recomputeActualSymbol(int& actualIdx, Symbol*& actual) const override { + auto callToFn = fn_->singleInvocation(); + // what is the symbol that was passed to this function? + actual = toSymExpr(callToFn->get(actualIdx))->symbol(); + + // is it also an argument here? + if (ArgSymbol* curFormal = toArgSymbol(actual)) { + auto parentFn = toFnSymbol(callToFn->parentSymbol); + INT_ASSERT(parentFn); + int formalIdx = findFormalIndex(parentFn, curFormal); + if (formalIdx >= 0) { + // CONTEXT_DEBUG(debugDepth+2, + // "which is actually the formal at idx " + std::to_string(formalIdx), + // curFormal); + + actualIdx = formalIdx; // update to this function's formal idx + } else { + INT_FATAL("how come?"); + } + } else { + // CONTEXT_DEBUG(debugDepth+2, + // "which is this symbol", + // curActual); + + actualIdx = -1; // signal that this variable is local here + } + } +}; + +class VectorizedLoopContext : public IteratorContext { + private: + CForLoop* loop_; + + public: + VectorizedLoopContext(CForLoop* loop) : + IteratorContext(Kind::VectorizedLoop), loop_(loop) {} + + BaseAST* node() override { return loop_; } + + void dump(int depth) const override { + std::string msg = "vectorized loop with handle "; + msg += "[" + std::to_string(localHandle_->id) + "]"; + CONTEXT_DEBUG(depth, msg, loop_); + } + + void recomputeActualSymbol(int& actualIdx, Symbol*& actual) const override { + // Do nothing, because vectorized loops aren't implemented as functions. + } +}; + +using IterContextPtr = std::unique_ptr; + +class ContextHandler { + public: + LoopContext loopCtx_; + std::vector contextStack_; + + // map between any handle used within user's loop body and the indices to + // contexts within contextStack + std::map handleMap_; + + ContextHandler(CForLoop* loop): loopCtx_(loop) { + const int debugDepth = 1; + + + CONTEXT_DEBUG(debugDepth, "looking for the handle", loop); + if (loopCtx_.findLoopContextHandle()) { + CONTEXT_DEBUG(debugDepth+1, "found loop context handle", loopCtx_.localHandle_); + + CONTEXT_DEBUG(debugDepth, "collecting context chain", loop); + + if (this->collectOuterContexts()) { } + else { + CONTEXT_DEBUG(debugDepth, "couldn't find any context chain", loop); + } + } + else { + CONTEXT_DEBUG(debugDepth+1, "couldn't find context handle", loop); + } + } + + CForLoop* findInnermostLoop(Expr* curParent) { + // TODO: this will find loops that aren't coforall lowering. + // how to fix? + + while (curParent) { + if (CForLoop* loop = toCForLoop(curParent)) { + return loop; + } + curParent = curParent->parentExpr; + } + return nullptr; + } + + bool collectOuterContexts() { + const int debugDepth = 2; + + // If the current context was implemented using a plain loop, that loop. + CForLoop* plainLoop = this->loop(); + // If the current context was implemented using an fn, the call to that fn. + CallExpr* callToCurCtx = NULL; + + Expr* cur = plainLoop; + do { + if (cur->parentExpr) { + if (CForLoop* cfl = toCForLoop(cur)) { + if (cfl->isOrderIndependent()) { + // Found a vectorization context. + CONTEXT_DEBUG(debugDepth, "found a candidate vectorized loop", cfl); + + auto outerCtx = std::make_unique(cfl); + outerCtx->setInnermostLooop(plainLoop); + outerCtx->setCallToInner(callToCurCtx); + outerCtx->setInnerLoop(findInnermostLoop(callToCurCtx)); + + if (!outerCtx->findLoopContextHandle()) break; + + contextStack_.push_back(std::move(outerCtx)); + cur = cur->parentExpr; + callToCurCtx = NULL; // vectorized loops aren't impl'ed as fns. + plainLoop = cfl; + + continue; + } + } + + // Not any expression we currently care about; keep looking upwards. + cur = cur->parentExpr; + continue; + } else if (auto parentFn = toFnSymbol(cur->parentSymbol)) { + // Functions that aren't coforall or on functions don't fit the + // pattern. Stop building chain. + if (!fnCanHaveContext(parentFn)) break; + + // Found a coforall/on context. + CONTEXT_DEBUG(debugDepth, "found a candidate parent fn", parentFn); + + auto outerCtx = std::make_unique(parentFn); + outerCtx->setInnermostLooop(plainLoop); + outerCtx->setCallToInner(callToCurCtx); + outerCtx->setInnerLoop(findInnermostLoop(callToCurCtx)); + + if (!outerCtx->findLoopContextHandle()) break; + + contextStack_.push_back(std::move(outerCtx)); + cur = callToCurCtx = parentFn->singleInvocation(); + plainLoop = NULL; + } else { + // Nothing fits the pattern, done searching. + break; + } + } while (true); + + return contextStack_.size() > 0; + } + + void dumpOuterContexts() { + int debugDepth = 3; + CONTEXT_DEBUG(debugDepth, "found the following context chain:", this->loop()); + for (auto& i : contextStack_) { + i->dump(++debugDepth); + } + } + + CForLoop* loop() { return toCForLoop(this->loopCtx_.loop_); } + Symbol* loopHandle() { return this->loopCtx_.localHandle_; } + + void removeHoistToContextCall(CallExpr* call) { + call->remove(); + } + + void removeOuterContextCallAndInitShadowHandle(CallExpr* call, + Symbol* symToSet) { + INT_ASSERT(symToSet); + + if (CallExpr* parent = toCallExpr(call->parentExpr)) { + // TODO this may be a little too reckless. At least do some INT_ASSERTS + Symbol* lhs = toSymExpr(parent->get(1))->symbol(); + + parent->remove(); + std::vector symExprs; + collectSymExprsFor(loop(), lhs, symExprs); + + for_vector (SymExpr, symExpr, symExprs) { + if (CallExpr* call = toCallExpr(symExpr->parentExpr)) { + if (call->isNamed(astrInitEquals) || // do we need init=? + call->isPrimitive(PRIM_MOVE)) { + symExpr->replace(new SymExpr(symToSet)); + continue; + } + } + symExpr->parentExpr->remove(); + } + + lhs->defPoint->remove(); + } + call->remove(); + } + + Symbol* handleOuterContextCall(CallExpr* call) { + const int debugDepth = 4; + + Symbol* outerCtxHandle = NULL; + // + // TODO better/less fragile pattern matching? + CallExpr* parentCall = toCallExpr(call->parentExpr); + INT_ASSERT(parentCall); + INT_ASSERT(parentCall->isPrimitive(PRIM_MOVE)); + + SymExpr* lhsSe = toSymExpr(parentCall->get(1)); + Symbol* lhs = lhsSe->symbol(); + + DefExpr* nextDef = toDefExpr(parentCall->next); + INT_ASSERT(nextDef); + + outerCtxHandle = nextDef->sym; + if (call->numActuals() == 1) { + INT_ASSERT(outerCtxHandle->getValType() == + this->loopCtx_.localHandle_->getValType()); + } + else { + INT_ASSERT(call->numActuals() == 2); + INT_ASSERT(outerCtxHandle->getValType() == call->get(1)->getValType()); + } + + CallExpr* callAfterDef = toCallExpr(nextDef->next); + INT_ASSERT(callAfterDef); + INT_ASSERT(callAfterDef->isNamed(astrInitEquals) || + callAfterDef->isPrimitive(PRIM_MOVE)); + INT_ASSERT(toSymExpr(callAfterDef->get(1))->symbol() == outerCtxHandle); + INT_ASSERT(toSymExpr(callAfterDef->get(2))->symbol() == lhs); + + CONTEXT_DEBUG(debugDepth, "found an outer context handle", + outerCtxHandle); + INT_ASSERT(handleMap_.count(outerCtxHandle) == 0); + + int outerCtxIdx = -1; + Symbol* argToCall = toSymExpr(call->argList.last())->symbol(); + + if (argToCall == this->loopHandle()) { + // immediate outer context + outerCtxIdx = 0; + } + else { + // farther away context + int j=0; + for (auto i = contextStack_.begin(); i != contextStack_.end(); i++, j++) { + if (this->handleMap_[argToCall] == j) { + outerCtxIdx = j+1; + break; + } + } + } + + handleMap_[outerCtxHandle] = outerCtxIdx; + + CONTEXT_DEBUG(debugDepth+1, + "mapped to ["+std::to_string(contextStack_[outerCtxIdx]->localHandle_->id)+"]", + outerCtxHandle); + + // TODO refactor this into a common helper + SET_LINENO(nextDef); + int curAdjustmentIdx = outerCtxIdx; + Symbol* handle = contextStack_[curAdjustmentIdx]->localHandle_; + + while (auto newHandle = contextStack_[curAdjustmentIdx]->insertActualForOuterSymbol(handle)) { + curAdjustmentIdx--; + handle = newHandle; + } + + removeOuterContextCallAndInitShadowHandle(call, handle); + + return outerCtxHandle; + } + + enum class HoistingKind { + Array, CArray, Barrier, Other + }; + + void handleHoistToCoforallOnContextCall(CallExpr* call, + Symbol* toHoist, + CoforallOnContext* context, + Symbol* usedHandle, + Symbol* targetHandle, + int targetCtxIdx, + HoistingKind kind) { + const int debugDepth = 3; + CONTEXT_DEBUG(debugDepth, "will hoist to ["+std::to_string(context->localHandle_->id)+"]", + call); + + if (kind == HoistingKind::CArray) { + // The early GPU transformations would've added a block here with + // additional code to support CArray hoisting to GPU. But it's not + // being hoisted to the GPU, so remove that block. + auto block = toBlockStmt(call->prev); + INT_ASSERT(block && "Expected a generated block before the hoist"); + CONTEXT_DEBUG(debugDepth, "removing block ["+std::to_string(block->id)+"]", block); + block->remove(); + } + + std::vector& autoDestroyAnchors = + context->getLocalHandleAutoDestroys(); + + // TODO cache this stuff somewhere, but we remove some below. Is that a problem? + std::vector callsInLoop; + collectCallExprs(this->loop(), callsInLoop); + std::map> autoDestroysInLoop; + for_vector (CallExpr, call, callsInLoop) { + if (FnSymbol* fn = call->resolvedFunction()) { + if (fn->hasFlag(FLAG_AUTO_DESTROY_FN)) { + Symbol* sym = toSymExpr(call->get(1))->symbol(); + autoDestroysInLoop[sym].push_back(call); + } + } + } + + DefExpr* def = toHoist->defPoint; + Expr* defPrev = def->prev; + + // this is only needed for `if hoist then __primitive("hoist"....)` + // when that if param-folds, we leave the block in the AST. This is probably + // irrelevant when this feature is production-ready + if (BlockStmt* parentBlock = toBlockStmt(call->parentExpr)) { + if (parentBlock->length() == 1) { + parentBlock->flattenAndRemove(); + } + else if (parentBlock->prev == def) { + parentBlock->flattenAndRemove(); + } + } + + SET_LINENO(def); + + // if we are using local shadows of target contexts' handle, we'll need to + // update the symbol to use what's local in the target + SymbolMap handleUpdateMap; + handleUpdateMap.put(usedHandle, targetHandle); + + CallExpr* mulCall = NULL; // only meaningful if isBarrier + Expr* cur = call->prev; + FnSymbol* parentFn = toFnSymbol(call->parentSymbol); + INT_ASSERT(parentFn); + + std::map outerActuals; + + while (cur != defPrev) { + CONTEXT_DEBUG(debugDepth+1, "hoisting", cur); + + // if we cross any defExpr, hoist/remove its autoDestroys + if (DefExpr* defExpr = toDefExpr(cur)) { + Symbol* sym = defExpr->sym; + std::vector& symsAutoDestroys = autoDestroysInLoop[sym]; + + if (symsAutoDestroys.size() > 0) { + // put auto destroys in all the right places + for_vector (CallExpr, anchor, autoDestroyAnchors) { + CONTEXT_DEBUG(debugDepth+2, "inserting new autoDestroy before", + anchor); + + anchor->insertBefore(symsAutoDestroys[0]->copy()); + } + // remove all the existing ones + for_vector (CallExpr, autoDestroy, symsAutoDestroys) { + CONTEXT_DEBUG(debugDepth+2, "removing autoDestroy", autoDestroy); + + autoDestroy->remove(); + } + } + } + + // replace shadow handles with actual local handles + update_symbols(cur, &handleUpdateMap); + + // keep track of indices of ArgSymbols here that we are hoisting. They'll + // need to be updated in that given context. Or we need to error if the + // user hoisted things way too far. + std::vector symExprsInCur; // why can't I just collect symbols? + collectSymExprs(cur, symExprsInCur); + for_vector (SymExpr, symExpr, symExprsInCur) { + Symbol* sym = symExpr->symbol(); + + if (ArgSymbol* argSym = toArgSymbol(sym)) { + if (outerActuals.count(argSym) == 1) continue; + + int formalIdx = findFormalIndex(parentFn, argSym); + if (formalIdx >= 0) { + CONTEXT_DEBUG(debugDepth+1, + "found a formal at idx " + std::to_string(formalIdx), + argSym); + + outerActuals[argSym] = formalIdx; + } + else { + INT_FATAL("how come?"); + } + } + } + + // if we are hoisting a barrier, try to find its multiply call + if (kind == HoistingKind::Barrier) { + if (CallExpr* call = findMultiplyCallForBarrier(cur, toHoist)) { + CONTEXT_DEBUG(debugDepth+1, "found multiply call", call); + if (mulCall != NULL) { + CONTEXT_DEBUG(debugDepth+2, "WARNING: there was another one", mulCall); + } + mulCall = call; + } + } + + targetHandle->defPoint->insertAfter(cur->remove()); + + cur = call->prev; + } + + // we have some symbols declared elsewhere in the block that we are hoisting + // walk up the context stack to the target to find where they are + SymbolMap newContextUpdateMap; + for (auto& outerActualToIdx : outerActuals) { + int actualIdx = outerActualToIdx.second; + Symbol* curActual = NULL; + + for (int i = 0; i < targetCtxIdx ; i++) { + if (actualIdx == -1) { + // we are currently looking for a symbol that was defined in an inner + // context. IOW, the user wants to hoist a block that has a symbol, + // to a block where the symbol wasn't defined yet. This is a user + // error. + USR_FATAL("Attempt to hoist symbols from an inner context to an outer context"); + } + + contextStack_[i]->recomputeActualSymbol(actualIdx, curActual); + } + + newContextUpdateMap.put(outerActualToIdx.first, curActual); + } + + std::string hoistedName = "hoisted_" + std::string(toHoist->name); + + Symbol* refToSym = new VarSymbol(hoistedName.c_str(), toHoist->getRefType()); + DefExpr* refToSymDef = new DefExpr(refToSym); + CallExpr* setRef = new CallExpr(PRIM_MOVE, refToSym, new CallExpr(PRIM_ADDR_OF, toHoist)); + + context->getInsertBeforeCallToInnerAnchor()->insertBefore(refToSymDef); + context->getInsertBeforeCallToInnerAnchor()->insertBefore(setRef); + + + for (int curCtx = targetCtxIdx ; curCtx >= 0 ; curCtx--) { + auto& ctx = contextStack_[curCtx]; + + auto innerSym = ctx->insertActualForHoistedSymbol(refToSym, hoistedName.c_str(), toHoist->getRefType()); + + if (kind == HoistingKind::Barrier) { + if (Expr* upEndCount = ctx->getUpEndCount()) { + CONTEXT_DEBUG(debugDepth+1, "multiply block will be inserted after here", upEndCount); + Symbol* numTasks = toSymExpr(toCallExpr(upEndCount)->get(2))->symbol(); + + INT_ASSERT(mulCall); + CallExpr* newMulCall = mulCall->copy(); + newMulCall->get(1)->replace(new SymExpr(refToSym)); + newMulCall->get(2)->replace(new SymExpr(numTasks)); + + ctx->getInsertBeforeCallToInnerAnchor()->insertBefore(newMulCall); + } + } + + refToSym = innerSym; + } + + removeHoistToContextCall(call); + if (mulCall) { + mulCall->remove(); + } + + // we want to use the last added symbol after the loop to adjust the loop body + SymbolMap updateMap; + updateMap.put(toHoist, refToSym); + + update_symbols(loop(), &updateMap); + update_symbols(context->node(), &newContextUpdateMap); + } + + void handleHoistToVectorContextCall(CallExpr* call, + Symbol* toHoist, + VectorizedLoopContext* context, + HoistingKind kind) { + if (kind == HoistingKind::CArray) { + // The ref-ified version of the array will not need autodestroys; + // get rid of them. + std::vector callsInLoop; + collectCallExprs(this->loop(), callsInLoop); + for_vector (CallExpr, call, callsInLoop) { + if (FnSymbol* fn = call->resolvedFunction()) { + if (fn->hasFlag(FLAG_AUTO_DESTROY_FN) && + toSymExpr(call->get(1))->symbol() == toHoist) { + call->remove(); + } + } + } + // Remove the initializer. + call->prev->prev->remove(); + // Unpack the pre-generated initialization code for the shared array + auto replacementBlock = toBlockStmt(call->prev); + auto replacementDefExpr = toDefExpr(replacementBlock->getFirstExpr()); + replacementBlock->flattenAndRemove(); + // Replace the old, non-shared array with the new shared array. + SymbolMap newArrayUpdateMap; + newArrayUpdateMap.put(toHoist, replacementDefExpr->sym); + toHoist->defPoint->remove(); + update_symbols(loop(), &newArrayUpdateMap); + // Remove the call-to-context call. + call->remove(); + } else { + INT_FATAL("currently only array hoisting to vector contexts is supported"); + } + } + + void handleHoistToContextCall(CallExpr* call) { + const int debugDepth = 3; + + Symbol* handle = toSymExpr(call->get(1))->symbol(); + Symbol* sym = toSymExpr(call->get(2))->symbol(); + int targetCtxIdx = handleMap_[handle]; + auto& target = contextStack_[targetCtxIdx]; + Symbol* targetHandle = target->localHandle_; + + bool isBarrier = astr(sym->type->symbol->name) == astr("barrier"); + bool isArray = strncmp(sym->type->symbol->name, "_array(", sizeof("_array(") - 1) == 0; + bool isCArray = strncmp(sym->type->symbol->name, "c_array(", sizeof("c_array(") - 1) == 0; + HoistingKind kind; + if (isBarrier) { + kind = HoistingKind::Barrier; + CONTEXT_DEBUG(debugDepth, "this is a barrier", sym); + } else if (isArray) { + kind = HoistingKind::Array; + CONTEXT_DEBUG(debugDepth, "this is an array", sym); + } else if (isCArray) { + kind = HoistingKind::CArray; + CONTEXT_DEBUG(debugDepth, "this is a C array", sym); + } else { + kind = HoistingKind::Other; + CONTEXT_DEBUG(debugDepth, "this is something else", sym); + } + + if (auto coforallOnCtx = target->toCoforallOnContext()) { + handleHoistToCoforallOnContextCall(call, sym, coforallOnCtx, handle, + targetHandle, targetCtxIdx, kind); + } else if (auto vectorizedLoopCtx = target->toVectorizedLoopContext()) { + handleHoistToVectorContextCall(call, sym, vectorizedLoopCtx, kind); + } + } + + CallExpr* findMultiplyCallForBarrier(Expr* e, Symbol* barrierSym) { + if (BlockStmt* block = toBlockStmt(e)) { + for_alist(expr, block->body) { + if (CallExpr* found = findMultiplyCallForBarrier(expr, barrierSym)) { + return found; + } + } + } + else if (CallExpr* call = toCallExpr(e)) { + if (call->isNamed("multiply")) { + Symbol* callReceiver = toSymExpr(call->get(1))->symbol(); + if (callReceiver == barrierSym) { + return call; + } + } + } + return NULL; + } + + void handleContextUsesWithinLoopBody() { + std::vector handles; + handles.push_back(this->loopHandle()); + + std::vector::size_type curIdx = 0; + while (curIdx < handles.size()) { + const int debugDepth = 1; + std::vector handleUses; + + collectSymExprsFor(this->loop(), handles[curIdx], handleUses); + + for_vector (SymExpr, use, handleUses) { + if (CallExpr* call = toCallExpr(use->parentExpr)) { + CONTEXT_DEBUG(debugDepth+1, "found a call that uses handle", call); + + if (call->isPrimitive(PRIM_OUTER_CONTEXT)) { + CONTEXT_DEBUG(debugDepth+2, "PRIM_OUTER_CONTEXT", call); + if (Symbol* newCtx = handleOuterContextCall(call)) { + handles.push_back(newCtx); + } + } + else if (call->isPrimitive(PRIM_HOIST_TO_CONTEXT)) { + CONTEXT_DEBUG(debugDepth+2, "PRIM_HOIST_TO_CONTEXT", call); + + handleHoistToContextCall(call); + } + } + else { + CONTEXT_DEBUG(debugDepth, "illegal use of context handle", use); + INT_FATAL("illegal use of context handle"); + } + } + + curIdx++; + } + } + + void handleContextUsesWithinLoopBody(Symbol* handle) { + const int debugDepth = 1; + std::vector handleUses; + collectSymExprsFor(this->loop(), handle, handleUses); + + for_vector (SymExpr, use, handleUses) { + if (CallExpr* call = toCallExpr(use->parentExpr)) { + CONTEXT_DEBUG(debugDepth+1, "found a call that uses handle", call); + + if (call->isPrimitive(PRIM_OUTER_CONTEXT)) { + handleOuterContextCall(call); + } + else if (call->isPrimitive(PRIM_MOVE)) { + CONTEXT_DEBUG(debugDepth+2, "ignoring call", call); + } + else { + CONTEXT_DEBUG(debugDepth+2, "call is illegal", call); + INT_FATAL("call is illegal"); + } + } + else { + CONTEXT_DEBUG(debugDepth, "illegal use of context handle", use); + INT_FATAL("illegal use of context handle"); + } + } + } +}; + + +void lowerContexts() { + forv_Vec(FnSymbol*, fn, gFnSymbols) { + if (!isInUserCode(fn)) continue; + + // TODO do something lighter + std::vector asts; + collect_asts(fn, asts); + + for_vector(BaseAST, ast, asts) { + if (CForLoop* loop = toCForLoop(ast)) { + const int debugDepth = 0; + + CONTEXT_DEBUG(debugDepth, "encountered a C loop", loop); + + ContextHandler h(loop); + + h.dumpOuterContexts(); + + h.handleContextUsesWithinLoopBody(); + } + } + } +} diff --git a/frontend/include/chpl/uast/PragmaList.h b/frontend/include/chpl/uast/PragmaList.h index 4eed836d91e5..1c62746919e6 100644 --- a/frontend/include/chpl/uast/PragmaList.h +++ b/frontend/include/chpl/uast/PragmaList.h @@ -187,6 +187,8 @@ PRAGMA(DEFAULT_INTENT_IS_REF_MAYBE_CONST, ypr, PRAGMA(NO_PROMOTION_WHEN_BY_REF, ypr, "no promotion when by ref", ncm) +PRAGMA(CONTEXT_TYPE, ypr, "context type", ncm) + PRAGMA(COPY_INIT, npr, "copy initializer", ncm) PRAGMA(DEFAULT_INIT, npr, "default initializer", ncm) PRAGMA(DESTRUCTOR, npr, diff --git a/frontend/include/chpl/uast/prim-ops-list.h b/frontend/include/chpl/uast/prim-ops-list.h index b0bf8645c8ab..61806d7a6817 100644 --- a/frontend/include/chpl/uast/prim-ops-list.h +++ b/frontend/include/chpl/uast/prim-ops-list.h @@ -39,6 +39,10 @@ PRIMITIVE_G(UNKNOWN, "") +PRIMITIVE_R(INNERMOST_CONTEXT, "innermost context") +PRIMITIVE_R(OUTER_CONTEXT, "outer context") +PRIMITIVE_R(HOIST_TO_CONTEXT, "hoist to context") + PRIMITIVE_R(ACTUALS_LIST, "actuals list") PRIMITIVE_G(NOOP, "noop") PRIMITIVE_G(MOVE, "move") diff --git a/frontend/lib/resolution/prims.cpp b/frontend/lib/resolution/prims.cpp index 8740c21b164b..5deb0938bbb6 100644 --- a/frontend/lib/resolution/prims.cpp +++ b/frontend/lib/resolution/prims.cpp @@ -1716,6 +1716,9 @@ CallResolutionResult resolvePrimCall(Context* context, case PRIM_REF_DESERIALIZE: case PRIM_UNKNOWN: + case PRIM_INNERMOST_CONTEXT: + case PRIM_OUTER_CONTEXT: + case PRIM_HOIST_TO_CONTEXT: case NUM_KNOWN_PRIMS: case PRIM_BREAKPOINT: case PRIM_CONST_ARG_HASH: diff --git a/modules/internal/ChapelBase.chpl b/modules/internal/ChapelBase.chpl index 8e0154e83a77..09c1461c3e7f 100644 --- a/modules/internal/ChapelBase.chpl +++ b/modules/internal/ChapelBase.chpl @@ -1633,6 +1633,19 @@ module ChapelBase { return ret; } + pragma "llvm return noalias" + proc _ddata_allocate_noinit_gpu_shared(type eltType, size: integral, + out callPostAlloc: bool, + subloc = c_sublocid_none) { + if CHPL_LOCALE_MODEL != "gpu" then + compilerError("_ddata_allocate_noinit_gpu_shared can't be called in this config"); + + var ret: _ddata(eltType); + // TODO why fixed size? + ret = __primitive("cast", ret.type, __primitive("gpu allocShared", 4096*8)); + return ret; + } + inline proc _ddata_allocate_postalloc(data:_ddata, size: integral) { pragma "fn synchronization free" pragma "insert line file info" diff --git a/modules/internal/DefaultRectangular.chpl b/modules/internal/DefaultRectangular.chpl index d02cd7955705..2bbb2e3adfd2 100644 --- a/modules/internal/DefaultRectangular.chpl +++ b/modules/internal/DefaultRectangular.chpl @@ -158,6 +158,7 @@ module DefaultRectangular { class DefaultRectangularDom: BaseRectangularDom(?) { var dist: unmanaged DefaultDist; var ranges : rank*range(idxType,boundKind.both,strides); + var useGpuSharedMemory : bool = false; override proc linksDistribution() param do return false; override proc dsiLinksDistribution() do return false; @@ -700,7 +701,8 @@ module DefaultRectangular { idxType=idxType, strides=strides, dom=_to_unmanaged(this), - initElts=initElts); + initElts=initElts, + useGpuSharedMemory=useGpuSharedMemory); } proc doiTryCreateArray(type eltType) throws { @@ -1049,6 +1051,9 @@ module DefaultRectangular { pragma "local field" var shiftedData : _ddata(eltType); + // For allocating arrays using GPU block shared memory + var useGpuSharedMemory: bool = false; + // note: used for external array support var externFreeFunc: c_ptr(void); var externArr: bool = false; @@ -1068,6 +1073,7 @@ module DefaultRectangular { param initElts = true, param deinitElts = initElts, data:_ddata(eltType) = nil, + useGpuSharedMemory = false, externArr = false, _borrowed = false, externFreeFunc: c_ptr(void) = nil) { @@ -1075,6 +1081,7 @@ module DefaultRectangular { idxType=idxType, strides=strides); this.dom = dom; this.data = data; + this.useGpuSharedMemory = useGpuSharedMemory; this.externFreeFunc = externFreeFunc; this.externArr = externArr; this._borrowed = _borrowed; @@ -1297,7 +1304,10 @@ module DefaultRectangular { chpl_debug_writeln("*** DR alloc ", eltType:string, " ", size); } - if !localeModelPartitionsIterationOnSublocales { + import ChplConfig; + if ChplConfig.CHPL_LOCALE_MODEL == "gpu" && useGpuSharedMemory { + data = _ddata_allocate_noinit_gpu_shared(eltType, size, callPostAlloc); + } else if !localeModelPartitionsIterationOnSublocales { data = _ddata_allocate_noinit(eltType, size, callPostAlloc); } else { data = _ddata_allocate_noinit(eltType, size, @@ -1494,6 +1504,10 @@ module DefaultRectangular { // Should have been checked above. param initElts = isDefaultInitializable(eltType); + if this.useGpuSharedMemory { + halt("Arrays based on GPU shared memory cannot be reallocated"); + } + var copy = new unmanaged DefaultRectangularArr(eltType=eltType, rank=rank, diff --git a/modules/standard/Collectives.chpl b/modules/standard/Collectives.chpl index 5c27104792e7..799f709ddbec 100644 --- a/modules/standard/Collectives.chpl +++ b/modules/standard/Collectives.chpl @@ -137,6 +137,12 @@ module Collectives { inline proc reset(nTasks: int) { bar.reset(nTasks); } + + @chpldoc.nodoc + proc multiply(n: int) { + try! reset((bar:(unmanaged aBarrier)).n*n); + } + } /* The BarrierBaseType class provides an abstract base type for barriers diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index ee232f302975..37ab5ad67e25 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -244,6 +244,12 @@ module GPU } } + inline proc createSharedCArray(type theType : c_array(?t, ?k)) ref { + var voidPtr = __primitive("gpu allocShared", numBytes(t) * k); + var arrayPtr = voidPtr : c_ptr(theType); + return arrayPtr.deref(); + } + /* Set the block size for kernels launched on the GPU. */ diff --git a/runtime/include/chpl-gen-includes.h b/runtime/include/chpl-gen-includes.h index 59d67fd17cfb..a9970908a03d 100644 --- a/runtime/include/chpl-gen-includes.h +++ b/runtime/include/chpl-gen-includes.h @@ -73,6 +73,10 @@ chpl_localeID_t chpl_gen_getLocaleID(void) return localeID; } +static inline void chpl_assert_on_gpu(int32_t lineno, int32_t filenameIdx) { + chpl_error("assertOnGpu() failed", lineno, filenameIdx); +} + #ifdef __cplusplus } #endif diff --git a/runtime/include/gpu/chpl-gpu-gen-common.h b/runtime/include/gpu/chpl-gpu-gen-common.h index f763ddf2e1c7..336e55f65436 100644 --- a/runtime/include/gpu/chpl-gpu-gen-common.h +++ b/runtime/include/gpu/chpl-gpu-gen-common.h @@ -117,9 +117,6 @@ MAYBE_GPU static inline void chpl_gpu_printf8(const char *fmt, } __device__ static inline void chpl_assert_on_gpu(int32_t lineno, int32_t filenameIdx) { /* no op */ } -__host__ static inline void chpl_assert_on_gpu(int32_t lineno, int32_t filenameIdx) { - chpl_error("assertOnGpu() failed", lineno, filenameIdx); -} __device__ static inline unsigned int chpl_gpu_clock(void) { return (unsigned int)clock(); diff --git a/test/users/engin/context/ChapelContextSupport.chpl b/test/users/engin/context/ChapelContextSupport.chpl new file mode 100644 index 000000000000..2c9c9c1a240c --- /dev/null +++ b/test/users/engin/context/ChapelContextSupport.chpl @@ -0,0 +1,29 @@ +module ChapelContextSupport { + + pragma "context type" + record Context { + // TODO: we probably want different types for the loop's context and the + // iterator's. Stuff here doesn't make much sense for the loop's context. + // OTOH, we may want to use loop's context to set blockSize, for example. + // Prototype challenge: PRIM_OUTER_CONTEXT's return type maybe a little + // difficult to handle. But maybe return a c_void_ptr and cast it? + param rank: int; + + type idType = if rank==1 then int else rank*int; + var taskId: idType; + var numTasks: idType; + + proc init() { + this.rank = 1; + this.complete(); + // this is a loop context and not an iterator context + } + + proc init(param rank, taskId, numTasks) { + this.rank=rank; + this.complete(); + this.taskId = taskId; + this.numTasks = numTasks; + } + } +} diff --git a/test/users/engin/context/ChapelContextSupport.notest b/test/users/engin/context/ChapelContextSupport.notest new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/test/users/engin/context/Iterators.chpl b/test/users/engin/context/Iterators.chpl new file mode 100644 index 000000000000..90dda36f8055 --- /dev/null +++ b/test/users/engin/context/Iterators.chpl @@ -0,0 +1,309 @@ +module Iterators { + + config const taskPerLoc = 2; + config const doDebug = false; + + proc debug(args...) { + if doDebug then writeln((...args)); + } + + module SimpleOneDim { + use Iterators; + use ChapelContextSupport; + + + iter simpleOneDim(n) { + const iterRange = 0.. $2.prediffed +mv $2.prediffed $2 diff --git a/test/users/engin/context/basic2D.skipif b/test/users/engin/context/basic2D.skipif new file mode 100644 index 000000000000..ec5ce3d91b2b --- /dev/null +++ b/test/users/engin/context/basic2D.skipif @@ -0,0 +1,4 @@ +# I want to be able to control threads per locale for this test +# I can't do that with fifo, however + +CHPL_TASKS!=qthreads diff --git a/test/users/engin/context/basicArrayHoist-hoisted.good b/test/users/engin/context/basicArrayHoist-hoisted.good new file mode 100644 index 000000000000..0cfbf08886fc --- /dev/null +++ b/test/users/engin/context/basicArrayHoist-hoisted.good @@ -0,0 +1 @@ +2 diff --git a/test/users/engin/context/basicArrayHoist-hoisted.lm-gpu.good b/test/users/engin/context/basicArrayHoist-hoisted.lm-gpu.good new file mode 100644 index 000000000000..d00491fd7e5b --- /dev/null +++ b/test/users/engin/context/basicArrayHoist-hoisted.lm-gpu.good @@ -0,0 +1 @@ +1 diff --git a/test/users/engin/context/basicArrayHoist-link-for-memleaks.chpl b/test/users/engin/context/basicArrayHoist-link-for-memleaks.chpl new file mode 120000 index 000000000000..c55c89131c07 --- /dev/null +++ b/test/users/engin/context/basicArrayHoist-link-for-memleaks.chpl @@ -0,0 +1 @@ +basicArrayHoist.chpl \ No newline at end of file diff --git a/test/users/engin/context/basicArrayHoist-link-for-memleaks.execopts b/test/users/engin/context/basicArrayHoist-link-for-memleaks.execopts new file mode 100644 index 000000000000..51cc9f6c9049 --- /dev/null +++ b/test/users/engin/context/basicArrayHoist-link-for-memleaks.execopts @@ -0,0 +1 @@ +--memLeaks --doVerboseMem=false diff --git a/test/users/engin/context/basicArrayHoist-link-for-memleaks.good b/test/users/engin/context/basicArrayHoist-link-for-memleaks.good new file mode 100644 index 000000000000..d677b495ec35 --- /dev/null +++ b/test/users/engin/context/basicArrayHoist-link-for-memleaks.good @@ -0,0 +1,20 @@ +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 diff --git a/test/users/engin/context/basicArrayHoist-link-for-memleaks.lm-gpu.good b/test/users/engin/context/basicArrayHoist-link-for-memleaks.lm-gpu.good new file mode 100644 index 000000000000..320797059f91 --- /dev/null +++ b/test/users/engin/context/basicArrayHoist-link-for-memleaks.lm-gpu.good @@ -0,0 +1,21 @@ +warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 diff --git a/test/users/engin/context/basicArrayHoist-link-for-memleaks.numlocales b/test/users/engin/context/basicArrayHoist-link-for-memleaks.numlocales new file mode 120000 index 000000000000..2eb595130631 --- /dev/null +++ b/test/users/engin/context/basicArrayHoist-link-for-memleaks.numlocales @@ -0,0 +1 @@ +basicArrayHoist.numlocales \ No newline at end of file diff --git a/test/users/engin/context/basicArrayHoist-not-hoisted.good b/test/users/engin/context/basicArrayHoist-not-hoisted.good new file mode 100644 index 000000000000..209e3ef4b624 --- /dev/null +++ b/test/users/engin/context/basicArrayHoist-not-hoisted.good @@ -0,0 +1 @@ +20 diff --git a/test/users/engin/context/basicArrayHoist.chpl b/test/users/engin/context/basicArrayHoist.chpl new file mode 100644 index 000000000000..e7a06a322a5f --- /dev/null +++ b/test/users/engin/context/basicArrayHoist.chpl @@ -0,0 +1,25 @@ +use MemDiagnostics; + +use ChapelContextSupport; +use Iterators.SimpleOneDim; + +config param hoistArray = true; + +config const n = 20; +config const doVerboseMem = true; + +if doVerboseMem then startVerboseMem(); +forall i in simpleOneDim(n) { // context should be coming from a new syntax + const context = new Context(); + const vectorContext = __primitive("outer context", context); + const localTaskContext = __primitive("outer context", vectorContext); + const localeContext = __primitive("outer context", localTaskContext); + const preLocaleTaskContext = __primitive("outer context", localeContext); + + var a: [0.. $2.prediffed +mv $2.prediffed $2 diff --git a/test/users/engin/context/basicBarrier.chpl b/test/users/engin/context/basicBarrier.chpl new file mode 100644 index 000000000000..28ed170e6db2 --- /dev/null +++ b/test/users/engin/context/basicBarrier.chpl @@ -0,0 +1,42 @@ +use IO; +use Collectives; +use MemDiagnostics; + +use ChapelContextSupport; +use Iterators.SimpleOneDim; + +config param hoistArray = true; + +config const n = 20; +config const doVerboseMem = true; + +if doVerboseMem then startVerboseMem(); +forall i in simpleOneDim(n) { // context should be coming from a new syntax + const context = new Context(); + const vectorContext = __primitive("outer context", context); + const localTaskContext = __primitive("outer context", vectorContext); + const localeContext = __primitive("outer context", localTaskContext); + const preLocaleTaskContext = __primitive("outer context", localeContext); + + var a: [0.. $2.prediff +mv $2.prediff $2 diff --git a/test/users/engin/context/basicTaskIds.chpl b/test/users/engin/context/basicTaskIds.chpl new file mode 100644 index 000000000000..a6f17fc6bdef --- /dev/null +++ b/test/users/engin/context/basicTaskIds.chpl @@ -0,0 +1,23 @@ +use IO; +use MemDiagnostics; + +use ChapelContextSupport; +use Iterators.SimpleOneDim; + +config const n = 20; +config const doVerboseMem = true; + +if doVerboseMem then startVerboseMem(); +forall i in simpleOneDim(n) { // context should be coming from a new syntax + const context = new Context(); + const vectorContext = __primitive("outer context", context); + const localTaskContext = __primitive("outer context", vectorContext); + const localeContext = __primitive("outer context", localTaskContext); + const preLocaleTaskContext = __primitive("outer context", localeContext); + + serial { + writeln(here, " %2i ".format(i), localTaskContext, " ", localeContext, " ", + preLocaleTaskContext); + } +} +if doVerboseMem then stopVerboseMem(); diff --git a/test/users/engin/context/basicTaskIds.execopts b/test/users/engin/context/basicTaskIds.execopts new file mode 100644 index 000000000000..fab2fd391c97 --- /dev/null +++ b/test/users/engin/context/basicTaskIds.execopts @@ -0,0 +1 @@ +--n=40 --taskPerLoc=4 diff --git a/test/users/engin/context/basicTaskIds.good b/test/users/engin/context/basicTaskIds.good new file mode 100644 index 000000000000..c64e08052c63 --- /dev/null +++ b/test/users/engin/context/basicTaskIds.good @@ -0,0 +1,40 @@ +LOCALE0 0 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 1 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 2 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 3 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 4 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 5 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 6 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 7 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 8 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 9 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 10 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 11 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 12 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 13 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 14 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 15 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 16 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 17 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 18 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE0 19 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 2) +LOCALE1 20 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 21 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 22 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 23 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 24 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 25 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 26 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 27 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 28 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 29 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 30 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 31 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 32 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 33 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 34 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 35 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 36 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 37 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 38 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) +LOCALE1 39 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 1, numTasks = 2) diff --git a/test/users/engin/context/basicTaskIds.lm-gpu.good b/test/users/engin/context/basicTaskIds.lm-gpu.good new file mode 100644 index 000000000000..f99c9c1281a7 --- /dev/null +++ b/test/users/engin/context/basicTaskIds.lm-gpu.good @@ -0,0 +1,41 @@ +LOCALE0 0 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly +LOCALE0 1 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 2 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 3 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 4 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 5 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 6 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 7 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 8 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 9 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 10 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 11 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 12 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 13 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 14 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 15 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 16 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 17 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 18 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 19 (taskId = 1, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 20 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 21 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 22 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 23 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 24 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 25 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 26 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 27 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 28 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 29 (taskId = 2, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 30 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 31 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 32 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 33 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 34 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 35 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 36 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 37 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 38 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) +LOCALE0 39 (taskId = 3, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) diff --git a/test/users/engin/context/basicTaskIds.numlocales b/test/users/engin/context/basicTaskIds.numlocales new file mode 100644 index 000000000000..0cfbf08886fc --- /dev/null +++ b/test/users/engin/context/basicTaskIds.numlocales @@ -0,0 +1 @@ +2 diff --git a/test/users/engin/context/basicTaskIds.prediff b/test/users/engin/context/basicTaskIds.prediff new file mode 100755 index 000000000000..876d9895cc36 --- /dev/null +++ b/test/users/engin/context/basicTaskIds.prediff @@ -0,0 +1,4 @@ +#!/bin/sh + +sort -k2 -n $2 > $2.prediffed +mv $2.prediffed $2 diff --git a/test/users/engin/context/gpuSharedMem.chpl b/test/users/engin/context/gpuSharedMem.chpl new file mode 100644 index 000000000000..cd0e9435d719 --- /dev/null +++ b/test/users/engin/context/gpuSharedMem.chpl @@ -0,0 +1,59 @@ +use CTypes; +use GPU; + +use ChapelContextSupport; +use Iterators.SimpleOneDim; + +config param N=16; +config param BLOCK_SIZE=32; + +config const n = 20; + +on here.gpus[0] { + var A : [0.. 0 && i % BLOCK_SIZE == 0 { + write(" | "); + } else if i > 0 { + write(" "); + } + write(A[i]); + } + writeln("]"); + + // Print stable values: + write(" A (stable) = ["); + for i in 0.. 0 then write(" "); + if i % BLOCK_SIZE != BLOCK_SIZE-1 { + write(A[i]); + } + } + writeln("]"); +} diff --git a/test/users/engin/context/gpuSharedMem.good b/test/users/engin/context/gpuSharedMem.good new file mode 100644 index 000000000000..571a9ccf1fbb --- /dev/null +++ b/test/users/engin/context/gpuSharedMem.good @@ -0,0 +1 @@ + A (stable) = [200 301 402 503 604 705 806 907 1008 1109 1210 1311 1412 1513 1614 115] diff --git a/test/users/engin/context/gpuSharedMem.prediff b/test/users/engin/context/gpuSharedMem.prediff new file mode 100755 index 000000000000..45d901ee0025 --- /dev/null +++ b/test/users/engin/context/gpuSharedMem.prediff @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +testname=$1 +outfile=$2 + +tmpfile=$outfile.prediff.tmp +grep 'A (stable)' $outfile > $tmpfile +mv $tmpfile $outfile diff --git a/test/users/engin/context/gpuSharedMem.skipif b/test/users/engin/context/gpuSharedMem.skipif new file mode 100644 index 000000000000..14aa578afbd0 --- /dev/null +++ b/test/users/engin/context/gpuSharedMem.skipif @@ -0,0 +1 @@ +CHPL_LOCALE_MODEL!=gpu diff --git a/test/users/engin/context/transpose.chpl b/test/users/engin/context/transpose.chpl new file mode 100644 index 000000000000..eddadd1dc2ef --- /dev/null +++ b/test/users/engin/context/transpose.chpl @@ -0,0 +1,85 @@ +use BlockDist; +use Collectives; +use CommDiagnostics; +use ChapelContextSupport; +use Time; + +use Iterators.TertiaryDRDomIterators; + +config type dataType = int; +config param assertLocal = false; + +config const n = 8; +config const printOutput = false; +config const reportPerf = true; +config const commDiag = false; +config const verboseComm = false; +config const verify = false; + +const Space = {0.. Date: Fri, 12 Jan 2024 19:06:54 -0800 Subject: [PATCH 2/4] Update deprecated/removed features Signed-off-by: Vassily Litvinov --- test/users/engin/context/ChapelContextSupport.chpl | 4 ++-- test/users/engin/context/Iterators.chpl | 2 +- test/users/engin/context/basic2D.chpl | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/users/engin/context/ChapelContextSupport.chpl b/test/users/engin/context/ChapelContextSupport.chpl index 2c9c9c1a240c..91da905247f6 100644 --- a/test/users/engin/context/ChapelContextSupport.chpl +++ b/test/users/engin/context/ChapelContextSupport.chpl @@ -15,13 +15,13 @@ module ChapelContextSupport { proc init() { this.rank = 1; - this.complete(); + init this; // this is a loop context and not an iterator context } proc init(param rank, taskId, numTasks) { this.rank=rank; - this.complete(); + init this; this.taskId = taskId; this.numTasks = numTasks; } diff --git a/test/users/engin/context/Iterators.chpl b/test/users/engin/context/Iterators.chpl index 90dda36f8055..250ff7ed90be 100644 --- a/test/users/engin/context/Iterators.chpl +++ b/test/users/engin/context/Iterators.chpl @@ -125,7 +125,7 @@ module Iterators { // not checking here whether the new low and high fit into idxType var low = (stride * followThis(i).lowBound:strType):idxType; var high = (stride * followThis(i).highBound:strType):idxType; - t(i) = ((low..high by stride:strType) + whole.dim(i).low by followThis(i).stride:strType).safeCast(t(i).type); + t(i) = ((low..high by stride:strType) + whole.dim(i).low by followThis(i).stride:strType) : (t(i).type); } for i in {(...t)} { yield i; diff --git a/test/users/engin/context/basic2D.chpl b/test/users/engin/context/basic2D.chpl index c7e75c1c380b..282628675147 100644 --- a/test/users/engin/context/basic2D.chpl +++ b/test/users/engin/context/basic2D.chpl @@ -7,7 +7,7 @@ config const n = 8; -var dom = Block.createDomain(0.. Date: Sat, 27 Jan 2024 18:28:02 -0800 Subject: [PATCH 3/4] Vass's changes Signed-off-by: Vassily Litvinov --- compiler/include/driver.h | 1 + compiler/include/lowerLoopContexts.h | 24 ++ compiler/main/driver.cpp | 4 +- compiler/passes/checkResolved.cpp | 13 +- compiler/resolution/lowerLoopContexts.cpp | 235 +++++++++--------- modules/internal/DefaultRectangular.chpl | 18 +- runtime/include/chpl-gen-includes.h | 4 - runtime/include/gpu/chpl-gpu-gen-common.h | 3 + .../native/distArray/blockOutsideOn.compopts | 2 + .../distArray/blockUseInFunction.compopts | 2 + test/gpu/native/noGpu/basicMem.prediff | 10 +- test/users/engin/context/COMPOPTS | 1 + .../engin/context/ChapelContextSupport.chpl | 2 - test/users/engin/context/Iterators.chpl | 21 +- test/users/engin/context/SKIPIF | 6 + test/users/engin/context/basic2D.chpl | 3 +- ...c2D.lm-gpu.good => basic2D.comm-none.good} | 1 - test/users/engin/context/basic2D.prediff | 2 +- .../basicArrayHoist-hoisted.comm-none.good | 2 + ...cArrayHoist-hoisted.comm-none.lm-gpu.good} | 0 ...rrayHoist-link-for-memleaks.comm-none.good | 1 + ...st-link-for-memleaks.comm-none.lm-gpu.good | 1 + ...icArrayHoist-link-for-memleaks.lm-gpu.good | 21 -- ...basicArrayHoist-not-hoisted.comm-none.good | 2 + ...rayHoist-not-hoisted.comm-none.lm-gpu.good | 1 + .../engin/context/basicArrayHoist.prediff | 3 +- .../engin/context/basicBarrier.comm-none.good | 1 + ...ood => basicBarrier.comm-none.lm-gpu.good} | 1 - test/users/engin/context/basicBarrier.prediff | 4 +- .../engin/context/basicTaskIds.comm-none.good | 1 + ...ood => basicTaskIds.comm-none.lm-gpu.good} | 1 - test/users/engin/context/gpuSharedMem.prediff | 2 +- test/users/engin/context/gpuSharedMem.skipif | 3 + test/users/engin/context/transpose.chpl | 2 +- .../engin/context/transpose.comm-none.good | 3 + test/users/engin/context/transpose.skipif | 2 + util/chpl-completion.bash | 1 + util/cron/common-native-gpu.bash | 2 +- util/cron/test-gpu-cuda.um.bash | 1 - util/cron/test-gpu-ex-cuda-11.bash | 3 - util/cron/test-gpu-ex-cuda-12.bash | 3 - util/cron/test-gpu-ex-rocm-54.bash | 3 - 42 files changed, 218 insertions(+), 198 deletions(-) create mode 100644 test/gpu/native/distArray/blockOutsideOn.compopts create mode 100644 test/gpu/native/distArray/blockUseInFunction.compopts create mode 100644 test/users/engin/context/COMPOPTS create mode 100644 test/users/engin/context/SKIPIF rename test/users/engin/context/{basic2D.lm-gpu.good => basic2D.comm-none.good} (97%) create mode 100644 test/users/engin/context/basicArrayHoist-hoisted.comm-none.good rename test/users/engin/context/{basicArrayHoist-hoisted.lm-gpu.good => basicArrayHoist-hoisted.comm-none.lm-gpu.good} (100%) create mode 100644 test/users/engin/context/basicArrayHoist-link-for-memleaks.comm-none.good create mode 120000 test/users/engin/context/basicArrayHoist-link-for-memleaks.comm-none.lm-gpu.good delete mode 100644 test/users/engin/context/basicArrayHoist-link-for-memleaks.lm-gpu.good create mode 100644 test/users/engin/context/basicArrayHoist-not-hoisted.comm-none.good create mode 100644 test/users/engin/context/basicArrayHoist-not-hoisted.comm-none.lm-gpu.good create mode 100644 test/users/engin/context/basicBarrier.comm-none.good rename test/users/engin/context/{basicBarrier.lm-gpu.good => basicBarrier.comm-none.lm-gpu.good} (74%) create mode 100644 test/users/engin/context/basicTaskIds.comm-none.good rename test/users/engin/context/{basicTaskIds.lm-gpu.good => basicTaskIds.comm-none.lm-gpu.good} (96%) create mode 100644 test/users/engin/context/transpose.comm-none.good create mode 100644 test/users/engin/context/transpose.skipif diff --git a/compiler/include/driver.h b/compiler/include/driver.h index e6f27def681d..ebb2b4160833 100644 --- a/compiler/include/driver.h +++ b/compiler/include/driver.h @@ -253,6 +253,7 @@ extern bool fReportOptimizedOn; extern bool fReportPromotion; extern bool fReportScalarReplace; extern bool fReportGpu; +extern bool fIteratorContexts; extern bool fReportContextAdj; extern bool fReportDeadBlocks; extern bool fReportDeadModules; diff --git a/compiler/include/lowerLoopContexts.h b/compiler/include/lowerLoopContexts.h index bed653b2aa08..99bd4699cc18 100644 --- a/compiler/include/lowerLoopContexts.h +++ b/compiler/include/lowerLoopContexts.h @@ -1,2 +1,26 @@ +/* + * Copyright 2020-2024 Hewlett Packard Enterprise Development LP + * Copyright 2004-2019 Cray Inc. + * Other additional copyright holders may be indicated within. + * + * The entirety of this work is licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _LOWER_LOOP_CONTEXTS_H_ +#define _LOWER_LOOP_CONTEXTS_H_ void lowerContexts(); + +#endif diff --git a/compiler/main/driver.cpp b/compiler/main/driver.cpp index ee91f8a3a9f5..0c532cb49e24 100644 --- a/compiler/main/driver.cpp +++ b/compiler/main/driver.cpp @@ -295,6 +295,7 @@ bool fReportOptimizeForallUnordered = false; bool fReportPromotion = false; bool fReportScalarReplace = false; bool fReportGpu = false; +bool fIteratorContexts = false; bool fReportContextAdj = false; bool fReportDeadBlocks = false; bool fReportDeadModules = false; @@ -1471,7 +1472,8 @@ static ArgumentDescription arg_desc[] = { {"report-promotion", ' ', NULL, "Print information about scalar promotion", "F", &fReportPromotion, NULL, NULL}, {"report-scalar-replace", ' ', NULL, "Print scalar replacement stats", "F", &fReportScalarReplace, NULL, NULL}, {"report-gpu", ' ', NULL, "Print information about what loops are and are not GPU eligible", "F", &fReportGpu, NULL, NULL}, - {"report-context-adjustments", ' ', NULL, "Print debugging information while handling contexts", "F", &fReportContextAdj, NULL, NULL}, + {"iterator-contexts", ' ', NULL, "Handle iterator contexts", "F", &fIteratorContexts, NULL, NULL}, + {"report-context-adjustments", ' ', NULL, "Print debugging information while handling iterator contexts", "F", &fReportContextAdj, NULL, NULL}, {"", ' ', NULL, "Developer Flags -- Miscellaneous", NULL, NULL, NULL, NULL}, {"allow-noinit-array-not-pod", ' ', NULL, "Allow noinit for arrays of records", "N", &fAllowNoinitArrayNotPod, "CHPL_BREAK_ON_CODEGEN", NULL}, diff --git a/compiler/passes/checkResolved.cpp b/compiler/passes/checkResolved.cpp index 6772a2f6cacc..dff49b34ee5e 100644 --- a/compiler/passes/checkResolved.cpp +++ b/compiler/passes/checkResolved.cpp @@ -585,6 +585,14 @@ checkReturnPaths(FnSymbol* fn) { } } +static void checkIteratorContextPrimitives(CallExpr* call) { + if (call->isPrimitive(PRIM_INNERMOST_CONTEXT) || + call->isPrimitive(PRIM_OUTER_CONTEXT) || + call->isPrimitive(PRIM_HOIST_TO_CONTEXT) ) + USR_FATAL_CONT(call, + "use of this feature requires compiling with --iterator-contexts"); +} + static void checkBadAddrOf(CallExpr* call) { @@ -633,8 +641,11 @@ checkBadAddrOf(CallExpr* call) static void checkCalls() { - forv_Vec(CallExpr, call, gCallExprs) + forv_Vec(CallExpr, call, gCallExprs) { checkBadAddrOf(call); + if (! fIteratorContexts) + checkIteratorContextPrimitives(call); + } } // This function checks that the passed type is an acceptable diff --git a/compiler/resolution/lowerLoopContexts.cpp b/compiler/resolution/lowerLoopContexts.cpp index 5406bcbd690a..770584700658 100644 --- a/compiler/resolution/lowerLoopContexts.cpp +++ b/compiler/resolution/lowerLoopContexts.cpp @@ -1,6 +1,26 @@ +/* + * Copyright 2020-2024 Hewlett Packard Enterprise Development LP + * Copyright 2004-2019 Cray Inc. + * Other additional copyright holders may be indicated within. + * + * The entirety of this work is licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include -// probably too much: +// todo: can any of these be removed? #include "astutil.h" #include "AstVisitorTraverse.h" #include "CForLoop.h" @@ -39,30 +59,35 @@ static bool fnCanHaveContext(FnSymbol* fn) { } -static void CONTEXT_DEBUG(int indent, std::string msg, BaseAST* node) { +static std::string showid(BaseAST* node, const char* endstring="") { + std::string result; + if (developer) result = "[" + std::to_string(node->id) + "]" + endstring; + return result; +} +static void CONTEXT_DEBUG(int indent, std::string msg, + BaseAST* node, Symbol* sym = nullptr) { if (fReportContextAdj) { - for(int i=0 ; iid << "] " << msg << std::endl; + for (int i=0 ; iname << "' " << msg; + else + std::cout << showid(node, " ") << msg; + if (sym != nullptr) std::cout << " '" << sym->name << showid(sym) << "'"; + std::cout << std::endl; } } static int findFormalIndex(FnSymbol* fn, ArgSymbol* arg) { - int ret = -1; - int i = 1; for_formals (formal, fn) { - if (formal == arg) { - ret = i; - } + if (formal == arg) return i; i++; } - - return ret; + return -1; } - +// This class factors out code related to handles, esp. findLoopContextHandle. +// Todo: perhaps switch its subclasses from "is a Context" to "has a Context"? class Context { public: Symbol* localHandle_ = NULL; @@ -72,10 +97,14 @@ class Context { Expr* endOfLocalHandleSetup_ = NULL; Expr* upEndCount_ = NULL; + bool hasLocalHandle() { + return localHandle_ != nullptr; + } + // this'll need to be differentiated between LoopContext and IteratorContext // when we have a proper syntax. The current implementation is more suitable // for IteratorContext - bool findLoopContextHandle() { + void findLoopContextHandle() { const int debugDepth = 1; std::vector defExprs; collectDefExprs(this->node(), defExprs); @@ -99,8 +128,6 @@ class Context { } } } - - return localHandle_ != NULL; } void collectLocalHandleAutoDestroys() { @@ -166,6 +193,8 @@ class Context { bool defExprIsLocalHandle(DefExpr* def) { if (isArgSymbol(def->sym)) return false; + //todo: the check for FLAG_CONTEXT_TYPE should suffice + // and perhaps FLAG_TEMP - ?? return !def->sym->hasFlag(FLAG_TEMP) && !isLabelSymbol(def->sym) && !def->sym->hasFlag(FLAG_INDEX_VAR) && // avoid re-finding loop's @@ -173,7 +202,7 @@ class Context { def->sym->getValType()->symbol->hasFlag(FLAG_CONTEXT_TYPE); } -}; +}; // class Context class LoopContext: public Context { public: @@ -181,9 +210,6 @@ class LoopContext: public Context { LoopContext(CForLoop* loop): loop_(loop) {} BaseAST* node() override { return loop_; }; - - - }; class CoforallOnContext; @@ -295,7 +321,7 @@ class IteratorContext: public Context { virtual void dump(int depth) const = 0; virtual void recomputeActualSymbol(int& actualIdx, Symbol*& actual) const = 0; -}; +}; // class IteratorContext class CoforallOnContext : public IteratorContext { private: @@ -314,7 +340,10 @@ class CoforallOnContext : public IteratorContext { else if (fn_->hasFlag(FLAG_COBEGIN_OR_COFORALL)) { msg += "coforall function with handle "; } - msg += "[" + std::to_string(localHandle_->id) + "]"; + msg += localHandle_->name; + if (developer) { + msg += "[" + std::to_string(localHandle_->id) + "]"; + } CONTEXT_DEBUG(depth, msg, fn_); } @@ -328,24 +357,13 @@ class CoforallOnContext : public IteratorContext { auto parentFn = toFnSymbol(callToFn->parentSymbol); INT_ASSERT(parentFn); int formalIdx = findFormalIndex(parentFn, curFormal); - if (formalIdx >= 0) { - // CONTEXT_DEBUG(debugDepth+2, - // "which is actually the formal at idx " + std::to_string(formalIdx), - // curFormal); - - actualIdx = formalIdx; // update to this function's formal idx - } else { - INT_FATAL("how come?"); - } + INT_ASSERT(formalIdx >= 0); + actualIdx = formalIdx; // update to this function's formal idx } else { - // CONTEXT_DEBUG(debugDepth+2, - // "which is this symbol", - // curActual); - actualIdx = -1; // signal that this variable is local here } } -}; +}; // class CoforallOnContext class VectorizedLoopContext : public IteratorContext { private: @@ -359,14 +377,17 @@ class VectorizedLoopContext : public IteratorContext { void dump(int depth) const override { std::string msg = "vectorized loop with handle "; - msg += "[" + std::to_string(localHandle_->id) + "]"; + msg += localHandle_->name; + if (developer) { + msg += "[" + std::to_string(localHandle_->id) + "]"; + } CONTEXT_DEBUG(depth, msg, loop_); } void recomputeActualSymbol(int& actualIdx, Symbol*& actual) const override { // Do nothing, because vectorized loops aren't implemented as functions. } -}; +}; // class VectorizedLoopContext using IterContextPtr = std::unique_ptr; @@ -376,27 +397,30 @@ class ContextHandler { std::vector contextStack_; // map between any handle used within user's loop body and the indices to - // contexts within contextStack + // contexts within contextStack std::map handleMap_; - ContextHandler(CForLoop* loop): loopCtx_(loop) { - const int debugDepth = 1; + ContextHandler(CForLoop* loop): loopCtx_(loop) {} + void collectHandleAndOuterContexts() { + const int debugDepth = 1; + CForLoop* loop = loopCtx_.loop_; CONTEXT_DEBUG(debugDepth, "looking for the handle", loop); - if (loopCtx_.findLoopContextHandle()) { - CONTEXT_DEBUG(debugDepth+1, "found loop context handle", loopCtx_.localHandle_); + loopCtx_.findLoopContextHandle(); + + if (loopCtx_.hasLocalHandle()) { + CONTEXT_DEBUG(debugDepth+1, "found loop context handle", + loopCtx_.localHandle_); CONTEXT_DEBUG(debugDepth, "collecting context chain", loop); - if (this->collectOuterContexts()) { } - else { - CONTEXT_DEBUG(debugDepth, "couldn't find any context chain", loop); - } + this->collectOuterContexts(); } else { CONTEXT_DEBUG(debugDepth+1, "couldn't find context handle", loop); } + dumpOuterContexts(); } CForLoop* findInnermostLoop(Expr* curParent) { @@ -412,7 +436,7 @@ class ContextHandler { return nullptr; } - bool collectOuterContexts() { + void collectOuterContexts() { const int debugDepth = 2; // If the current context was implemented using a plain loop, that loop. @@ -432,8 +456,8 @@ class ContextHandler { outerCtx->setInnermostLooop(plainLoop); outerCtx->setCallToInner(callToCurCtx); outerCtx->setInnerLoop(findInnermostLoop(callToCurCtx)); - - if (!outerCtx->findLoopContextHandle()) break; + outerCtx->findLoopContextHandle(); + if (!outerCtx->hasLocalHandle()) break; contextStack_.push_back(std::move(outerCtx)); cur = cur->parentExpr; @@ -459,8 +483,8 @@ class ContextHandler { outerCtx->setInnermostLooop(plainLoop); outerCtx->setCallToInner(callToCurCtx); outerCtx->setInnerLoop(findInnermostLoop(callToCurCtx)); - - if (!outerCtx->findLoopContextHandle()) break; + outerCtx->findLoopContextHandle(); + if (!outerCtx->hasLocalHandle()) break; contextStack_.push_back(std::move(outerCtx)); cur = callToCurCtx = parentFn->singleInvocation(); @@ -470,15 +494,17 @@ class ContextHandler { break; } } while (true); - - return contextStack_.size() > 0; } void dumpOuterContexts() { - int debugDepth = 3; - CONTEXT_DEBUG(debugDepth, "found the following context chain:", this->loop()); - for (auto& i : contextStack_) { - i->dump(++debugDepth); + int debugDepth = 2; + if (contextStack_.size() == 0) { + CONTEXT_DEBUG(debugDepth, "couldn't find any context chain", this->loop()); + } else { + CONTEXT_DEBUG(debugDepth, "found the following context chain:", this->loop()); + for (auto& i : contextStack_) { + i->dump(++debugDepth); + } } } @@ -550,8 +576,7 @@ class ContextHandler { INT_ASSERT(toSymExpr(callAfterDef->get(1))->symbol() == outerCtxHandle); INT_ASSERT(toSymExpr(callAfterDef->get(2))->symbol() == lhs); - CONTEXT_DEBUG(debugDepth, "found an outer context handle", - outerCtxHandle); + CONTEXT_DEBUG(debugDepth, "found an outer context handle", outerCtxHandle); INT_ASSERT(handleMap_.count(outerCtxHandle) == 0); int outerCtxIdx = -1; @@ -572,11 +597,15 @@ class ContextHandler { } } + if (outerCtxIdx >= (int)contextStack_.size()) { + USR_FATAL(call, "could not find the %d-th outer context" + " in the iterator for the enclosing loop", outerCtxIdx+1); + } + handleMap_[outerCtxHandle] = outerCtxIdx; - CONTEXT_DEBUG(debugDepth+1, - "mapped to ["+std::to_string(contextStack_[outerCtxIdx]->localHandle_->id)+"]", - outerCtxHandle); + CONTEXT_DEBUG(debugDepth+1, "mapped to", outerCtxHandle, + contextStack_[outerCtxIdx]->localHandle_); // TODO refactor this into a common helper SET_LINENO(nextDef); @@ -605,8 +634,7 @@ class ContextHandler { int targetCtxIdx, HoistingKind kind) { const int debugDepth = 3; - CONTEXT_DEBUG(debugDepth, "will hoist to ["+std::to_string(context->localHandle_->id)+"]", - call); + CONTEXT_DEBUG(debugDepth, "will hoist to", call, context->localHandle_); if (kind == HoistingKind::CArray) { // The early GPU transformations would've added a block here with @@ -614,7 +642,7 @@ class ContextHandler { // being hoisted to the GPU, so remove that block. auto block = toBlockStmt(call->prev); INT_ASSERT(block && "Expected a generated block before the hoist"); - CONTEXT_DEBUG(debugDepth, "removing block ["+std::to_string(block->id)+"]", block); + CONTEXT_DEBUG(debugDepth, "removing this block", block); block->remove(); } @@ -703,16 +731,11 @@ class ContextHandler { if (outerActuals.count(argSym) == 1) continue; int formalIdx = findFormalIndex(parentFn, argSym); - if (formalIdx >= 0) { - CONTEXT_DEBUG(debugDepth+1, - "found a formal at idx " + std::to_string(formalIdx), - argSym); - - outerActuals[argSym] = formalIdx; - } - else { - INT_FATAL("how come?"); - } + INT_ASSERT(formalIdx >= 0); + CONTEXT_DEBUG(debugDepth+1, + "found a formal at idx " + std::to_string(formalIdx), + argSym); + outerActuals[argSym] = formalIdx; } } @@ -732,7 +755,7 @@ class ContextHandler { cur = call->prev; } - // we have some symbols declared elsewhere in the block that we are hoisting + // we have some symbols declared elsewhere in the block that we are hoisting // walk up the context stack to the target to find where they are SymbolMap newContextUpdateMap; for (auto& outerActualToIdx : outerActuals) { @@ -745,7 +768,7 @@ class ContextHandler { // context. IOW, the user wants to hoist a block that has a symbol, // to a block where the symbol wasn't defined yet. This is a user // error. - USR_FATAL("Attempt to hoist symbols from an inner context to an outer context"); + USR_FATAL(call, "Attempt to hoist symbols from an inner context to an outer context"); } contextStack_[i]->recomputeActualSymbol(actualIdx, curActual); @@ -754,20 +777,19 @@ class ContextHandler { newContextUpdateMap.put(outerActualToIdx.first, curActual); } - std::string hoistedName = "hoisted_" + std::string(toHoist->name); - - Symbol* refToSym = new VarSymbol(hoistedName.c_str(), toHoist->getRefType()); + const char* hoistedName = astr("hoisted_", toHoist->name); + Symbol* refToSym = new VarSymbol(hoistedName, toHoist->getRefType()); DefExpr* refToSymDef = new DefExpr(refToSym); CallExpr* setRef = new CallExpr(PRIM_MOVE, refToSym, new CallExpr(PRIM_ADDR_OF, toHoist)); context->getInsertBeforeCallToInnerAnchor()->insertBefore(refToSymDef); context->getInsertBeforeCallToInnerAnchor()->insertBefore(setRef); - for (int curCtx = targetCtxIdx ; curCtx >= 0 ; curCtx--) { auto& ctx = contextStack_[curCtx]; - auto innerSym = ctx->insertActualForHoistedSymbol(refToSym, hoistedName.c_str(), toHoist->getRefType()); + auto innerSym = ctx->insertActualForHoistedSymbol(refToSym, hoistedName, + toHoist->getRefType()); if (kind == HoistingKind::Barrier) { if (Expr* upEndCount = ctx->getUpEndCount()) { @@ -896,7 +918,6 @@ class ContextHandler { while (curIdx < handles.size()) { const int debugDepth = 1; std::vector handleUses; - collectSymExprsFor(this->loop(), handles[curIdx], handleUses); for_vector (SymExpr, use, handleUses) { @@ -917,44 +938,19 @@ class ContextHandler { } else { CONTEXT_DEBUG(debugDepth, "illegal use of context handle", use); - INT_FATAL("illegal use of context handle"); + USR_FATAL(use, "illegal use of context handle %s", + loopHandle()->name); } } curIdx++; } } - - void handleContextUsesWithinLoopBody(Symbol* handle) { - const int debugDepth = 1; - std::vector handleUses; - collectSymExprsFor(this->loop(), handle, handleUses); - - for_vector (SymExpr, use, handleUses) { - if (CallExpr* call = toCallExpr(use->parentExpr)) { - CONTEXT_DEBUG(debugDepth+1, "found a call that uses handle", call); - - if (call->isPrimitive(PRIM_OUTER_CONTEXT)) { - handleOuterContextCall(call); - } - else if (call->isPrimitive(PRIM_MOVE)) { - CONTEXT_DEBUG(debugDepth+2, "ignoring call", call); - } - else { - CONTEXT_DEBUG(debugDepth+2, "call is illegal", call); - INT_FATAL("call is illegal"); - } - } - else { - CONTEXT_DEBUG(debugDepth, "illegal use of context handle", use); - INT_FATAL("illegal use of context handle"); - } - } - } -}; - +}; // class ContextHandler void lowerContexts() { + if (!fIteratorContexts) return; + forv_Vec(FnSymbol*, fn, gFnSymbols) { if (!isInUserCode(fn)) continue; @@ -964,14 +960,11 @@ void lowerContexts() { for_vector(BaseAST, ast, asts) { if (CForLoop* loop = toCForLoop(ast)) { - const int debugDepth = 0; - - CONTEXT_DEBUG(debugDepth, "encountered a C loop", loop); + CONTEXT_DEBUG(0, "encountered a C loop", loop); + if (loop->id == breakOnResolveID) gdbShouldBreakHere(); ContextHandler h(loop); - - h.dumpOuterContexts(); - + h.collectHandleAndOuterContexts(); h.handleContextUsesWithinLoopBody(); } } diff --git a/modules/internal/DefaultRectangular.chpl b/modules/internal/DefaultRectangular.chpl index 2bbb2e3adfd2..d02cd7955705 100644 --- a/modules/internal/DefaultRectangular.chpl +++ b/modules/internal/DefaultRectangular.chpl @@ -158,7 +158,6 @@ module DefaultRectangular { class DefaultRectangularDom: BaseRectangularDom(?) { var dist: unmanaged DefaultDist; var ranges : rank*range(idxType,boundKind.both,strides); - var useGpuSharedMemory : bool = false; override proc linksDistribution() param do return false; override proc dsiLinksDistribution() do return false; @@ -701,8 +700,7 @@ module DefaultRectangular { idxType=idxType, strides=strides, dom=_to_unmanaged(this), - initElts=initElts, - useGpuSharedMemory=useGpuSharedMemory); + initElts=initElts); } proc doiTryCreateArray(type eltType) throws { @@ -1051,9 +1049,6 @@ module DefaultRectangular { pragma "local field" var shiftedData : _ddata(eltType); - // For allocating arrays using GPU block shared memory - var useGpuSharedMemory: bool = false; - // note: used for external array support var externFreeFunc: c_ptr(void); var externArr: bool = false; @@ -1073,7 +1068,6 @@ module DefaultRectangular { param initElts = true, param deinitElts = initElts, data:_ddata(eltType) = nil, - useGpuSharedMemory = false, externArr = false, _borrowed = false, externFreeFunc: c_ptr(void) = nil) { @@ -1081,7 +1075,6 @@ module DefaultRectangular { idxType=idxType, strides=strides); this.dom = dom; this.data = data; - this.useGpuSharedMemory = useGpuSharedMemory; this.externFreeFunc = externFreeFunc; this.externArr = externArr; this._borrowed = _borrowed; @@ -1304,10 +1297,7 @@ module DefaultRectangular { chpl_debug_writeln("*** DR alloc ", eltType:string, " ", size); } - import ChplConfig; - if ChplConfig.CHPL_LOCALE_MODEL == "gpu" && useGpuSharedMemory { - data = _ddata_allocate_noinit_gpu_shared(eltType, size, callPostAlloc); - } else if !localeModelPartitionsIterationOnSublocales { + if !localeModelPartitionsIterationOnSublocales { data = _ddata_allocate_noinit(eltType, size, callPostAlloc); } else { data = _ddata_allocate_noinit(eltType, size, @@ -1504,10 +1494,6 @@ module DefaultRectangular { // Should have been checked above. param initElts = isDefaultInitializable(eltType); - if this.useGpuSharedMemory { - halt("Arrays based on GPU shared memory cannot be reallocated"); - } - var copy = new unmanaged DefaultRectangularArr(eltType=eltType, rank=rank, diff --git a/runtime/include/chpl-gen-includes.h b/runtime/include/chpl-gen-includes.h index a9970908a03d..59d67fd17cfb 100644 --- a/runtime/include/chpl-gen-includes.h +++ b/runtime/include/chpl-gen-includes.h @@ -73,10 +73,6 @@ chpl_localeID_t chpl_gen_getLocaleID(void) return localeID; } -static inline void chpl_assert_on_gpu(int32_t lineno, int32_t filenameIdx) { - chpl_error("assertOnGpu() failed", lineno, filenameIdx); -} - #ifdef __cplusplus } #endif diff --git a/runtime/include/gpu/chpl-gpu-gen-common.h b/runtime/include/gpu/chpl-gpu-gen-common.h index 336e55f65436..f763ddf2e1c7 100644 --- a/runtime/include/gpu/chpl-gpu-gen-common.h +++ b/runtime/include/gpu/chpl-gpu-gen-common.h @@ -117,6 +117,9 @@ MAYBE_GPU static inline void chpl_gpu_printf8(const char *fmt, } __device__ static inline void chpl_assert_on_gpu(int32_t lineno, int32_t filenameIdx) { /* no op */ } +__host__ static inline void chpl_assert_on_gpu(int32_t lineno, int32_t filenameIdx) { + chpl_error("assertOnGpu() failed", lineno, filenameIdx); +} __device__ static inline unsigned int chpl_gpu_clock(void) { return (unsigned int)clock(); diff --git a/test/gpu/native/distArray/blockOutsideOn.compopts b/test/gpu/native/distArray/blockOutsideOn.compopts new file mode 100644 index 000000000000..015b94f2ff3b --- /dev/null +++ b/test/gpu/native/distArray/blockOutsideOn.compopts @@ -0,0 +1,2 @@ +# This test currently crashes the compiler with --verify. To be fixed. +--no-verify diff --git a/test/gpu/native/distArray/blockUseInFunction.compopts b/test/gpu/native/distArray/blockUseInFunction.compopts new file mode 100644 index 000000000000..015b94f2ff3b --- /dev/null +++ b/test/gpu/native/distArray/blockUseInFunction.compopts @@ -0,0 +1,2 @@ +# This test currently crashes the compiler with --verify. To be fixed. +--no-verify diff --git a/test/gpu/native/noGpu/basicMem.prediff b/test/gpu/native/noGpu/basicMem.prediff index 8e4f8a901090..c03e809ecfc9 100755 --- a/test/gpu/native/noGpu/basicMem.prediff +++ b/test/gpu/native/noGpu/basicMem.prediff @@ -1,5 +1,9 @@ #!/bin/sh -sed -i -e 's/0x.*/0xPREDIFFED/' $2 -sed -i -e '//d' $2 -sed -i -e '/basicMem.chpl:5:.*\[domain(1,int(64),one)\] int(64) at/s/ 1[0-9][0-9]B / 1nnB /' $2 +cat $2 | sed \ + -e 's/0x.*/0xPREDIFFED/' \ + -e '//d' \ + -e '/basicMem.chpl:5:.*\[domain(1,int(64),one)\] int(64) at/s/ 1[0-9][0-9]B / 1nnB /' \ + $2 > $2.tmp + +mv $2.tmp $2 diff --git a/test/users/engin/context/COMPOPTS b/test/users/engin/context/COMPOPTS new file mode 100644 index 000000000000..c9e99f2dc0bf --- /dev/null +++ b/test/users/engin/context/COMPOPTS @@ -0,0 +1 @@ +--iterator-contexts diff --git a/test/users/engin/context/ChapelContextSupport.chpl b/test/users/engin/context/ChapelContextSupport.chpl index 91da905247f6..59ab1175939d 100644 --- a/test/users/engin/context/ChapelContextSupport.chpl +++ b/test/users/engin/context/ChapelContextSupport.chpl @@ -15,13 +15,11 @@ module ChapelContextSupport { proc init() { this.rank = 1; - init this; // this is a loop context and not an iterator context } proc init(param rank, taskId, numTasks) { this.rank=rank; - init this; this.taskId = taskId; this.numTasks = numTasks; } diff --git a/test/users/engin/context/Iterators.chpl b/test/users/engin/context/Iterators.chpl index 250ff7ed90be..bf95353884b7 100644 --- a/test/users/engin/context/Iterators.chpl +++ b/test/users/engin/context/Iterators.chpl @@ -55,6 +55,13 @@ module Iterators { use DSIUtil; use ChapelContextSupport; + /////////// BlockDom.customThese() : serial, leader, follower /////////// + + iter BlockDom.customThese() { + for i in whole do + yield i; + } + iter BlockDom.customThese(param tag: iterKind) where tag == iterKind.leader { const maxTasks = dist.dataParTasksPerLocale; const ignoreRunning = dist.dataParIgnoreRunningTasks; @@ -132,6 +139,8 @@ module Iterators { } } + /////////// DefaultRectangularDom.customThese() leader /////////// + iter DefaultRectangularDom.customThese(param tag: iterKind, tasksPerLocale = dataParTasksPerLocale, ignoreRunning = dataParIgnoreRunningTasks, @@ -139,7 +148,6 @@ module Iterators { offset=createTuple(rank, chpl_integralIdxType, 0:chpl_integralIdxType)) where tag == iterKind.leader { - const numSublocs = here._getChildCount(); if localeModelPartitionsIterationOnSublocales && numSublocs != 0 { @@ -264,10 +272,7 @@ module Iterators { } } - iter BlockDom.customThese() { - for i in whole do - yield i; - } + /////////// _domain.customThese() : serial, s/a, leader, follower /////////// /* Yield the domain indices */ iter _domain.customThese() { @@ -279,10 +284,11 @@ module Iterators { @chpldoc.nodoc iter _domain.customThese(param tag: iterKind) where tag == iterKind.standalone && - __primitive("resolves", _value.these(tag=tag)) { + __primitive("resolves", _value.customThese(tag=tag)) { for i in _value.customThese(tag) do yield i; } + @chpldoc.nodoc iter _domain.customThese(param tag: iterKind) where tag == iterKind.leader { @@ -292,6 +298,7 @@ module Iterators { for followThis in _value.customThese(tag) do yield followThis; } + @chpldoc.nodoc iter _domain.customThese(param tag: iterKind, followThis, param fast: bool = false) where tag == iterKind.follower { @@ -305,5 +312,5 @@ module Iterators { } } - } + } // module TertiaryDRDomIterators } diff --git a/test/users/engin/context/SKIPIF b/test/users/engin/context/SKIPIF new file mode 100644 index 000000000000..473b9a0f1999 --- /dev/null +++ b/test/users/engin/context/SKIPIF @@ -0,0 +1,6 @@ +# Upon --no-local the tests behave as if under gasnet, +# however the test system runs diff against .comm-none.good. +# Skip --no-local to avoid meaningless failures. +COMPOPTS <= --no-local +# Skip also upon --baseline, as a precaution. +COMPOPTS <= --baseline diff --git a/test/users/engin/context/basic2D.chpl b/test/users/engin/context/basic2D.chpl index 282628675147..918cc764700b 100644 --- a/test/users/engin/context/basic2D.chpl +++ b/test/users/engin/context/basic2D.chpl @@ -5,8 +5,6 @@ use Iterators.TertiaryDRDomIterators; config const n = 8; - - var dom = blockDist.createDomain(0.. $2.prediffed +grep -v 'NUMA domains but only .* Qthreads shepherds' $2 | sort > $2.prediffed mv $2.prediffed $2 diff --git a/test/users/engin/context/basicArrayHoist-hoisted.comm-none.good b/test/users/engin/context/basicArrayHoist-hoisted.comm-none.good new file mode 100644 index 000000000000..0b22070123a5 --- /dev/null +++ b/test/users/engin/context/basicArrayHoist-hoisted.comm-none.good @@ -0,0 +1,2 @@ +0 +basicArrayHoist.chpl:17: error: could not find the 4-th outer context in the iterator for the enclosing loop diff --git a/test/users/engin/context/basicArrayHoist-hoisted.lm-gpu.good b/test/users/engin/context/basicArrayHoist-hoisted.comm-none.lm-gpu.good similarity index 100% rename from test/users/engin/context/basicArrayHoist-hoisted.lm-gpu.good rename to test/users/engin/context/basicArrayHoist-hoisted.comm-none.lm-gpu.good diff --git a/test/users/engin/context/basicArrayHoist-link-for-memleaks.comm-none.good b/test/users/engin/context/basicArrayHoist-link-for-memleaks.comm-none.good new file mode 100644 index 000000000000..7110375c5402 --- /dev/null +++ b/test/users/engin/context/basicArrayHoist-link-for-memleaks.comm-none.good @@ -0,0 +1 @@ +basicArrayHoist-link-for-memleaks.chpl:17: error: could not find the 4-th outer context in the iterator for the enclosing loop diff --git a/test/users/engin/context/basicArrayHoist-link-for-memleaks.comm-none.lm-gpu.good b/test/users/engin/context/basicArrayHoist-link-for-memleaks.comm-none.lm-gpu.good new file mode 120000 index 000000000000..9f5d790f9f91 --- /dev/null +++ b/test/users/engin/context/basicArrayHoist-link-for-memleaks.comm-none.lm-gpu.good @@ -0,0 +1 @@ +basicArrayHoist-link-for-memleaks.good \ No newline at end of file diff --git a/test/users/engin/context/basicArrayHoist-link-for-memleaks.lm-gpu.good b/test/users/engin/context/basicArrayHoist-link-for-memleaks.lm-gpu.good deleted file mode 100644 index 320797059f91..000000000000 --- a/test/users/engin/context/basicArrayHoist-link-for-memleaks.lm-gpu.good +++ /dev/null @@ -1,21 +0,0 @@ -warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 diff --git a/test/users/engin/context/basicArrayHoist-not-hoisted.comm-none.good b/test/users/engin/context/basicArrayHoist-not-hoisted.comm-none.good new file mode 100644 index 000000000000..0b22070123a5 --- /dev/null +++ b/test/users/engin/context/basicArrayHoist-not-hoisted.comm-none.good @@ -0,0 +1,2 @@ +0 +basicArrayHoist.chpl:17: error: could not find the 4-th outer context in the iterator for the enclosing loop diff --git a/test/users/engin/context/basicArrayHoist-not-hoisted.comm-none.lm-gpu.good b/test/users/engin/context/basicArrayHoist-not-hoisted.comm-none.lm-gpu.good new file mode 100644 index 000000000000..209e3ef4b624 --- /dev/null +++ b/test/users/engin/context/basicArrayHoist-not-hoisted.comm-none.lm-gpu.good @@ -0,0 +1 @@ +20 diff --git a/test/users/engin/context/basicArrayHoist.prediff b/test/users/engin/context/basicArrayHoist.prediff index 86859dc6438c..110f9fdc1d6e 100755 --- a/test/users/engin/context/basicArrayHoist.prediff +++ b/test/users/engin/context/basicArrayHoist.prediff @@ -1,4 +1,5 @@ #!/bin/sh -grep "allocate.*array elements" $2 | wc -l > $2.prediffed +grep -c "allocate.*array elements" $2 > $2.prediffed +grep -i 'error\|warning' $2 >> $2.prediffed mv $2.prediffed $2 diff --git a/test/users/engin/context/basicBarrier.comm-none.good b/test/users/engin/context/basicBarrier.comm-none.good new file mode 100644 index 000000000000..0bb40f359f39 --- /dev/null +++ b/test/users/engin/context/basicBarrier.comm-none.good @@ -0,0 +1 @@ +basicBarrier.chpl:19: error: could not find the 4-th outer context in the iterator for the enclosing loop diff --git a/test/users/engin/context/basicBarrier.lm-gpu.good b/test/users/engin/context/basicBarrier.comm-none.lm-gpu.good similarity index 74% rename from test/users/engin/context/basicBarrier.lm-gpu.good rename to test/users/engin/context/basicBarrier.comm-none.lm-gpu.good index 23fc652db638..a26736dd99d9 100644 --- a/test/users/engin/context/basicBarrier.lm-gpu.good +++ b/test/users/engin/context/basicBarrier.comm-none.lm-gpu.good @@ -1,4 +1,3 @@ -warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly 10: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11: 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12: 0 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 diff --git a/test/users/engin/context/basicBarrier.prediff b/test/users/engin/context/basicBarrier.prediff index d9001b7b8ece..6d4832019358 100755 --- a/test/users/engin/context/basicBarrier.prediff +++ b/test/users/engin/context/basicBarrier.prediff @@ -1,4 +1,4 @@ #!/bin/sh -sort -n $2 > $2.prediff -mv $2.prediff $2 +sort -n $2 > $2.prediffed +mv $2.prediffed $2 diff --git a/test/users/engin/context/basicTaskIds.comm-none.good b/test/users/engin/context/basicTaskIds.comm-none.good new file mode 100644 index 000000000000..d05ef2414aee --- /dev/null +++ b/test/users/engin/context/basicTaskIds.comm-none.good @@ -0,0 +1 @@ +basicTaskIds.chpl:16: error: could not find the 4-th outer context in the iterator for the enclosing loop diff --git a/test/users/engin/context/basicTaskIds.lm-gpu.good b/test/users/engin/context/basicTaskIds.comm-none.lm-gpu.good similarity index 96% rename from test/users/engin/context/basicTaskIds.lm-gpu.good rename to test/users/engin/context/basicTaskIds.comm-none.lm-gpu.good index f99c9c1281a7..050d5abaad92 100644 --- a/test/users/engin/context/basicTaskIds.lm-gpu.good +++ b/test/users/engin/context/basicTaskIds.comm-none.lm-gpu.good @@ -1,5 +1,4 @@ LOCALE0 0 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) -warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly LOCALE0 1 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) LOCALE0 2 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) LOCALE0 3 (taskId = 0, numTasks = 4) (taskId = 0, numTasks = 1) (taskId = 0, numTasks = 1) diff --git a/test/users/engin/context/gpuSharedMem.prediff b/test/users/engin/context/gpuSharedMem.prediff index 45d901ee0025..adb2c7ca108e 100755 --- a/test/users/engin/context/gpuSharedMem.prediff +++ b/test/users/engin/context/gpuSharedMem.prediff @@ -3,6 +3,6 @@ testname=$1 outfile=$2 -tmpfile=$outfile.prediff.tmp +tmpfile=$outfile.prediffed grep 'A (stable)' $outfile > $tmpfile mv $tmpfile $outfile diff --git a/test/users/engin/context/gpuSharedMem.skipif b/test/users/engin/context/gpuSharedMem.skipif index 14aa578afbd0..0ba44004c598 100644 --- a/test/users/engin/context/gpuSharedMem.skipif +++ b/test/users/engin/context/gpuSharedMem.skipif @@ -1 +1,4 @@ +# This test is specific to device shared memory, +# otherwise the behavior is undefined. CHPL_LOCALE_MODEL!=gpu +CHPL_GPU=cpu diff --git a/test/users/engin/context/transpose.chpl b/test/users/engin/context/transpose.chpl index eddadd1dc2ef..eee6b3b86524 100644 --- a/test/users/engin/context/transpose.chpl +++ b/test/users/engin/context/transpose.chpl @@ -17,7 +17,7 @@ config const verboseComm = false; config const verify = false; const Space = {0.. Date: Fri, 23 Feb 2024 20:00:12 -0800 Subject: [PATCH 4/4] Follow up on review feedback Signed-off-by: Vassily Litvinov --- compiler/passes/normalize.cpp | 41 +++++++++----------- modules/standard/Collectives.chpl | 1 + modules/standard/GPU.chpl | 1 + test/users/engin/context/basicBarrier.chpl | 2 +- test/users/engin/context/gpuSharedMem.skipif | 2 +- 5 files changed, 23 insertions(+), 24 deletions(-) diff --git a/compiler/passes/normalize.cpp b/compiler/passes/normalize.cpp index 72ef7cd9ffab..16c83c6ffebb 100644 --- a/compiler/passes/normalize.cpp +++ b/compiler/passes/normalize.cpp @@ -130,9 +130,9 @@ static TypeSymbol* expandTypeAlias(SymExpr* se); * * ************************************** | *************************************/ -static void earlyGpuTransforms() { - forv_expanding_Vec(CallExpr, call, gCallExprs) { - if (!call->isPrimitive(PRIM_HOIST_TO_CONTEXT)) continue; +static void handleSharedCArrays() { + forv_expanding_Vec(CallExpr, call, gCallExprs) + if (call->isPrimitive(PRIM_HOIST_TO_CONTEXT)) // The particular definition we expect is a default-init c_array, which is: // @@ -141,30 +141,26 @@ static void earlyGpuTransforms() { // call_tmp = c_array(t, k); // __primitive("default init var", myArray, call_tmp); - auto hoistDefExpr = toSymExpr(call->get(2))->symbol()->defPoint; - if (!isDefExpr(hoistDefExpr->next)) continue; - auto typeDefExpr = toDefExpr(hoistDefExpr->next); - if (!isCallExpr(typeDefExpr->next)) continue; - auto typeAssign = toCallExpr(typeDefExpr->next); - if (!typeAssign->isPrimitive(PRIM_MOVE) || - !isCallExpr(typeAssign->get(2))) continue; - auto typeCall = toCallExpr(typeAssign->get(2)); - if (!isCallExpr(typeAssign->next)) continue; - auto initCall = toCallExpr(typeAssign->next); - if (!initCall->isPrimitive(PRIM_DEFAULT_INIT_VAR)) continue; - - auto typeConstructor = toSymExpr(typeCall->baseExpr); - if (!typeConstructor) continue; - if (typeConstructor->symbol()->name != astr("c_array")) continue; - + if (DefExpr* hoistDefExpr = toSymExpr(call->get(2))->symbol()->defPoint) + if (DefExpr* typeDefExpr = toDefExpr(hoistDefExpr->next)) + if (CallExpr* typeAssign = toCallExpr(typeDefExpr->next)) + if (typeAssign->isPrimitive(PRIM_MOVE)) + if (CallExpr* typeCall = toCallExpr(typeAssign->get(2))) + if (CallExpr* initCall = toCallExpr(typeAssign->next)) + if (initCall->isPrimitive(PRIM_DEFAULT_INIT_VAR)) + if (SymExpr* typeConstructor = toSymExpr(typeCall->baseExpr)) + if (typeConstructor->symbol()->hasFlag(FLAG_C_ARRAY)) + // if all the above conditions succeeded, add a shared variant + { SET_LINENO(hoistDefExpr); auto newBlock = new BlockStmt(); auto newArr = new VarSymbol(astr("shared_", hoistDefExpr->sym->name)); newArr->qual = Qualifier::QUAL_REF; newBlock->insertAtTail(new DefExpr(newArr)); - newBlock->insertAtTail(new CallExpr(PRIM_MOVE, new SymExpr(newArr), new CallExpr("createSharedCArray", new SymExpr(typeDefExpr->sym)))); + newBlock->insertAtTail(new CallExpr(PRIM_MOVE, newArr, + new CallExpr("createSharedCArray", typeDefExpr->sym))); initCall->insertAfter(newBlock); - } + } } @@ -302,7 +298,8 @@ void normalize() { } } - earlyGpuTransforms(); + if (fIteratorContexts) + handleSharedCArrays(); find_printModuleInit_stuff(); } diff --git a/modules/standard/Collectives.chpl b/modules/standard/Collectives.chpl index 799f709ddbec..b22193f4aedb 100644 --- a/modules/standard/Collectives.chpl +++ b/modules/standard/Collectives.chpl @@ -138,6 +138,7 @@ module Collectives { bar.reset(nTasks); } + // This method is used in the WIP implementation of iterator contexts. @chpldoc.nodoc proc multiply(n: int) { try! reset((bar:(unmanaged aBarrier)).n*n); diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index 37ab5ad67e25..db03b6879260 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -244,6 +244,7 @@ module GPU } } + @chpldoc.nodoc inline proc createSharedCArray(type theType : c_array(?t, ?k)) ref { var voidPtr = __primitive("gpu allocShared", numBytes(t) * k); var arrayPtr = voidPtr : c_ptr(theType); diff --git a/test/users/engin/context/basicBarrier.chpl b/test/users/engin/context/basicBarrier.chpl index 28ed170e6db2..7d1b1fdb857f 100644 --- a/test/users/engin/context/basicBarrier.chpl +++ b/test/users/engin/context/basicBarrier.chpl @@ -24,7 +24,7 @@ forall i in simpleOneDim(n) { // context should be coming from a new syntax var b; { // AST is too complicated without this block b = new barrier(1); - b.multiply(1); + b.multiply(1); // this no-op ensures multiply() is available to compiler } __primitive("hoist to context", localeContext, b); diff --git a/test/users/engin/context/gpuSharedMem.skipif b/test/users/engin/context/gpuSharedMem.skipif index 0ba44004c598..71fbdc324efa 100644 --- a/test/users/engin/context/gpuSharedMem.skipif +++ b/test/users/engin/context/gpuSharedMem.skipif @@ -1,4 +1,4 @@ # This test is specific to device shared memory, # otherwise the behavior is undefined. CHPL_LOCALE_MODEL!=gpu -CHPL_GPU=cpu +CHPL_GPU==cpu