From 17a90fb78977e6ffbcc9dff12df7abdf2df7c2a1 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Sun, 22 Sep 2019 22:52:56 -0400
Subject: [PATCH 01/88] SSSP basic compiling

---
 include/graphit/backend/backend.h             |   2 +
 .../codegen_gpu/assign_function_context.h     |  27 +
 .../graphit/backend/codegen_gpu/codegen_gpu.h | 121 ++++
 include/graphit/frontend/fir.h                |   1 +
 include/graphit/frontend/gpu_schedule.h       | 192 +++++
 .../graphit/frontend/high_level_schedule.h    |  38 +
 include/graphit/frontend/schedule.h           |   6 +-
 include/graphit/midend/mir.h                  |  22 +
 include/graphit/midend/mir_context.h          |   7 +
 src/backend/backend.cpp                       |   4 +
 .../codegen_gpu/assign_function_context.cpp   |  25 +
 src/backend/codegen_gpu/codegen_gpu.cpp       | 662 ++++++++++++++++++
 src/graphitc.py                               |   1 +
 src/main.cpp                                  |  17 +-
 src/midend/apply_expr_lower.cpp               |  22 +
 src/runtime_lib/gpu_intrinsics.h              |  47 ++
 src/runtime_lib/infra_gpu/graph.h             |  39 ++
 src/runtime_lib/infra_gpu/load_balance.h      |  15 +
 src/runtime_lib/infra_gpu/vertex_frontier.h   |  25 +
 src/runtime_lib/intrinsics.h                  |  20 +-
 src/runtime_lib/timer.h                       |  19 +
 test/c++/high_level_schedule_test.cpp         |  33 +
 22 files changed, 1317 insertions(+), 28 deletions(-)
 create mode 100644 include/graphit/backend/codegen_gpu/assign_function_context.h
 create mode 100644 include/graphit/backend/codegen_gpu/codegen_gpu.h
 create mode 100644 include/graphit/frontend/gpu_schedule.h
 create mode 100644 src/backend/codegen_gpu/assign_function_context.cpp
 create mode 100644 src/backend/codegen_gpu/codegen_gpu.cpp
 create mode 100644 src/runtime_lib/gpu_intrinsics.h
 create mode 100644 src/runtime_lib/infra_gpu/graph.h
 create mode 100644 src/runtime_lib/infra_gpu/load_balance.h
 create mode 100644 src/runtime_lib/infra_gpu/vertex_frontier.h
 create mode 100644 src/runtime_lib/timer.h
diff --git a/include/graphit/backend/backend.h b/include/graphit/backend/backend.h
index 2bc2f80c..7e24a76f 100644
--- a/include/graphit/backend/backend.h
+++ b/include/graphit/backend/backend.h
@@ -8,6 +8,7 @@
 #include <graphit/midend/mir_context.h>
 #include <graphit/backend/codegen_cpp.h>
 #include <graphit/backend/codegen_python.h>
+#include <graphit/backend/codegen_gpu/codegen_gpu.h>
 
 namespace graphit {
     class Backend {
@@ -18,6 +19,7 @@ namespace graphit {
 
         int emitCPP(std::ostream &oss = std::cout, std::string module_name="");
     	int emitPython(std::ostream &oss = std::cout, std::string module_name="", std::string module_path="");
+	int emitGPU(std::ostream &oss = std::cout, std::string module_name="", std::string module_path="");
 
     private:
         MIRContext* mir_context_;
diff --git a/include/graphit/backend/codegen_gpu/assign_function_context.h b/include/graphit/backend/codegen_gpu/assign_function_context.h
new file mode 100644
index 00000000..6bb42c85
--- /dev/null
+++ b/include/graphit/backend/codegen_gpu/assign_function_context.h
@@ -0,0 +1,27 @@
+#ifndef ASSIGN_FUNCTION_CONTEXT_H
+#define ASSIGN_FUNCTION CONTEXT_H
+
+#include <graphit/midend/mir.h>
+#include <graphit/midend/mir_visitor.h>
+#include <graphit/midend/mir_context.h>
+
+#include <iostream>
+#include <sstream>
+
+
+namespace graphit {
+class AssignFunctionContext : mir::MIRVisitor {
+	public:
+		AssignFunctionContext(MIRContext *mir_context) : mir_context_(mir_context) {
+		}
+		int assign_function_context(void);
+	protected:
+		void visit(mir::PushEdgeSetApplyExpr::Ptr);
+		void visit(mir::VertexSetApplyExpr::Ptr);
+	private:
+		MIRContext *mir_context_;
+};
+}
+
+
+#endif
diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
new file mode 100644
index 00000000..932548d2
--- /dev/null
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -0,0 +1,121 @@
+
+#ifndef GRAPHIT_CODEGEN_GPU_H
+#define GRAPHIT_CODEGEN_GPU_H
+
+#include <graphit/midend/mir.h>
+#include <graphit/midend/mir_visitor.h>
+#include <graphit/midend/mir_context.h>
+#include <iostream>
+#include <sstream>
+#include <graphit/backend/gen_edge_apply_func_decl.h>
+
+namespace graphit {
+class CodeGenGPUKernelEmitter: public mir::MIRVisitor {
+public:
+	CodeGenGPUKernelEmitter(std::ostream &input_oss, MIRContext *mir_context):
+		oss(input_oss), mir_context_(mir_context), indentLevel(0) {
+		}
+	void indent() { ++indentLevel; }
+	void dedent() { --indentLevel; }
+	void printIndent() { oss << std::string(indentLevel, '\t'); }
+
+	std::ostream &oss;
+	unsigned      indentLevel;
+
+	MIRContext * mir_context_;
+
+	void visit(mir::PushEdgeSetApplyExpr::Ptr);
+	//void visit(mir::VertexSetApplyExpr::Ptr);
+
+
+};
+class CodeGenGPU : public mir::MIRVisitor{
+public:
+	CodeGenGPU(std::ostream &input_oss, MIRContext *mir_context, std::string module_name_, std::string module_path):
+		oss(input_oss), mir_context_(mir_context), module_name(module_name_) {
+			indentLevel = 0;
+			edgeset_apply_func_gen_ = new EdgesetApplyFunctionDeclGenerator(mir_context_, oss);
+		}
+
+	int genGPU();
+
+protected:
+
+	void indent() { ++indentLevel; }
+	void dedent() { --indentLevel; }
+	void printIndent() { oss << std::string(indentLevel, '\t'); }
+
+	std::ostream &oss;
+	std::string module_name;
+	unsigned      indentLevel;
+	MIRContext * mir_context_;
+
+private:
+	void genIncludeStmts(void);
+	void genEdgeSets(void);
+
+
+	void genPropertyArrayImplementationWithInitialization(mir::VarDecl::Ptr shared_ptr);
+
+
+	void genPropertyArrayDecl(mir::VarDecl::Ptr);
+	void genPropertyArrayAlloca(mir::VarDecl::Ptr);
+
+	EdgesetApplyFunctionDeclGenerator* edgeset_apply_func_gen_;
+
+	virtual std::string getBackendFunctionLabel(void) {
+		return "__device__";
+	}
+
+	void generateBinaryExpr(mir::BinaryExpr::Ptr, std::string);
+
+	virtual void visit(mir::EdgeSetType::Ptr) override;
+	virtual void visit(mir::VertexSetType::Ptr) override;
+	virtual void visit(mir::ScalarType::Ptr) override;
+	virtual void visit(mir::FuncDecl::Ptr) override;
+	virtual void visit(mir::ElementType::Ptr) override;
+	virtual void visit(mir::ExprStmt::Ptr) override;
+	virtual void visit(mir::VarExpr::Ptr) override;
+	virtual void visit(mir::AssignStmt::Ptr) override;
+
+	virtual void visit(mir::AddExpr::Ptr) override;
+	virtual void visit(mir::MulExpr::Ptr) override;
+	virtual void visit(mir::DivExpr::Ptr) override;
+	virtual void visit(mir::SubExpr::Ptr) override;
+	virtual void visit(mir::EqExpr::Ptr) override;
+
+	virtual void visit(mir::TensorArrayReadExpr::Ptr) override;
+	virtual void visit(mir::IntLiteral::Ptr) override;
+	virtual void visit(mir::BoolLiteral::Ptr) override;
+	virtual void visit(mir::StringLiteral::Ptr) override;
+
+
+
+	virtual void visit(mir::ReduceStmt::Ptr) override;
+	virtual void visit(mir::VarDecl::Ptr) override;
+
+	virtual void visit(mir::ForStmt::Ptr) override;
+	virtual void visit(mir::WhileStmt::Ptr) override;
+	virtual void visit(mir::IfStmt::Ptr) override;
+	virtual void visit(mir::PrintStmt::Ptr) override;
+	virtual void visit(mir::Call::Ptr) override;	
+	
+	virtual void visit(mir::BreakStmt::Ptr) override;
+	
+	virtual void visit(mir::VertexSetApplyExpr::Ptr) override;
+	virtual void visit(mir::VertexSetAllocExpr::Ptr) override;
+	
+
+};
+class CodeGenGPUHost: public CodeGenGPU {
+public:
+	using CodeGenGPU::CodeGenGPU;
+private:
+	virtual std::string getBackendFunctionLabel(void) {
+		return "__host__";
+	}
+	virtual void visit(mir::TensorArrayReadExpr::Ptr);
+};
+
+}
+#endif
diff --git a/include/graphit/frontend/fir.h b/include/graphit/frontend/fir.h
index abb8879c..6c53094b 100644
--- a/include/graphit/frontend/fir.h
+++ b/include/graphit/frontend/fir.h
@@ -2,6 +2,7 @@
 // Created by Yunming Zhang on 1/24/17.
 //
 
+
 #ifndef GRAPHIT_FIR_H
 #define GRAPHIT_FIR_H
 
diff --git a/include/graphit/frontend/gpu_schedule.h b/include/graphit/frontend/gpu_schedule.h
new file mode 100644
index 00000000..1f2d5092
--- /dev/null
+++ b/include/graphit/frontend/gpu_schedule.h
@@ -0,0 +1,192 @@
+//
+// Created by Ajay Brahmakshatriya 
+//
+
+#ifndef GRAPHIT_GPU_SCHEDULE
+#define GRAPHIT_GPU_SCHEDULE
+
+#include <assert.h>
+
+
+namespace graphit {
+namespace fir {
+namespace gpu_schedule {
+
+
+enum gpu_schedule_options {
+	PUSH, 
+	PULL, 
+	FUSED, 
+	UNFUSED_BITMAP,
+	UNFUSED_BOOLMAP,
+	ENABLED,
+	DISABLED,
+	TWC,
+	TWCE,
+	WM,
+	CM,
+	STRICT,
+	EDGE_ONLY
+};
+
+class GPUSchedule {
+	// Abstract class has no functions for now
+};
+
+class SimpleGPUSchedule: public GPUSchedule {
+
+public:
+	enum class direction_type {
+		DIR_PUSH, 
+		DIR_PULL
+	};
+
+	enum class frontier_creation_type {
+		FRONTIER_FUSED, 
+		UNFUSED_BITMAP,
+		UNFUSED_BOOLMAP
+	};
+
+	enum class deduplication_type {
+		DEDUP_ENABLED, 
+		DEDUP_DISABLED
+	};
+
+	enum class load_balancing_type {
+		TWC, 
+		TWCE, 
+		WM, 
+		CM, 
+		STRICT,
+		EDGE_ONLY
+	};
+
+	enum class kernel_fusion_type {
+		FUSION_ENABLED,
+		FUSION_DISABLED
+	};
+
+private:
+	direction_type direction;
+	frontier_creation_type frontier_creation;
+	deduplication_type deduplication;
+	load_balancing_type load_balancing;
+	kernel_fusion_type kernel_fusion;
+
+public:	
+	void configDirection(enum gpu_schedule_options o) {
+		switch(o) {
+			case PUSH:
+				direction = direction_type::DIR_PUSH;
+				break;
+			case PULL:
+				direction = direction_type::DIR_PULL;
+				break;
+			default:
+				assert(false && "Invalid option for configDirection");
+				break;
+		}	
+	}
+	
+	void configFrontierCreation(enum gpu_schedule_options o) {
+		switch(o) {
+			case FUSED:
+				frontier_creation = frontier_creation_type::FRONTIER_FUSED;
+				break;
+			case UNFUSED_BITMAP:
+				frontier_creation = frontier_creation_type::UNFUSED_BITMAP;
+				break;
+			case UNFUSED_BOOLMAP:
+				frontier_creation = frontier_creation_type::UNFUSED_BOOLMAP;
+				break;
+			default:
+				assert(false && "Invalid option for configFrontierCreation");	
+				break;
+		}
+	}
+
+	void configDeduplication(enum gpu_schedule_options o) {
+		switch(o) {
+			case ENABLED:
+				deduplication = deduplication_type::DEDUP_ENABLED;
+				break;
+			case DISABLED:
+				deduplication = deduplication_type::DEDUP_DISABLED;
+				break;
+			default:
+				assert(false && "Invalid option for configDeduplication");
+				break;
+		}
+	}
+
+	void configLoadBalance(enum gpu_schedule_options o) {
+		switch(o) {
+			case TWC:
+				load_balancing = load_balancing_type::TWC;
+				break;
+			case TWCE:
+				load_balancing = load_balancing_type::TWCE;
+				break;
+			case WM:
+				load_balancing = load_balancing_type::WM;
+				break;
+			case CM:
+				load_balancing = load_balancing_type::CM;
+				break;
+			case STRICT:
+				load_balancing = load_balancing_type::STRICT;
+				break;
+			case EDGE_ONLY:
+				load_balancing = load_balancing_type::EDGE_ONLY;
+				break;
+			default:
+				assert(false && "Invalid option for configLoadBalance");
+				break;
+		}
+	}
+	
+	void configKernelFusion(enum gpu_schedule_options o) {
+		switch(o) {
+			case ENABLED:
+				kernel_fusion = kernel_fusion_type::FUSION_ENABLED;
+				break;
+			case DISABLED:
+				kernel_fusion = kernel_fusion_type::FUSION_DISABLED;
+				break;
+			default:
+				assert(false && "Invalid option for configKernelFusion");
+				break;
+		}
+		
+	}
+};
+
+class HybridGPUSchedule: public GPUSchedule {
+private:
+	SimpleGPUSchedule s1;
+	SimpleGPUSchedule s2;
+	
+	float threshold;
+	// TODO: have separate alpha beta
+public:	
+	enum hybrid_criteria {
+		INPUT_VERTEXSET_SIZE
+	};
+private:	
+	hybrid_criteria _hybrid_criteria;
+public:	
+	HybridGPUSchedule (hybrid_criteria h, float t, SimpleGPUSchedule &_s1, SimpleGPUSchedule &_s2) {
+		_hybrid_criteria = h;
+		threshold = t;
+		s1 = _s1;
+		s2 = _s2;
+	}
+};
+
+
+}
+}
+}
+
+#endif
+
diff --git a/include/graphit/frontend/high_level_schedule.h b/include/graphit/frontend/high_level_schedule.h
index 938ebe9e..efc1189a 100644
--- a/include/graphit/frontend/high_level_schedule.h
+++ b/include/graphit/frontend/high_level_schedule.h
@@ -16,6 +16,8 @@
 #include <map>
 #include <regex>
 
+#include <graphit/frontend/gpu_schedule.h>
+
 
 namespace graphit {
     namespace fir {
@@ -52,6 +54,14 @@ namespace graphit {
                     if (schedule_ != nullptr)
                         delete(schedule_);
                 }
+                enum class backend_selection_type {
+			CODEGEN_CPU,
+                        CODEGEN_GPU,
+
+			CODEGEN_INVALID
+                };
+                
+                backend_selection_type backend_selection = backend_selection_type::CODEGEN_CPU; 
 
                 typedef std::shared_ptr<ProgramScheduleNode> Ptr;
 
@@ -198,6 +208,34 @@ namespace graphit {
                     return  schedule_;
                 }
 
+
+		// New GPU Scheduling API
+		// We currently need two different functions to apply simple and hybrid schedules
+		// TODO: Abstract the simple and hybrid schedules into a single class
+		void applyGPUSchedule(std::string label_name, gpu_schedule::SimpleGPUSchedule &s1) {
+                	backend_selection = backend_selection_type::CODEGEN_GPU; 
+
+			if (schedule_ == nullptr)
+				schedule_ = new Schedule();
+
+			gpu_schedule::SimpleGPUSchedule *s1_copy = new gpu_schedule::SimpleGPUSchedule(s1);
+			
+			schedule_->apply_gpu_schedules[label_name] = s1_copy;
+			
+		}
+		void applyGPUSchedule(std::string label_name, gpu_schedule::HybridGPUSchedule &s2) {
+                	backend_selection = backend_selection_type::CODEGEN_GPU; 
+
+			if (schedule_ == nullptr)
+				schedule_ = new Schedule();
+
+			gpu_schedule::HybridGPUSchedule *s2_copy = new gpu_schedule::HybridGPUSchedule(s2);
+			*s2_copy = s2;
+			
+			schedule_->apply_gpu_schedules[label_name] = s2_copy;
+		}
+		
+
             private:
                 graphit::FIRContext * fir_context_;
                 Schedule * schedule_;
diff --git a/include/graphit/frontend/schedule.h b/include/graphit/frontend/schedule.h
index dcead15b..0d1da973 100644
--- a/include/graphit/frontend/schedule.h
+++ b/include/graphit/frontend/schedule.h
@@ -8,6 +8,7 @@
 #include <string>
 #include <map>
 #include <vector>
+#include <graphit/frontend/gpu_schedule.h>
 
 namespace graphit {
 
@@ -218,6 +219,9 @@ namespace graphit {
             ~Schedule() {
                 delete physical_data_layouts;
                 delete apply_schedules;
+		for (auto s = apply_gpu_schedules.begin(); s != apply_gpu_schedules.end(); s++) {
+			delete s->second;
+		}
             }
 
             //TODO: what does it mean??
@@ -230,8 +234,8 @@ namespace graphit {
             std::map<std::string, VertexsetPhysicalLayout> vertexset_data_layout;
 
 
+	    std::map <std::string, graphit::fir::gpu_schedule::GPUSchedule*> apply_gpu_schedules;
         };
     }
 
-
 #endif //GRAPHIT_SCHEDULE_H
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index ac5d1ace..b632acfc 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -592,6 +592,14 @@ namespace graphit {
             enum class Type {
                 INTERNAL, EXPORTED, EXTERNAL
             };
+	    enum function_context_type {
+		CONTEXT_NONE = 0x0,
+		CONTEXT_HOST = 0x1,
+		CONTEXT_DEVICE = 0x2,
+		CONTEXT_BOTH = 0x3,
+	    };
+
+	    enum function_context_type function_context = function_context_type::CONTEXT_HOST;
 
             std::string name;
             std::vector<mir::Var> args;
@@ -615,6 +623,17 @@ namespace graphit {
 
             virtual MIRNode::Ptr cloneNode();
         };
+	static inline FuncDecl::function_context_type operator | (FuncDecl::function_context_type a, FuncDecl::function_context_type b) {
+		return static_cast<FuncDecl::function_context_type>((int)a | (int)b);
+	}
+	static inline FuncDecl::function_context_type operator & (FuncDecl::function_context_type a, FuncDecl::function_context_type b) {
+		return static_cast<FuncDecl::function_context_type>((int)a & (int)b);
+	}
+	static inline FuncDecl::function_context_type& operator |= (FuncDecl::function_context_type &a, FuncDecl::function_context_type b) {
+		a = a | b;
+		return a;
+	}
+	
 
         struct TensorReadExpr : public Expr {
             Expr::Ptr index;
@@ -823,6 +842,9 @@ namespace graphit {
             std::string tracking_field = "";
             typedef std::shared_ptr<ApplyExpr> Ptr;
 
+	    std::string device_function;
+	    std::string kernel_function;
+
         protected:
             virtual void copy(MIRNode::Ptr);
 
diff --git a/include/graphit/midend/mir_context.h b/include/graphit/midend/mir_context.h
index f5cdbabb..5971193e 100644
--- a/include/graphit/midend/mir_context.h
+++ b/include/graphit/midend/mir_context.h
@@ -168,6 +168,13 @@ namespace graphit {
             }
             return false;
         }
+        bool isLoweredConstTensor(std::string var_name) {
+            for (auto tensor: lowered_constants_) {
+		if (tensor->name == var_name) 
+			return true;
+	    }
+	    return false;
+	}
 
         void addConstVertexSet(mir::VarDecl::Ptr vertexset) {
             const_vertex_sets_.push_back(vertexset);
diff --git a/src/backend/backend.cpp b/src/backend/backend.cpp
index 27b7b94b..1dff2ef5 100644
--- a/src/backend/backend.cpp
+++ b/src/backend/backend.cpp
@@ -17,4 +17,8 @@ namespace graphit{
 	delete codegen_python;
 	return flag;
     }
+    int Backend::emitGPU(std::ostream &oss, std::string module_name, std::string module_path) {
+        CodeGenGPU code_gen_gpu(oss, mir_context_, module_name, module_path);
+	return code_gen_gpu.genGPU();
+    }
 }
diff --git a/src/backend/codegen_gpu/assign_function_context.cpp b/src/backend/codegen_gpu/assign_function_context.cpp
new file mode 100644
index 00000000..ca11d0ae
--- /dev/null
+++ b/src/backend/codegen_gpu/assign_function_context.cpp
@@ -0,0 +1,25 @@
+#include "graphit/backend/codegen_gpu/assign_function_context.h"
+
+
+namespace graphit {
+int AssignFunctionContext::assign_function_context(void) {
+	const std::vector<mir::FuncDecl::Ptr> &functions = mir_context_->getFunctionList();
+	for (auto it = functions.begin(); it != functions.end(); it++)
+		it->get()->accept(this);	
+	for (auto stmt: mir_context_->field_vector_init_stmts)
+		stmt->accept(this);
+	
+}
+void AssignFunctionContext::visit(mir::PushEdgeSetApplyExpr::Ptr pesae) {
+	if (mir_context_->isFunction(pesae->input_function_name))
+		mir_context_->getFunction(pesae->input_function_name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+	if (mir_context_->isFunction(pesae->from_func))
+		mir_context_->getFunction(pesae->from_func)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+	if (mir_context_->isFunction(pesae->to_func))
+		mir_context_->getFunction(pesae->to_func)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+}
+void AssignFunctionContext::visit(mir::VertexSetApplyExpr::Ptr vsae) {
+	if (mir_context_->isFunction(vsae->input_function_name))
+		mir_context_->getFunction(vsae->input_function_name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+}
+}
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
new file mode 100644
index 00000000..6c806d15
--- /dev/null
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -0,0 +1,662 @@
+//
+// Created by Ajay Brahmakshatriya on 9/7/2019
+//
+
+#include <graphit/backend/codegen_gpu/codegen_gpu.h>
+#include <graphit/backend/codegen_gpu/assign_function_context.h>
+#include <graphit/midend/mir.h>
+#include <cstring>
+
+namespace graphit {
+int CodeGenGPU::genGPU() {
+	AssignFunctionContext assign_function_context(mir_context_);
+	assign_function_context.assign_function_context();
+
+
+	CodeGenGPUHost code_gen_gpu_host(oss, mir_context_, module_name, "");
+
+	genIncludeStmts();
+	
+	// This generates all the declarations of type GraphT<...>
+	genEdgeSets();
+
+	// Declare all the vertex properties
+	// We are only declaring the device versions now. If required we can generate the host versions later
+	for (auto constant: mir_context_->getLoweredConstants()) {
+		if ((mir::isa<mir::VectorType>(constant->type))) {
+			// This is some vertex data
+			genPropertyArrayDecl(constant);	
+		} else {
+			assert(false && "Constant type not handled yet in GPU backend\n");	
+		}
+	}	
+		
+	std::vector<mir::FuncDecl::Ptr> functions = mir_context_->getFunctionList();
+	
+	// Every operator requires a kernel to be generated
+	// Create that first because all the actual functions will be calling these kernels
+	CodeGenGPUKernelEmitter kernel_emitter(oss, mir_context_);
+	for (auto function: functions)
+		function->accept(&kernel_emitter);		
+
+	for (auto function: functions) {
+		if (function->function_context & mir::FuncDecl::function_context_type::CONTEXT_DEVICE)
+			function->accept(this);
+		if (function->function_context & mir::FuncDecl::function_context_type::CONTEXT_HOST)
+			function->accept(&code_gen_gpu_host);
+	}
+
+	oss << std::endl;
+	return 0;
+}
+void CodeGenGPU::genPropertyArrayDecl(mir::VarDecl::Ptr constant) {
+	mir::VectorType::Ptr vector_type = mir::to<mir::VectorType>(constant->type);
+	vector_type->vector_element_type->accept(this);
+	oss << " __device__ *" << constant->name << ";" << std::endl;
+
+	// Also generate the host versions of these arrays 
+	vector_type->vector_element_type->accept(this);
+	oss << " " << "*__host_" << constant->name << ";" << std::endl;
+}
+
+void CodeGenGPU::genPropertyArrayAlloca(mir::VarDecl::Ptr var_decl) {
+	auto vector_type = mir::to<mir::VectorType>(var_decl->type);
+	assert(vector_type != nullptr);
+	
+	auto size_expr = mir_context_->getElementCount(vector_type->element_type);
+	assert(size_expr != nullptr);
+	
+	printIndent();
+	oss << "{" << std::endl;
+	indent();
+	printIndent();
+	vector_type->vector_element_type->accept(this);
+	oss << " __tmp;" << std::endl;
+
+	printIndent();
+	oss << "cudaMalloc((void**)&__tmp, ";
+	size_expr->accept(this);
+	oss << "  * sizeof(";
+	vector_type->vector_element_type->accept(this);
+	oss << "));" << std::endl;
+	
+	printIndent();
+	oss << "cudaMemcpyToSymbol(\"";
+	oss << var_decl->name;
+	oss << "\", &__tmp, sizeof(void*));" << std::endl;
+
+	printIndent();
+	oss << "__host_" << var_decl->name << " = new ";
+	vector_type->vector_element_type->accept(this);
+	oss << "[";
+	size_expr->accept(this);
+	oss << "];" << std::endl;
+	
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;
+		
+}
+// Disabling this for now because we are handling all vertex operations in library
+/*
+void CodeGenGPUKernelEmitter::visit(mir::VertexSetApplyExpr::Ptr vsae) {
+	// First we generate the __device__ function. This is separate from the __global__ for kernel fusion
+	std::string vertex_apply_func = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();	
+	auto mir_var = mir::to<mir::VarExpr> (vsae->target);
+	if (mir_context_->isConstVertexSet(mir_var->var.getName())) {
+		oss << "void __device__ " << vertex_apply_func << " (int32_t num_vertices) {" << std::endl;
+		indent();
+		printIndent();
+		oss << "for (int32_t vid = threadIdx.x + blockDim.x * blockIdx.x; vid < ";
+		oss << "num_vertices";
+		oss << "; vid += gridDim.x * blockDim.x) {" << std::endl;
+		indent();
+		printIndent();
+		oss << vsae->input_function_name << "(vid);" << std::endl;
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;	
+	} else {
+		oss << "void __device__ " << vertex_apply_func << " (VertexFrontier frontier) {" << std::endl;
+		indent();
+		printIndent();
+		oss << "for (int32_t vidx = threadIdx.x + blockDim.x * blockIdx.x; vidx < ";
+		oss << "frontier.d_num_elems_input[0]";
+		oss << "; vidx += gridDim.x * blockDim.x) {" << std::endl;
+		indent();
+		printIndent();
+		oss << vsae->input_function_name << "(frontier.d_sparse_queue_input[vidx]);" << std::endl;
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;	
+	}
+
+	// Now generate the __global__ kernels to actually call the function
+	std::string vertex_apply_func_kernel = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();	
+	if (mir_context_->isConstVertexSet(mir_var->var.getName())) {
+		oss << "void __global__ " << vertex_apply_func_kernel << " (int32_t num_vertices) {" << std::endl;
+		indent();
+		printIndent();
+		oss << vertex_apply_func << "(num_vertices);" << std::endl;
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;
+	} else {
+		oss << "void __global__ " << vertex_apply_func_kernel << " (VertexFrontier frontier) {" << std::endl;
+		indent();
+		printIndent();
+		oss << vertex_apply_func << "(frontier);" << std::endl;
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;
+	}	
+	vsae->device_function = vertex_apply_func;
+	vsae->kernel_function = vertex_apply_func_kernel;
+}
+*/
+
+void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
+
+	// First we generate the function that is passed to the load balancing function
+
+	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
+
+	oss << "template <typename EdgeWeightType>" << std::endl;
+	oss << "void __device__ " << load_balancing_arg << "(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
+	indent();
+	printIndent();
+	oss << "// Body of the actual operator code" << std::endl;
+	printIndent();
+	oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+	printIndent();
+	oss << "if (" << apply_expr->input_function_name << "(src, dst, weight)) {" << std::endl;
+	indent();
+	printIndent();
+	oss << "gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, dst);" << std::endl;
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;
+
+	std::string kernel_function_name = "gpu_operator_kernel_" + mir_context_->getUniqueNameCounterString();
+
+	oss << "template <typename EdgeWeightType>" << std::endl;
+	oss << "void __global__ " << kernel_function_name << " (gpu_runtime::GraphT<EdgeWeightType> graph, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
+	indent();
+	printIndent();
+	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
+	oss << load_balance_function << "<EdgeWeightType, " << load_balancing_arg << "<EdgeWeightType>> (";
+	oss << "graph, input_frontier, output_frontier);" << std::endl;
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;
+	
+	apply_expr->kernel_function = kernel_function_name;
+	apply_expr->device_function = load_balancing_arg;
+	
+}
+void CodeGenGPU::genIncludeStmts(void) {
+	oss << "#include \"gpu_intrinsics.h\"" << std::endl;
+
+}
+
+void CodeGenGPU::genEdgeSets(void) {
+	for (auto edgeset: mir_context_->getEdgeSets()) {
+		auto edge_set_type = mir::to<mir::EdgeSetType>(edgeset->type);
+		edge_set_type->accept(this);
+		oss << " " << edgeset->name << ";" << std::endl;
+	}
+}
+
+void CodeGenGPU::visit(mir::EdgeSetType::Ptr edgeset_type) {
+	if (edgeset_type->weight_type != nullptr) {
+		oss << "gpu_runtime::GraphT<";
+		edgeset_type->weight_type->accept(this);
+		oss << ">";	
+	} else {
+		oss << "gpu_runtime::GraphT<int>";
+	}
+}
+
+void CodeGenGPU::visit(mir::VertexSetType::Ptr vertexset_type) {
+	oss << "gpu_runtime::VertexFrontier";
+}
+void CodeGenGPU::visit(mir::ScalarType::Ptr scalar_type) {
+	switch(scalar_type->type) {
+		case mir::ScalarType::Type::INT:
+			oss << "int32_t";
+			break;
+		case mir::ScalarType::Type::UINT:
+			oss << "uint32_t";
+			break;
+		case mir::ScalarType::Type::FLOAT:
+			oss << "float";
+			break;
+		case mir::ScalarType::Type::DOUBLE:
+			oss << "double";
+			break;
+		case mir::ScalarType::Type::BOOL:
+			oss << "bool";
+			break;
+		case mir::ScalarType::Type::COMPLEX:
+			assert(false && "Complex type not yet supported with the GPU backend\n");
+			break;
+		case mir::ScalarType::Type::STRING:
+			assert(false && "String type not yet supported with the GPU backend\n");
+			break;
+		default:
+			assert(false && "Invalid type enum for scalar type\n");
+			break;
+	}
+}
+
+void CodeGenGPU::visit(mir::FuncDecl::Ptr func_decl) {
+	if (func_decl->type == mir::FuncDecl::Type::EXTERNAL) {
+		assert(false && "GPU backend currently doesn't support external functions\n");
+	} else {
+		// First generate the signature of the function
+		if (func_decl->name == "main") {
+			oss << "int " << getBackendFunctionLabel() << " main(int argc, char* argv[])";
+		} else {
+			if (func_decl->result.isInitialized()) {
+				func_decl->result.getType()->accept(this);
+			} else {
+				oss << "void";
+			}
+			oss << " " << getBackendFunctionLabel() << " " << func_decl->name << "(";
+			bool printDelimeter = false;
+			for (auto arg: func_decl->args) {
+				if (printDelimeter)
+					oss << ", ";
+				arg.getType()->accept(this);
+				oss << " " << arg.getName();
+				printDelimeter = true;
+			}
+			oss << ")";	
+		}
+		oss << " {" << std::endl;
+		indent();
+
+		if (func_decl->name == "main") {
+			for (auto stmt: mir_context_->edgeset_alloc_stmts) {
+				mir::AssignStmt::Ptr assign_stmt = mir::to<mir::AssignStmt>(stmt);
+				mir::EdgeSetLoadExpr::Ptr edge_set_load_expr = mir::to<mir::EdgeSetLoadExpr>(assign_stmt->expr);
+				mir::VarExpr::Ptr lhs_var = mir::to<mir::VarExpr>(assign_stmt->lhs);
+				std::string var_name  = lhs_var->var.getName();
+				
+				printIndent();
+				oss << "gpu_runtime::load_graph(";
+				oss << var_name << ", ";
+				edge_set_load_expr->file_name->accept(this);
+				oss << ", false);" << std::endl;
+
+			}
+			for (auto constant: mir_context_->getLoweredConstants()) {
+				if (mir::isa<mir::VectorType>(constant->type)) {
+					if (constant->needs_allocation) 
+						genPropertyArrayAlloca(constant);
+				} else {
+					if (constant->initVal != nullptr) {
+						printIndent();
+						oss << constant->name << " = ";
+						constant->initVal->accept(this);
+						oss << ";" << std::endl;
+					}
+				}
+			}
+			for (auto stmt: mir_context_->field_vector_init_stmts) {
+				stmt->accept(this);
+			}
+		}
+		if (func_decl->body && func_decl->body->stmts) {
+			if (func_decl->result.isInitialized()) {
+				printIndent();
+				func_decl->result.getType()->accept(this);
+				oss << " " << func_decl->result.getName() << ";" << std::endl;
+			}	
+			func_decl->body->accept(this);	
+			if (func_decl->result.isInitialized()) {
+				printIndent();
+				oss << "return " << func_decl->result.getName() << ";" << std::endl;
+			}
+		}	
+		
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;
+	}
+}
+void CodeGenGPU::visit(mir::ElementType::Ptr element_type) {
+	oss << "int32_t";
+}
+void CodeGenGPU::visit(mir::ExprStmt::Ptr expr_stmt) {
+	printIndent();
+	expr_stmt->expr->accept(this);
+	oss << ";" << std::endl;
+}
+void CodeGenGPU::visit(mir::VarExpr::Ptr var_expr) {
+	oss << var_expr->var.getName();
+}
+void CodeGenGPU::visit(mir::AssignStmt::Ptr assign_stmt) {
+	printIndent();
+	assign_stmt->lhs->accept(this);
+	oss << " = ";
+	assign_stmt->expr->accept(this);
+	oss << ";" << std::endl;
+}
+
+void CodeGenGPU::generateBinaryExpr(mir::BinaryExpr::Ptr expr, std::string token) {
+	oss << "(";
+	expr->lhs->accept(this);
+	oss << " " << token << " ";
+	expr->rhs->accept(this);
+	oss << ")";
+}
+void CodeGenGPU::visit(mir::AddExpr::Ptr expr) {
+	generateBinaryExpr(expr, "+");
+}
+void CodeGenGPU::visit(mir::MulExpr::Ptr expr) {
+	generateBinaryExpr(expr, "*");
+}
+void CodeGenGPU::visit(mir::DivExpr::Ptr expr) {
+	generateBinaryExpr(expr, "/");
+}
+void CodeGenGPU::visit(mir::SubExpr::Ptr expr) {
+	generateBinaryExpr(expr, "-");
+}
+
+
+void CodeGenGPU::visit(mir::TensorArrayReadExpr::Ptr expr) {
+	expr->target->accept(this);
+	oss << "[";
+	expr->index->accept(this);
+	oss << "]";	
+}
+void CodeGenGPUHost::visit(mir::TensorArrayReadExpr::Ptr expr) {
+	mir::VarExpr::Ptr var_expr = mir::to<mir::VarExpr>(expr->target);
+	if (mir_context_->isLoweredConstTensor(var_expr->var.getName()))	
+		oss << "__host_";
+	expr->target->accept(this);
+	oss << "[";
+	expr->index->accept(this);
+	oss << "]";
+}
+
+void CodeGenGPU::visit(mir::IntLiteral::Ptr expr) {
+	oss << expr->val;
+}
+void CodeGenGPU::visit(mir::StringLiteral::Ptr expr) {
+	oss << "\"";
+	for (auto ch : expr->val)
+		if (iscntrl(ch) || ch == '\\' || ch == '\"' || ch == '\'')
+			oss << "\\0" << std::oct << (int)(ch);	
+		else
+			oss << ch;
+	oss << "\"";
+}
+void CodeGenGPU::visit(mir::ReduceStmt::Ptr reduce_stmt) {
+	switch (reduce_stmt->reduce_op_) {
+		case mir::ReduceStmt::ReductionOp::SUM:
+			printIndent();
+			reduce_stmt->lhs->accept(this);
+			oss << " += ";
+			reduce_stmt->expr->accept(this);
+			oss << ";" << std::endl;
+			if (reduce_stmt->tracking_var_name_ != "") {
+				printIndent();
+				oss << reduce_stmt->tracking_var_name_ << " = true;" << std::endl;
+			}
+			break;
+		case mir::ReduceStmt::ReductionOp::MIN:
+			printIndent();
+			oss << "if ((";
+			reduce_stmt->lhs->accept(this);
+			oss << ") > (";
+			reduce_stmt->expr->accept(this);
+			oss << ")) {" << std::endl;
+			indent();
+			printIndent();
+			reduce_stmt->lhs->accept(this);
+			oss << " = ";
+			reduce_stmt->expr->accept(this);
+			oss << ";" << std::endl;
+
+			if (reduce_stmt->tracking_var_name_ != "") {
+				printIndent();
+				oss << reduce_stmt->tracking_var_name_ << " = true;" << std::endl;
+			}
+			dedent();
+			printIndent();
+			oss << "}" << std::endl;
+			break;
+		case mir::ReduceStmt::ReductionOp::MAX:
+			printIndent();
+			oss << "if ((";
+			reduce_stmt->lhs->accept(this);
+			oss << ") < (";
+			reduce_stmt->expr->accept(this);
+			oss << ")) {" << std::endl;
+			indent();
+			printIndent();
+			reduce_stmt->lhs->accept(this);
+			oss << " = ";
+			reduce_stmt->expr->accept(this);
+			oss << ";" << std::endl;
+
+			if (reduce_stmt->tracking_var_name_ != "") {
+				printIndent();
+				oss << reduce_stmt->tracking_var_name_ << " = true;" << std::endl;
+			}
+			dedent();
+			printIndent();
+			oss << "}" << std::endl;
+			break;
+		case mir::ReduceStmt::ReductionOp::ATOMIC_MIN:
+			printIndent();
+			if (reduce_stmt->tracking_var_name_ != "") 
+				oss << reduce_stmt->tracking_var_name_ << " = ";
+			oss << "gpu_runtime::writeMin(&";
+			reduce_stmt->lhs->accept(this);
+			oss << ", ";
+			reduce_stmt->expr->accept(this);
+			oss << ");" << std::endl;
+			break;
+		case mir::ReduceStmt::ReductionOp::ATOMIC_SUM:
+			if (reduce_stmt->tracking_var_name_ != "") {
+				printIndent();
+				oss << reduce_stmt->tracking_var_name_ << " = true;" << std::endl;
+			}
+			printIndent();
+			oss << "writeAdd(&";
+			reduce_stmt->lhs->accept(this);
+			oss << ", ";
+			reduce_stmt->expr->accept(this);
+			oss << ");" << std::endl;
+			break;
+	}	
+}
+void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
+	
+	printIndent();
+	var_decl->type->accept(this);
+	
+	oss << " " << var_decl->name;
+	
+	if (var_decl->initVal != nullptr) {
+		// Special case if RHS is a EdgeSetApplyExpr
+		if (mir::isa<mir::EdgeSetApplyExpr>(var_decl->initVal)) {
+			mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(var_decl->initVal);
+			if (esae->from_func == "") {
+				assert(false && "GPU backend doesn't currently support creating output frontier without input frontier\n");
+			}		
+			// We will assume that the output frontier can reuse the input frontier. 
+			// TOOD: Add liveness analysis for this
+			oss << " = " << esae->from_func;
+			oss << ";" << std::endl;
+			printIndent();
+			
+			oss << "{" << std::endl;
+			indent();
+			printIndent();
+			oss << "int32_t num_cta, cta_size;" << std::endl;
+			printIndent();		
+			oss << "gpu_runtime::vertex_based_load_balance_info(";
+			oss << esae->from_func;
+			oss << ", num_cta, cta_size);" << std::endl;
+			printIndent();
+			oss << esae->kernel_function << "<<<num_cta, cta_size>>>" << "(";
+			esae->target->accept(this);
+			oss << ", " << esae->from_func << ", " << var_decl->name << ");" << std::endl;
+			dedent();
+			printIndent();
+			oss << "}" << std::endl;
+		} else {
+			oss << " = ";
+			var_decl->initVal->accept(this);
+			oss << ";" << std::endl;
+		}
+	} else 
+		oss << ";" << std::endl;
+		
+	
+}
+void CodeGenGPU::visit(mir::BoolLiteral::Ptr bool_literal) {
+	oss << bool_literal->val?"true":"false";
+}
+void CodeGenGPU::visit(mir::ForStmt::Ptr for_stmt) {
+	printIndent();
+	oss << "for (int32_t " << for_stmt->loopVar << " = ";
+	for_stmt->domain->lower->accept(this);
+	oss << "; " << for_stmt->loopVar << " < ";
+	for_stmt->domain->upper->accept(this);
+	oss << "; " << for_stmt->loopVar << "++) {" << std::endl;
+	indent();
+	for_stmt->body->accept(this);
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;
+}
+void CodeGenGPU::visit(mir::WhileStmt::Ptr while_stmt) {
+	printIndent();
+	oss << "while (";
+	while_stmt->cond->accept(this);
+	oss << ") {" << std::endl;
+	indent();
+	while_stmt->body->accept(this);
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;
+}
+void CodeGenGPU::visit(mir::IfStmt::Ptr if_stmt) {
+	printIndent();
+	oss << "if (";
+	if_stmt->cond->accept(this);
+	oss << ") {" << std::endl;
+	indent();
+	if_stmt->ifBody->accept(this);
+	dedent();
+	printIndent();
+	oss << "}";
+	if (if_stmt->elseBody != nullptr) {
+		oss << " else {" << std::endl;
+		indent();
+		if_stmt->elseBody->accept(this);
+		dedent();
+		printIndent();
+		oss << "}";
+	}	
+	oss << std::endl;
+}
+void CodeGenGPU::visit(mir::PrintStmt::Ptr print_stmt) {
+	printIndent();
+	oss << "std::cout << ";
+	print_stmt->expr->accept(this);
+	oss << ";" << std::endl;
+}
+void CodeGenGPU::visit(mir::Call::Ptr call_expr) {
+	if (call_expr->name == "deleteObject" || call_expr->name.substr(0, strlen("builtin_")) == "builtin_")	
+		oss << "gpu_runtime::" << call_expr->name << "(";
+	else
+		oss << call_expr->name << "(";
+	
+	bool printDelimeter = false;
+	for (auto arg: call_expr->args) {
+		if (printDelimeter) 
+			oss << ", ";
+		arg->accept(this);
+		printDelimeter = true;
+	}	
+	oss << ")";
+}
+void CodeGenGPU::visit(mir::EqExpr::Ptr eq_expr) {
+	oss << "(";
+	eq_expr->operands[0]->accept(this);
+	oss << ")";
+
+	for (unsigned i = 0; i < eq_expr->ops.size(); ++i) {
+		switch(eq_expr->ops[i]) {
+			case mir::EqExpr::Op::LT:
+				oss << " < ";
+				break;
+			case mir::EqExpr::Op::LE:
+				oss << " <= ";
+				break;
+			case mir::EqExpr::Op::GT:
+				oss << " > ";
+				break;
+			case mir::EqExpr::Op::GE:
+				oss << " >= ";
+				break;
+			case mir::EqExpr::Op::EQ:
+				oss << " == ";
+				break;
+			case mir::EqExpr::Op::NE:
+				oss << " != ";
+				break;
+			default:
+				assert(false && "Invalid operator for EqExpr\n");
+
+		}
+		oss << "(";
+		eq_expr->operands[i+1]->accept(this);
+		oss << ")";
+	}
+}
+void CodeGenGPU::visit(mir::BreakStmt::Ptr break_stmt) {
+	printIndent();
+	oss << "break;" << std::endl;
+}
+void CodeGenGPU::visit(mir::VertexSetApplyExpr::Ptr vsae) {
+	oss << "gpu_runtime::vertex_set_apply_kernel"; 
+	oss << "<" << vsae->input_function_name << ">";
+	oss << "<<<NUM_CTA, CTA_SIZE>>>";
+	auto mir_var = mir::to<mir::VarExpr> (vsae->target);
+	if (mir_context_->isConstVertexSet(mir_var->var.getName())) {
+		auto associated_element_type = mir_context_->getElementTypeFromVectorOrSetName(mir_var->var.getName());
+		assert(associated_element_type != nullptr);
+		auto associated_element_type_size = mir_context_->getElementCount(associated_element_type);
+		assert(associated_element_type_size != nullptr);
+		oss << "(";
+		associated_element_type_size->accept(this);
+		oss << ")";	
+	} else {
+		oss << "(";
+		oss << mir_var->var.getName();
+		oss << ")";
+	}		
+}
+void CodeGenGPU::visit(mir::VertexSetAllocExpr::Ptr vsae) {
+	mir::Expr::Ptr size_expr = mir_context_->getElementCount(vsae->element_type);
+	oss << "gpu_runtime::create_new_vertex_set(";
+	size_expr->accept(this);
+	oss << ")" << std::endl;
+}
+}
diff --git a/src/graphitc.py b/src/graphitc.py
index d2530427..468f277c 100644
--- a/src/graphitc.py
+++ b/src/graphitc.py
@@ -75,6 +75,7 @@ def parseArgs():
 
         compile_file.write("#include <graphit/frontend/high_level_schedule.h>\n")
         compile_file.write("namespace graphit {\n")
+	compile_file.write("using namespace graphit::fir::gpu_schedule;\n");
         compile_file.write("void user_defined_schedule (graphit::fir::high_level_schedule::ProgramScheduleNode::Ptr program) {\n")
         for schedule_cmd in schedule_cmd_list:
             compile_file.write(schedule_cmd)
diff --git a/src/main.cpp b/src/main.cpp
index 5b9f7fc5..47938b72 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -59,19 +59,12 @@ int main(int argc, char* argv[]) {
     std::string python_module_path = cli.python_module_path();
     
         
-    be->emitCPP(output_file, python_module_name);
+    if (program->backend_selection == fir::high_level_schedule::ProgramScheduleNode::backend_selection_type::CODEGEN_GPU)
+    	be->emitGPU(output_file, python_module_name);
+    else
+    	be->emitCPP(output_file, python_module_name);
     output_file.close();
-/*
-    if (python_module_name != "") {
-	if (python_module_path == "")
-		python_module_path = "/tmp";
-	std::ofstream python_output_file;
-	python_output_file.open(python_module_path + "/" + python_module_name + ".py");
-	be->emitPython(python_output_file, python_module_name, python_module_path) ;
-	python_output_file.close();
-	
-    }
-*/
+
     delete be;
     return 0;
 
diff --git a/src/midend/apply_expr_lower.cpp b/src/midend/apply_expr_lower.cpp
index f31d708c..e84c810c 100644
--- a/src/midend/apply_expr_lower.cpp
+++ b/src/midend/apply_expr_lower.cpp
@@ -50,6 +50,28 @@ namespace graphit {
             edgeset_apply->is_weighted = true;
         }
 
+
+
+	// First check if the program has a GPU Schedule, if yes, the defaults are different
+	if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
+		// Always parallelize all operators for GPU schedules
+		edgeset_apply->is_parallel = true;
+		// Check if there is a GPU schedule attached to this statement - 
+            	auto current_scope_name = label_scope_.getCurrentScope();
+		auto apply_schedule_iter = schedule_->apply_gpu_schedules.find(current_scope_name);
+		if (apply_schedule_iter != schedule_->apply_gpu_schedules.end()) {
+			auto apply_schedule = apply_schedule_iter->second;
+			
+			
+		} else {
+			// No schedule is attached, lower using default schedule
+			
+			node = std::make_shared<mir::PushEdgeSetApplyExpr>(edgeset_apply);
+			
+			
+		}
+	}
+
         // check if the schedule contains entry for the current edgeset apply expressions
 
         if (schedule_ != nullptr && schedule_->apply_schedules != nullptr) {
diff --git a/src/runtime_lib/gpu_intrinsics.h b/src/runtime_lib/gpu_intrinsics.h
new file mode 100644
index 00000000..b22e4103
--- /dev/null
+++ b/src/runtime_lib/gpu_intrinsics.h
@@ -0,0 +1,47 @@
+#ifndef GPU_INTRINSICS_H
+#define GPU_INTRINSICS_H
+
+#include <iostream>
+#include <string>
+
+#include "infra_gpu/graph.h"
+#include "infra_gpu/vertex_frontier.h"
+#include "infra_gpu/load_balance.h"
+#include "timer.h"
+
+namespace gpu_runtime {
+template <typename T>
+T __device__ writeMin(T *dst, T src) {
+	return atomicMin(dst, src);
+}
+template <typename EdgeWeightType>
+static int32_t builtin_getVertices(GraphT<EdgeWeightType> &graph) {
+	return graph.num_vertices;
+}
+
+static VertexFrontier create_new_vertex_set(int32_t num_vertices) {
+	return VertexFrontier();
+}
+
+static void builtin_addVertex(VertexFrontier &frontier, int32_t vid) {
+	
+}
+
+template <void body(int32_t vid)>
+static void __global__ vertex_set_apply_kernel(int32_t num_vertices) {
+
+} 
+
+static int32_t builtin_getVertexSetSize(VertexFrontier &frontier) {
+	return 0;	
+}
+
+template <typename T>
+void deleteObject(T &t) {
+
+}
+
+void __device__ enqueueVertexSparseQueue(int32_t *sparse_queue, int32_t *sparse_queue_size, int32_t vertex_id) {
+}
+}
+#endif
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
new file mode 100644
index 00000000..86c2495f
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -0,0 +1,39 @@
+#ifndef GPU_GRAPH_H
+#define GPU_GRAPH_H
+
+// GraphT data structure 
+
+namespace gpu_runtime {
+
+template <typename EdgeWeightType>
+struct GraphT { // Field names are according to CSR, reuse for CSC
+	int32_t num_vertices;
+	int32_t num_edges;
+
+	// Host pointers
+	int32_t *h_src_offsets; // num_vertices + 1;
+	int32_t *h_edge_src; // num_edges;
+	int32_t *h_edge_dst; // num_edges;
+	EdgeWeightType *h_edge_weight; // num_edges;
+
+	// Device pointers
+	int32_t *d_src_offsets; // num_vertices + 1;
+	int32_t *d_edge_src; // num_edges;
+	int32_t *d_edge_dst; // num_edges;
+	EdgeWeightType *d_edge_weight; // num_edges;
+
+	int32_t h_get_degree(int32_t vertex_id) {
+		return h_src_offsets[vertex_id + 1] - h_src_offsets[vertex_id];
+	}
+	int32_t d_get_degree(int32_t vertex_id) {
+		return d_src_offsets[vertex_id + 1] - d_src_offsets[vertex_id];
+	}
+};
+
+
+template <typename EdgeWeightType>
+void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool to_sort = false);
+
+
+}
+#endif
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
new file mode 100644
index 00000000..bc34668e
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -0,0 +1,15 @@
+#ifndef GRAPHIT_GPU_LOAD_BALANCE_H
+#define GRAPHIT_GPU_LOAD_BALANCE_H
+
+namespace gpu_runtime {
+template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier)>
+void __device__ vertex_based_load_balance(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+}
+
+void __host__ vertex_based_load_balance_info(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+}
+
+
+}
+
+#endif
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
new file mode 100644
index 00000000..b76b4940
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -0,0 +1,25 @@
+#ifndef GPU_VERTEX_FRONTIER_H
+#define GPU_VERTEX_FRONTIER_H
+
+namespace gpu_runtime {
+struct VertexFrontier {
+	int32_t max_num_elems; 
+
+	int32_t *d_num_elems_input;
+	int32_t *d_num_elems_output;
+
+	int32_t * d_sparse_queue_input;
+	int32_t * d_sparse_queue_output;
+	
+	unsigned char* d_byte_map_input;
+	unsigned char* d_byte_map_output;
+
+	unsigned char* d_bit_map_input;
+	unsigned char* d_bit_map_output;
+
+	// Extend this to check the current representation
+};
+}
+
+#endif
+
diff --git a/src/runtime_lib/intrinsics.h b/src/runtime_lib/intrinsics.h
index a71c3cd5..d4b1d7ae 100644
--- a/src/runtime_lib/intrinsics.h
+++ b/src/runtime_lib/intrinsics.h
@@ -321,21 +321,7 @@ template <typename T> T static builtin_pop (std::vector<T>* vec){
 //    return (float)(usec.time_since_epoch().count())/1000;
 //}
 
-static struct timeval start_time_;
-static struct timeval elapsed_time_;
-
-static void startTimer(){
-    gettimeofday(&start_time_, NULL);
-}
-
-static float stopTimer(){
-    gettimeofday(&elapsed_time_, NULL);
-    elapsed_time_.tv_sec  -= start_time_.tv_sec;
-    elapsed_time_.tv_usec -= start_time_.tv_usec;
-    return elapsed_time_.tv_sec + elapsed_time_.tv_usec/1e6;
-
-}
-
+#include "timer.h"
 
 static char* argv_safe(int index, char** argv, int argc ){
     // if index is less than or equal to argc than return argv[index]
@@ -506,4 +492,8 @@ void updateBucketWithGraphItVertexSubset(VertexSubset<NodeID>* vset, julienne::P
 }
 
 
+#include "infra_gpu/graph.h"
+
+
+
 #endif //GRAPHIT_INTRINSICS_H_H
diff --git a/src/runtime_lib/timer.h b/src/runtime_lib/timer.h
new file mode 100644
index 00000000..1eed0c79
--- /dev/null
+++ b/src/runtime_lib/timer.h
@@ -0,0 +1,19 @@
+#ifndef GRAPHIT_TIMER_H
+#define GRAPHIT_TIMER_H
+#include <sys/time.h>
+
+static struct timeval start_time_;
+static struct timeval elapsed_time_;
+
+static void startTimer(){
+    gettimeofday(&start_time_, NULL);
+}
+
+static float stopTimer(){
+    gettimeofday(&elapsed_time_, NULL);
+    elapsed_time_.tv_sec  -= start_time_.tv_sec;
+    elapsed_time_.tv_usec -= start_time_.tv_usec;
+    return elapsed_time_.tv_sec + elapsed_time_.tv_usec/1e6;
+
+}
+#endif
diff --git a/test/c++/high_level_schedule_test.cpp b/test/c++/high_level_schedule_test.cpp
index 8dfb05c0..1ebe3b38 100644
--- a/test/c++/high_level_schedule_test.cpp
+++ b/test/c++/high_level_schedule_test.cpp
@@ -2276,4 +2276,37 @@ fir::high_level_schedule::ProgramScheduleNode::Ptr program
 EXPECT_EQ (0, basicTestWithSchedule(program));
 }
 
+TEST_F(HighLevelScheduleTest, GPUScheduleBasicSimpleGPUScheduleTest) {
+    istringstream is (bfs_str_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+
+    // Now apply the GPU Schedule
+    fir::gpu_schedule::SimpleGPUSchedule s1;
+    s1.configDeduplication(fir::gpu_schedule::DISABLED);
+    s1.configDirection(fir::gpu_schedule::PUSH);
+
+    program->applyGPUSchedule("s1", s1);
+}
+
+TEST_F(HighLevelScheduleTest, GPUScheduleBasicHybridGPUScheduleTest) {
+    istringstream is (bfs_str_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
 
+    // Now apply the GPU Schedule
+    fir::gpu_schedule::SimpleGPUSchedule s1;
+    fir::gpu_schedule::SimpleGPUSchedule s2;
+    s1.configDeduplication(fir::gpu_schedule::DISABLED);
+    s1.configDirection(fir::gpu_schedule::PUSH);
+	
+    s2 = s1;
+    s2.configDirection(fir::gpu_schedule::PULL);
+
+    fir::gpu_schedule::HybridGPUSchedule h1 (fir::gpu_schedule::HybridGPUSchedule::INPUT_VERTEXSET_SIZE, 0.2, s1, s2);
+     
+
+    program->applyGPUSchedule("s1", h1);
+}

From d46022ba15182754203d997c8b63adcc255ba813 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Mon, 23 Sep 2019 14:15:59 -0400
Subject: [PATCH 02/88] SSSP compiled and running correctly. Correctness
 verified

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h | 12 ++-
 .../codegen_gpu/extract_read_write_set.h      | 30 +++++++
 src/backend/codegen_gpu/codegen_gpu.cpp       | 67 +++++++++++---
 .../codegen_gpu/extract_read_write_set.cpp    | 36 ++++++++
 src/runtime_lib/gpu_intrinsics.h              | 29 ++-----
 src/runtime_lib/{timer.h => graphit_timer.h}  |  0
 src/runtime_lib/infra_gapbs/graph.h           |  2 +
 src/runtime_lib/infra_gpu/graph.h             | 87 ++++++++++++++++++-
 src/runtime_lib/infra_gpu/load_balance.h      | 26 ++++++
 src/runtime_lib/infra_gpu/support.h           | 13 +++
 src/runtime_lib/infra_gpu/vertex_frontier.h   | 86 ++++++++++++++++++
 src/runtime_lib/intrinsics.h                  |  3 +-
 12 files changed, 346 insertions(+), 45 deletions(-)
 create mode 100644 include/graphit/backend/codegen_gpu/extract_read_write_set.h
 create mode 100644 src/backend/codegen_gpu/extract_read_write_set.cpp
 rename src/runtime_lib/{timer.h => graphit_timer.h} (100%)
 create mode 100644 src/runtime_lib/infra_gpu/support.h

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index 932548d2..f650f626 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -68,7 +68,7 @@ class CodeGenGPU : public mir::MIRVisitor{
 	}
 
 	void generateBinaryExpr(mir::BinaryExpr::Ptr, std::string);
-
+protected:
 	virtual void visit(mir::EdgeSetType::Ptr) override;
 	virtual void visit(mir::VertexSetType::Ptr) override;
 	virtual void visit(mir::ScalarType::Ptr) override;
@@ -99,22 +99,26 @@ class CodeGenGPU : public mir::MIRVisitor{
 	virtual void visit(mir::IfStmt::Ptr) override;
 	virtual void visit(mir::PrintStmt::Ptr) override;
 	virtual void visit(mir::Call::Ptr) override;	
-	
+
 	virtual void visit(mir::BreakStmt::Ptr) override;
-	
+
 	virtual void visit(mir::VertexSetApplyExpr::Ptr) override;
 	virtual void visit(mir::VertexSetAllocExpr::Ptr) override;
-	
+
 
 };
 class CodeGenGPUHost: public CodeGenGPU {
 public:
 	using CodeGenGPU::CodeGenGPU;
+	using CodeGenGPU::visit;
 private:
 	virtual std::string getBackendFunctionLabel(void) {
 		return "__host__";
 	}
 	virtual void visit(mir::TensorArrayReadExpr::Ptr);
+	virtual void visit(mir::StmtBlock::Ptr);
+	void generateDeviceToHostCopy(mir::TensorArrayReadExpr::Ptr tare);
+	void generateHostToDeviceCopy(mir::TensorArrayReadExpr::Ptr tare);
 };
 
 }
diff --git a/include/graphit/backend/codegen_gpu/extract_read_write_set.h b/include/graphit/backend/codegen_gpu/extract_read_write_set.h
new file mode 100644
index 00000000..13ba54a1
--- /dev/null
+++ b/include/graphit/backend/codegen_gpu/extract_read_write_set.h
@@ -0,0 +1,30 @@
+#ifndef EXTRACT_READ_WRITE_H
+#define EXTRACT_READ_WRITE_H
+
+#include <graphit/midend/mir.h>
+#include <graphit/midend/mir_visitor.h>
+#include <graphit/midend/mir_context.h>
+namespace graphit {
+class ExtractReadWriteSet: public mir::MIRVisitor {
+public:
+	ExtractReadWriteSet(MIRContext *mir_context_): read_set(read_set_), write_set(write_set_), mir_context(mir_context_) {
+	}
+	const std::vector<mir::TensorArrayReadExpr::Ptr> &read_set;
+	const std::vector<mir::TensorArrayReadExpr::Ptr> &write_set;	
+	
+protected:
+	virtual void visit(mir::TensorArrayReadExpr::Ptr);
+	virtual void visit(mir::AssignStmt::Ptr);
+	virtual void visit(mir::StmtBlock::Ptr);	
+	MIRContext *mir_context;
+	
+private:
+	void add_read(mir::TensorArrayReadExpr::Ptr);
+	void add_write(mir::TensorArrayReadExpr::Ptr);
+	
+	std::vector<mir::TensorArrayReadExpr::Ptr> read_set_;
+	std::vector<mir::TensorArrayReadExpr::Ptr> write_set_;
+};
+}
+
+#endif
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 6c806d15..53395de8 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -4,6 +4,7 @@
 
 #include <graphit/backend/codegen_gpu/codegen_gpu.h>
 #include <graphit/backend/codegen_gpu/assign_function_context.h>
+#include "graphit/backend/codegen_gpu/extract_read_write_set.h"
 #include <graphit/midend/mir.h>
 #include <cstring>
 
@@ -57,6 +58,9 @@ void CodeGenGPU::genPropertyArrayDecl(mir::VarDecl::Ptr constant) {
 	// Also generate the host versions of these arrays 
 	vector_type->vector_element_type->accept(this);
 	oss << " " << "*__host_" << constant->name << ";" << std::endl;
+	// Also generate the device pointer for easy copy
+	vector_type->vector_element_type->accept(this);
+	oss << " " << "*__device_" << constant->name << ";" << std::endl;
 }
 
 void CodeGenGPU::genPropertyArrayAlloca(mir::VarDecl::Ptr var_decl) {
@@ -66,24 +70,20 @@ void CodeGenGPU::genPropertyArrayAlloca(mir::VarDecl::Ptr var_decl) {
 	auto size_expr = mir_context_->getElementCount(vector_type->element_type);
 	assert(size_expr != nullptr);
 	
-	printIndent();
-	oss << "{" << std::endl;
-	indent();
-	printIndent();
-	vector_type->vector_element_type->accept(this);
-	oss << " __tmp;" << std::endl;
 
 	printIndent();
-	oss << "cudaMalloc((void**)&__tmp, ";
+	oss << "cudaMalloc(&__device_" << var_decl->name << ", ";
 	size_expr->accept(this);
-	oss << "  * sizeof(";
+	oss << " * sizeof(";
 	vector_type->vector_element_type->accept(this);
 	oss << "));" << std::endl;
 	
 	printIndent();
-	oss << "cudaMemcpyToSymbol(\"";
+	oss << "cudaMemcpyToSymbol(";
 	oss << var_decl->name;
-	oss << "\", &__tmp, sizeof(void*));" << std::endl;
+	oss << ", &__device_" << var_decl->name << ", sizeof(";
+	vector_type->vector_element_type->accept(this);	
+	oss << "*), 0);" << std::endl;
 
 	printIndent();
 	oss << "__host_" << var_decl->name << " = new ";
@@ -92,9 +92,6 @@ void CodeGenGPU::genPropertyArrayAlloca(mir::VarDecl::Ptr var_decl) {
 	size_expr->accept(this);
 	oss << "];" << std::endl;
 	
-	dedent();
-	printIndent();
-	oss << "}" << std::endl;
 		
 }
 // Disabling this for now because we are handling all vertex operations in library
@@ -515,6 +512,10 @@ void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
 			oss << esae->kernel_function << "<<<num_cta, cta_size>>>" << "(";
 			esae->target->accept(this);
 			oss << ", " << esae->from_func << ", " << var_decl->name << ");" << std::endl;
+			printIndent();
+			oss << "cudaDeviceSynchronize();" << std::endl;
+			printIndent();
+			oss << "gpu_runtime::swap_queues(" << var_decl->name << ");" << std::endl;
 			dedent();
 			printIndent();
 			oss << "}" << std::endl;
@@ -579,7 +580,7 @@ void CodeGenGPU::visit(mir::PrintStmt::Ptr print_stmt) {
 	printIndent();
 	oss << "std::cout << ";
 	print_stmt->expr->accept(this);
-	oss << ";" << std::endl;
+	oss << " << std::endl;" << std::endl;
 }
 void CodeGenGPU::visit(mir::Call::Ptr call_expr) {
 	if (call_expr->name == "deleteObject" || call_expr->name.substr(0, strlen("builtin_")) == "builtin_")	
@@ -659,4 +660,42 @@ void CodeGenGPU::visit(mir::VertexSetAllocExpr::Ptr vsae) {
 	size_expr->accept(this);
 	oss << ")" << std::endl;
 }
+void CodeGenGPUHost::generateDeviceToHostCopy(mir::TensorArrayReadExpr::Ptr tare) {
+	printIndent();
+	mir::Var target = mir::to<mir::VarExpr>(tare->target)->var;
+	std::string var_name = target.getName();
+	oss << "cudaMemcpy(__host_" << var_name << " + ";
+	tare->index->accept(this);
+	oss << ", __device_" << var_name << " + ";
+	tare->index->accept(this);
+	oss << ", sizeof(";
+	mir::to<mir::VectorType>(target.getType())->element_type->accept(this);
+	oss << "), cudaMemcpyDeviceToHost);" << std::endl;	
+	
+}
+void CodeGenGPUHost::generateHostToDeviceCopy(mir::TensorArrayReadExpr::Ptr tare) {
+	printIndent();
+	mir::Var target = mir::to<mir::VarExpr>(tare->target)->var;
+	std::string var_name = target.getName();
+	oss << "cudaMemcpy(__device_" << var_name << " + ";
+	tare->index->accept(this);
+	oss << ", __host_" << var_name << " + ";
+	tare->index->accept(this);
+	oss << ", sizeof(";
+	mir::to<mir::VectorType>(target.getType())->element_type->accept(this);
+	oss << "), cudaMemcpyHostToDevice);" << std::endl;	
+}
+void CodeGenGPUHost::visit(mir::StmtBlock::Ptr stmt_block) {
+	for (auto stmt: *(stmt_block->stmts)) {
+		ExtractReadWriteSet extractor(mir_context_);
+		stmt->accept(&extractor);
+		for (auto tare: extractor.read_set) {
+			generateDeviceToHostCopy(tare);
+		}			
+		stmt->accept(this);
+		for (auto tare: extractor.write_set) {
+			generateHostToDeviceCopy(tare);
+		}
+	}
+}
 }
diff --git a/src/backend/codegen_gpu/extract_read_write_set.cpp b/src/backend/codegen_gpu/extract_read_write_set.cpp
new file mode 100644
index 00000000..976e561e
--- /dev/null
+++ b/src/backend/codegen_gpu/extract_read_write_set.cpp
@@ -0,0 +1,36 @@
+#include "graphit/backend/codegen_gpu/extract_read_write_set.h"
+
+namespace graphit {
+void ExtractReadWriteSet::visit(mir::StmtBlock::Ptr stmt_block) {
+	return;
+}
+void ExtractReadWriteSet::visit(mir::TensorArrayReadExpr::Ptr tare) {
+	mir::Var target = mir::to<mir::VarExpr>(tare->target)->var;
+	if (mir_context->isLoweredConstTensor(target.getName())) {
+		add_read(tare);
+	}
+	tare->index->accept(this);
+}
+void ExtractReadWriteSet::visit(mir::AssignStmt::Ptr assign_stmt) {
+	if (mir::isa<mir::TensorArrayReadExpr>(assign_stmt->lhs)) {
+		mir::TensorArrayReadExpr::Ptr tare = mir::to<mir::TensorArrayReadExpr>(assign_stmt->lhs);
+		mir::Var target = mir::to<mir::VarExpr>(tare->target)->var;
+		if (mir_context->isLoweredConstTensor(target.getName())) {
+			add_write(tare);
+		tare->index->accept(this);
+		assign_stmt->expr->accept(this);
+	}
+	tare->index->accept(this);
+		
+	} else {
+		assign_stmt->lhs->accept(this);
+		assign_stmt->expr->accept(this);
+	}
+}
+void ExtractReadWriteSet::add_read(mir::TensorArrayReadExpr::Ptr tare) {
+	read_set_.push_back(tare);
+}
+void ExtractReadWriteSet::add_write(mir::TensorArrayReadExpr::Ptr tare) {
+	write_set_.push_back(tare);
+}
+}
diff --git a/src/runtime_lib/gpu_intrinsics.h b/src/runtime_lib/gpu_intrinsics.h
index b22e4103..304dd2cb 100644
--- a/src/runtime_lib/gpu_intrinsics.h
+++ b/src/runtime_lib/gpu_intrinsics.h
@@ -7,41 +7,26 @@
 #include "infra_gpu/graph.h"
 #include "infra_gpu/vertex_frontier.h"
 #include "infra_gpu/load_balance.h"
-#include "timer.h"
+#include "graphit_timer.h"
 
 namespace gpu_runtime {
 template <typename T>
-T __device__ writeMin(T *dst, T src) {
-	return atomicMin(dst, src);
+static bool __device__ writeMin(T *dst, T src) {
+	T old_value = atomicMin(dst, src);
+	bool ret = (old_value > src);
+	return ret;
 }
 template <typename EdgeWeightType>
 static int32_t builtin_getVertices(GraphT<EdgeWeightType> &graph) {
 	return graph.num_vertices;
 }
 
-static VertexFrontier create_new_vertex_set(int32_t num_vertices) {
-	return VertexFrontier();
-}
-
-static void builtin_addVertex(VertexFrontier &frontier, int32_t vid) {
-	
-}
-
-template <void body(int32_t vid)>
-static void __global__ vertex_set_apply_kernel(int32_t num_vertices) {
-
-} 
-
-static int32_t builtin_getVertexSetSize(VertexFrontier &frontier) {
-	return 0;	
-}
 
 template <typename T>
-void deleteObject(T &t) {
+static void deleteObject(T &t) {
+	// Currently deleteObject is empty
 
 }
 
-void __device__ enqueueVertexSparseQueue(int32_t *sparse_queue, int32_t *sparse_queue_size, int32_t vertex_id) {
-}
 }
 #endif
diff --git a/src/runtime_lib/timer.h b/src/runtime_lib/graphit_timer.h
similarity index 100%
rename from src/runtime_lib/timer.h
rename to src/runtime_lib/graphit_timer.h
diff --git a/src/runtime_lib/infra_gapbs/graph.h b/src/runtime_lib/infra_gapbs/graph.h
index 29112b5f..4b0c71d4 100644
--- a/src/runtime_lib/infra_gapbs/graph.h
+++ b/src/runtime_lib/infra_gapbs/graph.h
@@ -132,8 +132,10 @@ class CSRGraph {
 
 
  public:
+#ifndef IGNORE_JULIENNE_TYPES
   julienne::graph<julienne::symmetricVertex> julienne_graph = __julienne_null_graph;
   //julienne::EdgeMap<julienne::uintE, julienne::symmetricVertex> *em;
+#endif
   CSRGraph() : directed_(false), num_nodes_(-1), num_edges_(-1),
     out_index_(nullptr), out_neighbors_(nullptr),
   in_index_(nullptr), in_neighbors_(nullptr), flags_(nullptr), is_transpose_(false) {}
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
index 86c2495f..5b2b7a8c 100644
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -1,8 +1,12 @@
 #ifndef GPU_GRAPH_H
 #define GPU_GRAPH_H
 
-// GraphT data structure 
+#include <assert.h>
+#include "infra_gpu/support.h"
 
+// GraphT data structure 
+#define IGNORE_JULIENNE_TYPES
+#include "infra_gapbs/benchmark.h"
 namespace gpu_runtime {
 
 template <typename EdgeWeightType>
@@ -29,10 +33,87 @@ struct GraphT { // Field names are according to CSR, reuse for CSC
 		return d_src_offsets[vertex_id + 1] - d_src_offsets[vertex_id];
 	}
 };
+void consume(int32_t _) {
+}
+#define CONSUME consume
+template <typename EdgeWeightType>
+void static sort_with_degree(GraphT<EdgeWeightType> &graph) {
+	assert(false && "Sort with degree not yet implemented\n");
+	return;
+}
+template <typename EdgeWeightType>
+static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool to_sort = false) {
+	int flen = strlen(filename.c_str());
+	const char* bin_extension = to_sort?".graphit.sbin":".graphit.bin";
+	char bin_filename[100];
+	strcpy(bin_filename, filename.c_str());
+	strcat(bin_filename, bin_extension);
+	
+	FILE *bin_file = fopen(bin_filename, "rb");
+	if (bin_file) {
+		CONSUME(fread(&graph.num_vertices, sizeof(int32_t), 1, bin_file));
+		CONSUME(fread(&graph.num_edges, sizeof(int32_t), 1, bin_file));
+		
+		graph.h_edge_src = new int32_t[graph.num_edges];
+		graph.h_edge_dst = new int32_t[graph.num_edges];
+		graph.h_edge_weight = new EdgeWeightType[graph.num_edges];
+		
+		graph.h_src_offsets = new int32_t[graph.num_vertices + 1];
+		
+		CONSUME(fread(graph.h_edge_src, sizeof(int32_t), graph.num_edges, bin_file));
+		CONSUME(fread(graph.h_edge_dst, sizeof(int32_t), graph.num_edges, bin_file));
+		CONSUME(fread(graph.h_edge_weight, sizeof(int32_t), graph.num_edges, bin_file));
 
+		CONSUME(fread(graph.h_src_offsets, sizeof(int32_t), graph.num_vertices + 1, bin_file));
+		fclose(bin_file);	
+	} else {
+		CLBase cli (filename);
+		WeightedBuilder builder (cli);
+		WGraph g = builder.MakeGraph();
+		graph.num_vertices = g.num_nodes();
+		graph.num_edges = g.num_edges();
 
-template <typename EdgeWeightType>
-void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool to_sort = false);
+		graph.h_edge_src = new int32_t[graph.num_edges];
+		graph.h_edge_dst = new int32_t[graph.num_edges];
+		graph.h_edge_weight = new EdgeWeightType[graph.num_edges];
+		
+		graph.h_src_offsets = new int32_t[graph.num_vertices + 1];
+		
+		int32_t tmp = 0;
+		graph.h_src_offsets[0] = tmp;
+		for (int32_t i = 0; i < g.num_nodes(); i++) {
+			for (auto j: g.out_neigh(i)) {
+				graph.h_edge_src[tmp] = i;
+				graph.h_edge_dst[tmp] = j.v;
+				graph.h_edge_weight[tmp] = j.w;	
+				tmp++;
+			}
+			graph.h_src_offsets[i+1] = tmp;
+		}	
+		if (to_sort)
+			sort_with_degree(graph);
+		FILE *bin_file = fopen(bin_filename, "wb");
+		CONSUME(fwrite(&graph.num_vertices, sizeof(int32_t), 1, bin_file));
+		CONSUME(fwrite(&graph.num_edges, sizeof(int32_t), 1, bin_file));
+		CONSUME(fwrite(graph.h_edge_src, sizeof(int32_t), graph.num_edges, bin_file));
+		CONSUME(fwrite(graph.h_edge_dst, sizeof(int32_t), graph.num_edges, bin_file));
+		CONSUME(fwrite(graph.h_edge_weight, sizeof(int32_t), graph.num_edges, bin_file));
+		CONSUME(fwrite(graph.h_src_offsets, sizeof(int32_t), graph.num_vertices + 1, bin_file));
+		fclose(bin_file);	
+	}
+	cudaMalloc(&graph.d_edge_src, sizeof(int32_t) * graph.num_edges);
+	cudaMalloc(&graph.d_edge_dst, sizeof(int32_t) * graph.num_edges);
+	cudaMalloc(&graph.d_edge_weight, sizeof(EdgeWeightType) * graph.num_edges);
+	cudaMalloc(&graph.d_src_offsets, sizeof(int32_t) * (graph.num_vertices + 1));
+	
+	
+	cudaMemcpy(graph.d_edge_src, graph.h_edge_src, sizeof(int32_t) * graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(graph.d_edge_dst, graph.h_edge_dst, sizeof(int32_t) * graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(graph.d_edge_weight, graph.h_edge_weight, sizeof(EdgeWeightType) * graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(graph.d_src_offsets, graph.h_src_offsets, sizeof(int32_t) * (graph.num_vertices + 1), cudaMemcpyHostToDevice);
+	std::cout << filename << " (" << graph.num_vertices << ", " << graph.num_edges << ")" << std::endl;
+
+}
 
 
 }
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index bc34668e..3a8831ba 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -1,12 +1,38 @@
 #ifndef GRAPHIT_GPU_LOAD_BALANCE_H
 #define GRAPHIT_GPU_LOAD_BALANCE_H
 
+#include "infra_gpu/graph.h"
+#include "infra_gpu/vertex_frontier.h"
+
 namespace gpu_runtime {
+
+template <void body(int32_t vid)>
+static void __device__ vertex_set_apply(int32_t num_vertices) {
+	for(int32_t vid = threadIdx.x + blockDim.x * blockIdx.x; vid < num_vertices; vid+= blockDim.x * gridDim.x) {
+		body(vid);
+	}
+}
+template <void body(int32_t vid)>
+static void __global__ vertex_set_apply_kernel(int32_t num_vertices) {
+	vertex_set_apply<body>(num_vertices);
+} 
+
 template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier)>
 void __device__ vertex_based_load_balance(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	int32_t vid = threadIdx.x + blockDim.x * blockIdx.x;
+	if (vid >= input_frontier.d_num_elems_input[0])
+		return;
+	int32_t src = input_frontier.d_sparse_queue_input[vid];
+	for (int32_t eid = graph.d_src_offsets[src]; eid < graph.d_src_offsets[src+1]; eid++) {
+		int32_t dst = graph.d_edge_dst[eid];
+		load_balance_payload(graph, src, dst, eid, output_frontier);
+	}
 }
 
 void __host__ vertex_based_load_balance_info(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	int32_t num_threads = builtin_getVertexSetSize(frontier);
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	cta_size = CTA_SIZE;
 }
 
 
diff --git a/src/runtime_lib/infra_gpu/support.h b/src/runtime_lib/infra_gpu/support.h
new file mode 100644
index 00000000..b757621e
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/support.h
@@ -0,0 +1,13 @@
+#ifndef GRAPHIT_GPU_SUPPORT_H
+#define GRAPHIT_GPU_SUPPORT_H
+namespace gpu_runtime {
+void cudaCheckLastError(void) {
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) {
+                printf("Error: %s\n", cudaGetErrorString(err));
+		exit(-1);
+	}
+}
+}
+
+#endif
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index b76b4940..2d742515 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -1,6 +1,7 @@
 #ifndef GPU_VERTEX_FRONTIER_H
 #define GPU_VERTEX_FRONTIER_H
 
+#include "infra_gpu/support.h"
 namespace gpu_runtime {
 struct VertexFrontier {
 	int32_t max_num_elems; 
@@ -17,8 +18,93 @@ struct VertexFrontier {
 	unsigned char* d_bit_map_input;
 	unsigned char* d_bit_map_output;
 
+	int32_t *d_dedup_counters;
+	int32_t curr_dedup_counter;
+
 	// Extend this to check the current representation
 };
+static VertexFrontier create_new_vertex_set(int32_t num_vertices) {
+	VertexFrontier frontier;
+	cudaMalloc(&frontier.d_num_elems_input, sizeof(int32_t));
+	cudaMalloc(&frontier.d_num_elems_output, sizeof(int32_t));
+	cudaMemset(frontier.d_num_elems_input, 0, sizeof(int32_t));
+	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));
+
+	cudaMalloc(&frontier.d_sparse_queue_input, sizeof(int32_t) * num_vertices * 6);
+	cudaMalloc(&frontier.d_sparse_queue_output, sizeof(int32_t) * num_vertices * 6);
+
+	cudaMalloc(&frontier.d_byte_map_input, sizeof(unsigned char) * num_vertices);
+	cudaMalloc(&frontier.d_byte_map_output, sizeof(unsigned char) * num_vertices);
+	
+	cudaMemset(frontier.d_byte_map_input, 0, sizeof(unsigned char) * num_vertices);
+	cudaMemset(frontier.d_byte_map_output, 0, sizeof(unsigned char) * num_vertices);
+	
+	int32_t num_byte_for_bitmap = (num_vertices + 7)/8;
+	cudaMalloc(&frontier.d_bit_map_input, sizeof(unsigned char) * num_byte_for_bitmap);
+	cudaMalloc(&frontier.d_bit_map_output, sizeof(unsigned char) * num_byte_for_bitmap);
+	
+	cudaMemset(frontier.d_bit_map_input, 0, sizeof(unsigned char) * num_byte_for_bitmap);	
+	cudaMemset(frontier.d_bit_map_output, 0, sizeof(unsigned char) * num_byte_for_bitmap);	
+
+	frontier.max_num_elems = num_vertices;
+
+	frontier.curr_dedup_counter = 0;
+	cudaMalloc(&frontier.d_dedup_counters, sizeof(int32_t) * num_vertices);
+	cudaMemset(frontier.d_dedup_counters, 0, sizeof(int32_t) * num_vertices);
+
+	return frontier;
+}
+
+static void builtin_addVertex(VertexFrontier &frontier, int32_t vid) {
+	int32_t curr_size;
+	cudaMemcpy(&curr_size, frontier.d_num_elems_input, sizeof(int32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(frontier.d_sparse_queue_input + curr_size, &vid, sizeof(int32_t), cudaMemcpyHostToDevice);
+	curr_size++;
+	
+	cudaMemcpy(frontier.d_num_elems_input, &curr_size, sizeof(int32_t), cudaMemcpyHostToDevice);
+}
+static void __device__ enqueueVertexSparseQueue(int32_t *sparse_queue, int32_t *sparse_queue_size, int32_t vertex_id) {
+	// Simple enqueuVertex implementation 
+	// Each thread adds on it's own
+	// TODO: Optimize with warp reduce
+
+	int32_t pos = atomicAdd(sparse_queue_size, 1);
+	sparse_queue[pos] = vertex_id;
+}
+static int32_t builtin_getVertexSetSize(VertexFrontier &frontier) {
+	int32_t curr_size = 0;
+	cudaMemcpy(&curr_size, frontier.d_num_elems_input, sizeof(int32_t), cudaMemcpyDeviceToHost);
+	return curr_size;
+	
+}
+static void swap_queues(VertexFrontier &frontier) {
+	int32_t *temp = frontier.d_num_elems_input;
+	frontier.d_num_elems_input = frontier.d_num_elems_output;
+	frontier.d_num_elems_output = temp;
+	
+	temp = frontier.d_sparse_queue_input;
+	frontier.d_sparse_queue_input = frontier.d_sparse_queue_output;
+	frontier.d_sparse_queue_output = temp;
+
+	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));	
+}
+static void __device__ dedup_frontier_device(VertexFrontier &frontier) {
+	for(int32_t vidx = threadIdx.x + blockDim.x * blockIdx.x; vidx < frontier.d_num_elems_input[0]; vidx += blockDim.x * gridDim.x) {
+		int32_t vid = frontier.d_sparse_queue_input[vidx];
+		if (frontier.d_dedup_counters[vid] < frontier.curr_dedup_counter) {
+			enqueueVertexSparseQueue(frontier.d_sparse_queue_output, frontier.d_num_elems_output, vid);
+			frontier.d_dedup_counters[vid] = frontier.curr_dedup_counter;	
+		}
+	}
+}
+static void __global__ dedup_frontier_kernel(VertexFrontier frontier) {
+	dedup_frontier_device(frontier);	
+}
+static void dedup_frontier(VertexFrontier &frontier) {
+	frontier.curr_dedup_counter++;
+	dedup_frontier_kernel<<<NUM_CTA, CTA_SIZE>>>(frontier);
+	swap_queues(frontier);
+}
 }
 
 #endif
diff --git a/src/runtime_lib/intrinsics.h b/src/runtime_lib/intrinsics.h
index d4b1d7ae..48584c69 100644
--- a/src/runtime_lib/intrinsics.h
+++ b/src/runtime_lib/intrinsics.h
@@ -321,7 +321,7 @@ template <typename T> T static builtin_pop (std::vector<T>* vec){
 //    return (float)(usec.time_since_epoch().count())/1000;
 //}
 
-#include "timer.h"
+#include "graphit_timer.h"
 
 static char* argv_safe(int index, char** argv, int argc ){
     // if index is less than or equal to argc than return argv[index]
@@ -492,7 +492,6 @@ void updateBucketWithGraphItVertexSubset(VertexSubset<NodeID>* vset, julienne::P
 }
 
 
-#include "infra_gpu/graph.h"
 
 
 

From 530b472538934d6c162bba800e764adce8299271 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Wed, 25 Sep 2019 00:54:00 -0400
Subject: [PATCH 03/88] Added support for deduplicaiton, unfused_boolmap
 frontier creation and TWCE

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |   2 +
 include/graphit/frontend/gpu_schedule.h       |   4 +
 include/graphit/midend/apply_expr_lower.h     |   6 +-
 include/graphit/midend/mir.h                  |  19 ++++
 include/graphit/midend/mir_rewriter.h         |   3 +
 include/graphit/midend/mir_visitor.h          |  30 +++--
 src/backend/codegen_gpu/codegen_gpu.cpp       |  51 ++++++++-
 src/midend/apply_expr_lower.cpp               |  53 +++++++--
 src/midend/mir.cpp                            |  10 ++
 src/midend/mir_rewriter.cpp                   |   5 +
 src/midend/mir_visitor.cpp                    |   5 +-
 src/runtime_lib/gpu_intrinsics.h              |   2 +
 src/runtime_lib/infra_gpu/graph.h             |   2 +-
 src/runtime_lib/infra_gpu/load_balance.h      | 103 ++++++++++++++++++
 src/runtime_lib/infra_gpu/support.h           |  15 +++
 src/runtime_lib/infra_gpu/vertex_frontier.h   |  60 +++++++++-
 16 files changed, 339 insertions(+), 31 deletions(-)

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index f650f626..63d3eec3 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -105,6 +105,8 @@ class CodeGenGPU : public mir::MIRVisitor{
 	virtual void visit(mir::VertexSetApplyExpr::Ptr) override;
 	virtual void visit(mir::VertexSetAllocExpr::Ptr) override;
 
+	virtual void visit(mir::VertexSetDedupExpr::Ptr) override;
+
 
 };
 class CodeGenGPUHost: public CodeGenGPU {
diff --git a/include/graphit/frontend/gpu_schedule.h b/include/graphit/frontend/gpu_schedule.h
index 1f2d5092..1600cca5 100644
--- a/include/graphit/frontend/gpu_schedule.h
+++ b/include/graphit/frontend/gpu_schedule.h
@@ -31,6 +31,9 @@ enum gpu_schedule_options {
 
 class GPUSchedule {
 	// Abstract class has no functions for now
+public:
+	// Virtual destructor to make the class polymorphic
+	virtual ~GPUSchedule() = default;
 };
 
 class SimpleGPUSchedule: public GPUSchedule {
@@ -67,6 +70,7 @@ class SimpleGPUSchedule: public GPUSchedule {
 	};
 
 private:
+public:
 	direction_type direction;
 	frontier_creation_type frontier_creation;
 	deduplication_type deduplication;
diff --git a/include/graphit/midend/apply_expr_lower.h b/include/graphit/midend/apply_expr_lower.h
index 9c000421..56a77dc7 100644
--- a/include/graphit/midend/apply_expr_lower.h
+++ b/include/graphit/midend/apply_expr_lower.h
@@ -36,10 +36,14 @@ namespace graphit {
             //Lowers edgeset apply expressions
             virtual void visit(mir::EdgeSetApplyExpr::Ptr edgeset_apply_expr);
             virtual void visit(mir::VertexSetApplyExpr::Ptr vertexset_apply_expr);
-
+	
+	    virtual void visit(mir::StmtBlock::Ptr stmt_block);
+	    virtual void visit(mir::VarDecl::Ptr var_decl);
+	    virtual void visit(mir::AssignStmt::Ptr assign_stmt); 
 
             Schedule * schedule_;
             MIRContext* mir_context_;
+	    mir::Stmt::Ptr insert_after_stmt = nullptr;
         };
 
     private:
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index b632acfc..89be5e7f 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -15,6 +15,8 @@
 #include <assert.h>
 #include <graphit/midend/field_vector_property.h>
 #include <unordered_map>
+#include <graphit/frontend/gpu_schedule.h>
+
 
 namespace graphit {
     namespace mir {
@@ -844,6 +846,8 @@ namespace graphit {
 
 	    std::string device_function;
 	    std::string kernel_function;
+	
+	    fir::gpu_schedule::SimpleGPUSchedule applied_schedule;
 
         protected:
             virtual void copy(MIRNode::Ptr);
@@ -934,6 +938,8 @@ namespace graphit {
                 is_weighted = edgeset_apply->is_weighted;
                 is_parallel = edgeset_apply->is_parallel;
                 enable_deduplication = edgeset_apply->enable_deduplication;
+		
+		applied_schedule = edgeset_apply->applied_schedule;
             }
 
             virtual void accept(MIRVisitor *visitor) {
@@ -960,6 +966,7 @@ namespace graphit {
                 is_weighted = edgeset_apply->is_weighted;
                 is_parallel = edgeset_apply->is_parallel;
                 enable_deduplication = edgeset_apply->enable_deduplication;
+		applied_schedule = edgeset_apply->applied_schedule;
             }
 
             virtual void accept(MIRVisitor *visitor) {
@@ -1515,6 +1522,18 @@ namespace graphit {
 
         };
 
+
+	// GPU Specific operators
+	struct VertexSetDedupExpr: Expr {
+		Expr::Ptr target;
+		typedef std::shared_ptr<VertexSetDedupExpr> Ptr;
+		virtual void accept(MIRVisitor *visitor) {
+			visitor->visit(self<VertexSetDedupExpr>());
+		}
+		protected:
+		virtual void copy(MIRNode::Ptr);
+		virtual MIRNode::Ptr cloneNode();		
+	};
     }
 
 }
diff --git a/include/graphit/midend/mir_rewriter.h b/include/graphit/midend/mir_rewriter.h
index 58d23230..5f2c05c9 100644
--- a/include/graphit/midend/mir_rewriter.h
+++ b/include/graphit/midend/mir_rewriter.h
@@ -153,6 +153,9 @@ namespace graphit {
             virtual void visit(std::shared_ptr<PriorityUpdateOperatorSum>);
 
 	    virtual void visit(std::shared_ptr<UpdatePriorityEdgeCountEdgeSetApplyExpr>);
+	
+	    // GPU Additions
+	    virtual void visit(std::shared_ptr<VertexSetDedupExpr>);
 
             template<typename T = Program>
             std::shared_ptr<T> rewrite(std::shared_ptr<T> ptr) {
diff --git a/include/graphit/midend/mir_visitor.h b/include/graphit/midend/mir_visitor.h
index a06797f4..284e7fa2 100644
--- a/include/graphit/midend/mir_visitor.h
+++ b/include/graphit/midend/mir_visitor.h
@@ -103,15 +103,18 @@ namespace graphit {
         struct UpdatePriorityEdgeSetApplyExpr;
         struct UpdatePriorityExternVertexSetApplyExpr;
         struct UpdatePriorityUpdateBucketsCall;
-	    struct UpdatePriorityExternCall;
+	struct UpdatePriorityExternCall;
 
-	    struct OrderedProcessingOperator;
-
-	    struct PriorityUpdateOperator;
-	    struct PriorityUpdateOperatorMin;
-	    struct PriorityUpdateOperatorSum;
-		struct UpdatePriorityEdgeCountEdgeSetApplyExpr;
+	struct OrderedProcessingOperator;
 
+	struct PriorityUpdateOperator;
+	struct PriorityUpdateOperatorMin;
+	struct PriorityUpdateOperatorSum;
+	struct UpdatePriorityEdgeCountEdgeSetApplyExpr;
+	
+	// GPU Additions
+	struct VertexSetDedupExpr;
+	
 
         struct MIRVisitor {
             virtual void visit(Var*);
@@ -248,17 +251,20 @@ namespace graphit {
 
             virtual void visit(std::shared_ptr<OrderedProcessingOperator>);
 
-			virtual void visit(std::shared_ptr<PriorityUpdateOperator>);
+	    virtual void visit(std::shared_ptr<PriorityUpdateOperator>);
 
-			virtual void visit(std::shared_ptr<PriorityUpdateOperatorMin>);
+	    virtual void visit(std::shared_ptr<PriorityUpdateOperatorMin>);
 
-			virtual void visit(std::shared_ptr<PriorityUpdateOperatorSum>);
+	    virtual void visit(std::shared_ptr<PriorityUpdateOperatorSum>);
 
 
             virtual void visit(std::shared_ptr<UpdatePriorityEdgeCountEdgeSetApplyExpr>);
-	    
+	   
+	    // GPU Additions
+	    virtual void visit(std::shared_ptr<VertexSetDedupExpr>); 
+
+       	    protected:
 
-		protected:
             std::shared_ptr<MIRNode> node;
             LabelScope label_scope_;
             std::shared_ptr<FuncDecl> enclosing_func_decl_ = nullptr;
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 53395de8..0ae65da5 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -163,6 +163,10 @@ void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 	// First we generate the function that is passed to the load balancing function
 
 	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
+	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
+	if (apply_expr->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
+		load_balance_function = "gpu_runtime::TWCE_load_balance";
+	}
 
 	oss << "template <typename EdgeWeightType>" << std::endl;
 	oss << "void __device__ " << load_balancing_arg << "(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
@@ -175,7 +179,12 @@ void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 	oss << "if (" << apply_expr->input_function_name << "(src, dst, weight)) {" << std::endl;
 	indent();
 	printIndent();
-	oss << "gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, dst);" << std::endl;
+	if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED)
+		oss << "gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, dst);" << std::endl;
+	else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP)
+		oss << "gpu_runtime::enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, dst);" << std::endl;
+	else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP)
+		oss << "gpu_runtime::enqueueVertexBitmap(output_frontier.d_bit_map_output, output_frontier.d_num_elems_output, dst);" << std::endl;
 	dedent();
 	printIndent();
 	oss << "}" << std::endl;
@@ -189,9 +198,9 @@ void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 	oss << "void __global__ " << kernel_function_name << " (gpu_runtime::GraphT<EdgeWeightType> graph, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
 	indent();
 	printIndent();
-	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
 	oss << load_balance_function << "<EdgeWeightType, " << load_balancing_arg << "<EdgeWeightType>> (";
 	oss << "graph, input_frontier, output_frontier);" << std::endl;
+	
 	dedent();
 	printIndent();
 	oss << "}" << std::endl;
@@ -502,10 +511,22 @@ void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
 			
 			oss << "{" << std::endl;
 			indent();
+			std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
+			if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
+				load_balance_function = "gpu_runtime::TWCE_load_balance";
+			}
+			
+			if (mir::isa<mir::PushEdgeSetApplyExpr>(esae)) {
+				printIndent();
+				oss << "gpu_runtime::vertex_set_prepare_sparse(";
+				oss << esae->from_func;
+				oss << ");" << std::endl;
+			}
+
 			printIndent();
 			oss << "int32_t num_cta, cta_size;" << std::endl;
 			printIndent();		
-			oss << "gpu_runtime::vertex_based_load_balance_info(";
+			oss << load_balance_function << "_info(";
 			oss << esae->from_func;
 			oss << ", num_cta, cta_size);" << std::endl;
 			printIndent();
@@ -514,8 +535,23 @@ void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
 			oss << ", " << esae->from_func << ", " << var_decl->name << ");" << std::endl;
 			printIndent();
 			oss << "cudaDeviceSynchronize();" << std::endl;
-			printIndent();
-			oss << "gpu_runtime::swap_queues(" << var_decl->name << ");" << std::endl;
+			if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+				printIndent();
+				oss << "gpu_runtime::swap_queues(" << var_decl->name << ");" << std::endl;
+				printIndent();
+				oss << var_decl->name << ".format_ready = gpu_runtime::VertexFrontier::SPARSE;" << std::endl;
+			
+			} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
+				printIndent();
+				oss << "gpu_runtime::swap_bitmaps(" << var_decl->name << ");" << std::endl;
+				printIndent();
+				oss << var_decl->name << ".format_ready = gpu_runtime::VertexFrontier::BITMAP;" << std::endl;
+			} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
+				printIndent();
+				oss << "gpu_runtime::swap_bytemaps(" << var_decl->name << ");" << std::endl;
+				printIndent();
+				oss << var_decl->name << ".format_ready = gpu_runtime::VertexFrontier::BYTEMAP;" << std::endl;
+			}
 			dedent();
 			printIndent();
 			oss << "}" << std::endl;
@@ -528,6 +564,11 @@ void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
 		oss << ";" << std::endl;
 		
 	
+}
+void CodeGenGPU::visit(mir::VertexSetDedupExpr::Ptr vsde) {
+	oss << "gpu_runtime::dedup_frontier(";
+	vsde->target->accept(this);
+	oss << ")";
 }
 void CodeGenGPU::visit(mir::BoolLiteral::Ptr bool_literal) {
 	oss << bool_literal->val?"true":"false";
diff --git a/src/midend/apply_expr_lower.cpp b/src/midend/apply_expr_lower.cpp
index e84c810c..19706b0c 100644
--- a/src/midend/apply_expr_lower.cpp
+++ b/src/midend/apply_expr_lower.cpp
@@ -35,7 +35,42 @@ namespace graphit {
 
         node = vertexset_apply;
     }
-
+    void ApplyExprLower::LowerApplyExpr::visit(mir::StmtBlock::Ptr stmt_block) {
+	std::vector<mir::Stmt::Ptr> new_stmts;
+	for (auto stmt: *(stmt_block->stmts)) {
+		new_stmts.push_back(rewrite<mir::Stmt>(stmt));
+		if (insert_after_stmt != nullptr)
+			new_stmts.push_back(insert_after_stmt);
+		insert_after_stmt = nullptr;	
+	}
+	* (stmt_block->stmts) = new_stmts;
+	node = stmt_block;
+    }
+    void ApplyExprLower::LowerApplyExpr::visit(mir::VarDecl::Ptr var_decl) {
+	MIRRewriter::visit(var_decl);
+	var_decl = mir::to<mir::VarDecl>(node);
+	if (mir::isa<mir::EdgeSetApplyExpr> (var_decl->initVal)) {
+		mir::EdgeSetApplyExpr::Ptr edgeset_apply = mir::to<mir::EdgeSetApplyExpr>(var_decl->initVal);
+		
+		if (edgeset_apply->applied_schedule.deduplication == fir::gpu_schedule::SimpleGPUSchedule::deduplication_type::DEDUP_ENABLED && edgeset_apply->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+			mir::VertexSetDedupExpr::Ptr dedup_expr = std::make_shared<mir::VertexSetDedupExpr>();
+			mir::ExprStmt::Ptr expr_stmt = std::make_shared<mir::ExprStmt>();
+			mir::Var var(var_decl->name, var_decl->type);
+			mir::VarExpr::Ptr var_expr = std::make_shared<mir::VarExpr>();
+			var_expr->var = var;
+			dedup_expr->target = var_expr;
+			
+			expr_stmt->expr = dedup_expr;
+			insert_after_stmt = expr_stmt;
+		}
+	}
+	node = var_decl;
+    }    
+    void ApplyExprLower::LowerApplyExpr::visit(mir::AssignStmt::Ptr assign_stmt) {
+        MIRRewriter::visit(assign_stmt);
+	assign_stmt = mir::to<mir::AssignStmt>(node);
+	node = assign_stmt;
+    }
     void ApplyExprLower::LowerApplyExpr::visit(mir::EdgeSetApplyExpr::Ptr edgeset_apply) {
 
         // use the target var expressionto figure out the edgeset type
@@ -61,15 +96,17 @@ namespace graphit {
 		auto apply_schedule_iter = schedule_->apply_gpu_schedules.find(current_scope_name);
 		if (apply_schedule_iter != schedule_->apply_gpu_schedules.end()) {
 			auto apply_schedule = apply_schedule_iter->second;
-			
-			
-		} else {
-			// No schedule is attached, lower using default schedule
-			
+			if (dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule) != nullptr) {	
+				edgeset_apply->applied_schedule = *dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule);
+			}
+			// First we create a stmt block to return
 			node = std::make_shared<mir::PushEdgeSetApplyExpr>(edgeset_apply);
-			
-			
+						
+		} else {
+			// No schedule is attached, lower using default schedule	
+			node = std::make_shared<mir::PushEdgeSetApplyExpr>(edgeset_apply);			
 		}
+		return;
 	}
 
         // check if the schedule contains entry for the current edgeset apply expressions
diff --git a/src/midend/mir.cpp b/src/midend/mir.cpp
index 42d943bd..80fc848a 100644
--- a/src/midend/mir.cpp
+++ b/src/midend/mir.cpp
@@ -947,5 +947,15 @@ namespace graphit {
 		return node;
 	}
 
+	void VertexSetDedupExpr::copy(MIRNode::Ptr node) {
+		const auto op = mir::to<VertexSetDedupExpr>(node);
+		target = op->target;	
+	}
+	MIRNode::Ptr VertexSetDedupExpr::cloneNode() {
+		const auto node = std::make_shared<VertexSetDedupExpr>();
+		node->copy(shared_from_this());
+		return node;
+	}
+
     }
 }
diff --git a/src/midend/mir_rewriter.cpp b/src/midend/mir_rewriter.cpp
index aeaa09cd..122516e9 100644
--- a/src/midend/mir_rewriter.cpp
+++ b/src/midend/mir_rewriter.cpp
@@ -410,6 +410,11 @@ namespace graphit {
             ptr->target = rewrite<Expr>(ptr->target);
             node = ptr;
 	}
+	
+	void MIRRewriter::visit(VertexSetDedupExpr::Ptr ptr) {
+		ptr->target = rewrite<Expr>(ptr->target);
+		node = ptr;
+	}
 
     }
 }
diff --git a/src/midend/mir_visitor.cpp b/src/midend/mir_visitor.cpp
index bf949429..67850442 100644
--- a/src/midend/mir_visitor.cpp
+++ b/src/midend/mir_visitor.cpp
@@ -372,6 +372,9 @@ namespace graphit {
 	void MIRVisitor::visit(std::shared_ptr<UpdatePriorityEdgeCountEdgeSetApplyExpr> op) {
 		visit(std::static_pointer_cast<EdgeSetApplyExpr>(op));
 	}
-
+	
+	void MIRVisitor::visit(std::shared_ptr<VertexSetDedupExpr> op) {
+		op->target->accept(this);
+	}
     }
 }
diff --git a/src/runtime_lib/gpu_intrinsics.h b/src/runtime_lib/gpu_intrinsics.h
index 304dd2cb..cd907d2b 100644
--- a/src/runtime_lib/gpu_intrinsics.h
+++ b/src/runtime_lib/gpu_intrinsics.h
@@ -12,6 +12,8 @@
 namespace gpu_runtime {
 template <typename T>
 static bool __device__ writeMin(T *dst, T src) {
+	if (*dst <= src)
+		return false;
 	T old_value = atomicMin(dst, src);
 	bool ret = (old_value > src);
 	return ret;
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
index 5b2b7a8c..e7886eae 100644
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -29,7 +29,7 @@ struct GraphT { // Field names are according to CSR, reuse for CSC
 	int32_t h_get_degree(int32_t vertex_id) {
 		return h_src_offsets[vertex_id + 1] - h_src_offsets[vertex_id];
 	}
-	int32_t d_get_degree(int32_t vertex_id) {
+	int32_t __device__ d_get_degree(int32_t vertex_id) {
 		return d_src_offsets[vertex_id + 1] - d_src_offsets[vertex_id];
 	}
 };
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index 3a8831ba..85748666 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -34,7 +34,110 @@ void __host__ vertex_based_load_balance_info(VertexFrontier &frontier, int32_t &
 	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
 	cta_size = CTA_SIZE;
 }
+#define STAGE_1_SIZE (8)
+#define WARP_SIZE (32)
+template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier)>
+static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	int32_t thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+	
+	int32_t lane_id = thread_id % 32;
+	
+	__shared__ int32_t stage2_queue[CTA_SIZE];
+	__shared__ int32_t stage3_queue[CTA_SIZE];
+	__shared__ int32_t stage_queue_sizes[3];
+	
+	if (threadIdx.x == 0) {
+		stage_queue_sizes[0] = 0;
+		stage_queue_sizes[1] = 0;
+		stage_queue_sizes[2] = 0;
+	}
+	__syncthreads();
+	__shared__ int32_t stage2_offset[CTA_SIZE];
+	__shared__ int32_t stage3_offset[CTA_SIZE];
+	__shared__ int32_t stage2_size[CTA_SIZE];
+	__shared__ int32_t stage3_size[CTA_SIZE];	
+
+	int32_t total_vertices = input_frontier.d_num_elems_input[0];
+	int32_t local_vertex_idx = thread_id / (STAGE_1_SIZE);
+	int32_t degree;
+	int32_t s1_offset;
+	int32_t local_vertex;
+	int32_t src_offset;
+	if (local_vertex_idx < total_vertices) {
+		local_vertex = input_frontier.d_sparse_queue_input[local_vertex_idx];
+		// Step 1 seggrefate vertices into shared buffers
+		if (threadIdx.x % (STAGE_1_SIZE) == 0) {
+			degree = graph.d_get_degree(local_vertex);
+			src_offset = graph.d_src_offsets[local_vertex];
+			int32_t s3_size = degree/CTA_SIZE;
+			degree = degree - s3_size * CTA_SIZE;
+			if (s3_size > 0) {
+				int32_t pos = atomicAggInc(&stage_queue_sizes[2]);
+				stage3_queue[pos] = local_vertex;
+				stage3_size[pos] = s3_size * CTA_SIZE;
+				stage3_offset[pos] = src_offset;
+			}
 
+			int32_t s2_size = degree/WARP_SIZE;
+			degree = degree - WARP_SIZE * s2_size;
+			if (s2_size > 0) {
+				int32_t pos = atomicAggInc(&stage_queue_sizes[1]);
+				stage2_queue[pos] = local_vertex;
+				stage2_offset[pos] = s3_size * CTA_SIZE + src_offset;
+				stage2_size[pos] = s2_size * WARP_SIZE;
+			}
+			s1_offset = s3_size * CTA_SIZE + s2_size * WARP_SIZE + src_offset;
+		}
+	} else 
+		local_vertex = -1;
+	__syncthreads();
+	degree = __shfl_sync((uint32_t)-1, degree, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
+	s1_offset = __shfl_sync((uint32_t)-1, s1_offset, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
+	local_vertex = __shfl_sync((uint32_t)-1, local_vertex, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
+
+	if (local_vertex_idx < total_vertices) {
+		// STAGE 1
+		for (int32_t neigh_id = s1_offset + (lane_id % STAGE_1_SIZE); neigh_id < degree + s1_offset; neigh_id += STAGE_1_SIZE) {
+			int32_t dst = graph.d_edge_dst[neigh_id];
+			load_balance_payload(graph, local_vertex, dst, neigh_id, output_frontier);	
+		}
+
+	}
+	__syncwarp();
+	// STAGE 2 -- stage 2 is dynamically balanced
+	while(1) {
+		int32_t to_process;
+		if (lane_id == 0) {
+			to_process = atomicSub(&stage_queue_sizes[1], 1) - 1;
+		}
+		to_process = __shfl_sync((uint32_t)-1, to_process, 0, 32);
+		if (to_process < 0)
+			break;
+		local_vertex = stage2_queue[to_process];
+		degree = stage2_size[to_process];
+		int32_t s2_offset = stage2_offset[to_process];
+		for (int32_t neigh_id = s2_offset + (lane_id); neigh_id < degree + s2_offset; neigh_id += WARP_SIZE) {
+			int32_t dst = graph.d_edge_dst[neigh_id];
+			load_balance_payload(graph, local_vertex, dst, neigh_id, output_frontier);	
+		}
+		
+	}	
+	// STAGE 3 -- all threads have to do all, no need for LB
+	for (int32_t wid = 0; wid < stage_queue_sizes[2]; wid++) {
+		local_vertex = stage3_queue[wid];
+		degree = stage3_size[wid];
+		int32_t s3_offset = stage3_offset[wid];
+		for (int32_t neigh_id = s3_offset + (threadIdx.x); neigh_id < degree + s3_offset; neigh_id += CTA_SIZE) {
+			int32_t dst = graph.d_edge_dst[neigh_id];
+			load_balance_payload(graph, local_vertex, dst, neigh_id, output_frontier);	
+		}	
+	}
+}
+void __host__ TWCE_load_balance_info(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	int32_t num_threads = builtin_getVertexSetSize(frontier) * STAGE_1_SIZE;
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	cta_size = CTA_SIZE;
+}
 
 }
 
diff --git a/src/runtime_lib/infra_gpu/support.h b/src/runtime_lib/infra_gpu/support.h
index b757621e..5a286635 100644
--- a/src/runtime_lib/infra_gpu/support.h
+++ b/src/runtime_lib/infra_gpu/support.h
@@ -8,6 +8,21 @@ void cudaCheckLastError(void) {
 		exit(-1);
 	}
 }
+__device__ inline int32_t warp_bcast(int32_t v, int32_t leader) {
+	return __shfl_sync((uint32_t)-1, v, leader); 
+}
+__device__ inline int32_t atomicAggInc(int32_t *ctr) {
+	int32_t lane_id = threadIdx.x % 32;
+	
+        int mask = __activemask();
+        int leader = __ffs(mask) - 1;
+        int res;
+        if(lane_id == leader)
+                res = atomicAdd(ctr, __popc(mask));
+        res = warp_bcast(res, leader);
+
+        return (res + __popc(mask & ((1 << lane_id) - 1)));
+}
 }
 
 #endif
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index 2d742515..c4cea536 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -22,6 +22,13 @@ struct VertexFrontier {
 	int32_t curr_dedup_counter;
 
 	// Extend this to check the current representation
+	enum format_ready_type {
+		SPARSE,
+		BITMAP,
+		BYTEMAP		
+	};
+
+	format_ready_type format_ready;
 };
 static VertexFrontier create_new_vertex_set(int32_t num_vertices) {
 	VertexFrontier frontier;
@@ -52,6 +59,10 @@ static VertexFrontier create_new_vertex_set(int32_t num_vertices) {
 	cudaMalloc(&frontier.d_dedup_counters, sizeof(int32_t) * num_vertices);
 	cudaMemset(frontier.d_dedup_counters, 0, sizeof(int32_t) * num_vertices);
 
+	frontier.format_ready = VertexFrontier::SPARSE;
+
+	cudaCheckLastError();
+
 	return frontier;
 }
 
@@ -68,14 +79,20 @@ static void __device__ enqueueVertexSparseQueue(int32_t *sparse_queue, int32_t *
 	// Each thread adds on it's own
 	// TODO: Optimize with warp reduce
 
-	int32_t pos = atomicAdd(sparse_queue_size, 1);
+	//int32_t pos = atomicAdd(sparse_queue_size, 1);
+	int32_t pos = atomicAggInc(sparse_queue_size);
 	sparse_queue[pos] = vertex_id;
+	
+}
+static void __device__ enqueueVertexBytemap(unsigned char* byte_map, int32_t *byte_map_size, int32_t vertex_id) {
+	// We are not using atomic operation here because races are benign here
+	byte_map[vertex_id] = 1;
+	atomicAggInc(byte_map_size);
 }
 static int32_t builtin_getVertexSetSize(VertexFrontier &frontier) {
 	int32_t curr_size = 0;
 	cudaMemcpy(&curr_size, frontier.d_num_elems_input, sizeof(int32_t), cudaMemcpyDeviceToHost);
-	return curr_size;
-	
+	return curr_size;	
 }
 static void swap_queues(VertexFrontier &frontier) {
 	int32_t *temp = frontier.d_num_elems_input;
@@ -88,6 +105,19 @@ static void swap_queues(VertexFrontier &frontier) {
 
 	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));	
 }
+static void swap_bytemaps(VertexFrontier &frontier) {
+	int32_t *temp = frontier.d_num_elems_input;
+	frontier.d_num_elems_input = frontier.d_num_elems_output;
+	frontier.d_num_elems_output = temp;
+	
+	unsigned char* temp2;
+	temp2 = frontier.d_byte_map_input;
+	frontier.d_byte_map_input = frontier.d_byte_map_output;
+	frontier.d_byte_map_output = temp2;
+
+	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));	
+	cudaMemset(frontier.d_byte_map_output, 0, sizeof(unsigned char) * frontier.max_num_elems);
+}
 static void __device__ dedup_frontier_device(VertexFrontier &frontier) {
 	for(int32_t vidx = threadIdx.x + blockDim.x * blockIdx.x; vidx < frontier.d_num_elems_input[0]; vidx += blockDim.x * gridDim.x) {
 		int32_t vid = frontier.d_sparse_queue_input[vidx];
@@ -105,6 +135,30 @@ static void dedup_frontier(VertexFrontier &frontier) {
 	dedup_frontier_kernel<<<NUM_CTA, CTA_SIZE>>>(frontier);
 	swap_queues(frontier);
 }
+
+static void __global__ prepare_sparse_from_bytemap(VertexFrontier frontier) {
+	for (int32_t node_id = blockDim.x * blockIdx.x + threadIdx.x; node_id < frontier.max_num_elems; node_id += blockDim.x * gridDim.x) {
+		if (frontier.d_byte_map_input[node_id] == 1) {
+			enqueueVertexSparseQueue(frontier.d_sparse_queue_output, frontier.d_num_elems_output, node_id);
+		}
+	}
+}
+static void __global__ prepare_sparse_from_bitmap(VertexFrontier &frontier) {
+}
+
+static void vertex_set_prepare_sparse(VertexFrontier &frontier) {
+	if (frontier.format_ready == VertexFrontier::SPARSE)
+		return;
+	else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
+		prepare_sparse_from_bytemap<<<NUM_CTA, CTA_SIZE>>>(frontier);	
+		swap_queues(frontier);
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
+		prepare_sparse_from_bitmap<<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_queues(frontier);	
+		return;	
+	}	
+}
 }
 
 #endif

From 71692adaa00cf344cc8a2b486575ed0c58842038 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Wed, 25 Sep 2019 18:54:27 -0400
Subject: [PATCH 04/88] Pull direction working with just VertexBased load
 balance

---
 .../codegen_gpu/assign_function_context.h     |   1 +
 .../graphit/backend/codegen_gpu/codegen_gpu.h |   3 +-
 include/graphit/frontend/gpu_schedule.h       |  23 ++-
 .../codegen_gpu/assign_function_context.cpp   |   8 +
 src/backend/codegen_gpu/codegen_gpu.cpp       | 150 +++++++++---------
 src/midend/apply_expr_lower.cpp               |   8 +-
 src/runtime_lib/gpu_intrinsics.h              |  14 +-
 src/runtime_lib/infra_gpu/graph.h             |   4 +
 src/runtime_lib/infra_gpu/load_balance.h      |  12 +-
 src/runtime_lib/infra_gpu/support.h           |   8 +
 src/runtime_lib/infra_gpu/vertex_frontier.h   |  40 ++++-
 11 files changed, 168 insertions(+), 103 deletions(-)

diff --git a/include/graphit/backend/codegen_gpu/assign_function_context.h b/include/graphit/backend/codegen_gpu/assign_function_context.h
index 6bb42c85..1a014df8 100644
--- a/include/graphit/backend/codegen_gpu/assign_function_context.h
+++ b/include/graphit/backend/codegen_gpu/assign_function_context.h
@@ -17,6 +17,7 @@ class AssignFunctionContext : mir::MIRVisitor {
 		int assign_function_context(void);
 	protected:
 		void visit(mir::PushEdgeSetApplyExpr::Ptr);
+		void visit(mir::PullEdgeSetApplyExpr::Ptr);
 		void visit(mir::VertexSetApplyExpr::Ptr);
 	private:
 		MIRContext *mir_context_;
diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index 63d3eec3..b7314a9c 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -25,8 +25,9 @@ class CodeGenGPUKernelEmitter: public mir::MIRVisitor {
 	MIRContext * mir_context_;
 
 	void visit(mir::PushEdgeSetApplyExpr::Ptr);
-	//void visit(mir::VertexSetApplyExpr::Ptr);
+	void visit(mir::PullEdgeSetApplyExpr::Ptr);
 
+	void genEdgeSetGlobalKernel(mir::EdgeSetApplyExpr::Ptr);
 
 };
 class CodeGenGPU : public mir::MIRVisitor{
diff --git a/include/graphit/frontend/gpu_schedule.h b/include/graphit/frontend/gpu_schedule.h
index 1600cca5..a736b241 100644
--- a/include/graphit/frontend/gpu_schedule.h
+++ b/include/graphit/frontend/gpu_schedule.h
@@ -26,7 +26,8 @@ enum gpu_schedule_options {
 	WM,
 	CM,
 	STRICT,
-	EDGE_ONLY
+	EDGE_ONLY,
+	VERTEX_BASED
 };
 
 class GPUSchedule {
@@ -51,11 +52,12 @@ class SimpleGPUSchedule: public GPUSchedule {
 	};
 
 	enum class deduplication_type {
-		DEDUP_ENABLED, 
-		DEDUP_DISABLED
+		DEDUP_DISABLED,
+		DEDUP_ENABLED
 	};
 
 	enum class load_balancing_type {
+		VERTEX_BASED,	
 		TWC, 
 		TWCE, 
 		WM, 
@@ -65,8 +67,8 @@ class SimpleGPUSchedule: public GPUSchedule {
 	};
 
 	enum class kernel_fusion_type {
-		FUSION_ENABLED,
-		FUSION_DISABLED
+		FUSION_DISABLED,
+		FUSION_ENABLED
 	};
 
 private:
@@ -76,6 +78,14 @@ class SimpleGPUSchedule: public GPUSchedule {
 	deduplication_type deduplication;
 	load_balancing_type load_balancing;
 	kernel_fusion_type kernel_fusion;
+	
+	SimpleGPUSchedule () {
+		direction = direction_type::DIR_PUSH;
+		frontier_creation = frontier_creation_type::FRONTIER_FUSED;
+		deduplication = deduplication_type::DEDUP_DISABLED;
+		load_balancing = load_balancing_type::VERTEX_BASED;
+		kernel_fusion = kernel_fusion_type::FUSION_DISABLED;
+	}	
 
 public:	
 	void configDirection(enum gpu_schedule_options o) {
@@ -125,6 +135,9 @@ class SimpleGPUSchedule: public GPUSchedule {
 
 	void configLoadBalance(enum gpu_schedule_options o) {
 		switch(o) {
+			case VERTEX_BASED:
+				load_balancing = load_balancing_type::VERTEX_BASED;
+				break;
 			case TWC:
 				load_balancing = load_balancing_type::TWC;
 				break;
diff --git a/src/backend/codegen_gpu/assign_function_context.cpp b/src/backend/codegen_gpu/assign_function_context.cpp
index ca11d0ae..5dc8e83e 100644
--- a/src/backend/codegen_gpu/assign_function_context.cpp
+++ b/src/backend/codegen_gpu/assign_function_context.cpp
@@ -18,6 +18,14 @@ void AssignFunctionContext::visit(mir::PushEdgeSetApplyExpr::Ptr pesae) {
 	if (mir_context_->isFunction(pesae->to_func))
 		mir_context_->getFunction(pesae->to_func)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
 }
+void AssignFunctionContext::visit(mir::PullEdgeSetApplyExpr::Ptr pesae) {
+	if (mir_context_->isFunction(pesae->input_function_name))
+		mir_context_->getFunction(pesae->input_function_name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+	if (mir_context_->isFunction(pesae->from_func))
+		mir_context_->getFunction(pesae->from_func)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+	if (mir_context_->isFunction(pesae->to_func))
+		mir_context_->getFunction(pesae->to_func)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+}
 void AssignFunctionContext::visit(mir::VertexSetApplyExpr::Ptr vsae) {
 	if (mir_context_->isFunction(vsae->input_function_name))
 		mir_context_->getFunction(vsae->input_function_name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 0ae65da5..a3268a26 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -94,82 +94,34 @@ void CodeGenGPU::genPropertyArrayAlloca(mir::VarDecl::Ptr var_decl) {
 	
 		
 }
-// Disabling this for now because we are handling all vertex operations in library
-/*
-void CodeGenGPUKernelEmitter::visit(mir::VertexSetApplyExpr::Ptr vsae) {
-	// First we generate the __device__ function. This is separate from the __global__ for kernel fusion
-	std::string vertex_apply_func = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();	
-	auto mir_var = mir::to<mir::VarExpr> (vsae->target);
-	if (mir_context_->isConstVertexSet(mir_var->var.getName())) {
-		oss << "void __device__ " << vertex_apply_func << " (int32_t num_vertices) {" << std::endl;
-		indent();
-		printIndent();
-		oss << "for (int32_t vid = threadIdx.x + blockDim.x * blockIdx.x; vid < ";
-		oss << "num_vertices";
-		oss << "; vid += gridDim.x * blockDim.x) {" << std::endl;
-		indent();
-		printIndent();
-		oss << vsae->input_function_name << "(vid);" << std::endl;
-		dedent();
-		printIndent();
-		oss << "}" << std::endl;
-		dedent();
-		printIndent();
-		oss << "}" << std::endl;	
-	} else {
-		oss << "void __device__ " << vertex_apply_func << " (VertexFrontier frontier) {" << std::endl;
-		indent();
-		printIndent();
-		oss << "for (int32_t vidx = threadIdx.x + blockDim.x * blockIdx.x; vidx < ";
-		oss << "frontier.d_num_elems_input[0]";
-		oss << "; vidx += gridDim.x * blockDim.x) {" << std::endl;
-		indent();
-		printIndent();
-		oss << vsae->input_function_name << "(frontier.d_sparse_queue_input[vidx]);" << std::endl;
-		dedent();
-		printIndent();
-		oss << "}" << std::endl;
-		dedent();
-		printIndent();
-		oss << "}" << std::endl;	
+
+void CodeGenGPUKernelEmitter::genEdgeSetGlobalKernel(mir::EdgeSetApplyExpr::Ptr apply_expr) {
+	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
+	if (apply_expr->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
+		load_balance_function = "gpu_runtime::TWCE_load_balance";
 	}
+	std::string kernel_function_name = "gpu_operator_kernel_" + mir_context_->getUniqueNameCounterString();
 
-	// Now generate the __global__ kernels to actually call the function
-	std::string vertex_apply_func_kernel = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();	
-	if (mir_context_->isConstVertexSet(mir_var->var.getName())) {
-		oss << "void __global__ " << vertex_apply_func_kernel << " (int32_t num_vertices) {" << std::endl;
-		indent();
-		printIndent();
-		oss << vertex_apply_func << "(num_vertices);" << std::endl;
-		dedent();
-		printIndent();
-		oss << "}" << std::endl;
-	} else {
-		oss << "void __global__ " << vertex_apply_func_kernel << " (VertexFrontier frontier) {" << std::endl;
-		indent();
-		printIndent();
-		oss << vertex_apply_func << "(frontier);" << std::endl;
-		dedent();
-		printIndent();
-		oss << "}" << std::endl;
-	}	
-	vsae->device_function = vertex_apply_func;
-	vsae->kernel_function = vertex_apply_func_kernel;
+	oss << "template <typename EdgeWeightType>" << std::endl;
+	oss << "void __global__ " << kernel_function_name << " (gpu_runtime::GraphT<EdgeWeightType> graph, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
+	indent();
+	printIndent();
+	oss << load_balance_function << "<EdgeWeightType, " << apply_expr->device_function << "<EdgeWeightType>> (";
+	oss << "graph, input_frontier, output_frontier);" << std::endl;
+	
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;
+	apply_expr->kernel_function = kernel_function_name;
 }
-*/
-
 void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 
 	// First we generate the function that is passed to the load balancing function
 
 	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
-	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
-	if (apply_expr->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
-		load_balance_function = "gpu_runtime::TWCE_load_balance";
-	}
 
 	oss << "template <typename EdgeWeightType>" << std::endl;
-	oss << "void __device__ " << load_balancing_arg << "(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
+	oss << "void __device__ " << load_balancing_arg << "(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
 	indent();
 	printIndent();
 	oss << "// Body of the actual operator code" << std::endl;
@@ -190,25 +142,57 @@ void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 	oss << "}" << std::endl;
 	dedent();
 	printIndent();
-	oss << "}" << std::endl;
-
-	std::string kernel_function_name = "gpu_operator_kernel_" + mir_context_->getUniqueNameCounterString();
+	oss << "}" << std::endl;	
+	apply_expr->device_function = load_balancing_arg;
+	genEdgeSetGlobalKernel(apply_expr);
+	
+}
 
+void CodeGenGPUKernelEmitter::visit(mir::PullEdgeSetApplyExpr::Ptr apply_expr) {
+	// First we generate the function that is passed to the load balancing function
+	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
+	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
+	if (apply_expr->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
+		load_balance_function = "gpu_runtime::TWCE_load_balance";
+	}
+	
 	oss << "template <typename EdgeWeightType>" << std::endl;
-	oss << "void __global__ " << kernel_function_name << " (gpu_runtime::GraphT<EdgeWeightType> graph, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
+	oss << "void __device__ " << load_balancing_arg << "(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
 	indent();
 	printIndent();
-	oss << load_balance_function << "<EdgeWeightType, " << load_balancing_arg << "<EdgeWeightType>> (";
-	oss << "graph, input_frontier, output_frontier);" << std::endl;
+	oss << "// Body of the actual operator" << std::endl;
+	// Before we generate the call to the UDF, we have to check if the dst is on the input frontier
 	
+	printIndent();
+	oss << "if (!input_frontier.d_byte_map_input[dst])" << std::endl;
+	indent();
+	printIndent();
+	oss << "return;" << std::endl;
+	dedent();
+
+	printIndent();
+	oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+	printIndent();
+	// Order is reversed here because PULL direction
+	oss << "if (" << apply_expr->input_function_name << "(dst, src, weight)) {" << std::endl;
+	indent();
+	printIndent();
+	if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED)
+		oss << "gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, src);" << std::endl;
+	else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP)
+		oss << "gpu_runtime::enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, src);" << std::endl;
+	else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP)
+		oss << "gpu_runtime::enqueueVertexBitmap(output_frontier.d_bit_map_output, output_frontier.d_num_elems_output, src);" << std::endl;
 	dedent();
 	printIndent();
 	oss << "}" << std::endl;
-	
-	apply_expr->kernel_function = kernel_function_name;
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;	
 	apply_expr->device_function = load_balancing_arg;
-	
+	genEdgeSetGlobalKernel(apply_expr);
 }
+
 void CodeGenGPU::genIncludeStmts(void) {
 	oss << "#include \"gpu_intrinsics.h\"" << std::endl;
 
@@ -521,7 +505,23 @@ void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
 				oss << "gpu_runtime::vertex_set_prepare_sparse(";
 				oss << esae->from_func;
 				oss << ");" << std::endl;
+			} else if (mir::isa<mir::PullEdgeSetApplyExpr>(esae)) {
+				printIndent();
+				oss << "gpu_runtime::vertex_set_prepare_boolmap(";
+				oss << esae->from_func;
+				oss << ");" << std::endl;
+
+				std::string to_func = esae->to_func;
+				if (to_func == "")
+					to_func = "gpu_runtime::true_function";
+				printIndent();
+				oss << "gpu_runtime::vertex_set_create_reverse_sparse_queue<" << to_func << ">(";
+				oss << esae->from_func << ");" << std::endl;
+				
 			}
+			printIndent();
+			oss << var_decl->name << " = " << esae->from_func << ";" << std::endl;
+			
 
 			printIndent();
 			oss << "int32_t num_cta, cta_size;" << std::endl;
diff --git a/src/midend/apply_expr_lower.cpp b/src/midend/apply_expr_lower.cpp
index 19706b0c..66387209 100644
--- a/src/midend/apply_expr_lower.cpp
+++ b/src/midend/apply_expr_lower.cpp
@@ -99,8 +99,12 @@ namespace graphit {
 			if (dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule) != nullptr) {	
 				edgeset_apply->applied_schedule = *dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule);
 			}
-			// First we create a stmt block to return
-			node = std::make_shared<mir::PushEdgeSetApplyExpr>(edgeset_apply);
+			if (edgeset_apply->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PUSH)
+				node = std::make_shared<mir::PushEdgeSetApplyExpr>(edgeset_apply);
+			else if (edgeset_apply->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL)
+				node = std::make_shared<mir::PullEdgeSetApplyExpr>(edgeset_apply);
+			else 
+				assert(false && "Invalid option for direction\n");
 						
 		} else {
 			// No schedule is attached, lower using default schedule	
diff --git a/src/runtime_lib/gpu_intrinsics.h b/src/runtime_lib/gpu_intrinsics.h
index cd907d2b..204026a8 100644
--- a/src/runtime_lib/gpu_intrinsics.h
+++ b/src/runtime_lib/gpu_intrinsics.h
@@ -8,21 +8,9 @@
 #include "infra_gpu/vertex_frontier.h"
 #include "infra_gpu/load_balance.h"
 #include "graphit_timer.h"
+#include "infra_gpu/support.h"
 
 namespace gpu_runtime {
-template <typename T>
-static bool __device__ writeMin(T *dst, T src) {
-	if (*dst <= src)
-		return false;
-	T old_value = atomicMin(dst, src);
-	bool ret = (old_value > src);
-	return ret;
-}
-template <typename EdgeWeightType>
-static int32_t builtin_getVertices(GraphT<EdgeWeightType> &graph) {
-	return graph.num_vertices;
-}
-
 
 template <typename T>
 static void deleteObject(T &t) {
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
index e7886eae..25b83e4b 100644
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -114,6 +114,10 @@ static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool
 	std::cout << filename << " (" << graph.num_vertices << ", " << graph.num_edges << ")" << std::endl;
 
 }
+template <typename EdgeWeightType>
+static int32_t builtin_getVertices(GraphT<EdgeWeightType> &graph) {
+	return graph.num_vertices;
+}
 
 
 }
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index 85748666..17b3c87d 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -17,7 +17,7 @@ static void __global__ vertex_set_apply_kernel(int32_t num_vertices) {
 	vertex_set_apply<body>(num_vertices);
 } 
 
-template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier)>
+template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier)>
 void __device__ vertex_based_load_balance(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
 	int32_t vid = threadIdx.x + blockDim.x * blockIdx.x;
 	if (vid >= input_frontier.d_num_elems_input[0])
@@ -25,7 +25,7 @@ void __device__ vertex_based_load_balance(GraphT<EdgeWeightType> graph, VertexFr
 	int32_t src = input_frontier.d_sparse_queue_input[vid];
 	for (int32_t eid = graph.d_src_offsets[src]; eid < graph.d_src_offsets[src+1]; eid++) {
 		int32_t dst = graph.d_edge_dst[eid];
-		load_balance_payload(graph, src, dst, eid, output_frontier);
+		load_balance_payload(graph, src, dst, eid, input_frontier, output_frontier);
 	}
 }
 
@@ -36,7 +36,7 @@ void __host__ vertex_based_load_balance_info(VertexFrontier &frontier, int32_t &
 }
 #define STAGE_1_SIZE (8)
 #define WARP_SIZE (32)
-template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier)>
+template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier)>
 static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
 	int32_t thread_id = blockDim.x * blockIdx.x + threadIdx.x;
 	
@@ -99,7 +99,7 @@ static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFro
 		// STAGE 1
 		for (int32_t neigh_id = s1_offset + (lane_id % STAGE_1_SIZE); neigh_id < degree + s1_offset; neigh_id += STAGE_1_SIZE) {
 			int32_t dst = graph.d_edge_dst[neigh_id];
-			load_balance_payload(graph, local_vertex, dst, neigh_id, output_frontier);	
+			load_balance_payload(graph, local_vertex, dst, neigh_id, input_frontier, output_frontier);	
 		}
 
 	}
@@ -118,7 +118,7 @@ static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFro
 		int32_t s2_offset = stage2_offset[to_process];
 		for (int32_t neigh_id = s2_offset + (lane_id); neigh_id < degree + s2_offset; neigh_id += WARP_SIZE) {
 			int32_t dst = graph.d_edge_dst[neigh_id];
-			load_balance_payload(graph, local_vertex, dst, neigh_id, output_frontier);	
+			load_balance_payload(graph, local_vertex, dst, neigh_id, input_frontier, output_frontier);	
 		}
 		
 	}	
@@ -129,7 +129,7 @@ static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFro
 		int32_t s3_offset = stage3_offset[wid];
 		for (int32_t neigh_id = s3_offset + (threadIdx.x); neigh_id < degree + s3_offset; neigh_id += CTA_SIZE) {
 			int32_t dst = graph.d_edge_dst[neigh_id];
-			load_balance_payload(graph, local_vertex, dst, neigh_id, output_frontier);	
+			load_balance_payload(graph, local_vertex, dst, neigh_id, input_frontier, output_frontier);	
 		}	
 	}
 }
diff --git a/src/runtime_lib/infra_gpu/support.h b/src/runtime_lib/infra_gpu/support.h
index 5a286635..7fd07e9d 100644
--- a/src/runtime_lib/infra_gpu/support.h
+++ b/src/runtime_lib/infra_gpu/support.h
@@ -23,6 +23,14 @@ __device__ inline int32_t atomicAggInc(int32_t *ctr) {
 
         return (res + __popc(mask & ((1 << lane_id) - 1)));
 }
+template <typename T>
+static bool __device__ writeMin(T *dst, T src) {
+	if (*dst <= src)
+		return false;
+	T old_value = atomicMin(dst, src);
+	bool ret = (old_value > src);
+	return ret;
+}
 }
 
 #endif
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index c4cea536..23b97adf 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -143,9 +143,17 @@ static void __global__ prepare_sparse_from_bytemap(VertexFrontier frontier) {
 		}
 	}
 }
-static void __global__ prepare_sparse_from_bitmap(VertexFrontier &frontier) {
+static void __global__ prepare_sparse_from_bitmap(VertexFrontier frontier) {
 }
 
+static void __global__ prepare_bytemap_from_sparse(VertexFrontier frontier) {
+	for (int32_t node_idx = blockDim.x * blockIdx.x + threadIdx.x; node_idx < frontier.d_num_elems_input[0]; node_idx += blockDim.x * gridDim.x) {
+		int32_t node_id = frontier.d_sparse_queue_input[node_idx];
+		enqueueVertexBytemap(frontier.d_byte_map_output, frontier.d_num_elems_output, node_id);
+	}
+}
+static void __global__ prepare_bytemap_from_bitmap(VertexFrontier frontier) {
+}
 static void vertex_set_prepare_sparse(VertexFrontier &frontier) {
 	if (frontier.format_ready == VertexFrontier::SPARSE)
 		return;
@@ -159,6 +167,36 @@ static void vertex_set_prepare_sparse(VertexFrontier &frontier) {
 		return;	
 	}	
 }
+static void vertex_set_prepare_boolmap(VertexFrontier &frontier) {
+	if (frontier.format_ready == VertexFrontier::SPARSE) {
+		prepare_bytemap_from_sparse<<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_bytemaps(frontier);
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
+		prepare_bytemap_from_bitmap<<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_bytemaps(frontier);
+		return;
+	}
+}
+bool __device__ true_function(int32_t _) {
+	return true;
+}
+template <bool to_func(int32_t)>
+static void __global__ vertex_set_create_reverse_sparse_queue_kernel(VertexFrontier frontier) {
+	for (int32_t node_id = blockDim.x * blockIdx.x + threadIdx.x; node_id < frontier.max_num_elems; node_id += blockDim.x * gridDim.x) {
+		if ((to_func(node_id)))
+			enqueueVertexSparseQueue(frontier.d_sparse_queue_output, frontier.d_num_elems_output, node_id);
+	}	
+}
+
+template <bool to_func(int32_t)>
+static void vertex_set_create_reverse_sparse_queue(VertexFrontier &frontier) {
+	vertex_set_create_reverse_sparse_queue_kernel<to_func><<<NUM_CTA, CTA_SIZE>>>(frontier);
+	swap_queues(frontier);	
+}
+
 }
 
 #endif

From 05fb2ab2a71ea511ef497340fbfe13aafd6e69a8 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Thu, 26 Sep 2019 14:53:12 -0400
Subject: [PATCH 05/88] Added a src filter. Doesn't affect performance for SSSP

---
 src/backend/codegen_gpu/codegen_gpu.cpp     | 27 +++++++++++-----
 src/runtime_lib/infra_gpu/load_balance.h    | 29 ++++++++++++------
 src/runtime_lib/infra_gpu/vertex_frontier.h | 34 ++++++++++++++++++---
 3 files changed, 68 insertions(+), 22 deletions(-)

diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index a3268a26..1cffd535 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -100,13 +100,21 @@ void CodeGenGPUKernelEmitter::genEdgeSetGlobalKernel(mir::EdgeSetApplyExpr::Ptr
 	if (apply_expr->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
 		load_balance_function = "gpu_runtime::TWCE_load_balance";
 	}
+	std::string accessor_type = "gpu_runtime::AccessorSparse";
+	if (apply_expr->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && apply_expr->to_func == "")
+		accessor_type = "gpu_runtime::AccessorAll";
+
+	std::string src_filter = "gpu_runtime::true_function";
+	if (apply_expr->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && apply_expr->to_func != "")
+		src_filter = apply_expr->to_func;
+
 	std::string kernel_function_name = "gpu_operator_kernel_" + mir_context_->getUniqueNameCounterString();
 
 	oss << "template <typename EdgeWeightType>" << std::endl;
 	oss << "void __global__ " << kernel_function_name << " (gpu_runtime::GraphT<EdgeWeightType> graph, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
 	indent();
 	printIndent();
-	oss << load_balance_function << "<EdgeWeightType, " << apply_expr->device_function << "<EdgeWeightType>> (";
+	oss << load_balance_function << "<EdgeWeightType, " << apply_expr->device_function << "<EdgeWeightType>, " << accessor_type << ", " << src_filter << "> (";
 	oss << "graph, input_frontier, output_frontier);" << std::endl;
 	
 	dedent();
@@ -512,11 +520,11 @@ void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
 				oss << ");" << std::endl;
 
 				std::string to_func = esae->to_func;
-				if (to_func == "")
-					to_func = "gpu_runtime::true_function";
-				printIndent();
-				oss << "gpu_runtime::vertex_set_create_reverse_sparse_queue<" << to_func << ">(";
-				oss << esae->from_func << ");" << std::endl;
+				if (to_func != "") {
+					printIndent();
+					oss << "gpu_runtime::vertex_set_create_reverse_sparse_queue<" << to_func << ">(";
+					oss << esae->from_func << ");" << std::endl;
+				}
 				
 			}
 			printIndent();
@@ -525,8 +533,13 @@ void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
 
 			printIndent();
 			oss << "int32_t num_cta, cta_size;" << std::endl;
+
+			std::string accessor_type = "gpu_runtime::AccessorSparse";
+			if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->to_func == "")
+				accessor_type = "gpu_runtime::AccessorAll";
+
 			printIndent();		
-			oss << load_balance_function << "_info(";
+			oss << load_balance_function << "_info<" << accessor_type << ">(";
 			oss << esae->from_func;
 			oss << ", num_cta, cta_size);" << std::endl;
 			printIndent();
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index 17b3c87d..8f1b79ad 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -17,26 +17,28 @@ static void __global__ vertex_set_apply_kernel(int32_t num_vertices) {
 	vertex_set_apply<body>(num_vertices);
 } 
 
-template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier)>
+template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier), typename AccessorType, bool src_filter(int32_t)>
 void __device__ vertex_based_load_balance(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
 	int32_t vid = threadIdx.x + blockDim.x * blockIdx.x;
-	if (vid >= input_frontier.d_num_elems_input[0])
+	if (vid >= AccessorType::getSize(input_frontier))
 		return;
-	int32_t src = input_frontier.d_sparse_queue_input[vid];
+	int32_t src = AccessorType::getElement(input_frontier, vid);
 	for (int32_t eid = graph.d_src_offsets[src]; eid < graph.d_src_offsets[src+1]; eid++) {
+		if (src_filter(src) == false)
+			break;
 		int32_t dst = graph.d_edge_dst[eid];
 		load_balance_payload(graph, src, dst, eid, input_frontier, output_frontier);
 	}
 }
-
+template <typename AccessorType>
 void __host__ vertex_based_load_balance_info(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
-	int32_t num_threads = builtin_getVertexSetSize(frontier);
+	int32_t num_threads = AccessorType::getSizeHost(frontier);
 	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
 	cta_size = CTA_SIZE;
 }
 #define STAGE_1_SIZE (8)
 #define WARP_SIZE (32)
-template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier)>
+template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier), typename AccessorType, bool src_filter(int32_t)>
 static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
 	int32_t thread_id = blockDim.x * blockIdx.x + threadIdx.x;
 	
@@ -57,15 +59,15 @@ static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFro
 	__shared__ int32_t stage2_size[CTA_SIZE];
 	__shared__ int32_t stage3_size[CTA_SIZE];	
 
-	int32_t total_vertices = input_frontier.d_num_elems_input[0];
+	int32_t total_vertices = AccessorType::getSize(input_frontier);
 	int32_t local_vertex_idx = thread_id / (STAGE_1_SIZE);
 	int32_t degree;
 	int32_t s1_offset;
 	int32_t local_vertex;
 	int32_t src_offset;
 	if (local_vertex_idx < total_vertices) {
-		local_vertex = input_frontier.d_sparse_queue_input[local_vertex_idx];
-		// Step 1 seggrefate vertices into shared buffers
+		local_vertex = AccessorType::getElement(input_frontier, local_vertex_idx);
+		// Step 1 seggregate vertices into shared buffers
 		if (threadIdx.x % (STAGE_1_SIZE) == 0) {
 			degree = graph.d_get_degree(local_vertex);
 			src_offset = graph.d_src_offsets[local_vertex];
@@ -98,6 +100,8 @@ static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFro
 	if (local_vertex_idx < total_vertices) {
 		// STAGE 1
 		for (int32_t neigh_id = s1_offset + (lane_id % STAGE_1_SIZE); neigh_id < degree + s1_offset; neigh_id += STAGE_1_SIZE) {
+			if (src_filter(local_vertex) == false)
+				break;
 			int32_t dst = graph.d_edge_dst[neigh_id];
 			load_balance_payload(graph, local_vertex, dst, neigh_id, input_frontier, output_frontier);	
 		}
@@ -117,6 +121,8 @@ static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFro
 		degree = stage2_size[to_process];
 		int32_t s2_offset = stage2_offset[to_process];
 		for (int32_t neigh_id = s2_offset + (lane_id); neigh_id < degree + s2_offset; neigh_id += WARP_SIZE) {
+			if (src_filter(local_vertex) == false)
+				break;
 			int32_t dst = graph.d_edge_dst[neigh_id];
 			load_balance_payload(graph, local_vertex, dst, neigh_id, input_frontier, output_frontier);	
 		}
@@ -128,13 +134,16 @@ static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFro
 		degree = stage3_size[wid];
 		int32_t s3_offset = stage3_offset[wid];
 		for (int32_t neigh_id = s3_offset + (threadIdx.x); neigh_id < degree + s3_offset; neigh_id += CTA_SIZE) {
+			if (src_filter(local_vertex) == false)
+				break;
 			int32_t dst = graph.d_edge_dst[neigh_id];
 			load_balance_payload(graph, local_vertex, dst, neigh_id, input_frontier, output_frontier);	
 		}	
 	}
 }
+template <typename AccessorType>
 void __host__ TWCE_load_balance_info(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
-	int32_t num_threads = builtin_getVertexSetSize(frontier) * STAGE_1_SIZE;
+	int32_t num_threads = AccessorType::getSizeHost(frontier) * STAGE_1_SIZE;
 	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
 	cta_size = CTA_SIZE;
 }
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index 23b97adf..d86c7960 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -30,6 +30,35 @@ struct VertexFrontier {
 
 	format_ready_type format_ready;
 };
+static int32_t builtin_getVertexSetSize(VertexFrontier &frontier) {
+	int32_t curr_size = 0;
+	cudaMemcpy(&curr_size, frontier.d_num_elems_input, sizeof(int32_t), cudaMemcpyDeviceToHost);
+	return curr_size;	
+}
+class AccessorSparse {
+public:
+	static int32_t __device__ getSize(VertexFrontier &frontier) {
+		return frontier.d_num_elems_input[0];
+	}
+	static int32_t __device__ getElement(VertexFrontier &frontier, int32_t index) {
+		return frontier.d_sparse_queue_input[index];
+	}
+	static int32_t getSizeHost(VertexFrontier &frontier) {
+		return builtin_getVertexSetSize(frontier);
+	}
+};
+class AccessorAll {
+public:
+	static int32_t __device__ getSize(VertexFrontier &frontier) {
+		return frontier.max_num_elems;
+	}
+	static int32_t __device__ getElement(VertexFrontier &frontier, int32_t index) {
+		return index;
+	}
+	static int32_t getSizeHost(VertexFrontier &frontier) {
+		return frontier.max_num_elems;
+	}
+};
 static VertexFrontier create_new_vertex_set(int32_t num_vertices) {
 	VertexFrontier frontier;
 	cudaMalloc(&frontier.d_num_elems_input, sizeof(int32_t));
@@ -89,11 +118,6 @@ static void __device__ enqueueVertexBytemap(unsigned char* byte_map, int32_t *by
 	byte_map[vertex_id] = 1;
 	atomicAggInc(byte_map_size);
 }
-static int32_t builtin_getVertexSetSize(VertexFrontier &frontier) {
-	int32_t curr_size = 0;
-	cudaMemcpy(&curr_size, frontier.d_num_elems_input, sizeof(int32_t), cudaMemcpyDeviceToHost);
-	return curr_size;	
-}
 static void swap_queues(VertexFrontier &frontier) {
 	int32_t *temp = frontier.d_num_elems_input;
 	frontier.d_num_elems_input = frontier.d_num_elems_output;

From 4d795638097d550cd94c4c0328b7877f92dfffbe Mon Sep 17 00:00:00 2001
From: Yunming Zhang <zhangyunming1990@gmail.com>
Date: Fri, 27 Sep 2019 10:51:15 -0400
Subject: [PATCH 06/88] reformating and adding new GPU schedule tests

---
 include/graphit/backend/backend.h     |  2 +-
 test/c++/high_level_schedule_test.cpp | 93 ++++++++++++++++-----------
 test/c++/test.cpp                     |  2 +-
 3 files changed, 59 insertions(+), 38 deletions(-)

diff --git a/include/graphit/backend/backend.h b/include/graphit/backend/backend.h
index 7e24a76f..b712a014 100644
--- a/include/graphit/backend/backend.h
+++ b/include/graphit/backend/backend.h
@@ -19,7 +19,7 @@ namespace graphit {
 
         int emitCPP(std::ostream &oss = std::cout, std::string module_name="");
     	int emitPython(std::ostream &oss = std::cout, std::string module_name="", std::string module_path="");
-	int emitGPU(std::ostream &oss = std::cout, std::string module_name="", std::string module_path="");
+	    int emitGPU(std::ostream &oss = std::cout, std::string module_name="", std::string module_path="");
 
     private:
         MIRContext* mir_context_;
diff --git a/test/c++/high_level_schedule_test.cpp b/test/c++/high_level_schedule_test.cpp
index 1ebe3b38..066ecee1 100644
--- a/test/c++/high_level_schedule_test.cpp
+++ b/test/c++/high_level_schedule_test.cpp
@@ -315,13 +315,13 @@ class HighLevelScheduleTest : public ::testing::Test {
                                                               "        end\n"
                                                               "\n"
                                                               "    end\n"
-                                                              "end");	
-	
+                                                              "end");
+
         const char*  pr_cc_char = ("element Vertex end\n"
                                              "element Edge end\n"
                                              "const edges : edgeset{Edge}(Vertex,Vertex) = load (\"test.el\");\n"
                                              "const vertices : vertexset{Vertex} = edges.getVertices();\n"
-                                             "const IDs : vector{Vertex}(int) = 1;\n"				   
+                                             "const IDs : vector{Vertex}(int) = 1;\n"
                                              "const old_rank : vector{Vertex}(float) = 1.0;\n"
                                              "const new_rank : vector{Vertex}(float) = 0.0;\n"
                                              "const out_degrees : vector{Vertex}(int) = edges.getOutDegrees();\n"
@@ -785,6 +785,20 @@ class HighLevelScheduleTest : public ::testing::Test {
         return be->emitCPP();
     }
 
+
+    int basicTestWithGPUSchedule(
+            fir::high_level_schedule::ProgramScheduleNode::Ptr program) {
+
+        graphit::Midend *me = new graphit::Midend(context_, program->getSchedule());
+        std::cout << "fir: " << std::endl;
+        std::cout << *(context_->getProgram());
+        std::cout << std::endl;
+
+        me->emitMIR(mir_context_);
+        graphit::Backend *be = new graphit::Backend(mir_context_);
+        return be->emitGPU();
+    }
+
     std::vector<ParseError> *errors_;
     graphit::FIRContext *context_;
     Frontend *fe_;
@@ -2182,13 +2196,13 @@ TEST_F(HighLevelScheduleTest, UnorderedKCoreSparsePushParallel){
 }
 
 TEST_F(HighLevelScheduleTest, UnorderedKCoreSparsePushDensePullParallel){
-istringstream is (unordered_kcore_str_);
-fe_->parseStream(is, context_, errors_);
-fir::high_level_schedule::ProgramScheduleNode::Ptr program
+    istringstream is (unordered_kcore_str_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
         = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-program->configApplyDirection("s1", "SparsePush-DensePull");
-program->configApplyParallelization("s1", "dynamic-vertex-parallel");
-EXPECT_EQ (0, basicTestWithSchedule(program));
+    program->configApplyDirection("s1", "SparsePush-DensePull");
+    program->configApplyParallelization("s1", "dynamic-vertex-parallel");
+    EXPECT_EQ (0, basicTestWithSchedule(program));
 }
 
 TEST_F(HighLevelScheduleTest, KCoreSumReduceBeforeUpdate){
@@ -2248,65 +2262,72 @@ EXPECT_EQ (0, basicTestWithSchedule(program));
 }
 
 TEST_F(HighLevelScheduleTest, KCoreDensePullParallel){
-istringstream is (kcore_str_);
-fe_->parseStream(is, context_, errors_);
-fir::high_level_schedule::ProgramScheduleNode::Ptr program
-        = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-program->configApplyDirection("s1", "DensePull");
-program->configApplyParallelization("s1", "dynamic-vertex-parallel");
-EXPECT_EQ (0, basicTestWithSchedule(program));
+    istringstream is (kcore_str_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+    program->configApplyDirection("s1", "DensePull");
+    program->configApplyParallelization("s1", "dynamic-vertex-parallel");
+    EXPECT_EQ (0, basicTestWithSchedule(program));
 }
 
 TEST_F(HighLevelScheduleTest, KCoreSparsePushDensePullParallel){
-istringstream is (kcore_str_);
-fe_->parseStream(is, context_, errors_);
-fir::high_level_schedule::ProgramScheduleNode::Ptr program
-        = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-program->configApplyDirection("s1", "SparsePush-DensePull");
-program->configApplyParallelization("s1", "dynamic-vertex-parallel");
-EXPECT_EQ (0, basicTestWithSchedule(program));
+    istringstream is (kcore_str_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+    program->configApplyDirection("s1", "SparsePush-DensePull");
+    program->configApplyParallelization("s1", "dynamic-vertex-parallel");
+    EXPECT_EQ (0, basicTestWithSchedule(program));
 }
 
 
 TEST_F(HighLevelScheduleTest, SetCoverUintDefaultSchedule){
-istringstream is (setcover_uint_str_);
-fe_->parseStream(is, context_, errors_);
-fir::high_level_schedule::ProgramScheduleNode::Ptr program
+    istringstream is (setcover_uint_str_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
         = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-EXPECT_EQ (0, basicTestWithSchedule(program));
+    EXPECT_EQ (0, basicTestWithSchedule(program));
 }
 
-TEST_F(HighLevelScheduleTest, GPUScheduleBasicSimpleGPUScheduleTest) {
+TEST_F(HighLevelScheduleTest, BFSBasicSimpleGPUScheduleTest) {
     istringstream is (bfs_str_);
     fe_->parseStream(is, context_, errors_);
     fir::high_level_schedule::ProgramScheduleNode::Ptr program
             = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-
     // Now apply the GPU Schedule
     fir::gpu_schedule::SimpleGPUSchedule s1;
     s1.configDeduplication(fir::gpu_schedule::DISABLED);
     s1.configDirection(fir::gpu_schedule::PUSH);
-
     program->applyGPUSchedule("s1", s1);
+    EXPECT_EQ (0, basicTestWithGPUSchedule(program));
 }
 
-TEST_F(HighLevelScheduleTest, GPUScheduleBasicHybridGPUScheduleTest) {
+TEST_F(HighLevelScheduleTest, BFSBasicHybridGPUScheduleTest) {
     istringstream is (bfs_str_);
     fe_->parseStream(is, context_, errors_);
     fir::high_level_schedule::ProgramScheduleNode::Ptr program
             = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-
     // Now apply the GPU Schedule
     fir::gpu_schedule::SimpleGPUSchedule s1;
     fir::gpu_schedule::SimpleGPUSchedule s2;
     s1.configDeduplication(fir::gpu_schedule::DISABLED);
     s1.configDirection(fir::gpu_schedule::PUSH);
-	
     s2 = s1;
     s2.configDirection(fir::gpu_schedule::PULL);
-
     fir::gpu_schedule::HybridGPUSchedule h1 (fir::gpu_schedule::HybridGPUSchedule::INPUT_VERTEXSET_SIZE, 0.2, s1, s2);
-     
-
     program->applyGPUSchedule("s1", h1);
+    EXPECT_EQ (0, basicTestWithGPUSchedule(program));
 }
+
+TEST_F(HighLevelScheduleTest, SSSP_LabelProp_GPUScheduleTest) {
+    istringstream is (sssp_str_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+        = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+    fir::gpu_schedule::SimpleGPUSchedule s1;
+    s1.configDeduplication(fir::gpu_schedule::ENABLED);
+    s1.configDirection(fir::gpu_schedule::PUSH);
+    program->applyGPUSchedule("s1", s1);
+    EXPECT_EQ (0, basicTestWithGPUSchedule(program));
+}
\ No newline at end of file
diff --git a/test/c++/test.cpp b/test/c++/test.cpp
index bfbeca84..ef86d510 100644
--- a/test/c++/test.cpp
+++ b/test/c++/test.cpp
@@ -67,7 +67,7 @@ int main(int argc, char **argv) {
 
 
 
-//    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.KCoreDensePullParallel";
+//    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.SSSP_LabelProp_GPUScheduleTest";
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.KCoreSumReduceBeforeUpdate";
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.DeltaSteppingWithDefaultSchedule";
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.DeltaSteppingWithEagerPriorityUpdate";

From 505441dbe158081b0dabe7127ffb1119a27f2a84 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Mon, 30 Sep 2019 10:46:18 -0400
Subject: [PATCH 07/88] Working Hybrid operator

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |   1 +
 include/graphit/frontend/gpu_schedule.h       |  22 +-
 include/graphit/midend/mir.h                  |  16 ++
 include/graphit/midend/mir_rewriter.h         |   1 +
 include/graphit/midend/mir_visitor.h          |   2 +
 src/backend/codegen_gpu/codegen_gpu.cpp       | 204 ++++++++++--------
 src/midend/apply_expr_lower.cpp               | 119 ++++++++--
 src/midend/mir.cpp                            |  12 ++
 src/midend/mir_rewriter.cpp                   |   6 +
 src/midend/mir_visitor.cpp                    |   4 +
 src/runtime_lib/infra_gpu/support.h           |   6 +-
 src/runtime_lib/infra_gpu/vertex_frontier.h   |   2 +
 test/c++/high_level_schedule_test.cpp         |   4 +-
 13 files changed, 283 insertions(+), 116 deletions(-)

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index b7314a9c..05f43069 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -107,6 +107,7 @@ class CodeGenGPU : public mir::MIRVisitor{
 	virtual void visit(mir::VertexSetAllocExpr::Ptr) override;
 
 	virtual void visit(mir::VertexSetDedupExpr::Ptr) override;
+	virtual void visit(mir::HybridGPUStmt::Ptr) override;
 
 
 };
diff --git a/include/graphit/frontend/gpu_schedule.h b/include/graphit/frontend/gpu_schedule.h
index a736b241..75c40606 100644
--- a/include/graphit/frontend/gpu_schedule.h
+++ b/include/graphit/frontend/gpu_schedule.h
@@ -27,7 +27,8 @@ enum gpu_schedule_options {
 	CM,
 	STRICT,
 	EDGE_ONLY,
-	VERTEX_BASED
+	VERTEX_BASED,
+	INPUT_VERTEXSET_SIZE
 };
 
 class GPUSchedule {
@@ -180,20 +181,27 @@ class SimpleGPUSchedule: public GPUSchedule {
 
 class HybridGPUSchedule: public GPUSchedule {
 private:
+	// TODO: have separate alpha beta
+public:	
 	SimpleGPUSchedule s1;
 	SimpleGPUSchedule s2;
 	
 	float threshold;
-	// TODO: have separate alpha beta
-public:	
-	enum hybrid_criteria {
+	enum class hybrid_criteria {
 		INPUT_VERTEXSET_SIZE
 	};
-private:	
 	hybrid_criteria _hybrid_criteria;
+private:	
 public:	
-	HybridGPUSchedule (hybrid_criteria h, float t, SimpleGPUSchedule &_s1, SimpleGPUSchedule &_s2) {
-		_hybrid_criteria = h;
+	HybridGPUSchedule (enum gpu_schedule_options o, float t, SimpleGPUSchedule &_s1, SimpleGPUSchedule &_s2) {
+		switch(o) {
+			case INPUT_VERTEXSET_SIZE:
+				_hybrid_criteria = hybrid_criteria::INPUT_VERTEXSET_SIZE;
+				break;
+			default:
+				assert(false && "Invalid option for HybridGPUScheduleCriteria\n");
+				break;
+		}	
 		threshold = t;
 		s1 = _s1;
 		s2 = _s2;
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index 89be5e7f..a28a1be0 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -1534,6 +1534,22 @@ namespace graphit {
 		virtual void copy(MIRNode::Ptr);
 		virtual MIRNode::Ptr cloneNode();		
 	};
+	struct HybridGPUStmt: Stmt {
+		StmtBlock::Ptr stmt1;
+		StmtBlock::Ptr stmt2;
+		float threshold;
+		fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria criteria;
+			
+		std::string input_frontier_name;
+
+		typedef std::shared_ptr<HybridGPUStmt> Ptr;
+		virtual void accept(MIRVisitor *visitor) {
+			visitor->visit(self<HybridGPUStmt>());
+		}
+	protected:
+		virtual void copy(MIRNode::Ptr);
+		virtual MIRNode::Ptr cloneNode();
+	};
     }
 
 }
diff --git a/include/graphit/midend/mir_rewriter.h b/include/graphit/midend/mir_rewriter.h
index 5f2c05c9..b915bfe5 100644
--- a/include/graphit/midend/mir_rewriter.h
+++ b/include/graphit/midend/mir_rewriter.h
@@ -156,6 +156,7 @@ namespace graphit {
 	
 	    // GPU Additions
 	    virtual void visit(std::shared_ptr<VertexSetDedupExpr>);
+	    virtual void visit(std::shared_ptr<HybridGPUStmt>);
 
             template<typename T = Program>
             std::shared_ptr<T> rewrite(std::shared_ptr<T> ptr) {
diff --git a/include/graphit/midend/mir_visitor.h b/include/graphit/midend/mir_visitor.h
index 284e7fa2..557fa1bc 100644
--- a/include/graphit/midend/mir_visitor.h
+++ b/include/graphit/midend/mir_visitor.h
@@ -114,6 +114,7 @@ namespace graphit {
 	
 	// GPU Additions
 	struct VertexSetDedupExpr;
+	struct HybridGPUStmt;
 	
 
         struct MIRVisitor {
@@ -262,6 +263,7 @@ namespace graphit {
 	   
 	    // GPU Additions
 	    virtual void visit(std::shared_ptr<VertexSetDedupExpr>); 
+	    virtual void visit(std::shared_ptr<HybridGPUStmt>); 
 
        	    protected:
 
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 1cffd535..8a5be3b2 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -344,11 +344,100 @@ void CodeGenGPU::visit(mir::VarExpr::Ptr var_expr) {
 	oss << var_expr->var.getName();
 }
 void CodeGenGPU::visit(mir::AssignStmt::Ptr assign_stmt) {
-	printIndent();
-	assign_stmt->lhs->accept(this);
-	oss << " = ";
-	assign_stmt->expr->accept(this);
-	oss << ";" << std::endl;
+	if (mir::isa<mir::EdgeSetApplyExpr>(assign_stmt->expr)) {
+		mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);
+		if (esae->from_func == "") {
+			assert(false && "GPU backend doesn't currently support creating output frontier without input frontier\n");
+		}		
+		// We will assume that the output frontier can reuse the input frontier. 
+		// TOOD: Add liveness analysis for this
+		printIndent();	
+		oss << "{" << std::endl;
+		indent();
+		std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
+		if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
+			load_balance_function = "gpu_runtime::TWCE_load_balance";
+		}
+		
+		if (mir::isa<mir::PushEdgeSetApplyExpr>(esae)) {
+			printIndent();
+			oss << "gpu_runtime::vertex_set_prepare_sparse(";
+			oss << esae->from_func;
+			oss << ");" << std::endl;
+		} else if (mir::isa<mir::PullEdgeSetApplyExpr>(esae)) {
+			printIndent();
+			oss << "gpu_runtime::vertex_set_prepare_boolmap(";
+			oss << esae->from_func;
+			oss << ");" << std::endl;
+
+			std::string to_func = esae->to_func;
+			if (to_func != "") {
+				printIndent();
+				oss << "gpu_runtime::vertex_set_create_reverse_sparse_queue<" << to_func << ">(";
+				oss << esae->from_func << ");" << std::endl;
+			}
+			
+		}
+		printIndent();
+		assign_stmt->lhs->accept(this);
+		oss << " = " << esae->from_func << ";" << std::endl;
+		
+
+		printIndent();
+		oss << "int32_t num_cta, cta_size;" << std::endl;
+
+		std::string accessor_type = "gpu_runtime::AccessorSparse";
+		if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->to_func == "")
+			accessor_type = "gpu_runtime::AccessorAll";
+
+		printIndent();		
+		oss << load_balance_function << "_info<" << accessor_type << ">(";
+		oss << esae->from_func;
+		oss << ", num_cta, cta_size);" << std::endl;
+		printIndent();
+		oss << esae->kernel_function << "<<<num_cta, cta_size>>>" << "(";
+		esae->target->accept(this);
+		oss << ", " << esae->from_func << ", ";
+		assign_stmt->lhs->accept(this);
+		oss << ");" << std::endl;
+		printIndent();
+		oss << "cudaDeviceSynchronize();" << std::endl;
+		if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+			printIndent();
+			oss << "gpu_runtime::swap_queues(";
+			assign_stmt->lhs->accept(this);
+			oss << ");" << std::endl;
+			printIndent();
+			assign_stmt->lhs->accept(this);
+			oss << ".format_ready = gpu_runtime::VertexFrontier::SPARSE;" << std::endl;
+		
+		} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
+			printIndent();
+			oss << "gpu_runtime::swap_bitmaps(";
+			assign_stmt->lhs->accept(this);
+			oss << ");" << std::endl;
+			printIndent();
+			assign_stmt->lhs->accept(this);
+			oss << ".format_ready = gpu_runtime::VertexFrontier::BITMAP;" << std::endl;
+		} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
+			printIndent();
+			oss << "gpu_runtime::swap_bytemaps(";
+			assign_stmt->lhs->accept(this);
+			oss << ");" << std::endl;
+			printIndent();
+			assign_stmt->lhs->accept(this);
+			oss << ".format_ready = gpu_runtime::VertexFrontier::BYTEMAP;" << std::endl;
+		}
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;
+	} else {
+		printIndent();
+		assign_stmt->lhs->accept(this);
+		oss << " = ";
+		assign_stmt->expr->accept(this);
+		oss << ";" << std::endl;
+	}
 }
 
 void CodeGenGPU::generateBinaryExpr(mir::BinaryExpr::Ptr expr, std::string token) {
@@ -490,89 +579,10 @@ void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
 	
 	if (var_decl->initVal != nullptr) {
 		// Special case if RHS is a EdgeSetApplyExpr
-		if (mir::isa<mir::EdgeSetApplyExpr>(var_decl->initVal)) {
-			mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(var_decl->initVal);
-			if (esae->from_func == "") {
-				assert(false && "GPU backend doesn't currently support creating output frontier without input frontier\n");
-			}		
-			// We will assume that the output frontier can reuse the input frontier. 
-			// TOOD: Add liveness analysis for this
-			oss << " = " << esae->from_func;
-			oss << ";" << std::endl;
-			printIndent();
-			
-			oss << "{" << std::endl;
-			indent();
-			std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
-			if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
-				load_balance_function = "gpu_runtime::TWCE_load_balance";
-			}
-			
-			if (mir::isa<mir::PushEdgeSetApplyExpr>(esae)) {
-				printIndent();
-				oss << "gpu_runtime::vertex_set_prepare_sparse(";
-				oss << esae->from_func;
-				oss << ");" << std::endl;
-			} else if (mir::isa<mir::PullEdgeSetApplyExpr>(esae)) {
-				printIndent();
-				oss << "gpu_runtime::vertex_set_prepare_boolmap(";
-				oss << esae->from_func;
-				oss << ");" << std::endl;
-
-				std::string to_func = esae->to_func;
-				if (to_func != "") {
-					printIndent();
-					oss << "gpu_runtime::vertex_set_create_reverse_sparse_queue<" << to_func << ">(";
-					oss << esae->from_func << ");" << std::endl;
-				}
-				
-			}
-			printIndent();
-			oss << var_decl->name << " = " << esae->from_func << ";" << std::endl;
-			
-
-			printIndent();
-			oss << "int32_t num_cta, cta_size;" << std::endl;
-
-			std::string accessor_type = "gpu_runtime::AccessorSparse";
-			if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->to_func == "")
-				accessor_type = "gpu_runtime::AccessorAll";
-
-			printIndent();		
-			oss << load_balance_function << "_info<" << accessor_type << ">(";
-			oss << esae->from_func;
-			oss << ", num_cta, cta_size);" << std::endl;
-			printIndent();
-			oss << esae->kernel_function << "<<<num_cta, cta_size>>>" << "(";
-			esae->target->accept(this);
-			oss << ", " << esae->from_func << ", " << var_decl->name << ");" << std::endl;
-			printIndent();
-			oss << "cudaDeviceSynchronize();" << std::endl;
-			if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
-				printIndent();
-				oss << "gpu_runtime::swap_queues(" << var_decl->name << ");" << std::endl;
-				printIndent();
-				oss << var_decl->name << ".format_ready = gpu_runtime::VertexFrontier::SPARSE;" << std::endl;
-			
-			} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
-				printIndent();
-				oss << "gpu_runtime::swap_bitmaps(" << var_decl->name << ");" << std::endl;
-				printIndent();
-				oss << var_decl->name << ".format_ready = gpu_runtime::VertexFrontier::BITMAP;" << std::endl;
-			} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
-				printIndent();
-				oss << "gpu_runtime::swap_bytemaps(" << var_decl->name << ");" << std::endl;
-				printIndent();
-				oss << var_decl->name << ".format_ready = gpu_runtime::VertexFrontier::BYTEMAP;" << std::endl;
-			}
-			dedent();
-			printIndent();
-			oss << "}" << std::endl;
-		} else {
-			oss << " = ";
-			var_decl->initVal->accept(this);
-			oss << ";" << std::endl;
-		}
+		oss << " = ";
+		var_decl->initVal->accept(this);
+		oss << ";" << std::endl;
+		
 	} else 
 		oss << ";" << std::endl;
 		
@@ -752,4 +762,22 @@ void CodeGenGPUHost::visit(mir::StmtBlock::Ptr stmt_block) {
 		}
 	}
 }
+void CodeGenGPU::visit(mir::HybridGPUStmt::Ptr stmt) {
+	if (stmt->criteria == fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria::INPUT_VERTEXSET_SIZE) {
+		printIndent();
+		oss << "if (builtin_getVertexSetSize(" << stmt->input_frontier_name << ") < " << stmt->input_frontier_name << ".max_num_elems * " << stmt->threshold << ") {" << std::endl;
+		indent();
+		stmt->stmt1->accept(this);
+		dedent();
+		printIndent();
+		oss << "} else {" << std::endl;
+		indent();	
+		stmt->stmt2->accept(this);
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;	
+	} else {
+		assert(false && "Invalid criteria for Hybrid Statement\n");
+	}
+}
 }
diff --git a/src/midend/apply_expr_lower.cpp b/src/midend/apply_expr_lower.cpp
index 66387209..8fbf9e84 100644
--- a/src/midend/apply_expr_lower.cpp
+++ b/src/midend/apply_expr_lower.cpp
@@ -39,36 +39,121 @@ namespace graphit {
 	std::vector<mir::Stmt::Ptr> new_stmts;
 	for (auto stmt: *(stmt_block->stmts)) {
 		new_stmts.push_back(rewrite<mir::Stmt>(stmt));
-		if (insert_after_stmt != nullptr)
-			new_stmts.push_back(insert_after_stmt);
-		insert_after_stmt = nullptr;	
+		while (insert_after_stmt != nullptr) {
+			auto temp = insert_after_stmt;
+			insert_after_stmt = nullptr;	
+			temp = rewrite<mir::Stmt>(temp);
+			new_stmts.push_back(temp);
+		}
 	}
 	* (stmt_block->stmts) = new_stmts;
 	node = stmt_block;
     }
     void ApplyExprLower::LowerApplyExpr::visit(mir::VarDecl::Ptr var_decl) {
+	if (mir::isa<mir::EdgeSetApplyExpr> (var_decl->initVal)) {
+		auto init_val = var_decl->initVal;
+		var_decl->initVal = nullptr;
+		mir::AssignStmt::Ptr assign_stmt = std::make_shared<mir::AssignStmt>();
+		assign_stmt->expr = init_val;
+		mir::VarExpr::Ptr var_expr = std::make_shared<mir::VarExpr>();
+		mir::Var var (var_decl->name, var_decl->type);
+		var_expr->var = var;
+		assign_stmt->lhs = var_expr;
+		assign_stmt->stmt_label = var_decl->stmt_label;
+		insert_after_stmt = assign_stmt;
+		node = var_decl;
+		return;	
+	}
 	MIRRewriter::visit(var_decl);
 	var_decl = mir::to<mir::VarDecl>(node);
-	if (mir::isa<mir::EdgeSetApplyExpr> (var_decl->initVal)) {
-		mir::EdgeSetApplyExpr::Ptr edgeset_apply = mir::to<mir::EdgeSetApplyExpr>(var_decl->initVal);
+	node = var_decl;
+    }    
+    void ApplyExprLower::LowerApplyExpr::visit(mir::AssignStmt::Ptr assign_stmt) {
+	
+        if (assign_stmt->stmt_label != "") {
+                label_scope_.scope(assign_stmt->stmt_label);
+        }
+		
+	// Check for Hybrid stmt
+	if (mir::isa<mir::EdgeSetApplyExpr> (assign_stmt->expr)) {
+		mir::EdgeSetApplyExpr::Ptr edgeset_apply = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);
+		if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
+			auto current_scope_name = label_scope_.getCurrentScope();
+			auto apply_schedule_iter = schedule_->apply_gpu_schedules.find(current_scope_name);
+			if (apply_schedule_iter != schedule_->apply_gpu_schedules.end()) {
+				auto apply_schedule = apply_schedule_iter->second;
+				if (dynamic_cast<fir::gpu_schedule::HybridGPUSchedule*>(apply_schedule) != nullptr) {	
+					fir::gpu_schedule::HybridGPUSchedule *hybrid_schedule = dynamic_cast<fir::gpu_schedule::HybridGPUSchedule*>(apply_schedule);	
+					// This EdgeSetApply has a Hybrid Schedule attached to it
+					// Create the first Stmt block
+					mir::StmtBlock::Ptr stmt_block_1 = std::make_shared<mir::StmtBlock>();	
+					mir::AssignStmt::Ptr stmt1 = std::make_shared<mir::AssignStmt>();
+					stmt1->lhs = assign_stmt->lhs;
+					stmt1->expr = assign_stmt->expr;
+					stmt1->stmt_label = "hybrid1";	
+					stmt_block_1->insertStmtEnd(stmt1);
+					fir::gpu_schedule::SimpleGPUSchedule * schedule1 = new fir::gpu_schedule::SimpleGPUSchedule();
+					*schedule1 = hybrid_schedule->s1;
+					schedule_->apply_gpu_schedules[current_scope_name + ":hybrid1"] = schedule1;
+					stmt_block_1 = rewrite<mir::StmtBlock>(stmt_block_1);
+					
+					// Now create the second Stmt block
+				        auto func_decl = mir_context_->getFunction(edgeset_apply->input_function_name);
+				        mir::FuncDecl::Ptr func_decl_v2 = func_decl->clone<mir::FuncDecl>();
+				        func_decl_v2->name = func_decl->name + "_v2"; 
+				        mir_context_->addFunctionFront(func_decl_v2);
+					mir::StmtBlock::Ptr stmt_block_2 = std::make_shared<mir::StmtBlock>();
+					mir::AssignStmt::Ptr stmt2 = std::make_shared<mir::AssignStmt>();
+					stmt2->lhs = assign_stmt->lhs;
+					stmt2->expr = assign_stmt->expr;
+					mir::to<mir::EdgeSetApplyExpr>(stmt2->expr)->input_function_name = func_decl_v2->name;
+					stmt2->stmt_label = "hybrid2";
+					stmt_block_2->insertStmtEnd(stmt2);
+					fir::gpu_schedule::SimpleGPUSchedule * schedule2 = new fir::gpu_schedule::SimpleGPUSchedule();
+					*schedule2 = hybrid_schedule->s2;
+					schedule_->apply_gpu_schedules[current_scope_name + ":hybrid2"] = schedule2;
+					stmt_block_2 = rewrite<mir::StmtBlock>(stmt_block_2);
+					
+					// Finally create a hybrid statement and replace - 
+					mir::HybridGPUStmt::Ptr hybrid_node = std::make_shared<mir::HybridGPUStmt>();
+					hybrid_node->stmt1 = stmt_block_1;
+					hybrid_node->stmt2 = stmt_block_2;
+					hybrid_node->threshold = hybrid_schedule->threshold;
+					hybrid_node->criteria = hybrid_schedule->_hybrid_criteria;
+					if (hybrid_node->criteria == fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria::INPUT_VERTEXSET_SIZE && edgeset_apply->from_func != "") {
+						hybrid_node->input_frontier_name = edgeset_apply->from_func;	
+					} else {
+						assert(false && "Invalid criteria for Hybrid Node\n");
+					}
+					
+					node = hybrid_node;
+					if (assign_stmt->stmt_label != "") {
+						label_scope_.unscope();
+					}
+					return;
+								
+				}
+			}
+		}
+	}
+        if (assign_stmt->stmt_label != "") {
+                label_scope_.unscope();
+        }
+
+
+        MIRRewriter::visit(assign_stmt);
+	assign_stmt = mir::to<mir::AssignStmt>(node);
+	if (mir::isa<mir::EdgeSetApplyExpr> (assign_stmt->expr)) {
+		mir::EdgeSetApplyExpr::Ptr edgeset_apply = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);
 		
-		if (edgeset_apply->applied_schedule.deduplication == fir::gpu_schedule::SimpleGPUSchedule::deduplication_type::DEDUP_ENABLED && edgeset_apply->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+		if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty() && edgeset_apply->applied_schedule.deduplication == fir::gpu_schedule::SimpleGPUSchedule::deduplication_type::DEDUP_ENABLED && edgeset_apply->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
 			mir::VertexSetDedupExpr::Ptr dedup_expr = std::make_shared<mir::VertexSetDedupExpr>();
 			mir::ExprStmt::Ptr expr_stmt = std::make_shared<mir::ExprStmt>();
-			mir::Var var(var_decl->name, var_decl->type);
-			mir::VarExpr::Ptr var_expr = std::make_shared<mir::VarExpr>();
-			var_expr->var = var;
-			dedup_expr->target = var_expr;
-			
+			dedup_expr->target = assign_stmt->lhs;	
 			expr_stmt->expr = dedup_expr;
 			insert_after_stmt = expr_stmt;
 		}
 	}
-	node = var_decl;
-    }    
-    void ApplyExprLower::LowerApplyExpr::visit(mir::AssignStmt::Ptr assign_stmt) {
-        MIRRewriter::visit(assign_stmt);
-	assign_stmt = mir::to<mir::AssignStmt>(node);
 	node = assign_stmt;
     }
     void ApplyExprLower::LowerApplyExpr::visit(mir::EdgeSetApplyExpr::Ptr edgeset_apply) {
@@ -98,6 +183,8 @@ namespace graphit {
 			auto apply_schedule = apply_schedule_iter->second;
 			if (dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule) != nullptr) {	
 				edgeset_apply->applied_schedule = *dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule);
+			} else {
+				assert(false && "Schedule applied to EdgeSetApply must be a Simple Schedule");
 			}
 			if (edgeset_apply->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PUSH)
 				node = std::make_shared<mir::PushEdgeSetApplyExpr>(edgeset_apply);
diff --git a/src/midend/mir.cpp b/src/midend/mir.cpp
index 80fc848a..820199b9 100644
--- a/src/midend/mir.cpp
+++ b/src/midend/mir.cpp
@@ -957,5 +957,17 @@ namespace graphit {
 		return node;
 	}
 
+
+	void HybridGPUStmt::copy(MIRNode::Ptr node) {
+		const auto op = mir::to<HybridGPUStmt>(node);
+		stmt1 = op->stmt1;
+		stmt2 = op->stmt2;
+	}
+	MIRNode::Ptr HybridGPUStmt::cloneNode() {
+		const auto node = std::make_shared<HybridGPUStmt>();
+		node->copy(shared_from_this());
+		return node;
+	}
+
     }
 }
diff --git a/src/midend/mir_rewriter.cpp b/src/midend/mir_rewriter.cpp
index 122516e9..94714486 100644
--- a/src/midend/mir_rewriter.cpp
+++ b/src/midend/mir_rewriter.cpp
@@ -416,5 +416,11 @@ namespace graphit {
 		node = ptr;
 	}
 
+	void MIRRewriter::visit(HybridGPUStmt::Ptr stmt) {
+		stmt->stmt1 = rewrite<StmtBlock>(stmt->stmt1);
+		stmt->stmt2 = rewrite<StmtBlock>(stmt->stmt2);
+		node = stmt;
+	}
+
     }
 }
diff --git a/src/midend/mir_visitor.cpp b/src/midend/mir_visitor.cpp
index 67850442..ccf62752 100644
--- a/src/midend/mir_visitor.cpp
+++ b/src/midend/mir_visitor.cpp
@@ -376,5 +376,9 @@ namespace graphit {
 	void MIRVisitor::visit(std::shared_ptr<VertexSetDedupExpr> op) {
 		op->target->accept(this);
 	}
+	void MIRVisitor::visit(std::shared_ptr<HybridGPUStmt> op) {
+		op->stmt1->accept(this);
+		op->stmt2->accept(this);		
+	}
     }
 }
diff --git a/src/runtime_lib/infra_gpu/support.h b/src/runtime_lib/infra_gpu/support.h
index 7fd07e9d..29538554 100644
--- a/src/runtime_lib/infra_gpu/support.h
+++ b/src/runtime_lib/infra_gpu/support.h
@@ -8,8 +8,8 @@ void cudaCheckLastError(void) {
 		exit(-1);
 	}
 }
-__device__ inline int32_t warp_bcast(int32_t v, int32_t leader) {
-	return __shfl_sync((uint32_t)-1, v, leader); 
+__device__ inline int32_t warp_bcast(int32_t mask, int32_t v, int32_t leader) {
+	return __shfl_sync((uint32_t)mask, v, leader); 
 }
 __device__ inline int32_t atomicAggInc(int32_t *ctr) {
 	int32_t lane_id = threadIdx.x % 32;
@@ -19,7 +19,7 @@ __device__ inline int32_t atomicAggInc(int32_t *ctr) {
         int res;
         if(lane_id == leader)
                 res = atomicAdd(ctr, __popc(mask));
-        res = warp_bcast(res, leader);
+        res = warp_bcast(mask, res, leader);
 
         return (res + __popc(mask & ((1 << lane_id) - 1)));
 }
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index d86c7960..68131d1b 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -115,6 +115,8 @@ static void __device__ enqueueVertexSparseQueue(int32_t *sparse_queue, int32_t *
 }
 static void __device__ enqueueVertexBytemap(unsigned char* byte_map, int32_t *byte_map_size, int32_t vertex_id) {
 	// We are not using atomic operation here because races are benign here
+	if (byte_map[vertex_id] == 1)
+		return;
 	byte_map[vertex_id] = 1;
 	atomicAggInc(byte_map_size);
 }
diff --git a/test/c++/high_level_schedule_test.cpp b/test/c++/high_level_schedule_test.cpp
index 066ecee1..fee8c295 100644
--- a/test/c++/high_level_schedule_test.cpp
+++ b/test/c++/high_level_schedule_test.cpp
@@ -2315,7 +2315,7 @@ TEST_F(HighLevelScheduleTest, BFSBasicHybridGPUScheduleTest) {
     s1.configDirection(fir::gpu_schedule::PUSH);
     s2 = s1;
     s2.configDirection(fir::gpu_schedule::PULL);
-    fir::gpu_schedule::HybridGPUSchedule h1 (fir::gpu_schedule::HybridGPUSchedule::INPUT_VERTEXSET_SIZE, 0.2, s1, s2);
+    fir::gpu_schedule::HybridGPUSchedule h1 (fir::gpu_schedule::INPUT_VERTEXSET_SIZE, 0.2, s1, s2);
     program->applyGPUSchedule("s1", h1);
     EXPECT_EQ (0, basicTestWithGPUSchedule(program));
 }
@@ -2330,4 +2330,4 @@ TEST_F(HighLevelScheduleTest, SSSP_LabelProp_GPUScheduleTest) {
     s1.configDirection(fir::gpu_schedule::PUSH);
     program->applyGPUSchedule("s1", s1);
     EXPECT_EQ (0, basicTestWithGPUSchedule(program));
-}
\ No newline at end of file
+}

From c4bc83460fc994b581a47e9b89ee8ad47bb925d4 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Mon, 30 Sep 2019 11:44:32 -0400
Subject: [PATCH 08/88] Fixed the apply expr lower vardecl->assign conversion
 only if GPU schedule is attached

---
 src/midend/apply_expr_lower.cpp | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/midend/apply_expr_lower.cpp b/src/midend/apply_expr_lower.cpp
index 8fbf9e84..6ca6d315 100644
--- a/src/midend/apply_expr_lower.cpp
+++ b/src/midend/apply_expr_lower.cpp
@@ -50,19 +50,21 @@ namespace graphit {
 	node = stmt_block;
     }
     void ApplyExprLower::LowerApplyExpr::visit(mir::VarDecl::Ptr var_decl) {
-	if (mir::isa<mir::EdgeSetApplyExpr> (var_decl->initVal)) {
-		auto init_val = var_decl->initVal;
-		var_decl->initVal = nullptr;
-		mir::AssignStmt::Ptr assign_stmt = std::make_shared<mir::AssignStmt>();
-		assign_stmt->expr = init_val;
-		mir::VarExpr::Ptr var_expr = std::make_shared<mir::VarExpr>();
-		mir::Var var (var_decl->name, var_decl->type);
-		var_expr->var = var;
-		assign_stmt->lhs = var_expr;
-		assign_stmt->stmt_label = var_decl->stmt_label;
-		insert_after_stmt = assign_stmt;
-		node = var_decl;
-		return;	
+	if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
+		if (mir::isa<mir::EdgeSetApplyExpr> (var_decl->initVal)) {
+			auto init_val = var_decl->initVal;
+			var_decl->initVal = nullptr;
+			mir::AssignStmt::Ptr assign_stmt = std::make_shared<mir::AssignStmt>();
+			assign_stmt->expr = init_val;
+			mir::VarExpr::Ptr var_expr = std::make_shared<mir::VarExpr>();
+			mir::Var var (var_decl->name, var_decl->type);
+			var_expr->var = var;
+			assign_stmt->lhs = var_expr;
+			assign_stmt->stmt_label = var_decl->stmt_label;
+			insert_after_stmt = assign_stmt;
+			node = var_decl;
+			return;	
+		}
 	}
 	MIRRewriter::visit(var_decl);
 	var_decl = mir::to<mir::VarDecl>(node);

From cb0c070a2749425cbfa70a2a07bad902b19e448e Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Tue, 1 Oct 2019 16:14:45 -0400
Subject: [PATCH 09/88] Added bitmap support for PULL direction

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |  2 +
 include/graphit/frontend/gpu_schedule.h       | 23 ++++-
 src/backend/codegen_gpu/codegen_gpu.cpp       | 53 +++++++++---
 src/runtime_lib/infra_gpu/support.h           |  4 +
 src/runtime_lib/infra_gpu/vertex_frontier.h   | 83 +++++++++++++++++--
 5 files changed, 145 insertions(+), 20 deletions(-)

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index 05f43069..64fe7225 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -84,6 +84,7 @@ class CodeGenGPU : public mir::MIRVisitor{
 	virtual void visit(mir::DivExpr::Ptr) override;
 	virtual void visit(mir::SubExpr::Ptr) override;
 	virtual void visit(mir::EqExpr::Ptr) override;
+	virtual void visit(mir::NegExpr::Ptr) override;
 
 	virtual void visit(mir::TensorArrayReadExpr::Ptr) override;
 	virtual void visit(mir::IntLiteral::Ptr) override;
@@ -93,6 +94,7 @@ class CodeGenGPU : public mir::MIRVisitor{
 
 
 	virtual void visit(mir::ReduceStmt::Ptr) override;
+	virtual void visit(mir::CompareAndSwapStmt::Ptr) override;
 	virtual void visit(mir::VarDecl::Ptr) override;
 
 	virtual void visit(mir::ForStmt::Ptr) override;
diff --git a/include/graphit/frontend/gpu_schedule.h b/include/graphit/frontend/gpu_schedule.h
index 75c40606..793c246e 100644
--- a/include/graphit/frontend/gpu_schedule.h
+++ b/include/graphit/frontend/gpu_schedule.h
@@ -28,7 +28,9 @@ enum gpu_schedule_options {
 	STRICT,
 	EDGE_ONLY,
 	VERTEX_BASED,
-	INPUT_VERTEXSET_SIZE
+	INPUT_VERTEXSET_SIZE,
+	BITMAP,
+	BOOLMAP
 };
 
 class GPUSchedule {
@@ -41,6 +43,10 @@ class GPUSchedule {
 class SimpleGPUSchedule: public GPUSchedule {
 
 public:
+	enum class pull_frontier_rep_type {
+		BITMAP, 
+		BOOLMAP
+	};
 	enum class direction_type {
 		DIR_PUSH, 
 		DIR_PULL
@@ -75,6 +81,7 @@ class SimpleGPUSchedule: public GPUSchedule {
 private:
 public:
 	direction_type direction;
+	pull_frontier_rep_type pull_frontier_rep;
 	frontier_creation_type frontier_creation;
 	deduplication_type deduplication;
 	load_balancing_type load_balancing;
@@ -82,6 +89,7 @@ class SimpleGPUSchedule: public GPUSchedule {
 	
 	SimpleGPUSchedule () {
 		direction = direction_type::DIR_PUSH;
+		pull_frontier_rep = pull_frontier_rep_type::BOOLMAP;
 		frontier_creation = frontier_creation_type::FRONTIER_FUSED;
 		deduplication = deduplication_type::DEDUP_DISABLED;
 		load_balancing = load_balancing_type::VERTEX_BASED;
@@ -89,13 +97,24 @@ class SimpleGPUSchedule: public GPUSchedule {
 	}	
 
 public:	
-	void configDirection(enum gpu_schedule_options o) {
+	void configDirection(enum gpu_schedule_options o, enum gpu_schedule_options r = BOOLMAP) {
 		switch(o) {
 			case PUSH:
 				direction = direction_type::DIR_PUSH;
 				break;
 			case PULL:
 				direction = direction_type::DIR_PULL;
+				switch (r) {
+					case BITMAP:
+						pull_frontier_rep = pull_frontier_rep_type::BITMAP;
+						break;
+					case BOOLMAP:
+						pull_frontier_rep = pull_frontier_rep_type::BOOLMAP;
+						break;
+					default:
+						assert(false && "Invalid option for Pull Frontier representation\n");
+						break;
+				}
 				break;
 			default:
 				assert(false && "Invalid option for configDirection");
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 8a5be3b2..de3c0943 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -171,12 +171,21 @@ void CodeGenGPUKernelEmitter::visit(mir::PullEdgeSetApplyExpr::Ptr apply_expr) {
 	oss << "// Body of the actual operator" << std::endl;
 	// Before we generate the call to the UDF, we have to check if the dst is on the input frontier
 	
-	printIndent();
-	oss << "if (!input_frontier.d_byte_map_input[dst])" << std::endl;
-	indent();
-	printIndent();
-	oss << "return;" << std::endl;
-	dedent();
+	if (apply_expr->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
+		printIndent();
+		oss << "if (!input_frontier.d_byte_map_input[dst])" << std::endl;
+		indent();
+		printIndent();
+		oss << "return;" << std::endl;
+		dedent();
+	} else if (apply_expr->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BITMAP) {
+		printIndent();
+		oss << "if (!gpu_runtime::checkBit(input_frontier.d_bit_map_input, dst))" << std::endl;
+		indent();
+		printIndent();
+		oss << "return;" << std::endl;
+		dedent();
+	}
 
 	printIndent();
 	oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
@@ -365,10 +374,17 @@ void CodeGenGPU::visit(mir::AssignStmt::Ptr assign_stmt) {
 			oss << esae->from_func;
 			oss << ");" << std::endl;
 		} else if (mir::isa<mir::PullEdgeSetApplyExpr>(esae)) {
-			printIndent();
-			oss << "gpu_runtime::vertex_set_prepare_boolmap(";
-			oss << esae->from_func;
-			oss << ");" << std::endl;
+			if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
+				printIndent();
+				oss << "gpu_runtime::vertex_set_prepare_boolmap(";
+				oss << esae->from_func;
+				oss << ");" << std::endl;
+			} else if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BITMAP) {
+				printIndent();
+				oss << "gpu_runtime::vertex_set_prepare_bitmap(";
+				oss << esae->from_func;
+				oss << ");" << std::endl;
+			}
 
 			std::string to_func = esae->to_func;
 			if (to_func != "") {
@@ -459,6 +475,13 @@ void CodeGenGPU::visit(mir::DivExpr::Ptr expr) {
 void CodeGenGPU::visit(mir::SubExpr::Ptr expr) {
 	generateBinaryExpr(expr, "-");
 }
+void CodeGenGPU::visit(mir::NegExpr::Ptr expr) {
+	if (expr->negate)
+		oss << "-";
+	oss << "(";
+	expr->operand->accept(this);
+	oss << ")";
+}
 
 
 void CodeGenGPU::visit(mir::TensorArrayReadExpr::Ptr expr) {
@@ -570,6 +593,16 @@ void CodeGenGPU::visit(mir::ReduceStmt::Ptr reduce_stmt) {
 			break;
 	}	
 }
+void CodeGenGPU::visit(mir::CompareAndSwapStmt::Ptr cas_stmt) {
+	printIndent();
+	oss << cas_stmt->tracking_var_ << " = gpu_runtime::CAS(&";
+	cas_stmt->lhs->accept(this);
+	oss << ", ";
+	cas_stmt->compare_val_expr->accept(this);
+	oss << ", ";
+	cas_stmt->expr->accept(this);
+	oss << ");" << std::endl;
+}
 void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
 	
 	printIndent();
diff --git a/src/runtime_lib/infra_gpu/support.h b/src/runtime_lib/infra_gpu/support.h
index 29538554..be3e2770 100644
--- a/src/runtime_lib/infra_gpu/support.h
+++ b/src/runtime_lib/infra_gpu/support.h
@@ -31,6 +31,10 @@ static bool __device__ writeMin(T *dst, T src) {
 	bool ret = (old_value > src);
 	return ret;
 }
+template <typename T>
+static bool __device__ CAS(T *dst, T old_val, const T &new_val) {
+	return old_val == atomicCAS(dst, old_val, new_val);
+}
 }
 
 #endif
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index 68131d1b..6d1889fe 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -15,8 +15,8 @@ struct VertexFrontier {
 	unsigned char* d_byte_map_input;
 	unsigned char* d_byte_map_output;
 
-	unsigned char* d_bit_map_input;
-	unsigned char* d_bit_map_output;
+	uint32_t* d_bit_map_input;
+	uint32_t* d_bit_map_output;
 
 	int32_t *d_dedup_counters;
 	int32_t curr_dedup_counter;
@@ -75,12 +75,13 @@ static VertexFrontier create_new_vertex_set(int32_t num_vertices) {
 	cudaMemset(frontier.d_byte_map_input, 0, sizeof(unsigned char) * num_vertices);
 	cudaMemset(frontier.d_byte_map_output, 0, sizeof(unsigned char) * num_vertices);
 	
-	int32_t num_byte_for_bitmap = (num_vertices + 7)/8;
-	cudaMalloc(&frontier.d_bit_map_input, sizeof(unsigned char) * num_byte_for_bitmap);
-	cudaMalloc(&frontier.d_bit_map_output, sizeof(unsigned char) * num_byte_for_bitmap);
+	int32_t num_byte_for_bitmap = (num_vertices + sizeof(uint32_t) - 1)/sizeof(uint32_t);
+	cudaMalloc(&frontier.d_bit_map_input, sizeof(uint32_t) * num_byte_for_bitmap);
+	cudaMalloc(&frontier.d_bit_map_output, sizeof(uint32_t) * num_byte_for_bitmap);
 	
-	cudaMemset(frontier.d_bit_map_input, 0, sizeof(unsigned char) * num_byte_for_bitmap);	
-	cudaMemset(frontier.d_bit_map_output, 0, sizeof(unsigned char) * num_byte_for_bitmap);	
+	cudaMemset(frontier.d_bit_map_input, 0, sizeof(uint32_t) * num_byte_for_bitmap);	
+	cudaMemset(frontier.d_bit_map_output, 0, sizeof(uint32_t) * num_byte_for_bitmap);	
+	cudaCheckLastError();
 
 	frontier.max_num_elems = num_vertices;
 
@@ -120,6 +121,21 @@ static void __device__ enqueueVertexBytemap(unsigned char* byte_map, int32_t *by
 	byte_map[vertex_id] = 1;
 	atomicAggInc(byte_map_size);
 }
+static bool __device__ checkBit(uint32_t* array, int32_t index) {	
+	uint32_t * address = array + index / sizeof(uint32_t);
+	return (*address & (1 << (index % sizeof(uint32_t))));
+}
+static bool __device__ setBit(uint32_t* array, int32_t index) {
+	uint32_t * address = array + index / sizeof(uint32_t);	
+	return atomicOr(address, (1 << (index % sizeof(uint32_t)))) & (1 << (index % sizeof(uint32_t)));
+}
+static void __device__ enqueueVertexBitmap(uint32_t* bit_map, int32_t * bit_map_size, int32_t vertex_id) {
+	// We need atomics here because of bit manipulations
+	if (checkBit(bit_map, vertex_id)) 
+		return;
+	if (!setBit(bit_map, vertex_id))
+		atomicAggInc(bit_map_size);	
+}
 static void swap_queues(VertexFrontier &frontier) {
 	int32_t *temp = frontier.d_num_elems_input;
 	frontier.d_num_elems_input = frontier.d_num_elems_output;
@@ -144,6 +160,21 @@ static void swap_bytemaps(VertexFrontier &frontier) {
 	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));	
 	cudaMemset(frontier.d_byte_map_output, 0, sizeof(unsigned char) * frontier.max_num_elems);
 }
+static void swap_bitmaps(VertexFrontier &frontier) {
+	int32_t *temp = frontier.d_num_elems_input;
+	frontier.d_num_elems_input = frontier.d_num_elems_output;
+	frontier.d_num_elems_output = temp;
+	
+	uint32_t* temp2;
+	temp2 = frontier.d_bit_map_input;
+	frontier.d_bit_map_input = frontier.d_bit_map_output;
+	frontier.d_bit_map_output = temp2;
+
+	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));		
+	int32_t num_byte_for_bitmap = (frontier.max_num_elems + sizeof(uint32_t) - 1)/sizeof(uint32_t);
+	cudaMemset(frontier.d_bit_map_output, 0, sizeof(uint32_t) * num_byte_for_bitmap);
+	cudaCheckLastError();
+}
 static void __device__ dedup_frontier_device(VertexFrontier &frontier) {
 	for(int32_t vidx = threadIdx.x + blockDim.x * blockIdx.x; vidx < frontier.d_num_elems_input[0]; vidx += blockDim.x * gridDim.x) {
 		int32_t vid = frontier.d_sparse_queue_input[vidx];
@@ -161,7 +192,6 @@ static void dedup_frontier(VertexFrontier &frontier) {
 	dedup_frontier_kernel<<<NUM_CTA, CTA_SIZE>>>(frontier);
 	swap_queues(frontier);
 }
-
 static void __global__ prepare_sparse_from_bytemap(VertexFrontier frontier) {
 	for (int32_t node_id = blockDim.x * blockIdx.x + threadIdx.x; node_id < frontier.max_num_elems; node_id += blockDim.x * gridDim.x) {
 		if (frontier.d_byte_map_input[node_id] == 1) {
@@ -170,6 +200,11 @@ static void __global__ prepare_sparse_from_bytemap(VertexFrontier frontier) {
 	}
 }
 static void __global__ prepare_sparse_from_bitmap(VertexFrontier frontier) {
+	for (int32_t node_id = blockDim.x * blockIdx.x + threadIdx.x; node_id < frontier.max_num_elems; node_id += blockDim.x * gridDim.x) {
+		if (checkBit(frontier.d_bit_map_input, node_id)) {
+			enqueueVertexSparseQueue(frontier.d_sparse_queue_output, frontier.d_num_elems_output, node_id);
+		}
+	}
 }
 
 static void __global__ prepare_bytemap_from_sparse(VertexFrontier frontier) {
@@ -179,6 +214,25 @@ static void __global__ prepare_bytemap_from_sparse(VertexFrontier frontier) {
 	}
 }
 static void __global__ prepare_bytemap_from_bitmap(VertexFrontier frontier) {
+	for (int32_t node_id = blockDim.x * blockIdx.x + threadIdx.x; node_id < frontier.max_num_elems; node_id += blockDim.x * gridDim.x) {
+		if (checkBit(frontier.d_bit_map_input, node_id)) {
+			enqueueVertexBytemap(frontier.d_byte_map_output, frontier.d_num_elems_output, node_id);
+		}
+	}
+}
+
+static void __global__ prepare_bitmap_from_sparse(VertexFrontier frontier) {
+	for (int32_t node_idx = blockDim.x * blockIdx.x + threadIdx.x; node_idx < frontier.d_num_elems_input[0]; node_idx += blockDim.x * gridDim.x) {
+		int32_t node_id = frontier.d_sparse_queue_input[node_idx];
+		enqueueVertexBitmap(frontier.d_bit_map_output, frontier.d_num_elems_output, node_id);
+	}
+}
+static void __global__ prepare_bitmap_from_bytemap(VertexFrontier frontier) {
+	for (int32_t node_id = blockDim.x * blockIdx.x + threadIdx.x; node_id < frontier.max_num_elems; node_id += blockDim.x * gridDim.x) {
+		if (frontier.d_byte_map_input[node_id] == 1) {
+			enqueueVertexBitmap(frontier.d_bit_map_output, frontier.d_num_elems_output, node_id);
+		}
+	}
 }
 static void vertex_set_prepare_sparse(VertexFrontier &frontier) {
 	if (frontier.format_ready == VertexFrontier::SPARSE)
@@ -206,6 +260,19 @@ static void vertex_set_prepare_boolmap(VertexFrontier &frontier) {
 		return;
 	}
 }
+static void vertex_set_prepare_bitmap(VertexFrontier &frontier) {
+	if (frontier.format_ready == VertexFrontier::SPARSE) {
+		prepare_bitmap_from_sparse<<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_bitmaps(frontier);
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
+		prepare_bitmap_from_bytemap<<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_bitmaps(frontier);
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
+		return;
+	}
+}
 bool __device__ true_function(int32_t _) {
 	return true;
 }

From 9d109fbeafe9fe51133764aa6a64fa0b02127ad0 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Tue, 1 Oct 2019 19:02:59 -0400
Subject: [PATCH 10/88] Added GPU test framework

---
 CMakeLists.txt                                | 10 +++
 test/c++/high_level_schedule_test.cpp         | 23 +++++
 test/gpu_tests/runtime_lib_tests.py           | 84 +++++++++++++++++++
 test/gpu_tests/test_input/basic_compile.cu    |  5 ++
 test/gpu_tests/test_input/basic_load_graph.cu |  9 ++
 test/gpu_tests/test_input/obtain_gpu_cc.cu    | 31 +++++++
 6 files changed, 162 insertions(+)
 create mode 100644 test/gpu_tests/runtime_lib_tests.py
 create mode 100644 test/gpu_tests/test_input/basic_compile.cu
 create mode 100644 test/gpu_tests/test_input/basic_load_graph.cu
 create mode 100644 test/gpu_tests/test_input/obtain_gpu_cc.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e8642491..c926448d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -83,9 +83,19 @@ add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/graphit.py
 	VERBATIM
 )
 
+find_package(CUDA QUIET)
+
+add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/gpu_tests/runtime_lib_tests.py
+	COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/gpu_tests
+	COMMAND sed -e s?\$\{NVCC_COMPILER\}?${CUDA_NVCC_EXECUTABLE}?g -e s?\$\{GRAPHIT_SOURCE_DIRECTORY\}?${CMAKE_SOURCE_DIR}?g -e s?\$\{CXX_COMPILER\}?${CMAKE_CXX_COMPILER}?g -e s?\$\{GRAPHIT_BUILD_DIRECTORY\}?${CMAKE_BINARY_DIR}?g ${CMAKE_SOURCE_DIR}/test/gpu_tests/runtime_lib_tests.py > ${CMAKE_BINARY_DIR}/gpu_tests/runtime_lib_tests.py
+	DEPENDS ${CMAKE_SOURCE_DIR}/test/gpu_tests/runtime_lib_tests.py
+	VERBATIM
+)
+	
 add_custom_target(copy_graphitc_py ALL DEPENDS ${GRAPHITC_PY})
 add_custom_target(copy_python_tests ALL DEPENDS ${CMAKE_BINARY_DIR}/python_tests/test_with_schedules.py ${CMAKE_BINARY_DIR}/python_tests/test.py ${CMAKE_BINARY_DIR}/python_tests/pybind_test.py)
 add_custom_target(copy_graphit_py ALL DEPENDS ${CMAKE_BINARY_DIR}/graphit.py)
+add_custom_target(copy_gpu_runtime_lib_tests_py ALL DEPENDS ${CMAKE_BINARY_DIR}/gpu_tests/runtime_lib_tests.py)
 
 configure_file(src/main.cpp ${CMAKE_BINARY_DIR}/bin/main.cpp COPYONLY)
 configure_file(test/library_test_drivers/library_test_driver_cpp.txt ${CMAKE_BINARY_DIR}/bin/library_test_driver.cpp COPYONLY)
diff --git a/test/c++/high_level_schedule_test.cpp b/test/c++/high_level_schedule_test.cpp
index fee8c295..ad696779 100644
--- a/test/c++/high_level_schedule_test.cpp
+++ b/test/c++/high_level_schedule_test.cpp
@@ -2331,3 +2331,26 @@ TEST_F(HighLevelScheduleTest, SSSP_LabelProp_GPUScheduleTest) {
     program->applyGPUSchedule("s1", s1);
     EXPECT_EQ (0, basicTestWithGPUSchedule(program));
 }
+
+TEST_F(HighLevelScheduleTest, BFSHybridPushPullScheduleTest) {
+    using namespace fir::gpu_schedule;
+
+    istringstream is (bfs_str_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+    // Now apply the GPU Schedule
+    SimpleGPUSchedule s1;
+    s1.configDeduplication(ENABLED);
+    s1.configFrontierCreation(UNFUSED_BITMAP);
+    s1.configLoadBalance(TWCE);
+    s1.configDirection(PUSH);
+    
+    SimpleGPUSchedule s2 = s1;
+    s2.configLoadBalance(VERTEX_BASED);
+    s2.configDirection(PULL, BITMAP);
+    
+    HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, 0.12, s1, s2);
+    program->applyGPUSchedule("s1", h1);
+    EXPECT_EQ(0, basicTestWithGPUSchedule(program));
+}
diff --git a/test/gpu_tests/runtime_lib_tests.py b/test/gpu_tests/runtime_lib_tests.py
new file mode 100644
index 00000000..706976b4
--- /dev/null
+++ b/test/gpu_tests/runtime_lib_tests.py
@@ -0,0 +1,84 @@
+import unittest
+import subprocess
+import os
+import shutil
+import sys
+
+GRAPHIT_BUILD_DIRECTORY="${GRAPHIT_BUILD_DIRECTORY}".strip().rstrip("/")
+GRAPHIT_SOURCE_DIRECTORY="${GRAPHIT_SOURCE_DIRECTORY}".strip().rstrip("/")
+CXX_COMPILER="${CXX_COMPILER}"
+
+NVCC_COMPILER="${NVCC_COMPILER}"
+
+class TestGPURuntimeLibrary(unittest.TestCase):
+	@classmethod
+	def get_command_output_class(self, command):
+		output = ""
+		if isinstance(command, list):
+			proc = subprocess.Popen(command, stdout=subprocess.PIPE)
+		else:
+			proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
+		exitcode = proc.wait()
+		for line in proc.stdout.readlines():
+			if isinstance(line, bytes):
+				line = line.decode()
+			output += line.rstrip() + "\n"
+
+		proc.stdout.close()
+		return exitcode, output
+
+	def get_command_output(self, command):
+		(exitcode, output) = self.get_command_output_class(command)
+		self.assertEqual(exitcode, 0)
+		return output
+
+	@classmethod	
+	def setUpClass(cls):
+		if NVCC_COMPILER == "":
+			print ("Cannot find CUDA compiler")
+			exit(-1)	
+
+		cls.build_directory = GRAPHIT_BUILD_DIRECTORY
+		cls.scratch_directory = GRAPHIT_BUILD_DIRECTORY + "/scratch"
+		if os.path.isdir(cls.scratch_directory):
+			shutil.rmtree(cls.scratch_directory)
+		os.mkdir(cls.scratch_directory)
+		
+		cls.nvcc_command = NVCC_COMPILER + " -ccbin " + CXX_COMPILER + " "
+		cls.test_input_directory = GRAPHIT_SOURCE_DIRECTORY + "/test/gpu_tests/test_input"
+		
+		cls.get_command_output_class(cls.nvcc_command + cls.test_input_directory + "/obtain_gpu_cc.cu -o " + cls.scratch_directory + "/obtain_gpu_cc")
+		output = cls.get_command_output_class(cls.scratch_directory + "/obtain_gpu_cc")[1].split()
+
+		if len(output) != 2:
+			print ("Cannot obtain GPU information")
+			exit(-1)
+		compute_capability = output[0]
+		num_of_sm = output[1]
+		
+		cls.nvcc_command += " -DNUM_CTA=" + num_of_sm + " -DCTA_SIZE=1024 -gencode arch=compute_" + compute_capability + ",code=sm_" + compute_capability
+		cls.nvcc_command += " -std=c++11 -O3 -I " + GRAPHIT_SOURCE_DIRECTORY + "/src/runtime_lib/ -Xcompiler \"-w\" -Wno-deprecated-gpu-targets "
+		
+		shutil.copytree(GRAPHIT_SOURCE_DIRECTORY + "/test/graphs", cls.scratch_directory + "/graphs")
+		cls.graph_directory = cls.scratch_directory + "/graphs"
+		
+		cls.executable_name = cls.scratch_directory + "/test_exectuable"
+	
+	def cpp_compile_test(self, input_file_name, extra_cpp_args=[]):
+		compile_command = self.nvcc_command + self.test_input_directory + "/" + input_file_name + " -o " + self.executable_name + " " + " ".join(extra_cpp_args)
+		self.get_command_output(compile_command)
+	
+	def cpp_exec_test(self, input_file_name, extra_cpp_args=[], extra_exec_args=[]):
+		self.cpp_compile_test(input_file_name, extra_cpp_args)
+		return self.get_command_output(self.executable_name + " " + " ".join(extra_exec_args))
+			
+	def test_basic_compile(self):
+		self.cpp_compile_test("basic_compile.cu")
+	def test_basic_load_graph(self):
+		output = self.cpp_exec_test("basic_load_graph.cu", [], [self.graph_directory + "/4.mtx"])
+		output = output.split("\n")
+		self.assertEqual(len(output), 3)
+		self.assertEqual(output[1], "14, 106")
+
+if __name__ == '__main__':
+	unittest.main()
diff --git a/test/gpu_tests/test_input/basic_compile.cu b/test/gpu_tests/test_input/basic_compile.cu
new file mode 100644
index 00000000..f1b3dd0c
--- /dev/null
+++ b/test/gpu_tests/test_input/basic_compile.cu
@@ -0,0 +1,5 @@
+#include "gpu_intrinsics.h"
+
+int __host__ main(int argc, char* argv[]) {
+	return 0;
+}
diff --git a/test/gpu_tests/test_input/basic_load_graph.cu b/test/gpu_tests/test_input/basic_load_graph.cu
new file mode 100644
index 00000000..4cd0a330
--- /dev/null
+++ b/test/gpu_tests/test_input/basic_load_graph.cu
@@ -0,0 +1,9 @@
+#include "gpu_intrinsics.h"
+
+gpu_runtime::GraphT<int32_t> edges;
+
+int __host__ main(int argc, char* argv[]) {
+	gpu_runtime::load_graph(edges, argv[1], false);
+	std::cout << edges.num_vertices << ", " << edges.num_edges << std::endl;	
+	return 0;
+}
diff --git a/test/gpu_tests/test_input/obtain_gpu_cc.cu b/test/gpu_tests/test_input/obtain_gpu_cc.cu
new file mode 100644
index 00000000..bdec4266
--- /dev/null
+++ b/test/gpu_tests/test_input/obtain_gpu_cc.cu
@@ -0,0 +1,31 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_runtime_api.h>
+
+int main(int argc, char *argv[]) {
+    cudaDeviceProp prop;
+    cudaError_t status;
+    int device_count;
+    int device_index = 0;
+    if (argc > 1) {
+        device_index = atoi(argv[1]);
+    }
+
+    status = cudaGetDeviceCount(&device_count);
+    if (status != cudaSuccess) {
+        fprintf(stderr,"cudaGetDeviceCount() failed: %s\n", cudaGetErrorString(status));
+        return -1;
+    }
+    if (device_index >= device_count) {
+        fprintf(stderr, "Specified device index %d exceeds the maximum (the device count on this system is %d)\n", device_index, device_count);
+        return -1;
+    }
+    status = cudaGetDeviceProperties(&prop, device_index);
+    if (status != cudaSuccess) {
+        fprintf(stderr,"cudaGetDeviceProperties() for device device_index failed: %s\n", cudaGetErrorString(status));
+        return -1;
+    }
+    int v = prop.major * 10 + prop.minor;
+    printf("%d\n", v);
+    printf("%d\n", prop.multiProcessorCount);
+}

From 85fb15698370b977fa7eaec9745d10cf9d635687 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Wed, 2 Oct 2019 13:40:52 -0400
Subject: [PATCH 11/88] Added runtime library tests to GPU test suite

---
 CMakeLists.txt                                |  8 ++---
 ...{runtime_lib_tests.py => all_gpu_tests.py} |  4 ++-
 .../gpu_tests/test_input/runtime_lib_tests.cu | 29 +++++++++++++++++++
 3 files changed, 36 insertions(+), 5 deletions(-)
 rename test/gpu_tests/{runtime_lib_tests.py => all_gpu_tests.py} (92%)
 create mode 100644 test/gpu_tests/test_input/runtime_lib_tests.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c926448d..8a67a3cb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,17 +85,17 @@ add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/graphit.py
 
 find_package(CUDA QUIET)
 
-add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/gpu_tests/runtime_lib_tests.py
+add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/gpu_tests/all_gpu_tests.py
 	COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/gpu_tests
-	COMMAND sed -e s?\$\{NVCC_COMPILER\}?${CUDA_NVCC_EXECUTABLE}?g -e s?\$\{GRAPHIT_SOURCE_DIRECTORY\}?${CMAKE_SOURCE_DIR}?g -e s?\$\{CXX_COMPILER\}?${CMAKE_CXX_COMPILER}?g -e s?\$\{GRAPHIT_BUILD_DIRECTORY\}?${CMAKE_BINARY_DIR}?g ${CMAKE_SOURCE_DIR}/test/gpu_tests/runtime_lib_tests.py > ${CMAKE_BINARY_DIR}/gpu_tests/runtime_lib_tests.py
-	DEPENDS ${CMAKE_SOURCE_DIR}/test/gpu_tests/runtime_lib_tests.py
+	COMMAND sed -e s?\$\{NVCC_COMPILER\}?${CUDA_NVCC_EXECUTABLE}?g -e s?\$\{GRAPHIT_SOURCE_DIRECTORY\}?${CMAKE_SOURCE_DIR}?g -e s?\$\{CXX_COMPILER\}?${CMAKE_CXX_COMPILER}?g -e s?\$\{GRAPHIT_BUILD_DIRECTORY\}?${CMAKE_BINARY_DIR}?g ${CMAKE_SOURCE_DIR}/test/gpu_tests/all_gpu_tests.py > ${CMAKE_BINARY_DIR}/gpu_tests/all_gpu_tests.py
+	DEPENDS ${CMAKE_SOURCE_DIR}/test/gpu_tests/all_gpu_tests.py
 	VERBATIM
 )
 	
 add_custom_target(copy_graphitc_py ALL DEPENDS ${GRAPHITC_PY})
 add_custom_target(copy_python_tests ALL DEPENDS ${CMAKE_BINARY_DIR}/python_tests/test_with_schedules.py ${CMAKE_BINARY_DIR}/python_tests/test.py ${CMAKE_BINARY_DIR}/python_tests/pybind_test.py)
 add_custom_target(copy_graphit_py ALL DEPENDS ${CMAKE_BINARY_DIR}/graphit.py)
-add_custom_target(copy_gpu_runtime_lib_tests_py ALL DEPENDS ${CMAKE_BINARY_DIR}/gpu_tests/runtime_lib_tests.py)
+add_custom_target(copy_all_gpu_tests_py ALL DEPENDS ${CMAKE_BINARY_DIR}/gpu_tests/all_gpu_tests.py)
 
 configure_file(src/main.cpp ${CMAKE_BINARY_DIR}/bin/main.cpp COPYONLY)
 configure_file(test/library_test_drivers/library_test_driver_cpp.txt ${CMAKE_BINARY_DIR}/bin/library_test_driver.cpp COPYONLY)
diff --git a/test/gpu_tests/runtime_lib_tests.py b/test/gpu_tests/all_gpu_tests.py
similarity index 92%
rename from test/gpu_tests/runtime_lib_tests.py
rename to test/gpu_tests/all_gpu_tests.py
index 706976b4..959c4699 100644
--- a/test/gpu_tests/runtime_lib_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -34,7 +34,7 @@ def get_command_output(self, command):
 
 	@classmethod	
 	def setUpClass(cls):
-		if NVCC_COMPILER == "":
+		if NVCC_COMPILER == "CUDA_NVCC_EXECUTABLE-NOTFOUND":
 			print ("Cannot find CUDA compiler")
 			exit(-1)	
 
@@ -79,6 +79,8 @@ def test_basic_load_graph(self):
 		output = output.split("\n")
 		self.assertEqual(len(output), 3)
 		self.assertEqual(output[1], "14, 106")
+	def test_runtime_library(self):
+		print (self.cpp_exec_test("runtime_lib_tests.cu", ["-I", GRAPHIT_SOURCE_DIRECTORY+"/test/gtest", GRAPHIT_SOURCE_DIRECTORY+"/test/gtest/gtest-all.cc"], [self.graph_directory]))
 
 if __name__ == '__main__':
 	unittest.main()
diff --git a/test/gpu_tests/test_input/runtime_lib_tests.cu b/test/gpu_tests/test_input/runtime_lib_tests.cu
new file mode 100644
index 00000000..d326c804
--- /dev/null
+++ b/test/gpu_tests/test_input/runtime_lib_tests.cu
@@ -0,0 +1,29 @@
+#include <gtest.h>
+#include "gpu_intrinsics.h"
+
+
+std::string graph_directory;
+
+class GPURuntimeLibTest: public ::testing::Test {
+protected:
+	virtual void SetUp() {
+	}
+	virtual void TearDown() {
+	}
+	
+};
+TEST_F(GPURuntimeLibTest, SimpleLoadGraphFromFileTest) {
+	gpu_runtime::GraphT<int32_t> edges;
+	gpu_runtime::load_graph(edges, graph_directory + "/4.mtx", false);
+	EXPECT_EQ (14, edges.num_vertices);
+}	
+
+int main(int argc, char* argv[]) {
+	if (argc < 2) {
+		std::cout << "Test needs path to graph directory as first argument" << std::endl;
+		exit(-1);
+	}
+	graph_directory = argv[1];
+	::testing::InitGoogleTest(&argc, argv);
+	return RUN_ALL_TESTS();	
+}

From cde571c6c59b23d0fed4af025afa860963e44c3e Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Wed, 2 Oct 2019 16:11:57 -0400
Subject: [PATCH 12/88] Wrapped load balance functions in host wrappers

---
 src/backend/codegen_gpu/codegen_gpu.cpp  | 62 +++++++++---------------
 src/runtime_lib/infra_gpu/graph.h        |  1 +
 src/runtime_lib/infra_gpu/load_balance.h | 32 +++++++++++-
 3 files changed, 56 insertions(+), 39 deletions(-)

diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index de3c0943..b74e8a6a 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -95,33 +95,6 @@ void CodeGenGPU::genPropertyArrayAlloca(mir::VarDecl::Ptr var_decl) {
 		
 }
 
-void CodeGenGPUKernelEmitter::genEdgeSetGlobalKernel(mir::EdgeSetApplyExpr::Ptr apply_expr) {
-	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
-	if (apply_expr->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
-		load_balance_function = "gpu_runtime::TWCE_load_balance";
-	}
-	std::string accessor_type = "gpu_runtime::AccessorSparse";
-	if (apply_expr->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && apply_expr->to_func == "")
-		accessor_type = "gpu_runtime::AccessorAll";
-
-	std::string src_filter = "gpu_runtime::true_function";
-	if (apply_expr->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && apply_expr->to_func != "")
-		src_filter = apply_expr->to_func;
-
-	std::string kernel_function_name = "gpu_operator_kernel_" + mir_context_->getUniqueNameCounterString();
-
-	oss << "template <typename EdgeWeightType>" << std::endl;
-	oss << "void __global__ " << kernel_function_name << " (gpu_runtime::GraphT<EdgeWeightType> graph, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
-	indent();
-	printIndent();
-	oss << load_balance_function << "<EdgeWeightType, " << apply_expr->device_function << "<EdgeWeightType>, " << accessor_type << ", " << src_filter << "> (";
-	oss << "graph, input_frontier, output_frontier);" << std::endl;
-	
-	dedent();
-	printIndent();
-	oss << "}" << std::endl;
-	apply_expr->kernel_function = kernel_function_name;
-}
 void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 
 	// First we generate the function that is passed to the load balancing function
@@ -152,7 +125,10 @@ void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 	printIndent();
 	oss << "}" << std::endl;	
 	apply_expr->device_function = load_balancing_arg;
+	// We are not generating the kernel now because we are directly using the host wrappers from the library
+/*
 	genEdgeSetGlobalKernel(apply_expr);
+*/
 	
 }
 
@@ -207,7 +183,11 @@ void CodeGenGPUKernelEmitter::visit(mir::PullEdgeSetApplyExpr::Ptr apply_expr) {
 	printIndent();
 	oss << "}" << std::endl;	
 	apply_expr->device_function = load_balancing_arg;
+
+	// We are not generating the kernel now because we are directly using the host wrappers from the library
+/*
 	genEdgeSetGlobalKernel(apply_expr);
+*/
 }
 
 void CodeGenGPU::genIncludeStmts(void) {
@@ -229,7 +209,7 @@ void CodeGenGPU::visit(mir::EdgeSetType::Ptr edgeset_type) {
 		edgeset_type->weight_type->accept(this);
 		oss << ">";	
 	} else {
-		oss << "gpu_runtime::GraphT<int>";
+		oss << "gpu_runtime::GraphT<int32_t>";
 	}
 }
 
@@ -397,25 +377,31 @@ void CodeGenGPU::visit(mir::AssignStmt::Ptr assign_stmt) {
 		printIndent();
 		assign_stmt->lhs->accept(this);
 		oss << " = " << esae->from_func << ";" << std::endl;
-		
 
 		printIndent();
-		oss << "int32_t num_cta, cta_size;" << std::endl;
-
+		oss << load_balance_function << "_host<";
+		
+		mir::Var target_var = mir::to<mir::VarExpr>(esae->target)->var;
+		mir::EdgeSetType::Ptr target_type = mir::to<mir::EdgeSetType>(target_var.getType());
+		if (target_type->weight_type == nullptr)
+			oss << "int32_t";
+		else
+			target_type->weight_type->accept(this);
+		
 		std::string accessor_type = "gpu_runtime::AccessorSparse";
 		if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->to_func == "")
 			accessor_type = "gpu_runtime::AccessorAll";
-
-		printIndent();		
-		oss << load_balance_function << "_info<" << accessor_type << ">(";
-		oss << esae->from_func;
-		oss << ", num_cta, cta_size);" << std::endl;
-		printIndent();
-		oss << esae->kernel_function << "<<<num_cta, cta_size>>>" << "(";
+		std::string src_filter = "gpu_runtime::true_function";
+		if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->to_func != "")
+			src_filter = esae->to_func;
+		
+		oss << ", " << esae->device_function << ", " << accessor_type << ", " << src_filter << ">(";
 		esae->target->accept(this);
 		oss << ", " << esae->from_func << ", ";
 		assign_stmt->lhs->accept(this);
 		oss << ");" << std::endl;
+			
+
 		printIndent();
 		oss << "cudaDeviceSynchronize();" << std::endl;
 		if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
index 25b83e4b..04574e16 100644
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -11,6 +11,7 @@ namespace gpu_runtime {
 
 template <typename EdgeWeightType>
 struct GraphT { // Field names are according to CSR, reuse for CSC
+	typedef EdgeWeightType EdgeWeightT;
 	int32_t num_vertices;
 	int32_t num_edges;
 
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index 8f1b79ad..ca28cf26 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -6,6 +6,11 @@
 
 namespace gpu_runtime {
 
+template <typename EdgeWeightType>
+using load_balance_payload_type = void (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier);
+
+
+// VERTEX SET APPLY FUNCTIONS
 template <void body(int32_t vid)>
 static void __device__ vertex_set_apply(int32_t num_vertices) {
 	for(int32_t vid = threadIdx.x + blockDim.x * blockIdx.x; vid < num_vertices; vid+= blockDim.x * gridDim.x) {
@@ -17,7 +22,8 @@ static void __global__ vertex_set_apply_kernel(int32_t num_vertices) {
 	vertex_set_apply<body>(num_vertices);
 } 
 
-template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier), typename AccessorType, bool src_filter(int32_t)>
+// VERTEX BASED LOAD BALANCE FUNCTIONS
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
 void __device__ vertex_based_load_balance(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
 	int32_t vid = threadIdx.x + blockDim.x * blockIdx.x;
 	if (vid >= AccessorType::getSize(input_frontier))
@@ -36,6 +42,19 @@ void __host__ vertex_based_load_balance_info(VertexFrontier &frontier, int32_t &
 	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
 	cta_size = CTA_SIZE;
 }
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ vertex_based_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	vertex_based_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __host__ vertex_based_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	vertex_based_load_balance_info<AccessorType>(input_frontier, num_cta, cta_size);
+	vertex_based_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
+}
+
+// TWCE LOAD BALANCE FUNCTIONS
 #define STAGE_1_SIZE (8)
 #define WARP_SIZE (32)
 template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier), typename AccessorType, bool src_filter(int32_t)>
@@ -147,6 +166,17 @@ void __host__ TWCE_load_balance_info(VertexFrontier &frontier, int32_t &num_cta,
 	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
 	cta_size = CTA_SIZE;
 }
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ TWCE_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	TWCE_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __host__ TWCE_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	TWCE_load_balance_info<AccessorType>(input_frontier, num_cta, cta_size);
+	TWCE_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
+}
 
 }
 

From fe272bf2baa80a6fdee0aa1cb58ed566dbc83ff7 Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Thu, 3 Oct 2019 10:46:39 -0400
Subject: [PATCH 13/88] setting up tests for gpu-based priority queue

---
 src/runtime_lib/gpu_intrinsics.h               |  1 +
 test/gpu_tests/all_gpu_tests.py                |  1 +
 test/gpu_tests/test_input/runtime_lib_tests.cu | 12 ++++++++++--
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/runtime_lib/gpu_intrinsics.h b/src/runtime_lib/gpu_intrinsics.h
index 204026a8..f72e6539 100644
--- a/src/runtime_lib/gpu_intrinsics.h
+++ b/src/runtime_lib/gpu_intrinsics.h
@@ -9,6 +9,7 @@
 #include "infra_gpu/load_balance.h"
 #include "graphit_timer.h"
 #include "infra_gpu/support.h"
+#include "infra_gpu/gpu_priority_queue.h"
 
 namespace gpu_runtime {
 
diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
index 959c4699..830bd5c7 100644
--- a/test/gpu_tests/all_gpu_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -17,6 +17,7 @@ def get_command_output_class(self, command):
 		if isinstance(command, list):
 			proc = subprocess.Popen(command, stdout=subprocess.PIPE)
 		else:
+                        print(command)
 			proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
 		exitcode = proc.wait()
 		for line in proc.stdout.readlines():
diff --git a/test/gpu_tests/test_input/runtime_lib_tests.cu b/test/gpu_tests/test_input/runtime_lib_tests.cu
index d326c804..da3f1467 100644
--- a/test/gpu_tests/test_input/runtime_lib_tests.cu
+++ b/test/gpu_tests/test_input/runtime_lib_tests.cu
@@ -1,7 +1,6 @@
 #include <gtest.h>
 #include "gpu_intrinsics.h"
 
-
 std::string graph_directory;
 
 class GPURuntimeLibTest: public ::testing::Test {
@@ -16,7 +15,16 @@ TEST_F(GPURuntimeLibTest, SimpleLoadGraphFromFileTest) {
 	gpu_runtime::GraphT<int32_t> edges;
 	gpu_runtime::load_graph(edges, graph_directory + "/4.mtx", false);
 	EXPECT_EQ (14, edges.num_vertices);
-}	
+}
+
+TEST_F(GPURuntimeLibTest, SimplePriorityQueueTest){
+	gpu_runtime::GraphT<int32_t> edges;
+	gpu_runtime::load_graph(edges, graph_directory + "/4.mtx", false);
+	int num_vertices = gpu_runtime::builtin_getVertices(edges);
+	int* priorities = new int[num_vertices]; 
+	gpu_runtime::GPUPriorityQueue<int> pq = gpu_runtime::GPUPriorityQueue<int>(priorities);
+	EXPECT_EQ (14, num_vertices);
+}
 
 int main(int argc, char* argv[]) {
 	if (argc < 2) {

From 6532105778342f45940dfcb71bc8076dd5184419 Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Thu, 3 Oct 2019 11:25:25 -0400
Subject: [PATCH 14/88] adding the file for gpu priority queue

---
 .../infra_gpu/gpu_priority_queue.h            | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 src/runtime_lib/infra_gpu/gpu_priority_queue.h

diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
new file mode 100644
index 00000000..1ed2aa52
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -0,0 +1,42 @@
+#ifndef GPU_PRIORITY_QUEUE_H
+#define GPU_PRIORITY_QUEUE_H
+
+#include <algorithm>
+#include <cinttypes>
+#include "vertex_frontier.h" 
+
+namespace gpu_runtime {
+
+  template<typename PriorityT_>
+    class GPUPriorityQueue {
+    
+  public:
+    explicit GPUPriorityQueue(PriorityT_* priorities, PriorityT_ delta=1)
+      : priorities_(priorities), delta_(delta){
+    }
+    
+    size_t get_current_priority(){
+      return current_priority_;
+    }
+
+	void update_current_priority(PriorityT_ priority_change_){
+
+	}
+    
+    bool finished() {
+      //TODO
+      return true;
+    }
+    
+    bool finishedNode(NodeID v){
+		return priorities_[v]/delta_ < get_current_priority();;
+    }
+    
+    PriorityT_* priorities_;
+    PriorityT_ delta_;
+	PriorityT_ current_priority_;
+  };
+}
+
+
+#endif // GPU_PRIORITY_QUEUE_H

From 4811023387b9c7c2c6262a1be26fd3ec77b8a7af Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Thu, 3 Oct 2019 17:53:16 -0400
Subject: [PATCH 15/88] start working on a sssp_delta_stepping gpu runtime
 library test

---
 test/gpu_tests/all_gpu_tests.py               |   7 +
 .../test_input/sssp_delta_stepping.cu         | 342 ++++++++++++++++++
 2 files changed, 349 insertions(+)
 create mode 100644 test/gpu_tests/test_input/sssp_delta_stepping.cu

diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
index 830bd5c7..8d116356 100644
--- a/test/gpu_tests/all_gpu_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -83,5 +83,12 @@ def test_basic_load_graph(self):
 	def test_runtime_library(self):
 		print (self.cpp_exec_test("runtime_lib_tests.cu", ["-I", GRAPHIT_SOURCE_DIRECTORY+"/test/gtest", GRAPHIT_SOURCE_DIRECTORY+"/test/gtest/gtest-all.cc"], [self.graph_directory]))
 
+        def test_sssp_delta_stepping(self):
+                self.cpp_exec_test("sssp_delta_stepping.cu", [], [self.graph_directory + "/4.mtx", "2"])
+
+                
 if __name__ == '__main__':
 	unittest.main()
+        #suite = unittest.TestSuite()
+        #suite.addTest(TestGraphitCompiler('test_sssp_delta_stepping'))
+        #unittest.TextTestRunner(verbosity=2).run(suite)
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
new file mode 100644
index 00000000..9374888c
--- /dev/null
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -0,0 +1,342 @@
+#include "gpu_intrinsics.h"
+#include <algorithm>
+
+#define ITER_COUNT (1)
+#define USE_DEDUP 0
+#define SORT_NODES 0
+#include <assert.h>
+#include <vector>
+#include <queue>
+
+
+typedef struct {
+	int32_t *SP;
+
+	int32_t *frontier1;
+	int32_t *frontier2;
+
+	int32_t *frontier1_size;
+	int32_t *frontier2_size;
+	int32_t *iters;
+
+	int32_t *output_size;
+
+	int32_t num_blocks;
+
+	int32_t *node_borders;
+	int32_t *edge_borders;
+
+	int32_t *worklist;
+	int32_t *old_indices;
+}algo_state;
+
+//struct timeval start_time_;
+//struct timeval elapsed_time_;
+
+// void startTimer(){
+// 	gettimeofday(&start_time_, NULL);
+// }
+
+// float stopTimer(){
+// 	gettimeofday(&elapsed_time_, NULL);
+// 	elapsed_time_.tv_sec  -= start_time_.tv_sec;
+// 	elapsed_time_.tv_usec -= start_time_.tv_usec;
+// 	return elapsed_time_.tv_sec + elapsed_time_.tv_usec/1e6;
+// }
+
+void cudaCheckLastError(void) {
+	cudaError_t err = cudaGetLastError();
+	if (err != cudaSuccess) 
+		printf("Error: %s\n", cudaGetErrorString(err));
+}
+
+
+#define VIRTUAL_WARP_SIZE (32)
+#define NUM_THREADS (1024)
+#define NUM_BLOCKS (80)
+#define CTA_SIZE (1024)
+#define WARP_SIZE (32)
+#define STAGE_1_SIZE (8)
+
+void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
+        int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+        int num_threads = blockDim.x * gridDim.x;
+        int total_work = graph.num_vertices;
+        int work_per_thread = (total_work + num_threads - 1)/num_threads;
+        for (int i = 0; i < work_per_thread; i++) {
+                int id = num_threads * i + thread_id;
+                if (id < total_work) {
+			device_state.SP[id] = INT_MAX;
+			device_state.iters[id] = 0;
+                }
+        }
+	if (thread_id == 0) {
+		device_state.SP[0] = 0;
+		device_state.frontier1[0] = 0;	
+		*device_state.frontier1_size = 1;
+		*device_state.frontier2_size = 0;
+	}
+}
+__device__ inline int warp_bcast(int v, int leader) { return __shfl_sync(__activemask(), v, leader); }
+__device__ inline int atomicAggInc(int *ctr) {
+	int32_t lane_id = threadIdx.x % 32;
+	
+        int mask = __activemask();
+        int leader = __ffs(mask) - 1;
+        int res;
+        if(lane_id == leader)
+                res = atomicAdd(ctr, __popc(mask));
+        res = warp_bcast(res, leader);
+
+        return (res + __popc(mask & ((1 << lane_id) - 1)));
+}
+__device__ void enqueueVertex(int32_t v, algo_state &device_state, int32_t curr_iter) {
+	if (device_state.iters[v] == curr_iter)
+		return;
+	device_state.iters[v] = curr_iter;
+	int32_t pos = atomicAggInc(device_state.frontier2_size);
+	device_state.frontier2[pos] = v;
+}
+
+void __global__ update_edges (gpu_runtime::GraphT<int32_t> graph, algo_state device_state, int32_t curr_iter) {
+	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+	
+	int lane_id = thread_id % 32;
+
+	__shared__ int32_t stage2_queue[CTA_SIZE];
+	__shared__ int32_t stage3_queue[CTA_SIZE];
+	__shared__ int32_t stage_queue_sizes[3];
+	if (threadIdx.x == 0) {
+		stage_queue_sizes[0] = 0;
+		stage_queue_sizes[1] = 0;
+		stage_queue_sizes[2] = 0;
+	}
+	__syncthreads();
+	
+	
+	__shared__ int32_t stage2_offset[CTA_SIZE];
+	__shared__ int32_t stage3_offset[CTA_SIZE];
+
+	__shared__ int32_t stage2_size[CTA_SIZE];
+	__shared__ int32_t stage3_size[CTA_SIZE];
+	
+
+	int32_t total_vertices = device_state.frontier1_size[0];	
+
+	int32_t my_vertex_idx = thread_id / (STAGE_1_SIZE);
+	int32_t d;
+	int32_t s1_offset;
+	int32_t my_vertex;
+	int32_t row_offset;
+	if (my_vertex_idx < total_vertices) {
+		my_vertex = device_state.frontier1[my_vertex_idx];
+		// Step 1 segreggate vertices into shared buffers	
+		if (thread_id % (STAGE_1_SIZE) == 0 ) {
+			d = graph.d_get_degree(my_vertex);
+			row_offset = graph.d_src_offsets[my_vertex];	
+			int32_t s3_size = d/CTA_SIZE;
+			d = d - s3_size * CTA_SIZE;
+			if (s3_size) {
+				int32_t pos = atomicAggInc(&stage_queue_sizes[2]);
+				stage3_queue[pos] = my_vertex;			
+				stage3_size[pos] = s3_size * CTA_SIZE;
+				// stage3_offset[pos] = 0; // Not required because always 0
+				stage3_offset[pos] = row_offset;	
+			}
+			
+			int32_t s2_size = d/WARP_SIZE;
+			d = d - s2_size * WARP_SIZE;
+			
+			if (s2_size) {
+				int32_t pos = atomicAggInc(&stage_queue_sizes[1]);
+				stage2_queue[pos] = my_vertex;
+				stage2_offset[pos] = s3_size * CTA_SIZE + row_offset;
+				stage2_size[pos] = s2_size * WARP_SIZE;
+			}
+			s1_offset = s3_size * CTA_SIZE + s2_size * WARP_SIZE + row_offset;
+		}
+	}else
+		my_vertex = -1;
+
+	__syncthreads();
+	
+	d = __shfl_sync((uint32_t)-1, d, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
+	s1_offset = __shfl_sync((uint32_t)-1, s1_offset, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
+	int32_t src_distance;
+	if (my_vertex_idx < total_vertices) {
+		// STAGE 1	
+		my_vertex = device_state.frontier1[my_vertex_idx];
+		src_distance = device_state.SP[my_vertex];
+		for (int32_t neigh_id = s1_offset + (lane_id % STAGE_1_SIZE); neigh_id < d + s1_offset; neigh_id += STAGE_1_SIZE) {
+			// DO ACTUAL SSSP
+			int32_t dst = graph.d_edge_dst[neigh_id];
+			int32_t new_dst = graph.d_edge_weight[neigh_id] + src_distance;
+			if (new_dst < device_state.SP[dst]) {
+				atomicMin(&device_state.SP[dst], new_dst);
+				enqueueVertex(dst, device_state, curr_iter);
+			}	
+		}		
+	}	
+	// STAGE 2 -- stage 2 is dynamically balanced
+	while (1) {
+		int32_t to_process;
+		if (lane_id == 0) {
+			to_process = atomicSub(&stage_queue_sizes[1], 1) - 1;	
+		}
+		to_process = __shfl_sync((uint32_t)-1, to_process, 0, 32);
+		if (to_process < 0)
+			break;
+		my_vertex = stage2_queue[to_process];
+		d = stage2_size[to_process];
+		int32_t s2_offset = stage2_offset[to_process];	
+		src_distance = device_state.SP[my_vertex];
+		
+		for (int32_t neigh_id = s2_offset + (lane_id); neigh_id < d + s2_offset; neigh_id += WARP_SIZE) {
+			// DO ACTUAL SSSP
+			int dst = graph.d_edge_dst[neigh_id];
+			int new_dst = graph.d_edge_weight[neigh_id] + src_distance;
+			if (new_dst < device_state.SP[dst]) {
+				atomicMin(&device_state.SP[dst], new_dst);
+				enqueueVertex(dst, device_state, curr_iter);
+			}	
+		}
+	}	
+
+	// STAGE 3 -- all threads have to do all, no need for LB
+	for (int32_t wid = 0; wid < stage_queue_sizes[2]; wid ++) {
+		my_vertex = stage3_queue[wid];
+		d = stage3_size[wid];
+		int32_t s3_offset = stage3_offset[wid];
+		src_distance = device_state.SP[my_vertex];
+		
+		for (int32_t neigh_id = s3_offset + (threadIdx.x); neigh_id < d + s3_offset; neigh_id += CTA_SIZE) {
+			// DO ACTUAL SSSP
+			int dst = graph.d_edge_dst[neigh_id];
+			int new_dst = graph.d_edge_weight[neigh_id] + src_distance;
+			if (new_dst < device_state.SP[dst]) {
+				atomicMin(&device_state.SP[dst], new_dst);
+				enqueueVertex(dst, device_state, curr_iter);
+			}	
+		}
+	}	
+}
+void __global__ update_nodes (gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
+	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+	int num_threads = blockDim.x * gridDim.x;
+	
+	int total_work = graph.num_vertices;
+	int work_per_thread = (total_work + num_threads - 1)/num_threads;
+	
+	for (int i = 0; i < work_per_thread; i++) {
+		int32_t node_id = thread_id + i * num_threads;
+		if (node_id < graph.num_vertices) {
+			if (device_state.frontier2[node_id]) {
+				device_state.frontier2[node_id] = 0;
+				int pos = atomicAdd(device_state.frontier1_size, 1);
+				device_state.frontier1[pos] = node_id;
+			}
+		}
+	}
+
+}
+void allocate_state(algo_state &host_state, algo_state &device_state, gpu_runtime::GraphT<int32_t> &graph) {
+	host_state.SP = new int[graph.num_vertices];
+	host_state.output_size = new int32_t[1];
+
+	host_state.frontier1_size = new int32_t[1];
+	host_state.frontier1 = new int32_t[graph.num_vertices];
+
+	
+	cudaMalloc(&device_state.SP, sizeof(int32_t)*graph.num_vertices);	
+
+	cudaMalloc(&device_state.frontier1, sizeof(int32_t)*graph.num_vertices * 6);	
+	cudaMalloc(&device_state.frontier2, sizeof(int32_t)*graph.num_vertices * 6);	
+	cudaMalloc(&device_state.iters, sizeof(int32_t)*graph.num_vertices);	
+
+	cudaMalloc(&device_state.frontier1_size, sizeof(int32_t));	
+	cudaMalloc(&device_state.frontier2_size, sizeof(int32_t));	
+
+	cudaMalloc(&device_state.output_size, sizeof(int32_t));
+
+
+	cudaMalloc(&device_state.worklist, sizeof(int32_t));
+}
+
+void swap_pointers(int32_t **a, int32_t **b) {
+	int32_t* t = *a;
+	*a = *b;
+	*b = t;
+}
+void swap_queues(algo_state &device_state) {
+	swap_pointers(&device_state.frontier1, &device_state.frontier2);
+	swap_pointers(&device_state.frontier1_size, &device_state.frontier2_size);
+}
+int main(int argc, char *argv[]) {
+	cudaSetDevice(0);
+	cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
+	gpu_runtime::GraphT<int32_t> graph;
+	gpu_runtime::load_graph(graph, argv[1], false);
+
+	algo_state host_state, device_state;
+
+	allocate_state(host_state, device_state, graph);
+
+	cudaDeviceSynchronize();
+		
+	float total_time = 0;
+	for (int outer = 0; outer < ITER_COUNT; outer++) {
+		float iter_total = 0;
+		startTimer();
+		
+		startTimer();
+		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);		
+		int iters = 0;	
+		cudaDeviceSynchronize();
+		float t = stopTimer();
+		printf("Init time = %f\n", t);
+		iter_total+=t;
+
+		host_state.frontier1_size[0] = 1;
+		while(*host_state.frontier1_size) {
+			startTimer();
+			iters++;
+			int num_threads = *host_state.frontier1_size *(STAGE_1_SIZE);
+			int num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+			
+			update_edges<<<num_cta, CTA_SIZE>>>(graph, device_state, iters);
+
+			host_state.frontier1_size[0] = 0;
+			cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, sizeof(int32_t), cudaMemcpyHostToDevice);
+			
+			swap_queues(device_state);
+				
+			cudaCheckLastError();
+			cudaMemcpy(host_state.frontier1_size, device_state.frontier1_size, sizeof(int32_t), cudaMemcpyDeviceToHost);
+
+			t = stopTimer();
+			printf("Iter %d time = %f, output_size = %d <%d, %d>\n", iters, t, *host_state.frontier1_size, num_cta, CTA_SIZE);
+			iter_total += t;
+		}
+		
+		printf("Num iters = %d\n", iters);
+		printf("Time elapsed = %f\n", iter_total);
+		total_time += iter_total;
+
+	}
+	printf("Total time = %f\n", total_time);
+	if (argc > 2)
+		if (argv[2][0] == 'o'){ 
+			FILE *output = fopen("output.txt", "w");
+			cudaMemcpy(host_state.SP, device_state.SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
+			for (int i = 0; i < graph.num_vertices; i++)
+				fprintf(output, "%d, %d\n", i, host_state.SP[i]);
+		}else if (argv[2][0] == 'c'){
+			/*
+			for (int i = 0; i < NUM_BLOCKS * NUM_THREADS; i++)
+				printf("%d: %d\n", i, counters[i]);
+			*/
+		}
+
+	return 0;
+
+}

From 8b9f8c73fa1b023b742966d033cc841748cb0a9e Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Thu, 3 Oct 2019 19:47:19 -0400
Subject: [PATCH 16/88] Basic Kernel fusion implemented. Compiling for SSSP.
 Need to get operator lowering done

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |  50 +++++-
 include/graphit/midend/mir.h                  |   3 +
 include/graphit/midend/mir_context.h          |   4 +
 include/graphit/midend/while_loop_fusion.h    |  24 +++
 src/backend/codegen_gpu/codegen_gpu.cpp       | 154 ++++++++++++++++--
 src/midend/mir_lower.cpp                      |   4 +
 src/midend/while_loop_fusion.cpp              |  34 ++++
 src/runtime_lib/gpu_intrinsics.h              |   5 +
 src/runtime_lib/infra_gpu/vertex_frontier.h   |   3 +
 9 files changed, 267 insertions(+), 14 deletions(-)
 create mode 100644 include/graphit/midend/while_loop_fusion.h
 create mode 100644 src/midend/while_loop_fusion.cpp

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index 64fe7225..cc6bec59 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -8,6 +8,7 @@
 #include <iostream>
 #include <sstream>
 #include <graphit/backend/gen_edge_apply_func_decl.h>
+#include <unordered_set>
 
 namespace graphit {
 class CodeGenGPUKernelEmitter: public mir::MIRVisitor {
@@ -30,6 +31,7 @@ class CodeGenGPUKernelEmitter: public mir::MIRVisitor {
 	void genEdgeSetGlobalKernel(mir::EdgeSetApplyExpr::Ptr);
 
 };
+
 class CodeGenGPU : public mir::MIRVisitor{
 public:
 	CodeGenGPU(std::ostream &input_oss, MIRContext *mir_context, std::string module_name_, std::string module_path):
@@ -61,6 +63,8 @@ class CodeGenGPU : public mir::MIRVisitor{
 
 	void genPropertyArrayDecl(mir::VarDecl::Ptr);
 	void genPropertyArrayAlloca(mir::VarDecl::Ptr);
+	
+	void genFusedWhileLoop(mir::WhileStmt::Ptr);
 
 	EdgesetApplyFunctionDeclGenerator* edgeset_apply_func_gen_;
 
@@ -68,6 +72,16 @@ class CodeGenGPU : public mir::MIRVisitor{
 		return "__device__";
 	}
 
+	std::vector<mir::Var> kernel_hoisted_vars;
+	std::string current_kernel_name;
+	bool is_hoisted_var (mir::Var var) {
+		for (auto h_var: kernel_hoisted_vars)
+			if (h_var.getName() == var.getName())
+				return true;
+		return false;
+	}
+	
+
 	void generateBinaryExpr(mir::BinaryExpr::Ptr, std::string);
 protected:
 	virtual void visit(mir::EdgeSetType::Ptr) override;
@@ -121,11 +135,43 @@ class CodeGenGPUHost: public CodeGenGPU {
 	virtual std::string getBackendFunctionLabel(void) {
 		return "__host__";
 	}
-	virtual void visit(mir::TensorArrayReadExpr::Ptr);
-	virtual void visit(mir::StmtBlock::Ptr);
+	virtual void visit(mir::TensorArrayReadExpr::Ptr) override;
+	virtual void visit(mir::StmtBlock::Ptr) override;
+
+	virtual void visit(mir::Call::Ptr) override;	
+	virtual void visit(mir::PrintStmt::Ptr) override;
+
+
+
 	void generateDeviceToHostCopy(mir::TensorArrayReadExpr::Ptr tare);
 	void generateHostToDeviceCopy(mir::TensorArrayReadExpr::Ptr tare);
 };
 
+
+class CodeGenGPUFusedKernel: public CodeGenGPU {
+public:
+	using CodeGenGPU::CodeGenGPU;
+	using CodeGenGPU::visit;
+	virtual void visit(mir::StmtBlock::Ptr) override;
+	virtual void visit(mir::AssignStmt::Ptr) override;
+	virtual void visit(mir::VarDecl::Ptr) override;
+	virtual void visit(mir::PrintStmt::Ptr) override;
+};
+
+class KernelVariableExtractor: public mir::MIRVisitor {
+public:
+	using mir::MIRVisitor::visit;
+	std::vector<mir::Var> hoisted_vars; 
+
+	void insertVar(mir::Var var_to_insert) {
+		for (auto var: hoisted_vars)
+			if (var.getName() == var_to_insert.getName())
+				return;
+		hoisted_vars.push_back(var_to_insert);
+	}
+
+	virtual void visit(mir::VarExpr::Ptr);
+};
+
 }
 #endif
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index a28a1be0..7789a492 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -414,6 +414,9 @@ namespace graphit {
 
             typedef std::shared_ptr<WhileStmt> Ptr;
 
+	    bool is_fused;
+	    std::string fused_kernel_name;
+
             virtual void accept(MIRVisitor *visitor) {
                 visitor->visit(self<WhileStmt>());
             }
diff --git a/include/graphit/midend/mir_context.h b/include/graphit/midend/mir_context.h
index 5971193e..48293cbe 100644
--- a/include/graphit/midend/mir_context.h
+++ b/include/graphit/midend/mir_context.h
@@ -422,6 +422,10 @@ namespace graphit {
 
         std::vector<mir::Type::Ptr> types_requiring_typedef;
 
+
+	// Used by kernel fusion optimization
+	std::vector<mir::WhileStmt::Ptr> fused_while_loops;
+
     };
 
 }
diff --git a/include/graphit/midend/while_loop_fusion.h b/include/graphit/midend/while_loop_fusion.h
new file mode 100644
index 00000000..27a46427
--- /dev/null
+++ b/include/graphit/midend/while_loop_fusion.h
@@ -0,0 +1,24 @@
+#ifndef WHILE_LOOP_FUSION_H
+#define WHILE_LOOP_FUSION_H
+
+#include <graphit/midend/mir_context.h>
+#include <graphit/frontend/schedule.h>
+#include <graphit/midend/mir_rewriter.h>
+
+namespace graphit {
+
+struct WhileLoopFusion: public mir::MIRVisitor {
+	using mir::MIRVisitor::visit;
+	WhileLoopFusion(MIRContext* mir_context, Schedule* schedule): mir_context_(mir_context), schedule_(schedule) {
+	}
+	void lower(void);
+protected:
+	virtual void visit(mir::WhileStmt::Ptr);
+private:
+	Schedule *schedule_ = nullptr;
+	MIRContext *mir_context_ = nullptr;
+};
+
+}
+
+#endif
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index b74e8a6a..93125013 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -39,6 +39,10 @@ int CodeGenGPU::genGPU() {
 	CodeGenGPUKernelEmitter kernel_emitter(oss, mir_context_);
 	for (auto function: functions)
 		function->accept(&kernel_emitter);		
+	
+	// All the fused kernels need to generated before we can acutally generate the functions
+	for (auto while_loop: mir_context_->fused_while_loops) 
+		genFusedWhileLoop(while_loop);
 
 	for (auto function: functions) {
 		if (function->function_context & mir::FuncDecl::function_context_type::CONTEXT_DEVICE)
@@ -94,7 +98,63 @@ void CodeGenGPU::genPropertyArrayAlloca(mir::VarDecl::Ptr var_decl) {
 	
 		
 }
+void KernelVariableExtractor::visit(mir::VarExpr::Ptr var_expr) {
+	insertVar(var_expr->var);
+}
+void CodeGenGPU::genFusedWhileLoop(mir::WhileStmt::Ptr while_stmt) {
+	// First we generate a unique function name for this fused kernel
+	std::string fused_kernel_name = "fused_kernel_body_" + mir_context_->getUniqueNameCounterString();
+	while_stmt->fused_kernel_name = fused_kernel_name;
 
+	// Now we extract the list of variables that are used in the kernel that are not const 
+	// So we can hoist them
+	KernelVariableExtractor extractor;
+	while_stmt->accept(&extractor);
+	
+	CodeGenGPUFusedKernel codegen (oss, mir_context_, module_name, "");
+	
+	oss << "// ";
+	for (auto var: extractor.hoisted_vars) 
+		oss << var.getName() << " ";
+	oss << std::endl;
+	
+	for (auto var: extractor.hoisted_vars) {	
+		var.getType()->accept(this);	
+		oss << " __device__ " << fused_kernel_name << "_" << var.getName() << ";" << std::endl;
+	}
+	codegen.kernel_hoisted_vars = extractor.hoisted_vars;
+	codegen.current_kernel_name = fused_kernel_name;
+
+	oss << "void __global__ " << fused_kernel_name << "(void) {" << std::endl;	
+	codegen.indent();
+	codegen.printIndent();
+	oss << "grid_group _grid = this_grid();" << std::endl;
+	codegen.printIndent();
+	oss << "int32_t _thread_id = threadIdx.x + blockIdx.x * blockDim.x;" << std::endl;
+	codegen.printIndent();
+	oss << "while (";
+	while_stmt->cond->accept(&codegen);
+	oss << ") {" << std::endl;
+	codegen.indent();
+	while_stmt->body->accept(&codegen);
+	codegen.dedent();
+	codegen.printIndent();
+	oss << "}" << std::endl;
+	codegen.dedent();
+	codegen.printIndent();
+	oss << "}" << std::endl;			
+
+	codegen.kernel_hoisted_vars.clear();
+}
+void CodeGenGPUFusedKernel::visit(mir::StmtBlock::Ptr stmt_block) {
+	for (auto stmt : *(stmt_block->stmts)) {
+		stmt->accept(this);
+		if (!mir::isa<mir::BreakStmt>(stmt)) {
+			printIndent();
+			oss << "_grid.sync();" << std::endl;
+		}
+	}
+}
 void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 
 	// First we generate the function that is passed to the load balancing function
@@ -106,10 +166,16 @@ void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 	indent();
 	printIndent();
 	oss << "// Body of the actual operator code" << std::endl;
-	printIndent();
-	oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
-	printIndent();
-	oss << "if (" << apply_expr->input_function_name << "(src, dst, weight)) {" << std::endl;
+	mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function_name);
+	if (input_function->args.size() == 3) {	
+		printIndent();
+		oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+		printIndent();
+		oss << "if (" << apply_expr->input_function_name << "(src, dst, weight)) {" << std::endl;
+	} else {
+		printIndent();
+		oss << "if (" << apply_expr->input_function_name << "(src, dst)) {" << std::endl;
+	}
 	indent();
 	printIndent();
 	if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED)
@@ -163,11 +229,17 @@ void CodeGenGPUKernelEmitter::visit(mir::PullEdgeSetApplyExpr::Ptr apply_expr) {
 		dedent();
 	}
 
-	printIndent();
-	oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
-	printIndent();
-	// Order is reversed here because PULL direction
-	oss << "if (" << apply_expr->input_function_name << "(dst, src, weight)) {" << std::endl;
+	mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function_name);
+	if (input_function->args.size() == 3) {	
+		printIndent();
+		oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+		printIndent();
+		oss << "if (" << apply_expr->input_function_name << "(dst, src, weight)) {" << std::endl;
+	} else {
+		printIndent();
+		oss << "if (" << apply_expr->input_function_name << "(dst, src)) {" << std::endl;
+	}
+
 	indent();
 	printIndent();
 	if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED)
@@ -192,6 +264,8 @@ void CodeGenGPUKernelEmitter::visit(mir::PullEdgeSetApplyExpr::Ptr apply_expr) {
 
 void CodeGenGPU::genIncludeStmts(void) {
 	oss << "#include \"gpu_intrinsics.h\"" << std::endl;
+	oss << "#include <cooperative_groups.h>" << std::endl;
+	oss << "using namespace cooperative_groups;" << std::endl;
 
 }
 
@@ -330,6 +404,10 @@ void CodeGenGPU::visit(mir::ExprStmt::Ptr expr_stmt) {
 	oss << ";" << std::endl;
 }
 void CodeGenGPU::visit(mir::VarExpr::Ptr var_expr) {
+	if (is_hoisted_var(var_expr->var)) {
+		oss << current_kernel_name << "_" << var_expr->var.getName();
+		return;
+	}
 	oss << var_expr->var.getName();
 }
 void CodeGenGPU::visit(mir::AssignStmt::Ptr assign_stmt) {
@@ -442,6 +520,22 @@ void CodeGenGPU::visit(mir::AssignStmt::Ptr assign_stmt) {
 	}
 }
 
+void CodeGenGPUFusedKernel::visit(mir::AssignStmt::Ptr assign_stmt) {
+	if (mir::isa<mir::EdgeSetApplyExpr>(assign_stmt->expr)) {
+		// Will be handled later
+	} else {
+		printIndent();
+		oss << "if (_thread_id == 0) " << std::endl;
+		indent();
+		printIndent();
+		assign_stmt->lhs->accept(this);
+		oss << " = ";
+		assign_stmt->expr->accept(this);
+		oss << ";" << std::endl;	
+		dedent();
+	}	
+}
+
 void CodeGenGPU::generateBinaryExpr(mir::BinaryExpr::Ptr expr, std::string token) {
 	oss << "(";
 	expr->lhs->accept(this);
@@ -606,6 +700,9 @@ void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
 		oss << ";" << std::endl;
 		
 	
+}
+void CodeGenGPUFusedKernel::visit(mir::VarDecl::Ptr var_decl) {
+	// Do nothing for variable declarations on kernel	
 }
 void CodeGenGPU::visit(mir::VertexSetDedupExpr::Ptr vsde) {
 	oss << "gpu_runtime::dedup_frontier(";
@@ -629,6 +726,11 @@ void CodeGenGPU::visit(mir::ForStmt::Ptr for_stmt) {
 	oss << "}" << std::endl;
 }
 void CodeGenGPU::visit(mir::WhileStmt::Ptr while_stmt) {
+	if (while_stmt->is_fused == true) {
+		printIndent();
+		oss << "cudaLaunchCooperativeKernel((void*)" << while_stmt->fused_kernel_name << ", NUM_CTA, CTA_SIZE, gpu_runtime::no_args);" << std::endl;
+		return;
+	}
 	printIndent();
 	oss << "while (";
 	while_stmt->cond->accept(this);
@@ -659,13 +761,24 @@ void CodeGenGPU::visit(mir::IfStmt::Ptr if_stmt) {
 	}	
 	oss << std::endl;
 }
-void CodeGenGPU::visit(mir::PrintStmt::Ptr print_stmt) {
+void CodeGenGPUHost::visit(mir::PrintStmt::Ptr print_stmt) {
 	printIndent();
 	oss << "std::cout << ";
 	print_stmt->expr->accept(this);
 	oss << " << std::endl;" << std::endl;
 }
-void CodeGenGPU::visit(mir::Call::Ptr call_expr) {
+void CodeGenGPU::visit(mir::PrintStmt::Ptr print_stmt) {
+	assert(false && "Cannot print from device function\n");
+}
+void CodeGenGPUFusedKernel::visit(mir::PrintStmt::Ptr print_stmt) {
+	printIndent();
+	oss << "if (_thread_id == 0)" << std::endl;
+	indent();
+	printIndent();
+	oss << "printf(\"There is supposed to be a print here\\n\");" << std::endl;
+	dedent();
+}
+void CodeGenGPUHost::visit(mir::Call::Ptr call_expr) {
 	if (call_expr->name == "deleteObject" || call_expr->name.substr(0, strlen("builtin_")) == "builtin_")	
 		oss << "gpu_runtime::" << call_expr->name << "(";
 	else
@@ -680,6 +793,23 @@ void CodeGenGPU::visit(mir::Call::Ptr call_expr) {
 	}	
 	oss << ")";
 }
+
+void CodeGenGPU::visit(mir::Call::Ptr call_expr) {
+	if (call_expr->name == "deleteObject" || call_expr->name.substr(0, strlen("builtin_")) == "builtin_")	
+		oss << "gpu_runtime::device_" << call_expr->name << "(";
+	else
+		oss << call_expr->name << "(";
+	
+	bool printDelimeter = false;
+	for (auto arg: call_expr->args) {
+		if (printDelimeter) 
+			oss << ", ";
+		arg->accept(this);
+		printDelimeter = true;
+	}	
+	oss << ")";
+}
+
 void CodeGenGPU::visit(mir::EqExpr::Ptr eq_expr) {
 	oss << "(";
 	eq_expr->operands[0]->accept(this);
@@ -741,7 +871,7 @@ void CodeGenGPU::visit(mir::VertexSetAllocExpr::Ptr vsae) {
 	mir::Expr::Ptr size_expr = mir_context_->getElementCount(vsae->element_type);
 	oss << "gpu_runtime::create_new_vertex_set(";
 	size_expr->accept(this);
-	oss << ")" << std::endl;
+	oss << ")";
 }
 void CodeGenGPUHost::generateDeviceToHostCopy(mir::TensorArrayReadExpr::Ptr tare) {
 	printIndent();
diff --git a/src/midend/mir_lower.cpp b/src/midend/mir_lower.cpp
index e333e60b..c988a57f 100644
--- a/src/midend/mir_lower.cpp
+++ b/src/midend/mir_lower.cpp
@@ -12,6 +12,7 @@
 #include <graphit/midend/vertex_edge_set_lower.h>
 #include <graphit/midend/merge_reduce_lower.h>
 #include <graphit/midend/priority_features_lowering.h>
+#include <graphit/midend/while_loop_fusion.h>
 
 namespace graphit {
     /**
@@ -64,6 +65,9 @@ namespace graphit {
         // This pass extracts the merge field and reduce operator. If numa_aware is set to true in
         // the schedule for the corresponding label, it also adds NUMA optimization
         MergeReduceLower(mir_context, schedule).lower();
+
+	// This pass lowers while loops that have fusion schedule attached to them 
+	WhileLoopFusion(mir_context, schedule).lower();	
     }
 }
 
diff --git a/src/midend/while_loop_fusion.cpp b/src/midend/while_loop_fusion.cpp
new file mode 100644
index 00000000..1231358c
--- /dev/null
+++ b/src/midend/while_loop_fusion.cpp
@@ -0,0 +1,34 @@
+#include <graphit/midend/while_loop_fusion.h>
+
+void graphit::WhileLoopFusion::lower(void) {	
+    std::vector<mir::FuncDecl::Ptr> functions = mir_context_->getFunctionList();
+    for (auto function : functions) {
+        function->accept(this);
+    }
+}
+void graphit::WhileLoopFusion::visit(mir::WhileStmt::Ptr while_stmt) {
+	if (while_stmt->stmt_label != "") {
+		label_scope_.scope(while_stmt->stmt_label);
+	}
+	while_stmt->cond->accept(this);
+	while_stmt->body->accept(this);
+	if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
+		auto current_scope_name = label_scope_.getCurrentScope();
+		auto apply_schedule_iter = schedule_->apply_gpu_schedules.find(current_scope_name);
+		if (apply_schedule_iter != schedule_->apply_gpu_schedules.end()) {
+			auto apply_schedule = apply_schedule_iter->second;
+			if (dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule)) {
+				auto applied_simple_schedule = dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule);
+				if (applied_simple_schedule->kernel_fusion == fir::gpu_schedule::SimpleGPUSchedule::kernel_fusion_type::FUSION_ENABLED)
+					while_stmt->is_fused = true;
+					mir_context_->fused_while_loops.push_back(while_stmt);
+			}
+		}
+	}
+
+	if (while_stmt->stmt_label != "") {
+		label_scope_.unscope();
+	}
+
+}
+
diff --git a/src/runtime_lib/gpu_intrinsics.h b/src/runtime_lib/gpu_intrinsics.h
index 204026a8..383dc31e 100644
--- a/src/runtime_lib/gpu_intrinsics.h
+++ b/src/runtime_lib/gpu_intrinsics.h
@@ -17,6 +17,11 @@ static void deleteObject(T &t) {
 	// Currently deleteObject is empty
 
 }
+template <typename T>
+static __device__ void device_deleteObject(T &t) {
+	// Currently deleteObject is empty
+}
 
+static void * no_args[1];
 }
 #endif
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index 6d1889fe..88cd8006 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -35,6 +35,9 @@ static int32_t builtin_getVertexSetSize(VertexFrontier &frontier) {
 	cudaMemcpy(&curr_size, frontier.d_num_elems_input, sizeof(int32_t), cudaMemcpyDeviceToHost);
 	return curr_size;	
 }
+static int32_t __device__ device_builtin_getVertexSetSize(VertexFrontier &frontier) {
+	return frontier.d_num_elems_input[0];
+}
 class AccessorSparse {
 public:
 	static int32_t __device__ getSize(VertexFrontier &frontier) {

From 0439720d6293f9124e7b3dd493ddf502328634e5 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Fri, 4 Oct 2019 16:48:36 -0400
Subject: [PATCH 17/88] Added hoisted decls for fused kernel generation and
 printing support

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |  5 ++
 include/graphit/midend/mir.h                  |  2 +
 src/backend/codegen_gpu/codegen_gpu.cpp       | 54 ++++++++++++++++++-
 src/runtime_lib/gpu_intrinsics.h              |  1 +
 src/runtime_lib/infra_gpu/printer.h           | 17 ++++++
 5 files changed, 77 insertions(+), 2 deletions(-)
 create mode 100644 src/runtime_lib/infra_gpu/printer.h

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index cc6bec59..6bcabd82 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -162,6 +162,7 @@ class KernelVariableExtractor: public mir::MIRVisitor {
 public:
 	using mir::MIRVisitor::visit;
 	std::vector<mir::Var> hoisted_vars; 
+	std::vector<mir::VarDecl::Ptr> hoisted_decls;
 
 	void insertVar(mir::Var var_to_insert) {
 		for (auto var: hoisted_vars)
@@ -169,8 +170,12 @@ class KernelVariableExtractor: public mir::MIRVisitor {
 				return;
 		hoisted_vars.push_back(var_to_insert);
 	}
+	void insertDecl(mir::VarDecl::Ptr decl_to_insert) {
+		hoisted_decls.push_back(decl_to_insert);
+	}
 
 	virtual void visit(mir::VarExpr::Ptr);
+	virtual void visit(mir::VarDecl::Ptr);
 };
 
 }
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index 7789a492..03d88290 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -416,6 +416,8 @@ namespace graphit {
 
 	    bool is_fused;
 	    std::string fused_kernel_name;
+	    std::vector<mir::Var> hoisted_vars;
+	    std::vector<std::shared_ptr<mir::VarDecl>> hoisted_decls;
 
             virtual void accept(MIRVisitor *visitor) {
                 visitor->visit(self<WhileStmt>());
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 93125013..1bc3e042 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -101,6 +101,9 @@ void CodeGenGPU::genPropertyArrayAlloca(mir::VarDecl::Ptr var_decl) {
 void KernelVariableExtractor::visit(mir::VarExpr::Ptr var_expr) {
 	insertVar(var_expr->var);
 }
+void KernelVariableExtractor::visit(mir::VarDecl::Ptr var_decl) {
+	insertDecl(var_decl);
+}
 void CodeGenGPU::genFusedWhileLoop(mir::WhileStmt::Ptr while_stmt) {
 	// First we generate a unique function name for this fused kernel
 	std::string fused_kernel_name = "fused_kernel_body_" + mir_context_->getUniqueNameCounterString();
@@ -110,6 +113,9 @@ void CodeGenGPU::genFusedWhileLoop(mir::WhileStmt::Ptr while_stmt) {
 	// So we can hoist them
 	KernelVariableExtractor extractor;
 	while_stmt->accept(&extractor);
+
+	while_stmt->hoisted_vars = extractor.hoisted_vars;
+	while_stmt->hoisted_decls = extractor.hoisted_decls;
 	
 	CodeGenGPUFusedKernel codegen (oss, mir_context_, module_name, "");
 	
@@ -702,7 +708,15 @@ void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
 	
 }
 void CodeGenGPUFusedKernel::visit(mir::VarDecl::Ptr var_decl) {
-	// Do nothing for variable declarations on kernel	
+	// Do nothing for variable declarations on kernel only lower the initialization as assignment
+	if (var_decl->initVal != nullptr) {
+		printIndent();
+		oss << "if (_thread_id == 0)" << std::endl;
+		indent();
+		printIndent();
+		oss << var_decl->name << " = ";
+		var_decl->initVal->accept(this);
+	}
 }
 void CodeGenGPU::visit(mir::VertexSetDedupExpr::Ptr vsde) {
 	oss << "gpu_runtime::dedup_frontier(";
@@ -727,8 +741,41 @@ void CodeGenGPU::visit(mir::ForStmt::Ptr for_stmt) {
 }
 void CodeGenGPU::visit(mir::WhileStmt::Ptr while_stmt) {
 	if (while_stmt->is_fused == true) {
+		/*
+		for (auto decl: while_stmt->hoisted_decls) {
+			printIndent();
+			decl->type->accept(this);	
+			oss << " " << decl->name << ";" << std::endl;
+		}
+		*/
+		for (auto var: while_stmt->hoisted_vars) {
+			bool to_copy = true;
+			for (auto decl: while_stmt->hoisted_decls) {
+				if (decl->name == var.getName()) {
+					to_copy = false;
+					break;
+				}
+			}
+			if (!to_copy)
+				continue;
+			printIndent();
+			oss << "cudaMemcpyToSymbol(" << while_stmt->fused_kernel_name << "_" << var.getName() << ", &" << var.getName() << ", sizeof(" << var.getName() << "), 0, cudaMemcpyHostToDevice);" << std::endl;
+		}
 		printIndent();
 		oss << "cudaLaunchCooperativeKernel((void*)" << while_stmt->fused_kernel_name << ", NUM_CTA, CTA_SIZE, gpu_runtime::no_args);" << std::endl;
+		for (auto var: while_stmt->hoisted_vars) {
+			bool to_copy = true;
+			for (auto decl: while_stmt->hoisted_decls) {
+				if (decl->name == var.getName()) {
+					to_copy = false;
+					break;
+				}
+			}
+			if (!to_copy)
+				continue;
+			printIndent();
+			oss << "cudaMemcpyFromSymbol(&" << var.getName() << ", " << while_stmt->fused_kernel_name << "_" << var.getName() << ", sizeof(" << var.getName() << "), 0, cudaMemcpyDeviceToHost);" << std::endl;
+		}
 		return;
 	}
 	printIndent();
@@ -775,7 +822,10 @@ void CodeGenGPUFusedKernel::visit(mir::PrintStmt::Ptr print_stmt) {
 	oss << "if (_thread_id == 0)" << std::endl;
 	indent();
 	printIndent();
-	oss << "printf(\"There is supposed to be a print here\\n\");" << std::endl;
+	//oss << "printf(\"There is supposed to be a print here\\n\");" << std::endl;
+	oss << "gpu_runtime::print(";
+	print_stmt->expr->accept(this);
+	oss << ");" << std::endl;
 	dedent();
 }
 void CodeGenGPUHost::visit(mir::Call::Ptr call_expr) {
diff --git a/src/runtime_lib/gpu_intrinsics.h b/src/runtime_lib/gpu_intrinsics.h
index ba8c95b8..66b197f4 100644
--- a/src/runtime_lib/gpu_intrinsics.h
+++ b/src/runtime_lib/gpu_intrinsics.h
@@ -9,6 +9,7 @@
 #include "infra_gpu/load_balance.h"
 #include "graphit_timer.h"
 #include "infra_gpu/support.h"
+#include "infra_gpu/printer.h"
 #include "infra_gpu/gpu_priority_queue.h"
 
 namespace gpu_runtime {
diff --git a/src/runtime_lib/infra_gpu/printer.h b/src/runtime_lib/infra_gpu/printer.h
new file mode 100644
index 00000000..5b9cda01
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/printer.h
@@ -0,0 +1,17 @@
+#ifndef GRAPHIT_GPU_PRINTER
+#define GRAPHIT_GPU_PRINTER
+#include <string>
+
+namespace gpu_runtime {
+void __device__ print(int32_t val) {
+	printf("%d\n", val);
+}
+void __device__ print(float val) {
+	printf("%f\n", val);
+}
+void __device__ print(const char* val) {
+	printf("%s\n", val);
+}
+}
+
+#endif

From 7198946eec00d21fa059b0191fd8cb1ae3a35a47 Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Mon, 7 Oct 2019 15:59:38 -0400
Subject: [PATCH 18/88] adding correctness test for sssp, setting up the first
 verifier

---
 src/runtime_lib/infra_gpu/graph.h             |  2 +-
 test/gpu_tests/all_gpu_tests.py               | 19 +++++++++++++------
 .../test_input/sssp_delta_stepping.cu         | 18 ++++++++++--------
 3 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
index 04574e16..1fcaaf8c 100644
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -112,7 +112,7 @@ static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool
 	cudaMemcpy(graph.d_edge_dst, graph.h_edge_dst, sizeof(int32_t) * graph.num_edges, cudaMemcpyHostToDevice);
 	cudaMemcpy(graph.d_edge_weight, graph.h_edge_weight, sizeof(EdgeWeightType) * graph.num_edges, cudaMemcpyHostToDevice);
 	cudaMemcpy(graph.d_src_offsets, graph.h_src_offsets, sizeof(int32_t) * (graph.num_vertices + 1), cudaMemcpyHostToDevice);
-	std::cout << filename << " (" << graph.num_vertices << ", " << graph.num_edges << ")" << std::endl;
+	//std::cout << filename << " (" << graph.num_vertices << ", " << graph.num_edges << ")" << std::endl;
 
 }
 template <typename EdgeWeightType>
diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
index 8d116356..662e417a 100644
--- a/test/gpu_tests/all_gpu_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -33,6 +33,12 @@ def get_command_output(self, command):
 		self.assertEqual(exitcode, 0)
 		return output
 
+        def sssp_verified_test(self, input_file_name):
+                self.cpp_compile_test(input_file_name, [])
+                self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel v > verifier_input ")
+                self.get_command_output(self.verifier_directory + "/sssp_verifier -f " + self.graph_directory +  "/4.wel -t verifier_input -r 0")
+                
+        
 	@classmethod	
 	def setUpClass(cls):
 		if NVCC_COMPILER == "CUDA_NVCC_EXECUTABLE-NOTFOUND":
@@ -41,6 +47,7 @@ def setUpClass(cls):
 
 		cls.build_directory = GRAPHIT_BUILD_DIRECTORY
 		cls.scratch_directory = GRAPHIT_BUILD_DIRECTORY + "/scratch"
+                cls.verifier_directory = cls.build_directory + "/bin"        
 		if os.path.isdir(cls.scratch_directory):
 			shutil.rmtree(cls.scratch_directory)
 		os.mkdir(cls.scratch_directory)
@@ -62,9 +69,7 @@ def setUpClass(cls):
 		
 		shutil.copytree(GRAPHIT_SOURCE_DIRECTORY + "/test/graphs", cls.scratch_directory + "/graphs")
 		cls.graph_directory = cls.scratch_directory + "/graphs"
-		
-		cls.executable_name = cls.scratch_directory + "/test_exectuable"
-	
+		cls.executable_name = cls.scratch_directory + "/test_exectuable"	
 	def cpp_compile_test(self, input_file_name, extra_cpp_args=[]):
 		compile_command = self.nvcc_command + self.test_input_directory + "/" + input_file_name + " -o " + self.executable_name + " " + " ".join(extra_cpp_args)
 		self.get_command_output(compile_command)
@@ -78,14 +83,16 @@ def test_basic_compile(self):
 	def test_basic_load_graph(self):
 		output = self.cpp_exec_test("basic_load_graph.cu", [], [self.graph_directory + "/4.mtx"])
 		output = output.split("\n")
-		self.assertEqual(len(output), 3)
-		self.assertEqual(output[1], "14, 106")
+		self.assertEqual(len(output), 2)
+		self.assertEqual(output[0], "14, 106")
 	def test_runtime_library(self):
 		print (self.cpp_exec_test("runtime_lib_tests.cu", ["-I", GRAPHIT_SOURCE_DIRECTORY+"/test/gtest", GRAPHIT_SOURCE_DIRECTORY+"/test/gtest/gtest-all.cc"], [self.graph_directory]))
 
         def test_sssp_delta_stepping(self):
-                self.cpp_exec_test("sssp_delta_stepping.cu", [], [self.graph_directory + "/4.mtx", "2"])
+                self.cpp_exec_test("sssp_delta_stepping.cu", [], [self.graph_directory + "/4.mtx", "v"])
 
+        def test_sssp_delta_stepping_verified(self):
+                self.sssp_verified_test("sssp_delta_stepping.cu")
                 
 if __name__ == '__main__':
 	unittest.main()
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 9374888c..8bcf7daa 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -72,6 +72,7 @@ void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state devic
         }
 	if (thread_id == 0) {
 		device_state.SP[0] = 0;
+		//starting point is set to 0 
 		device_state.frontier1[0] = 0;	
 		*device_state.frontier1_size = 1;
 		*device_state.frontier2_size = 0;
@@ -293,7 +294,7 @@ int main(int argc, char *argv[]) {
 		int iters = 0;	
 		cudaDeviceSynchronize();
 		float t = stopTimer();
-		printf("Init time = %f\n", t);
+		//printf("Init time = %f\n", t);
 		iter_total+=t;
 
 		host_state.frontier1_size[0] = 1;
@@ -314,22 +315,23 @@ int main(int argc, char *argv[]) {
 			cudaMemcpy(host_state.frontier1_size, device_state.frontier1_size, sizeof(int32_t), cudaMemcpyDeviceToHost);
 
 			t = stopTimer();
-			printf("Iter %d time = %f, output_size = %d <%d, %d>\n", iters, t, *host_state.frontier1_size, num_cta, CTA_SIZE);
+			//printf("Iter %d time = %f, output_size = %d <%d, %d>\n", iters, t, *host_state.frontier1_size, num_cta, CTA_SIZE);
 			iter_total += t;
 		}
 		
-		printf("Num iters = %d\n", iters);
-		printf("Time elapsed = %f\n", iter_total);
+		//printf("Num iters = %d\n", iters);
+		//printf("Time elapsed = %f\n", iter_total);
 		total_time += iter_total;
 
 	}
-	printf("Total time = %f\n", total_time);
+	//printf("Total time = %f\n", total_time);
 	if (argc > 2)
-		if (argv[2][0] == 'o'){ 
-			FILE *output = fopen("output.txt", "w");
+		if (argv[2][0] == 'v'){ 
+			//FILE *output = fopen("output.txt", "w");
 			cudaMemcpy(host_state.SP, device_state.SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
 			for (int i = 0; i < graph.num_vertices; i++)
-				fprintf(output, "%d, %d\n", i, host_state.SP[i]);
+				//fprintf(output, "%d, %d\n", i, host_state.SP[i]);
+				printf("%d\n", host_state.SP[i]);
 		}else if (argv[2][0] == 'c'){
 			/*
 			for (int i = 0; i < NUM_BLOCKS * NUM_THREADS; i++)

From 4f29384ec6867dae0c6f83abde5fb3f149abaebe Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Mon, 7 Oct 2019 17:41:36 -0400
Subject: [PATCH 19/88] Basic Kernel fusion working for SSSP TWCE push unfused

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |   8 +-
 src/backend/codegen_gpu/codegen_gpu.cpp       | 285 +++++++++++++-----
 src/runtime_lib/gpu_intrinsics.h              |   1 +
 src/runtime_lib/infra_gpu/load_balance.h      |  32 +-
 src/runtime_lib/infra_gpu/support.h           |   4 +
 src/runtime_lib/infra_gpu/vertex_frontier.h   | 120 ++------
 .../infra_gpu/vertex_representation.h         | 120 ++++++++
 7 files changed, 406 insertions(+), 164 deletions(-)
 create mode 100644 src/runtime_lib/infra_gpu/vertex_representation.h

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index 6bcabd82..a077584a 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -65,13 +65,14 @@ class CodeGenGPU : public mir::MIRVisitor{
 	void genPropertyArrayAlloca(mir::VarDecl::Ptr);
 	
 	void genFusedWhileLoop(mir::WhileStmt::Ptr);
+	void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr);
 
 	EdgesetApplyFunctionDeclGenerator* edgeset_apply_func_gen_;
 
 	virtual std::string getBackendFunctionLabel(void) {
 		return "__device__";
 	}
-
+protected:
 	std::vector<mir::Var> kernel_hoisted_vars;
 	std::string current_kernel_name;
 	bool is_hoisted_var (mir::Var var) {
@@ -152,10 +153,15 @@ class CodeGenGPUFusedKernel: public CodeGenGPU {
 public:
 	using CodeGenGPU::CodeGenGPU;
 	using CodeGenGPU::visit;
+	void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr);
 	virtual void visit(mir::StmtBlock::Ptr) override;
 	virtual void visit(mir::AssignStmt::Ptr) override;
 	virtual void visit(mir::VarDecl::Ptr) override;
 	virtual void visit(mir::PrintStmt::Ptr) override;
+	
+	std::string var_name (std::string var) {
+		return current_kernel_name + "_" + var;
+	}
 };
 
 class KernelVariableExtractor: public mir::MIRVisitor {
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 1bc3e042..ccf78ad2 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -416,107 +416,246 @@ void CodeGenGPU::visit(mir::VarExpr::Ptr var_expr) {
 	}
 	oss << var_expr->var.getName();
 }
-void CodeGenGPU::visit(mir::AssignStmt::Ptr assign_stmt) {
-	if (mir::isa<mir::EdgeSetApplyExpr>(assign_stmt->expr)) {
-		mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);
-		if (esae->from_func == "") {
-			assert(false && "GPU backend doesn't currently support creating output frontier without input frontier\n");
-		}		
-		// We will assume that the output frontier can reuse the input frontier. 
-		// TOOD: Add liveness analysis for this
-		printIndent();	
-		oss << "{" << std::endl;
-		indent();
-		std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
-		if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
-			load_balance_function = "gpu_runtime::TWCE_load_balance";
-		}
-		
-		if (mir::isa<mir::PushEdgeSetApplyExpr>(esae)) {
+void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr::Ptr target) {
+	if (target != nullptr && esae->from_func == "") {
+		assert(false && "GPU backend doesn't currently support creating output frontier without input frontier\n");
+	}		
+	// We will assume that the output frontier can reuse the input frontier. 
+	// TOOD: Add liveness analysis for this
+	printIndent();	
+	oss << "{" << std::endl;
+	indent();
+	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
+	if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
+		load_balance_function = "gpu_runtime::TWCE_load_balance";
+	}
+
+	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae)) {
+		printIndent();
+		oss << "gpu_runtime::vertex_set_prepare_sparse(";
+		oss << esae->from_func;
+		oss << ");" << std::endl;
+	} else if (mir::isa<mir::PullEdgeSetApplyExpr>(esae)) {
+		if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
 			printIndent();
-			oss << "gpu_runtime::vertex_set_prepare_sparse(";
+			oss << "gpu_runtime::vertex_set_prepare_boolmap(";
 			oss << esae->from_func;
 			oss << ");" << std::endl;
-		} else if (mir::isa<mir::PullEdgeSetApplyExpr>(esae)) {
-			if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
-				printIndent();
-				oss << "gpu_runtime::vertex_set_prepare_boolmap(";
-				oss << esae->from_func;
-				oss << ");" << std::endl;
-			} else if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BITMAP) {
-				printIndent();
-				oss << "gpu_runtime::vertex_set_prepare_bitmap(";
-				oss << esae->from_func;
-				oss << ");" << std::endl;
-			}
+		} else if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BITMAP) {
+			printIndent();
+			oss << "gpu_runtime::vertex_set_prepare_bitmap(";
+			oss << esae->from_func;
+			oss << ");" << std::endl;
+		}
 
-			std::string to_func = esae->to_func;
-			if (to_func != "") {
-				printIndent();
-				oss << "gpu_runtime::vertex_set_create_reverse_sparse_queue<" << to_func << ">(";
-				oss << esae->from_func << ");" << std::endl;
-			}
-			
+		std::string to_func = esae->to_func;
+		if (to_func != "") {
+			printIndent();
+			oss << "gpu_runtime::vertex_set_create_reverse_sparse_queue<" << to_func << ">(";
+			oss << esae->from_func << ");" << std::endl;
 		}
+
+	}
+	if (target != nullptr) {
 		printIndent();
-		assign_stmt->lhs->accept(this);
+		target->accept(this);
 		oss << " = " << esae->from_func << ";" << std::endl;
+	}
 
-		printIndent();
-		oss << load_balance_function << "_host<";
-		
-		mir::Var target_var = mir::to<mir::VarExpr>(esae->target)->var;
-		mir::EdgeSetType::Ptr target_type = mir::to<mir::EdgeSetType>(target_var.getType());
-		if (target_type->weight_type == nullptr)
-			oss << "int32_t";
-		else
-			target_type->weight_type->accept(this);
-		
-		std::string accessor_type = "gpu_runtime::AccessorSparse";
-		if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->to_func == "")
-			accessor_type = "gpu_runtime::AccessorAll";
-		std::string src_filter = "gpu_runtime::true_function";
-		if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->to_func != "")
-			src_filter = esae->to_func;
-		
-		oss << ", " << esae->device_function << ", " << accessor_type << ", " << src_filter << ">(";
-		esae->target->accept(this);
-		oss << ", " << esae->from_func << ", ";
-		assign_stmt->lhs->accept(this);
-		oss << ");" << std::endl;
-			
+	printIndent();
+	oss << load_balance_function << "_host<";
 
-		printIndent();
-		oss << "cudaDeviceSynchronize();" << std::endl;
+	mir::Var target_var = mir::to<mir::VarExpr>(esae->target)->var;
+	mir::EdgeSetType::Ptr target_type = mir::to<mir::EdgeSetType>(target_var.getType());
+	if (target_type->weight_type == nullptr)
+		oss << "int32_t";
+	else
+		target_type->weight_type->accept(this);
+
+	std::string accessor_type = "gpu_runtime::AccessorSparse";
+	if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->to_func == "")
+		accessor_type = "gpu_runtime::AccessorAll";
+	std::string src_filter = "gpu_runtime::true_function";
+	if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->to_func != "")
+		src_filter = esae->to_func;
+
+	oss << ", " << esae->device_function << ", " << accessor_type << ", " << src_filter << ">(";
+	esae->target->accept(this);
+	oss << ", " << esae->from_func << ", ";
+	if (target != nullptr)
+		target->accept(this);
+	else 
+		oss << "gpu_runtime::sentinel_frontier";
+	oss << ");" << std::endl;
+
+
+	printIndent();
+	oss << "cudaDeviceSynchronize();" << std::endl;
+	if (target != nullptr) {
 		if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
 			printIndent();
 			oss << "gpu_runtime::swap_queues(";
-			assign_stmt->lhs->accept(this);
+			target->accept(this);
 			oss << ");" << std::endl;
 			printIndent();
-			assign_stmt->lhs->accept(this);
+			target->accept(this);
 			oss << ".format_ready = gpu_runtime::VertexFrontier::SPARSE;" << std::endl;
-		
+
 		} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
 			printIndent();
 			oss << "gpu_runtime::swap_bitmaps(";
-			assign_stmt->lhs->accept(this);
+			target->accept(this);
 			oss << ");" << std::endl;
 			printIndent();
-			assign_stmt->lhs->accept(this);
+			target->accept(this);
 			oss << ".format_ready = gpu_runtime::VertexFrontier::BITMAP;" << std::endl;
 		} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
 			printIndent();
 			oss << "gpu_runtime::swap_bytemaps(";
-			assign_stmt->lhs->accept(this);
+			target->accept(this);
 			oss << ");" << std::endl;
 			printIndent();
-			assign_stmt->lhs->accept(this);
+			target->accept(this);
 			oss << ".format_ready = gpu_runtime::VertexFrontier::BYTEMAP;" << std::endl;
 		}
+	}
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;
+
+}
+void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr::Ptr target) {
+	if (target != nullptr && esae->from_func == "") {
+		assert(false && "GPU backend doesn't currently support creating output frontier without input frontier\n");
+	}
+	printIndent();
+	oss << "{" << std::endl;
+	indent();
+	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
+	if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
+		load_balance_function = "gpu_runtime::TWCE_load_balance";
+	}
+	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae)) {
+		printIndent();
+		oss << "gpu_runtime::vertex_set_prepare_sparse_device(";
+		oss << var_name(esae->from_func);
+		oss << ");" << std::endl;
+	} else if (mir::isa<mir::PullEdgeSetApplyExpr>(esae)) {
+		if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
+			printIndent();
+			oss << "gpu_runtime::vertex_set_prepare_boolmap_device(";
+			oss << var_name(esae->from_func);
+			oss << ");" << std::endl;
+		} else if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BITMAP) {
+			printIndent();
+			oss << "gpu_runtime::vertex_set_prepare_bitmap_device(";
+			oss << var_name(esae->from_func);
+			oss << ");" << std::endl;
+		}
+		std::string to_func = esae->to_func;
+		if (to_func != "") {
+			printIndent();
+			oss << "gpu_runtime::vertex_set_create_reverse_sparse_queue_device<" << to_func << ">(";
+			oss << var_name(esae->from_func) << ");" << std::endl;
+		}
+	}
+	printIndent();
+	oss << "_grid.sync();" << std::endl;
+	if (target != nullptr) {
+		printIndent();
+		oss << "if (_thread_id == 0)" << std::endl;
+		indent();
+		printIndent();
+		target->accept(this);
+		oss << " = " << var_name(esae->from_func) << ";" << std::endl;
 		dedent();
 		printIndent();
-		oss << "}" << std::endl;
+		oss << "_grid.sync();" << std::endl;
+	}
+	printIndent();
+	oss << load_balance_function << "_device<";
+	
+	mir::Var target_var = mir::to<mir::VarExpr>(esae->target)->var;
+	mir::EdgeSetType::Ptr target_type = mir::to<mir::EdgeSetType>(target_var.getType());
+	if (target_type->weight_type == nullptr)
+		oss << "int32_t";
+	else
+		target_type->weight_type->accept(this);
+	
+	std::string accessor_type = "gpu_runtime::AccessorSparse";
+	if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->to_func == "")
+		accessor_type = "gpu_runtime::AcessorAll";
+	std::string src_filter = "gpu_runtime::true_function";
+	if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->to_func != "")
+		src_filter = esae->to_func;
+
+	oss << ", " << esae->device_function << ", " << accessor_type << ", " << src_filter << ">(";
+	esae->target->accept(this);
+	oss << ", " << var_name(esae->from_func) << ", ";
+	if (target != nullptr) 
+		target->accept(this);
+	else 
+		oss << "gpu_runtime::sentinel_frontier";
+	oss << ");" << std::endl;
+	printIndent();
+	oss << "_grid.sync();" << std::endl;
+	
+	if (target != nullptr) {
+		if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+			printIndent();
+			oss << "gpu_runtime::swap_queues_device(";
+			target->accept(this);
+			oss << ");" << std::endl;
+			printIndent();
+			oss << "_grid.sync();" << std::endl;
+			printIndent();
+			oss << "if (_thread_id == 0)" << std::endl;
+			indent();
+			printIndent();
+			target->accept(this);
+			oss << ".format_ready = gpu_runtime::VertexFrontier::SPARSE;" << std::endl;
+			dedent();
+		} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
+			printIndent();
+			oss << "gpu_runtime::swap_bitmaps_device(";
+			target->accept(this);
+			oss << ");" << std::endl;
+			printIndent();
+			oss << "_grid.sync();" << std::endl;
+			printIndent();
+			oss << "if (_thread_id == 0)" << std::endl;
+			indent();
+			printIndent();
+			target->accept(this);
+			oss << ".format_ready = gpu_runtime::VertexFrontier::BITMAP;" << std::endl;
+			dedent();
+		} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
+			printIndent();
+			oss << "gpu_runtime::swap_bytemaps_device(";
+			target->accept(this);
+			oss << ");" << std::endl;
+			printIndent();
+			oss << "_grid.sync();" << std::endl;
+			printIndent();
+			oss << "if (_thread_id == 0)" << std::endl;
+			indent();
+			printIndent();
+			target->accept(this);
+			oss << ".format_ready = gpu_runtime::VertexFrontier::BYTEMAP;" << std::endl;
+			dedent();
+		}
+		printIndent();
+		oss << "_grid.sync();" << std::endl;
+	}
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;
+	
+}
+void CodeGenGPU::visit(mir::AssignStmt::Ptr assign_stmt) {
+	if (mir::isa<mir::EdgeSetApplyExpr>(assign_stmt->expr)) {
+		mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);
+		genEdgeSetApplyExpr(esae, assign_stmt->lhs);
 	} else {
 		printIndent();
 		assign_stmt->lhs->accept(this);
@@ -528,7 +667,8 @@ void CodeGenGPU::visit(mir::AssignStmt::Ptr assign_stmt) {
 
 void CodeGenGPUFusedKernel::visit(mir::AssignStmt::Ptr assign_stmt) {
 	if (mir::isa<mir::EdgeSetApplyExpr>(assign_stmt->expr)) {
-		// Will be handled later
+		mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);
+		genEdgeSetApplyExpr(esae, assign_stmt->lhs);
 	} else {
 		printIndent();
 		oss << "if (_thread_id == 0) " << std::endl;
@@ -822,7 +962,6 @@ void CodeGenGPUFusedKernel::visit(mir::PrintStmt::Ptr print_stmt) {
 	oss << "if (_thread_id == 0)" << std::endl;
 	indent();
 	printIndent();
-	//oss << "printf(\"There is supposed to be a print here\\n\");" << std::endl;
 	oss << "gpu_runtime::print(";
 	print_stmt->expr->accept(this);
 	oss << ");" << std::endl;
diff --git a/src/runtime_lib/gpu_intrinsics.h b/src/runtime_lib/gpu_intrinsics.h
index 66b197f4..f487814a 100644
--- a/src/runtime_lib/gpu_intrinsics.h
+++ b/src/runtime_lib/gpu_intrinsics.h
@@ -6,6 +6,7 @@
 
 #include "infra_gpu/graph.h"
 #include "infra_gpu/vertex_frontier.h"
+#include "infra_gpu/vertex_representation.h"
 #include "infra_gpu/load_balance.h"
 #include "graphit_timer.h"
 #include "infra_gpu/support.h"
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index ca28cf26..f93a3f85 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -3,6 +3,8 @@
 
 #include "infra_gpu/graph.h"
 #include "infra_gpu/vertex_frontier.h"
+#include <cooperative_groups.h>
+using namespace cooperative_groups;
 
 namespace gpu_runtime {
 
@@ -54,12 +56,19 @@ void __host__ vertex_based_load_balance_host(GraphT<EdgeWeightType> &graph, Vert
 	vertex_based_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
 }
 
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __device__ vertex_based_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	//int32_t num_cta, cta_size;
+	//vertex_based_load_balance_info_device<AccessorType>(input_frontier, num_cta, cta_size);
+	// Do the actual processing
+}
+
 // TWCE LOAD BALANCE FUNCTIONS
 #define STAGE_1_SIZE (8)
 #define WARP_SIZE (32)
 template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier), typename AccessorType, bool src_filter(int32_t)>
-static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
-	int32_t thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier, int32_t cta_id, int32_t total_cta) {
+	int32_t thread_id = blockDim.x * cta_id + threadIdx.x;
 	
 	int32_t lane_id = thread_id % 32;
 	
@@ -166,6 +175,12 @@ void __host__ TWCE_load_balance_info(VertexFrontier &frontier, int32_t &num_cta,
 	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
 	cta_size = CTA_SIZE;
 }
+template <typename AccessorType>
+void __device__ TWCE_load_balance_info_device(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	int32_t num_threads = AccessorType::getSize(frontier) * STAGE_1_SIZE;
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	cta_size = CTA_SIZE;
+}
 template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
 void __global__ TWCE_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
 	TWCE_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier);
@@ -175,9 +190,20 @@ template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> loa
 void __host__ TWCE_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
 	int32_t num_cta, cta_size;
 	TWCE_load_balance_info<AccessorType>(input_frontier, num_cta, cta_size);
-	TWCE_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
+	TWCE_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
 }
 
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __device__ TWCE_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	TWCE_load_balance_info_device<AccessorType>(input_frontier, num_cta, cta_size);
+	this_grid().sync();
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		TWCE_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);	
+		__syncthreads();
+	}
+	this_grid().sync();
+}
 }
 
 #endif
diff --git a/src/runtime_lib/infra_gpu/support.h b/src/runtime_lib/infra_gpu/support.h
index be3e2770..be990644 100644
--- a/src/runtime_lib/infra_gpu/support.h
+++ b/src/runtime_lib/infra_gpu/support.h
@@ -35,6 +35,10 @@ template <typename T>
 static bool __device__ CAS(T *dst, T old_val, const T &new_val) {
 	return old_val == atomicCAS(dst, old_val, new_val);
 }
+static void __device__ parallel_memset(unsigned char* dst, unsigned char val, size_t total_bytes) {
+	for (size_t index = threadIdx.x + blockDim.x * blockIdx.x; index < total_bytes; index += blockDim.x * gridDim.x)
+		dst[index] = val;
+}
 }
 
 #endif
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index 88cd8006..65dfe207 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -2,6 +2,8 @@
 #define GPU_VERTEX_FRONTIER_H
 
 #include "infra_gpu/support.h"
+#include <cooperative_groups.h>
+using namespace cooperative_groups;
 namespace gpu_runtime {
 struct VertexFrontier {
 	int32_t max_num_elems; 
@@ -108,14 +110,8 @@ static void builtin_addVertex(VertexFrontier &frontier, int32_t vid) {
 	cudaMemcpy(frontier.d_num_elems_input, &curr_size, sizeof(int32_t), cudaMemcpyHostToDevice);
 }
 static void __device__ enqueueVertexSparseQueue(int32_t *sparse_queue, int32_t *sparse_queue_size, int32_t vertex_id) {
-	// Simple enqueuVertex implementation 
-	// Each thread adds on it's own
-	// TODO: Optimize with warp reduce
-
-	//int32_t pos = atomicAdd(sparse_queue_size, 1);
 	int32_t pos = atomicAggInc(sparse_queue_size);
 	sparse_queue[pos] = vertex_id;
-	
 }
 static void __device__ enqueueVertexBytemap(unsigned char* byte_map, int32_t *byte_map_size, int32_t vertex_id) {
 	// We are not using atomic operation here because races are benign here
@@ -150,6 +146,20 @@ static void swap_queues(VertexFrontier &frontier) {
 
 	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));	
 }
+static void __device__ swap_queues_device(VertexFrontier &frontier) {	
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+		int32_t *temp = frontier.d_num_elems_input;
+		frontier.d_num_elems_input = frontier.d_num_elems_output;
+		frontier.d_num_elems_output = temp;
+		
+		temp = frontier.d_sparse_queue_input;
+		frontier.d_sparse_queue_input = frontier.d_sparse_queue_output;
+		frontier.d_sparse_queue_output = temp;
+
+		frontier.d_num_elems_output[0] = 0;
+	}
+}
+
 static void swap_bytemaps(VertexFrontier &frontier) {
 	int32_t *temp = frontier.d_num_elems_input;
 	frontier.d_num_elems_input = frontier.d_num_elems_output;
@@ -163,6 +173,23 @@ static void swap_bytemaps(VertexFrontier &frontier) {
 	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));	
 	cudaMemset(frontier.d_byte_map_output, 0, sizeof(unsigned char) * frontier.max_num_elems);
 }
+
+static void __device__ swap_bytemaps_device(VertexFrontier &frontier) {
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+		int32_t *temp = frontier.d_num_elems_input;
+		frontier.d_num_elems_input = frontier.d_num_elems_output;
+		frontier.d_num_elems_output = temp;
+		
+		unsigned char* temp2;
+		temp2 = frontier.d_byte_map_input;
+		frontier.d_byte_map_input = frontier.d_byte_map_output;
+		frontier.d_byte_map_output = temp2;
+
+		frontier.d_num_elems_output[0] = 0;
+	}
+	this_grid().sync();
+	parallel_memset(frontier.d_byte_map_output, 0, sizeof(unsigned char) * frontier.max_num_elems);		
+}
 static void swap_bitmaps(VertexFrontier &frontier) {
 	int32_t *temp = frontier.d_num_elems_input;
 	frontier.d_num_elems_input = frontier.d_num_elems_output;
@@ -195,87 +222,6 @@ static void dedup_frontier(VertexFrontier &frontier) {
 	dedup_frontier_kernel<<<NUM_CTA, CTA_SIZE>>>(frontier);
 	swap_queues(frontier);
 }
-static void __global__ prepare_sparse_from_bytemap(VertexFrontier frontier) {
-	for (int32_t node_id = blockDim.x * blockIdx.x + threadIdx.x; node_id < frontier.max_num_elems; node_id += blockDim.x * gridDim.x) {
-		if (frontier.d_byte_map_input[node_id] == 1) {
-			enqueueVertexSparseQueue(frontier.d_sparse_queue_output, frontier.d_num_elems_output, node_id);
-		}
-	}
-}
-static void __global__ prepare_sparse_from_bitmap(VertexFrontier frontier) {
-	for (int32_t node_id = blockDim.x * blockIdx.x + threadIdx.x; node_id < frontier.max_num_elems; node_id += blockDim.x * gridDim.x) {
-		if (checkBit(frontier.d_bit_map_input, node_id)) {
-			enqueueVertexSparseQueue(frontier.d_sparse_queue_output, frontier.d_num_elems_output, node_id);
-		}
-	}
-}
-
-static void __global__ prepare_bytemap_from_sparse(VertexFrontier frontier) {
-	for (int32_t node_idx = blockDim.x * blockIdx.x + threadIdx.x; node_idx < frontier.d_num_elems_input[0]; node_idx += blockDim.x * gridDim.x) {
-		int32_t node_id = frontier.d_sparse_queue_input[node_idx];
-		enqueueVertexBytemap(frontier.d_byte_map_output, frontier.d_num_elems_output, node_id);
-	}
-}
-static void __global__ prepare_bytemap_from_bitmap(VertexFrontier frontier) {
-	for (int32_t node_id = blockDim.x * blockIdx.x + threadIdx.x; node_id < frontier.max_num_elems; node_id += blockDim.x * gridDim.x) {
-		if (checkBit(frontier.d_bit_map_input, node_id)) {
-			enqueueVertexBytemap(frontier.d_byte_map_output, frontier.d_num_elems_output, node_id);
-		}
-	}
-}
-
-static void __global__ prepare_bitmap_from_sparse(VertexFrontier frontier) {
-	for (int32_t node_idx = blockDim.x * blockIdx.x + threadIdx.x; node_idx < frontier.d_num_elems_input[0]; node_idx += blockDim.x * gridDim.x) {
-		int32_t node_id = frontier.d_sparse_queue_input[node_idx];
-		enqueueVertexBitmap(frontier.d_bit_map_output, frontier.d_num_elems_output, node_id);
-	}
-}
-static void __global__ prepare_bitmap_from_bytemap(VertexFrontier frontier) {
-	for (int32_t node_id = blockDim.x * blockIdx.x + threadIdx.x; node_id < frontier.max_num_elems; node_id += blockDim.x * gridDim.x) {
-		if (frontier.d_byte_map_input[node_id] == 1) {
-			enqueueVertexBitmap(frontier.d_bit_map_output, frontier.d_num_elems_output, node_id);
-		}
-	}
-}
-static void vertex_set_prepare_sparse(VertexFrontier &frontier) {
-	if (frontier.format_ready == VertexFrontier::SPARSE)
-		return;
-	else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
-		prepare_sparse_from_bytemap<<<NUM_CTA, CTA_SIZE>>>(frontier);	
-		swap_queues(frontier);
-		return;
-	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
-		prepare_sparse_from_bitmap<<<NUM_CTA, CTA_SIZE>>>(frontier);
-		swap_queues(frontier);	
-		return;	
-	}	
-}
-static void vertex_set_prepare_boolmap(VertexFrontier &frontier) {
-	if (frontier.format_ready == VertexFrontier::SPARSE) {
-		prepare_bytemap_from_sparse<<<NUM_CTA, CTA_SIZE>>>(frontier);
-		swap_bytemaps(frontier);
-		return;
-	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
-		return;
-	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
-		prepare_bytemap_from_bitmap<<<NUM_CTA, CTA_SIZE>>>(frontier);
-		swap_bytemaps(frontier);
-		return;
-	}
-}
-static void vertex_set_prepare_bitmap(VertexFrontier &frontier) {
-	if (frontier.format_ready == VertexFrontier::SPARSE) {
-		prepare_bitmap_from_sparse<<<NUM_CTA, CTA_SIZE>>>(frontier);
-		swap_bitmaps(frontier);
-		return;
-	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
-		prepare_bitmap_from_bytemap<<<NUM_CTA, CTA_SIZE>>>(frontier);
-		swap_bitmaps(frontier);
-		return;
-	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
-		return;
-	}
-}
 bool __device__ true_function(int32_t _) {
 	return true;
 }
diff --git a/src/runtime_lib/infra_gpu/vertex_representation.h b/src/runtime_lib/infra_gpu/vertex_representation.h
new file mode 100644
index 00000000..36cf2146
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/vertex_representation.h
@@ -0,0 +1,120 @@
+#ifndef VERTEX_REPRESENTATION_H
+#define VERTEX_REPRESENTATION_H
+
+#include "infra_gpu/vertex_frontier.h"
+#include <cooperative_groups.h>
+using namespace cooperative_groups;
+namespace gpu_runtime {
+template <typename AccessorType, bool condition(VertexFrontier&, int32_t), void update(VertexFrontier&, int32_t)>
+static void __device__ generalized_prepare_from_to(VertexFrontier &frontier) {
+	int32_t total_work = AccessorType::getSize(frontier);
+	for (int32_t index = threadIdx.x + blockIdx.x * blockDim.x; index < total_work; index += gridDim.x * blockDim.x) {
+		int32_t node_id = AccessorType::getElement(frontier, index);
+		if (condition(frontier, node_id))
+			update(frontier, node_id);	
+	}
+}
+
+template <typename AccessorType, bool condition(VertexFrontier&, int32_t), void update(VertexFrontier&, int32_t)>
+static void __global__ generalized_prepare_from_to_kernel(VertexFrontier frontier) {
+	generalized_prepare_from_to<AccessorType, condition, update>(frontier);
+}
+
+static bool __device__ condition_sparse(VertexFrontier &frontier, int32_t node_id) {
+	return true;
+}
+static bool __device__ condition_bytemap(VertexFrontier &frontier, int32_t node_id) {
+	return frontier.d_byte_map_input[node_id] == 1;	
+}
+static bool __device__ condition_bitmap(VertexFrontier &frontier, int32_t node_id) {
+	return checkBit(frontier.d_bit_map_input, node_id);
+}
+
+
+static void __device__ update_sparse(VertexFrontier &frontier, int32_t node_id) {
+	enqueueVertexSparseQueue(frontier.d_sparse_queue_output, frontier.d_num_elems_output, node_id);
+} 
+
+static void __device__ update_bytemap(VertexFrontier &frontier, int32_t node_id) {
+	enqueueVertexBytemap(frontier.d_byte_map_output, frontier.d_num_elems_output, node_id);
+}
+
+static void __device__ update_bitmap(VertexFrontier &frontier, int32_t node_id) {
+	enqueueVertexBitmap(frontier.d_bit_map_output, frontier.d_num_elems_output, node_id);
+}
+
+static void vertex_set_prepare_sparse(VertexFrontier &frontier) {
+	if (frontier.format_ready == VertexFrontier::SPARSE) {
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
+		generalized_prepare_from_to_kernel<AccessorAll, condition_bytemap, update_sparse><<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_queues(frontier);
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
+		generalized_prepare_from_to_kernel<AccessorAll, condition_bitmap, update_sparse><<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_queues(frontier);
+		return;
+	}
+}
+static void __device__ vertex_set_prepare_sparse_device(VertexFrontier &frontier) {
+	if (frontier.format_ready == VertexFrontier::SPARSE) {
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
+		generalized_prepare_from_to<AccessorAll, condition_bytemap, update_sparse>(frontier);
+		this_grid().sync();
+		swap_queues_device(frontier);
+		this_grid().sync();
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
+		generalized_prepare_from_to<AccessorAll, condition_bitmap, update_sparse>(frontier);
+		this_grid().sync();
+		swap_queues_device(frontier);
+		this_grid().sync();
+		return;
+	}
+}
+static void vertex_set_prepare_boolmap(VertexFrontier &frontier) {
+	if (frontier.format_ready == VertexFrontier::SPARSE) {
+		generalized_prepare_from_to_kernel<AccessorSparse, condition_sparse, update_bytemap><<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_bytemaps(frontier);
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
+		generalized_prepare_from_to_kernel<AccessorAll, condition_bitmap, update_bytemap><<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_bytemaps(frontier);
+		return;
+	}
+}
+static void __device__ vertex_set_prepare_boolmap_device(VertexFrontier &frontier) {
+	if (frontier.format_ready == VertexFrontier::SPARSE) {
+		generalized_prepare_from_to<AccessorSparse, condition_sparse, update_bytemap>(frontier);
+		this_grid().sync();
+		swap_bytemaps_device(frontier);
+		this_grid().sync();
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
+		generalized_prepare_from_to<AccessorAll, condition_bitmap, update_bytemap>(frontier);
+		this_grid().sync();
+		swap_bytemaps_device(frontier);
+		this_grid().sync();
+		return;
+	}
+}
+static void vertex_set_prepare_bitmap(VertexFrontier &frontier) {
+	if (frontier.format_ready == VertexFrontier::SPARSE) {
+		generalized_prepare_from_to_kernel<AccessorSparse, condition_sparse, update_bitmap><<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_bitmaps(frontier);
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
+		generalized_prepare_from_to_kernel<AccessorAll, condition_bytemap, update_bitmap><<<NUM_CTA, CTA_SIZE>>>(frontier);
+		swap_bitmaps(frontier);
+		return;	
+	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
+		return;
+	}
+}
+}
+#endif

From 537c5fe67aa5b929951620cbf3510fae68e55afc Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Mon, 7 Oct 2019 18:47:55 -0400
Subject: [PATCH 20/88] Changed file name for 4.mtx and relevant test cases

---
 src/graphitc.py                               |  2 +-
 test/gpu_tests/all_gpu_tests.py               | 22 +++++++++----------
 .../gpu_tests/test_input/runtime_lib_tests.cu |  4 ++--
 test/graphs/{4.mtx => simple_mtx.mtx}         |  0
 test/python/pybind_test.py                    |  2 +-
 5 files changed, 15 insertions(+), 15 deletions(-)
 rename test/graphs/{4.mtx => simple_mtx.mtx} (100%)

diff --git a/src/graphitc.py b/src/graphitc.py
index 468f277c..97a4cd3b 100644
--- a/src/graphitc.py
+++ b/src/graphitc.py
@@ -75,7 +75,7 @@ def parseArgs():
 
         compile_file.write("#include <graphit/frontend/high_level_schedule.h>\n")
         compile_file.write("namespace graphit {\n")
-	compile_file.write("using namespace graphit::fir::gpu_schedule;\n");
+        compile_file.write("using namespace graphit::fir::gpu_schedule;\n")
         compile_file.write("void user_defined_schedule (graphit::fir::high_level_schedule::ProgramScheduleNode::Ptr program) {\n")
         for schedule_cmd in schedule_cmd_list:
             compile_file.write(schedule_cmd)
diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
index 662e417a..f4801228 100644
--- a/test/gpu_tests/all_gpu_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -17,7 +17,7 @@ def get_command_output_class(self, command):
 		if isinstance(command, list):
 			proc = subprocess.Popen(command, stdout=subprocess.PIPE)
 		else:
-                        print(command)
+			print(command)
 			proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
 		exitcode = proc.wait()
 		for line in proc.stdout.readlines():
@@ -33,10 +33,10 @@ def get_command_output(self, command):
 		self.assertEqual(exitcode, 0)
 		return output
 
-        def sssp_verified_test(self, input_file_name):
-                self.cpp_compile_test(input_file_name, [])
-                self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel v > verifier_input ")
-                self.get_command_output(self.verifier_directory + "/sssp_verifier -f " + self.graph_directory +  "/4.wel -t verifier_input -r 0")
+	def sssp_verified_test(self, input_file_name):
+		self.cpp_compile_test(input_file_name, [])
+		self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel v > verifier_input ")
+		self.get_command_output(self.verifier_directory + "/sssp_verifier -f " + self.graph_directory +  "/4.wel -t verifier_input -r 0")
                 
         
 	@classmethod	
@@ -47,7 +47,7 @@ def setUpClass(cls):
 
 		cls.build_directory = GRAPHIT_BUILD_DIRECTORY
 		cls.scratch_directory = GRAPHIT_BUILD_DIRECTORY + "/scratch"
-                cls.verifier_directory = cls.build_directory + "/bin"        
+		cls.verifier_directory = cls.build_directory + "/bin"        
 		if os.path.isdir(cls.scratch_directory):
 			shutil.rmtree(cls.scratch_directory)
 		os.mkdir(cls.scratch_directory)
@@ -81,18 +81,18 @@ def cpp_exec_test(self, input_file_name, extra_cpp_args=[], extra_exec_args=[]):
 	def test_basic_compile(self):
 		self.cpp_compile_test("basic_compile.cu")
 	def test_basic_load_graph(self):
-		output = self.cpp_exec_test("basic_load_graph.cu", [], [self.graph_directory + "/4.mtx"])
+		output = self.cpp_exec_test("basic_load_graph.cu", [], [self.graph_directory + "/simple_mtx.mtx"])
 		output = output.split("\n")
 		self.assertEqual(len(output), 2)
 		self.assertEqual(output[0], "14, 106")
 	def test_runtime_library(self):
 		print (self.cpp_exec_test("runtime_lib_tests.cu", ["-I", GRAPHIT_SOURCE_DIRECTORY+"/test/gtest", GRAPHIT_SOURCE_DIRECTORY+"/test/gtest/gtest-all.cc"], [self.graph_directory]))
 
-        def test_sssp_delta_stepping(self):
-                self.cpp_exec_test("sssp_delta_stepping.cu", [], [self.graph_directory + "/4.mtx", "v"])
+	def test_sssp_delta_stepping(self):
+		self.cpp_exec_test("sssp_delta_stepping.cu", [], [self.graph_directory + "/simple_mtx.mtx", "v"])
 
-        def test_sssp_delta_stepping_verified(self):
-                self.sssp_verified_test("sssp_delta_stepping.cu")
+	def test_sssp_delta_stepping_verified(self):
+		self.sssp_verified_test("sssp_delta_stepping.cu")
                 
 if __name__ == '__main__':
 	unittest.main()
diff --git a/test/gpu_tests/test_input/runtime_lib_tests.cu b/test/gpu_tests/test_input/runtime_lib_tests.cu
index da3f1467..4bc6a82a 100644
--- a/test/gpu_tests/test_input/runtime_lib_tests.cu
+++ b/test/gpu_tests/test_input/runtime_lib_tests.cu
@@ -13,13 +13,13 @@ protected:
 };
 TEST_F(GPURuntimeLibTest, SimpleLoadGraphFromFileTest) {
 	gpu_runtime::GraphT<int32_t> edges;
-	gpu_runtime::load_graph(edges, graph_directory + "/4.mtx", false);
+	gpu_runtime::load_graph(edges, graph_directory + "/simple_mtx.mtx", false);
 	EXPECT_EQ (14, edges.num_vertices);
 }
 
 TEST_F(GPURuntimeLibTest, SimplePriorityQueueTest){
 	gpu_runtime::GraphT<int32_t> edges;
-	gpu_runtime::load_graph(edges, graph_directory + "/4.mtx", false);
+	gpu_runtime::load_graph(edges, graph_directory + "/simple_mtx.mtx", false);
 	int num_vertices = gpu_runtime::builtin_getVertices(edges);
 	int* priorities = new int[num_vertices]; 
 	gpu_runtime::GPUPriorityQueue<int> pq = gpu_runtime::GPUPriorityQueue<int>(priorities);
diff --git a/test/graphs/4.mtx b/test/graphs/simple_mtx.mtx
similarity index 100%
rename from test/graphs/4.mtx
rename to test/graphs/simple_mtx.mtx
diff --git a/test/python/pybind_test.py b/test/python/pybind_test.py
index 6344d1a6..432090da 100644
--- a/test/python/pybind_test.py
+++ b/test/python/pybind_test.py
@@ -96,7 +96,7 @@ def test_pybind_pr_with_vector_input(self):
 
     def test_pybind_pr_load_file(self):
         module = graphit.compile_and_load(self.root_test_input_dir + "export_pr_with_return.gt")
-        graph = csr_matrix(scipy.io.mmread(self.root_test_graph_dir+"4.mtx"))
+        graph = csr_matrix(scipy.io.mmread(self.root_test_graph_dir+"simple_mtx.mtx"))
         ranks = module.export_func(graph)
         self.assertEqual(len(ranks), graph.shape[0])
         self.assertTrue(abs(np.sum(ranks)-1.0) < 0.1)

From 19ba657eb05c473e8f2c1b9c591cd1b3a3197111 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Mon, 7 Oct 2019 20:53:04 -0400
Subject: [PATCH 21/88] Added generating Function declaration before kernel

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |  1 +
 src/backend/codegen_gpu/codegen_gpu.cpp       | 31 +++++++++++++++++++
 src/runtime_lib/infra_gpu/load_balance.h      |  8 ++---
 3 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index a077584a..5f2b43b3 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -29,6 +29,7 @@ class CodeGenGPUKernelEmitter: public mir::MIRVisitor {
 	void visit(mir::PullEdgeSetApplyExpr::Ptr);
 
 	void genEdgeSetGlobalKernel(mir::EdgeSetApplyExpr::Ptr);
+	void genFuncDecl(mir::FuncDecl::Ptr);
 
 };
 
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index ccf78ad2..1a079eea 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -161,8 +161,35 @@ void CodeGenGPUFusedKernel::visit(mir::StmtBlock::Ptr stmt_block) {
 		}
 	}
 }
+void CodeGenGPUKernelEmitter::genFuncDecl(mir::FuncDecl::Ptr func_decl) {
+	if (func_decl->result.isInitialized()) {
+		func_decl->result.getType()->accept(this);
+		assert(mir::isa<mir::ScalarType>(func_decl->result.getType()));
+		assert(mir::to<mir::ScalarType>(func_decl->result.getType())->type == mir::ScalarType::Type::BOOL);
+		oss << "bool";
+	} else {
+		oss << "void";
+	}
+	oss << " " << "__device__" << " " << func_decl->name << "(";
+	bool printDelimeter = false;
+	for (auto arg: func_decl->args) {
+		if (printDelimeter)
+			oss << ", ";
+		assert(mir::isa<mir::ElementType>(arg.getType()) || mir::isa<mir::ScalarType>(arg.getType()));
+		if (mir::isa<mir::ScalarType>(arg.getType()))
+			assert(mir::to<mir::ScalarType>(arg.getType())->type == mir::ScalarType::Type::INT);
+		oss << "int32_t";
+		oss << " " << arg.getName();
+		printDelimeter = true;
+	}
+	oss << ");" << std::endl;
+}
 void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 
+
+	// Before we generate the payload for the load balancing function, we need to generate a declaration for the UDF
+	mir::FuncDecl::Ptr input_function_decl = mir_context_->getFunction(apply_expr->input_function_name);
+	genFuncDecl(input_function_decl);
 	// First we generate the function that is passed to the load balancing function
 
 	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
@@ -205,6 +232,10 @@ void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 }
 
 void CodeGenGPUKernelEmitter::visit(mir::PullEdgeSetApplyExpr::Ptr apply_expr) {
+	// Before we generate the payload for the load balancing function, we need to generate a declaration for the UDF
+	mir::FuncDecl::Ptr input_function_decl = mir_context_->getFunction(apply_expr->input_function_name);
+	genFuncDecl(input_function_decl);
+
 	// First we generate the function that is passed to the load balancing function
 	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
 	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index f93a3f85..f4c6231d 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -26,7 +26,7 @@ static void __global__ vertex_set_apply_kernel(int32_t num_vertices) {
 
 // VERTEX BASED LOAD BALANCE FUNCTIONS
 template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
-void __device__ vertex_based_load_balance(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+void __device__ vertex_based_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
 	int32_t vid = threadIdx.x + blockDim.x * blockIdx.x;
 	if (vid >= AccessorType::getSize(input_frontier))
 		return;
@@ -67,7 +67,7 @@ void __device__ vertex_based_load_balance_device(GraphT<EdgeWeightType> &graph,
 #define STAGE_1_SIZE (8)
 #define WARP_SIZE (32)
 template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier), typename AccessorType, bool src_filter(int32_t)>
-static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier, int32_t cta_id, int32_t total_cta) {
+static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier input_frontier, VertexFrontier output_frontier, unsigned int cta_id, unsigned int total_cta) {
 	int32_t thread_id = blockDim.x * cta_id + threadIdx.x;
 	
 	int32_t lane_id = thread_id % 32;
@@ -183,14 +183,14 @@ void __device__ TWCE_load_balance_info_device(VertexFrontier &frontier, int32_t
 }
 template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
 void __global__ TWCE_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
-	TWCE_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier);
+	TWCE_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
 }
 
 template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
 void __host__ TWCE_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
 	int32_t num_cta, cta_size;
 	TWCE_load_balance_info<AccessorType>(input_frontier, num_cta, cta_size);
-	TWCE_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+	TWCE_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
 }
 
 template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 

From ffddce898f7260a449466667d5da58245a15ef6a Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Mon, 7 Oct 2019 22:56:00 -0400
Subject: [PATCH 22/88] Added destination filter to push edge set apply and
 fixed the bitmap implementation

---
 src/backend/codegen_gpu/codegen_gpu.cpp     | 12 ++++++++++++
 src/runtime_lib/infra_gpu/vertex_frontier.h | 12 ++++++------
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 1a079eea..d7d73bde 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -190,6 +190,10 @@ void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 	// Before we generate the payload for the load balancing function, we need to generate a declaration for the UDF
 	mir::FuncDecl::Ptr input_function_decl = mir_context_->getFunction(apply_expr->input_function_name);
 	genFuncDecl(input_function_decl);
+	if (apply_expr->to_func != "") {
+		mir::FuncDecl::Ptr to_function_decl = mir_context_->getFunction(apply_expr->to_func);
+		genFuncDecl(to_function_decl);
+	}
 	// First we generate the function that is passed to the load balancing function
 
 	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
@@ -199,6 +203,14 @@ void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 	indent();
 	printIndent();
 	oss << "// Body of the actual operator code" << std::endl;
+	if (apply_expr->to_func != "") {
+		printIndent();
+		oss << "if (!" << apply_expr->to_func << "(dst))" << std::endl;
+		indent();
+		printIndent();
+		oss << "return;" << std::endl;
+		dedent();
+	}
 	mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function_name);
 	if (input_function->args.size() == 3) {	
 		printIndent();
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index 65dfe207..d5fe6222 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -80,7 +80,7 @@ static VertexFrontier create_new_vertex_set(int32_t num_vertices) {
 	cudaMemset(frontier.d_byte_map_input, 0, sizeof(unsigned char) * num_vertices);
 	cudaMemset(frontier.d_byte_map_output, 0, sizeof(unsigned char) * num_vertices);
 	
-	int32_t num_byte_for_bitmap = (num_vertices + sizeof(uint32_t) - 1)/sizeof(uint32_t);
+	int32_t num_byte_for_bitmap = (num_vertices + sizeof(uint32_t) * 8 - 1)/(sizeof(uint32_t) * 8);
 	cudaMalloc(&frontier.d_bit_map_input, sizeof(uint32_t) * num_byte_for_bitmap);
 	cudaMalloc(&frontier.d_bit_map_output, sizeof(uint32_t) * num_byte_for_bitmap);
 	
@@ -121,12 +121,12 @@ static void __device__ enqueueVertexBytemap(unsigned char* byte_map, int32_t *by
 	atomicAggInc(byte_map_size);
 }
 static bool __device__ checkBit(uint32_t* array, int32_t index) {	
-	uint32_t * address = array + index / sizeof(uint32_t);
-	return (*address & (1 << (index % sizeof(uint32_t))));
+	uint32_t * address = array + index / (8 * sizeof(uint32_t));
+	return (*address & (1 << (index % (8 * sizeof(uint32_t)))));
 }
 static bool __device__ setBit(uint32_t* array, int32_t index) {
-	uint32_t * address = array + index / sizeof(uint32_t);	
-	return atomicOr(address, (1 << (index % sizeof(uint32_t)))) & (1 << (index % sizeof(uint32_t)));
+	uint32_t * address = array + index / (8 * sizeof(uint32_t));
+	return atomicOr(address, (1 << (index % (8 * sizeof(uint32_t))))) & (1 << (index % (8 * sizeof(uint32_t))));
 }
 static void __device__ enqueueVertexBitmap(uint32_t* bit_map, int32_t * bit_map_size, int32_t vertex_id) {
 	// We need atomics here because of bit manipulations
@@ -201,7 +201,7 @@ static void swap_bitmaps(VertexFrontier &frontier) {
 	frontier.d_bit_map_output = temp2;
 
 	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));		
-	int32_t num_byte_for_bitmap = (frontier.max_num_elems + sizeof(uint32_t) - 1)/sizeof(uint32_t);
+	int32_t num_byte_for_bitmap = (frontier.max_num_elems + 8 * sizeof(uint32_t) - 1)/(sizeof(uint32_t) * 8);
 	cudaMemset(frontier.d_bit_map_output, 0, sizeof(uint32_t) * num_byte_for_bitmap);
 	cudaCheckLastError();
 }

From ed487f1711b9620bcf32427a7cdfd6937350516e Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Tue, 8 Oct 2019 15:36:12 -0400
Subject: [PATCH 23/88] moving the sssp runtime lib into a sssp_lp (label
 propagation) file, and adding a real delta stepping version

---
 test/gpu_tests/all_gpu_tests.py               |   6 +
 .../test_input/sssp_delta_stepping.cu         | 277 +++++++++-----
 test/gpu_tests/test_input/sssp_lp.cu          | 357 ++++++++++++++++++
 3 files changed, 552 insertions(+), 88 deletions(-)
 create mode 100644 test/gpu_tests/test_input/sssp_lp.cu

diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
index 662e417a..3b624078 100644
--- a/test/gpu_tests/all_gpu_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -88,6 +88,12 @@ def test_basic_load_graph(self):
 	def test_runtime_library(self):
 		print (self.cpp_exec_test("runtime_lib_tests.cu", ["-I", GRAPHIT_SOURCE_DIRECTORY+"/test/gtest", GRAPHIT_SOURCE_DIRECTORY+"/test/gtest/gtest-all.cc"], [self.graph_directory]))
 
+        def test_sssp_lp_runtime_lib(self):
+                self.cpp_exec_test("sssp_lp.cu", [], [self.graph_directory + "/4.mtx", "v"])
+
+        def test_sssp_lp_verified(self):
+                self.sssp_verified_test("sssp_lp.cu")
+                
         def test_sssp_delta_stepping(self):
                 self.cpp_exec_test("sssp_delta_stepping.cu", [], [self.graph_directory + "/4.mtx", "v"])
 
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 8bcf7daa..b7411c4a 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -1,4 +1,5 @@
-#include "gpu_intrinsics.h"
+
+#include "graph.h"
 #include <algorithm>
 
 #define ITER_COUNT (1)
@@ -13,11 +14,13 @@ typedef struct {
 	int32_t *SP;
 
 	int32_t *frontier1;
-	int32_t *frontier2;
+
+	
+	char *frontier2;
 
 	int32_t *frontier1_size;
 	int32_t *frontier2_size;
-	int32_t *iters;
+
 
 	int32_t *output_size;
 
@@ -28,22 +31,28 @@ typedef struct {
 
 	int32_t *worklist;
 	int32_t *old_indices;
+	
+	int32_t window_lower;
+	int32_t window_upper;
+	
+	int32_t *more_elems;
+		
+	int32_t *new_window_start;
 }algo_state;
 
-//struct timeval start_time_;
-//struct timeval elapsed_time_;
+struct timeval start_time_;
+struct timeval elapsed_time_;
 
-// void startTimer(){
-// 	gettimeofday(&start_time_, NULL);
-// }
-
-// float stopTimer(){
-// 	gettimeofday(&elapsed_time_, NULL);
-// 	elapsed_time_.tv_sec  -= start_time_.tv_sec;
-// 	elapsed_time_.tv_usec -= start_time_.tv_usec;
-// 	return elapsed_time_.tv_sec + elapsed_time_.tv_usec/1e6;
-// }
+void startTimer(){
+	gettimeofday(&start_time_, NULL);
+}
 
+float stopTimer(){
+	gettimeofday(&elapsed_time_, NULL);
+	elapsed_time_.tv_sec  -= start_time_.tv_sec;
+	elapsed_time_.tv_usec -= start_time_.tv_usec;
+	return elapsed_time_.tv_sec + elapsed_time_.tv_usec/1e6;
+}
 void cudaCheckLastError(void) {
 	cudaError_t err = cudaGetLastError();
 	if (err != cudaSuccess) 
@@ -53,32 +62,33 @@ void cudaCheckLastError(void) {
 
 #define VIRTUAL_WARP_SIZE (32)
 #define NUM_THREADS (1024)
-#define NUM_BLOCKS (80)
 #define CTA_SIZE (1024)
 #define WARP_SIZE (32)
 #define STAGE_1_SIZE (8)
 
-void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
+void __global__ init_kernel(GraphT graph, algo_state device_state) {
         int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
         int num_threads = blockDim.x * gridDim.x;
-        int total_work = graph.num_vertices;
+        int total_work = graph.num_nodes;
         int work_per_thread = (total_work + num_threads - 1)/num_threads;
         for (int i = 0; i < work_per_thread; i++) {
                 int id = num_threads * i + thread_id;
                 if (id < total_work) {
 			device_state.SP[id] = INT_MAX;
-			device_state.iters[id] = 0;
+			device_state.frontier2[id] = 0;
                 }
         }
 	if (thread_id == 0) {
 		device_state.SP[0] = 0;
-		//starting point is set to 0 
-		device_state.frontier1[0] = 0;	
-		*device_state.frontier1_size = 1;
-		*device_state.frontier2_size = 0;
+		device_state.frontier1[graph.num_nodes] = 0;	
+		device_state.frontier1_size[0] = 1;
+		device_state.frontier1_size[1] = 1;
+		device_state.frontier1_size[2] = 0;
+		device_state.frontier1_size[3] = 0;
+		device_state.frontier1_size[4] = 0;
 	}
 }
-__device__ inline int warp_bcast(int v, int leader) { return __shfl_sync(__activemask(), v, leader); }
+__device__ inline int warp_bcast(int v, int leader) { return __shfl_sync(-1, v, leader); }
 __device__ inline int atomicAggInc(int *ctr) {
 	int32_t lane_id = threadIdx.x % 32;
 	
@@ -91,17 +101,13 @@ __device__ inline int atomicAggInc(int *ctr) {
 
         return (res + __popc(mask & ((1 << lane_id) - 1)));
 }
-__device__ void enqueueVertex(int32_t v, algo_state &device_state, int32_t curr_iter) {
-	if (device_state.iters[v] == curr_iter)
-		return;
-	device_state.iters[v] = curr_iter;
-	int32_t pos = atomicAggInc(device_state.frontier2_size);
-	device_state.frontier2[pos] = v;
+__device__ void enqueueVertex(int32_t v, algo_state &device_state, int32_t new_dist) {
+	if (new_dist < device_state.window_upper)
+		device_state.frontier2[v] = 1 ;
 }
-
-void __global__ update_edges (gpu_runtime::GraphT<int32_t> graph, algo_state device_state, int32_t curr_iter) {
+void __global__ update_edges (GraphT graph, algo_state device_state, int32_t curr_iter) {
 	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-	
+	int num_threads = blockDim.x * gridDim.x;
 	int lane_id = thread_id % 32;
 
 	__shared__ int32_t stage2_queue[CTA_SIZE];
@@ -130,11 +136,20 @@ void __global__ update_edges (gpu_runtime::GraphT<int32_t> graph, algo_state dev
 	int32_t my_vertex;
 	int32_t row_offset;
 	if (my_vertex_idx < total_vertices) {
-		my_vertex = device_state.frontier1[my_vertex_idx];
+		//my_vertex = device_state.frontier1[my_vertex_idx];
+		if (my_vertex_idx < device_state.frontier1_size[1]) {
+			my_vertex = device_state.frontier1[graph.num_nodes + my_vertex_idx];
+		} else if (my_vertex_idx < device_state.frontier1_size[1] + device_state.frontier1_size[2]) {
+			my_vertex = device_state.frontier1[graph.num_nodes * 2 + my_vertex_idx - device_state.frontier1_size[1]];
+		} else if (my_vertex_idx < device_state.frontier1_size[1] + device_state.frontier1_size[2] + device_state.frontier1_size[3]) {
+			my_vertex = device_state.frontier1[graph.num_nodes * 3 + my_vertex_idx - device_state.frontier1_size[1] - device_state.frontier1_size[2]];	
+		} else {
+			my_vertex = device_state.frontier1[graph.num_nodes * 4 + my_vertex_idx - device_state.frontier1_size[1] - device_state.frontier1_size[2] - device_state.frontier1_size[3]];
+		}	
 		// Step 1 segreggate vertices into shared buffers	
 		if (thread_id % (STAGE_1_SIZE) == 0 ) {
-			d = graph.d_get_degree(my_vertex);
-			row_offset = graph.d_src_offsets[my_vertex];	
+			d = graph.degrees_d[my_vertex];
+			row_offset = graph.row_offsets_d[my_vertex];	
 			int32_t s3_size = d/CTA_SIZE;
 			d = d - s3_size * CTA_SIZE;
 			if (s3_size) {
@@ -161,30 +176,31 @@ void __global__ update_edges (gpu_runtime::GraphT<int32_t> graph, algo_state dev
 
 	__syncthreads();
 	
-	d = __shfl_sync((uint32_t)-1, d, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
-	s1_offset = __shfl_sync((uint32_t)-1, s1_offset, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
+	d = __shfl_sync(-1, d, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
+	s1_offset = __shfl_sync(-1, s1_offset, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
 	int32_t src_distance;
 	if (my_vertex_idx < total_vertices) {
 		// STAGE 1	
-		my_vertex = device_state.frontier1[my_vertex_idx];
+		//my_vertex = device_state.frontier1[my_vertex_idx];
 		src_distance = device_state.SP[my_vertex];
 		for (int32_t neigh_id = s1_offset + (lane_id % STAGE_1_SIZE); neigh_id < d + s1_offset; neigh_id += STAGE_1_SIZE) {
 			// DO ACTUAL SSSP
-			int32_t dst = graph.d_edge_dst[neigh_id];
-			int32_t new_dst = graph.d_edge_weight[neigh_id] + src_distance;
+			int32_t dst = graph.edges_d[neigh_id];
+			int32_t new_dst = graph.edge_weights_d[neigh_id] + src_distance;
 			if (new_dst < device_state.SP[dst]) {
 				atomicMin(&device_state.SP[dst], new_dst);
-				enqueueVertex(dst, device_state, curr_iter);
+				enqueueVertex(dst, device_state, new_dst);
 			}	
 		}		
 	}	
 	// STAGE 2 -- stage 2 is dynamically balanced
+	__syncwarp(); // SYNC the warp here because ...
 	while (1) {
 		int32_t to_process;
 		if (lane_id == 0) {
 			to_process = atomicSub(&stage_queue_sizes[1], 1) - 1;	
 		}
-		to_process = __shfl_sync((uint32_t)-1, to_process, 0, 32);
+		to_process = __shfl_sync(-1, to_process, 0, 32);
 		if (to_process < 0)
 			break;
 		my_vertex = stage2_queue[to_process];
@@ -194,11 +210,11 @@ void __global__ update_edges (gpu_runtime::GraphT<int32_t> graph, algo_state dev
 		
 		for (int32_t neigh_id = s2_offset + (lane_id); neigh_id < d + s2_offset; neigh_id += WARP_SIZE) {
 			// DO ACTUAL SSSP
-			int dst = graph.d_edge_dst[neigh_id];
-			int new_dst = graph.d_edge_weight[neigh_id] + src_distance;
+			int dst = graph.edges_d[neigh_id];
+			int new_dst = graph.edge_weights_d[neigh_id] + src_distance;
 			if (new_dst < device_state.SP[dst]) {
 				atomicMin(&device_state.SP[dst], new_dst);
-				enqueueVertex(dst, device_state, curr_iter);
+				enqueueVertex(dst, device_state, new_dst);
 			}	
 		}
 	}	
@@ -212,55 +228,93 @@ void __global__ update_edges (gpu_runtime::GraphT<int32_t> graph, algo_state dev
 		
 		for (int32_t neigh_id = s3_offset + (threadIdx.x); neigh_id < d + s3_offset; neigh_id += CTA_SIZE) {
 			// DO ACTUAL SSSP
-			int dst = graph.d_edge_dst[neigh_id];
-			int new_dst = graph.d_edge_weight[neigh_id] + src_distance;
+			int dst = graph.edges_d[neigh_id];
+			int new_dst = graph.edge_weights_d[neigh_id] + src_distance;
 			if (new_dst < device_state.SP[dst]) {
 				atomicMin(&device_state.SP[dst], new_dst);
-				enqueueVertex(dst, device_state, curr_iter);
+				enqueueVertex(dst, device_state, new_dst);
 			}	
 		}
 	}	
 }
-void __global__ update_nodes (gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
+void __global__ update_nodes (GraphT graph, algo_state device_state) {
 	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
 	int num_threads = blockDim.x * gridDim.x;
-	
-	int total_work = graph.num_vertices;
+	int warp_id = thread_id / 32;	
+	int total_work = graph.num_nodes;
 	int work_per_thread = (total_work + num_threads - 1)/num_threads;
-	
 	for (int i = 0; i < work_per_thread; i++) {
 		int32_t node_id = thread_id + i * num_threads;
-		if (node_id < graph.num_vertices) {
+		if (node_id < graph.num_nodes) {
 			if (device_state.frontier2[node_id]) {
 				device_state.frontier2[node_id] = 0;
-				int pos = atomicAdd(device_state.frontier1_size, 1);
-				device_state.frontier1[pos] = node_id;
+				int pos = atomicAggInc(device_state.frontier1_size + 1 + (warp_id % 4));
+				device_state.frontier1[pos + (warp_id % 4 + 1) * graph.num_nodes] = node_id;
 			}
 		}
-	}
+	}	
+}
 
+void __global__ update_nodes_identify_min(GraphT graph, algo_state device_state) {
+	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+	int num_threads = blockDim.x * gridDim.x;
+	
+	int total_work = graph.num_nodes;
+	int work_per_thread = (total_work + num_threads - 1)/num_threads;
+	int32_t my_minimum = INT_MAX;
+	for (int i = 0; i < work_per_thread; i++) {
+		int32_t node_id = thread_id + i * num_threads;
+		if (node_id < graph.num_nodes) {
+			if (device_state.SP[node_id] >= device_state.window_upper && device_state.SP[node_id] != INT_MAX && device_state.SP[node_id] < my_minimum) {
+				my_minimum = device_state.SP[node_id];
+			}
+		}
+	}
+	if (my_minimum < device_state.new_window_start[0]) {
+		atomicMin(device_state.new_window_start, my_minimum);
+	}	
 }
-void allocate_state(algo_state &host_state, algo_state &device_state, gpu_runtime::GraphT<int32_t> &graph) {
-	host_state.SP = new int[graph.num_vertices];
+void __global__ update_nodes_special(GraphT graph, algo_state device_state) {
+	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+	int num_threads = blockDim.x * gridDim.x;
+	int warp_id = thread_id / 32;	
+	
+	int total_work = graph.num_nodes;
+	int work_per_thread = (total_work + num_threads - 1)/num_threads;
+	for (int i = 0; i < work_per_thread; i++) {
+		int32_t node_id = thread_id + i * num_threads;
+		if (node_id < graph.num_nodes) {
+			if(device_state.SP[node_id] >= device_state.window_lower && device_state.SP[node_id] < device_state.window_upper) {
+				int pos = atomicAggInc(device_state.frontier1_size + 1 + (warp_id % 4));
+				device_state.frontier1[pos + (warp_id % 4 + 1) * graph.num_nodes] = node_id;
+			}	
+		}
+	}
+}
+void allocate_state(algo_state &host_state, algo_state &device_state, GraphT &graph) {
+	host_state.SP = new int[graph.num_nodes];
 	host_state.output_size = new int32_t[1];
+	host_state.new_window_start = new int32_t[1];
 
 	host_state.frontier1_size = new int32_t[1];
-	host_state.frontier1 = new int32_t[graph.num_vertices];
+	host_state.frontier1 = new int32_t[graph.num_nodes];
 
-	
-	cudaMalloc(&device_state.SP, sizeof(int32_t)*graph.num_vertices);	
 
-	cudaMalloc(&device_state.frontier1, sizeof(int32_t)*graph.num_vertices * 6);	
-	cudaMalloc(&device_state.frontier2, sizeof(int32_t)*graph.num_vertices * 6);	
-	cudaMalloc(&device_state.iters, sizeof(int32_t)*graph.num_vertices);	
+	host_state.more_elems = new int32_t();
+	cudaMalloc(&device_state.SP, sizeof(int32_t)*graph.num_nodes);	
 
-	cudaMalloc(&device_state.frontier1_size, sizeof(int32_t));	
-	cudaMalloc(&device_state.frontier2_size, sizeof(int32_t));	
+	cudaMalloc(&device_state.frontier1, sizeof(int32_t)*graph.num_nodes * 5);	
+	cudaMalloc(&device_state.frontier2, sizeof(char)*graph.num_nodes );	
+
+	cudaMalloc(&device_state.frontier1_size, 5*sizeof(int32_t));	
+	//cudaMalloc(&device_state.frontier2_size, sizeof(int32_t));	
 
 	cudaMalloc(&device_state.output_size, sizeof(int32_t));
 
 
 	cudaMalloc(&device_state.worklist, sizeof(int32_t));
+	cudaMalloc(&device_state.more_elems, sizeof(int32_t));
+	cudaMalloc(&device_state.new_window_start, sizeof(int32_t));
 }
 
 void swap_pointers(int32_t **a, int32_t **b) {
@@ -269,19 +323,27 @@ void swap_pointers(int32_t **a, int32_t **b) {
 	*b = t;
 }
 void swap_queues(algo_state &device_state) {
-	swap_pointers(&device_state.frontier1, &device_state.frontier2);
-	swap_pointers(&device_state.frontier1_size, &device_state.frontier2_size);
+	//swap_pointers(&device_state.frontier1, &device_state.frontier2);
+	//swap_pointers(&device_state.frontier1_size, &device_state.frontier2_size);
 }
 int main(int argc, char *argv[]) {
 	cudaSetDevice(0);
 	cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
-	gpu_runtime::GraphT<int32_t> graph;
-	gpu_runtime::load_graph(graph, argv[1], false);
+	GraphT graph;
+
+	int32_t *new_indices = load_graph(argv[1], false, graph);
+	int32_t delta = atoi(argv[2]);
 
 	algo_state host_state, device_state;
 
 	allocate_state(host_state, device_state, graph);
 
+	host_state.window_lower = 0;
+	host_state.window_upper = delta;
+	device_state.window_lower = 0;
+	device_state.window_upper = delta;
+	
+
 	cudaDeviceSynchronize();
 		
 	float total_time = 0;
@@ -294,44 +356,83 @@ int main(int argc, char *argv[]) {
 		int iters = 0;	
 		cudaDeviceSynchronize();
 		float t = stopTimer();
-		//printf("Init time = %f\n", t);
+		printf("Init time = %f\n", t);
 		iter_total+=t;
 
 		host_state.frontier1_size[0] = 1;
+
+
+
 		while(*host_state.frontier1_size) {
 			startTimer();
 			iters++;
+			int num_blocks = NUM_BLOCKS;
+			
 			int num_threads = *host_state.frontier1_size *(STAGE_1_SIZE);
 			int num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
 			
 			update_edges<<<num_cta, CTA_SIZE>>>(graph, device_state, iters);
 
 			host_state.frontier1_size[0] = 0;
-			cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, sizeof(int32_t), cudaMemcpyHostToDevice);
+			host_state.frontier1_size[1] = 0;
+			host_state.frontier1_size[2] = 0;
+			host_state.frontier1_size[3] = 0;
+			host_state.frontier1_size[4] = 0;
+			cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, 5*sizeof(int32_t), cudaMemcpyHostToDevice);
 			
-			swap_queues(device_state);
-				
-			cudaCheckLastError();
-			cudaMemcpy(host_state.frontier1_size, device_state.frontier1_size, sizeof(int32_t), cudaMemcpyDeviceToHost);
+			update_nodes<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);
+			cudaMemcpy(host_state.frontier1_size, device_state.frontier1_size, sizeof(int32_t)*5, cudaMemcpyDeviceToHost);
+			host_state.frontier1_size[0] = host_state.frontier1_size[1];
+			host_state.frontier1_size[0] += host_state.frontier1_size[2];
+			host_state.frontier1_size[0] += host_state.frontier1_size[3];
+			host_state.frontier1_size[0] += host_state.frontier1_size[4];
+			cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, sizeof(int32_t), cudaMemcpyHostToDevice);
+
+
+			if (host_state.frontier1_size[0] == 0) {
+				host_state.new_window_start[0] = INT_MAX;
+				cudaMemcpy(device_state.new_window_start, host_state.new_window_start, sizeof(int32_t), cudaMemcpyHostToDevice);
+				update_nodes_identify_min<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);	
+				cudaMemcpy(host_state.new_window_start, device_state.new_window_start, sizeof(int32_t), cudaMemcpyDeviceToHost);
+				if (host_state.new_window_start[0] == INT_MAX) {
+					break;
+				}	
+				device_state.window_lower = host_state.new_window_start[0];
+				device_state.window_upper = host_state.new_window_start[0] + delta; 
+				host_state.frontier1_size[0] = 0;
+
+				host_state.frontier1_size[0] = 0;
+				host_state.frontier1_size[1] = 0;
+				host_state.frontier1_size[2] = 0;
+				host_state.frontier1_size[3] = 0;
+				host_state.frontier1_size[4] = 0;
+				cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, 5*sizeof(int32_t), cudaMemcpyHostToDevice);
+				update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>( graph, device_state);	
+				cudaMemcpy(host_state.frontier1_size, device_state.frontier1_size, sizeof(int32_t)*5, cudaMemcpyDeviceToHost);
+				host_state.frontier1_size[0] = host_state.frontier1_size[1];
+				host_state.frontier1_size[0] += host_state.frontier1_size[2];
+				host_state.frontier1_size[0] += host_state.frontier1_size[3];
+				host_state.frontier1_size[0] += host_state.frontier1_size[4];
+				cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, sizeof(int32_t), cudaMemcpyHostToDevice);
+			}
 
 			t = stopTimer();
-			//printf("Iter %d time = %f, output_size = %d <%d, %d>\n", iters, t, *host_state.frontier1_size, num_cta, CTA_SIZE);
+			printf("Iter %d time = %f, output_size = %d <%d, %d>\n", iters, t, *host_state.frontier1_size, num_cta, CTA_SIZE);
 			iter_total += t;
 		}
 		
-		//printf("Num iters = %d\n", iters);
-		//printf("Time elapsed = %f\n", iter_total);
+		printf("Num iters = %d\n", iters);
+		printf("Time elapsed = %f\n", iter_total);
 		total_time += iter_total;
 
 	}
-	//printf("Total time = %f\n", total_time);
-	if (argc > 2)
-		if (argv[2][0] == 'v'){ 
-			//FILE *output = fopen("output.txt", "w");
-			cudaMemcpy(host_state.SP, device_state.SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
-			for (int i = 0; i < graph.num_vertices; i++)
-				//fprintf(output, "%d, %d\n", i, host_state.SP[i]);
-				printf("%d\n", host_state.SP[i]);
+	printf("Total time = %f\n", total_time);
+	if (argc > 3)
+		if (argv[3][0] == 'o'){ 
+			FILE *output = fopen("output.txt", "w");
+			cudaMemcpy(host_state.SP, device_state.SP, sizeof(int32_t)*graph.num_nodes, cudaMemcpyDeviceToHost);
+			for (int i = 0; i < graph.num_nodes; i++)
+				fprintf(output, "%d, %d\n", i, host_state.SP[i]);
 		}else if (argv[2][0] == 'c'){
 			/*
 			for (int i = 0; i < NUM_BLOCKS * NUM_THREADS; i++)
diff --git a/test/gpu_tests/test_input/sssp_lp.cu b/test/gpu_tests/test_input/sssp_lp.cu
new file mode 100644
index 00000000..89471969
--- /dev/null
+++ b/test/gpu_tests/test_input/sssp_lp.cu
@@ -0,0 +1,357 @@
+#include "gpu_intrinsics.h"
+#include <algorithm>
+
+#define ITER_COUNT (1)
+#define USE_DEDUP 0
+#define SORT_NODES 0
+#include <assert.h>
+#include <vector>
+#include <queue>
+
+
+typedef struct {
+	int32_t *SP;
+
+	int32_t *frontier1;
+	int32_t *frontier2;
+
+	int32_t *frontier1_size;
+	int32_t *frontier2_size;
+	int32_t *iters;
+
+	int32_t *output_size;
+
+	int32_t num_blocks;
+
+	int32_t *node_borders;
+	int32_t *edge_borders;
+
+	int32_t *worklist;
+	int32_t *old_indices;
+}algo_state;
+
+//struct timeval start_time_;
+//struct timeval elapsed_time_;
+
+// void startTimer(){
+// 	gettimeofday(&start_time_, NULL);
+// }
+
+// float stopTimer(){
+// 	gettimeofday(&elapsed_time_, NULL);
+// 	elapsed_time_.tv_sec  -= start_time_.tv_sec;
+// 	elapsed_time_.tv_usec -= start_time_.tv_usec;
+// 	return elapsed_time_.tv_sec + elapsed_time_.tv_usec/1e6;
+// }
+
+void cudaCheckLastError(void) {
+	cudaError_t err = cudaGetLastError();
+	if (err != cudaSuccess) 
+		printf("Error: %s\n", cudaGetErrorString(err));
+}
+
+
+#define VIRTUAL_WARP_SIZE (32)
+#define NUM_THREADS (1024)
+#define NUM_BLOCKS (80)
+#define CTA_SIZE (1024)
+#define WARP_SIZE (32)
+#define STAGE_1_SIZE (8)
+
+void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
+        int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+        int num_threads = blockDim.x * gridDim.x;
+        int total_work = graph.num_vertices;
+        int work_per_thread = (total_work + num_threads - 1)/num_threads;
+        for (int i = 0; i < work_per_thread; i++) {
+                int id = num_threads * i + thread_id;
+                if (id < total_work) {
+			device_state.SP[id] = INT_MAX;
+			device_state.iters[id] = 0;
+                }
+        }
+	if (thread_id == 0) {
+		device_state.SP[0] = 0;
+		//starting point is set to 0 
+		device_state.frontier1[0] = 0;	
+		*device_state.frontier1_size = 1;
+		*device_state.frontier2_size = 0;
+	}
+}
+__device__ inline int warp_bcast(int v, int leader) { return __shfl_sync(__activemask(), v, leader); }
+__device__ inline int atomicAggInc(int *ctr) {
+	int32_t lane_id = threadIdx.x % 32;
+	
+        int mask = __activemask();
+        int leader = __ffs(mask) - 1;
+        int res;
+        if(lane_id == leader)
+                res = atomicAdd(ctr, __popc(mask));
+        res = warp_bcast(res, leader);
+
+        return (res + __popc(mask & ((1 << lane_id) - 1)));
+}
+__device__ void enqueueVertex(int32_t v, algo_state &device_state, int32_t curr_iter) {
+	if (device_state.iters[v] == curr_iter)
+		return;
+	device_state.iters[v] = curr_iter;
+	int32_t pos = atomicAggInc(device_state.frontier2_size);
+	device_state.frontier2[pos] = v;
+}
+
+void __global__ update_edges (gpu_runtime::GraphT<int32_t> graph, algo_state device_state, int32_t curr_iter) {
+	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+	
+	int lane_id = thread_id % 32;
+
+	__shared__ int32_t stage2_queue[CTA_SIZE];
+	__shared__ int32_t stage3_queue[CTA_SIZE];
+	__shared__ int32_t stage_queue_sizes[3];
+	if (threadIdx.x == 0) {
+		stage_queue_sizes[0] = 0;
+		stage_queue_sizes[1] = 0;
+		stage_queue_sizes[2] = 0;
+	}
+	__syncthreads();
+	
+	
+	__shared__ int32_t stage2_offset[CTA_SIZE];
+	__shared__ int32_t stage3_offset[CTA_SIZE];
+
+	__shared__ int32_t stage2_size[CTA_SIZE];
+	__shared__ int32_t stage3_size[CTA_SIZE];
+	
+
+	int32_t total_vertices = device_state.frontier1_size[0];	
+
+	int32_t my_vertex_idx = thread_id / (STAGE_1_SIZE);
+	int32_t d;
+	int32_t s1_offset;
+	int32_t my_vertex;
+	int32_t row_offset;
+	if (my_vertex_idx < total_vertices) {
+		my_vertex = device_state.frontier1[my_vertex_idx];
+		// Step 1 segreggate vertices into shared buffers	
+		if (thread_id % (STAGE_1_SIZE) == 0 ) {
+			d = graph.d_get_degree(my_vertex);
+			row_offset = graph.d_src_offsets[my_vertex];	
+			int32_t s3_size = d/CTA_SIZE;
+			d = d - s3_size * CTA_SIZE;
+			if (s3_size) {
+				int32_t pos = atomicAggInc(&stage_queue_sizes[2]);
+				stage3_queue[pos] = my_vertex;			
+				stage3_size[pos] = s3_size * CTA_SIZE;
+				// stage3_offset[pos] = 0; // Not required because always 0
+				stage3_offset[pos] = row_offset;	
+			}
+			
+			int32_t s2_size = d/WARP_SIZE;
+			d = d - s2_size * WARP_SIZE;
+			
+			if (s2_size) {
+				int32_t pos = atomicAggInc(&stage_queue_sizes[1]);
+				stage2_queue[pos] = my_vertex;
+				stage2_offset[pos] = s3_size * CTA_SIZE + row_offset;
+				stage2_size[pos] = s2_size * WARP_SIZE;
+			}
+			s1_offset = s3_size * CTA_SIZE + s2_size * WARP_SIZE + row_offset;
+		}
+	}else
+		my_vertex = -1;
+
+	__syncthreads();
+	
+	d = __shfl_sync((uint32_t)-1, d, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
+	s1_offset = __shfl_sync((uint32_t)-1, s1_offset, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
+	int32_t src_distance;
+	if (my_vertex_idx < total_vertices) {
+		// STAGE 1	
+		my_vertex = device_state.frontier1[my_vertex_idx];
+		src_distance = device_state.SP[my_vertex];
+		for (int32_t neigh_id = s1_offset + (lane_id % STAGE_1_SIZE); neigh_id < d + s1_offset; neigh_id += STAGE_1_SIZE) {
+			// DO ACTUAL SSSP
+			int32_t dst = graph.d_edge_dst[neigh_id];
+			int32_t new_dst = graph.d_edge_weight[neigh_id] + src_distance;
+			if (new_dst < device_state.SP[dst]) {
+				atomicMin(&device_state.SP[dst], new_dst);
+				enqueueVertex(dst, device_state, curr_iter);
+			}	
+		}		
+	}	
+	// STAGE 2 -- stage 2 is dynamically balanced
+	while (1) {
+		int32_t to_process;
+		if (lane_id == 0) {
+			to_process = atomicSub(&stage_queue_sizes[1], 1) - 1;	
+		}
+		to_process = __shfl_sync((uint32_t)-1, to_process, 0, 32);
+		if (to_process < 0)
+			break;
+		my_vertex = stage2_queue[to_process];
+		d = stage2_size[to_process];
+		int32_t s2_offset = stage2_offset[to_process];	
+		src_distance = device_state.SP[my_vertex];
+		
+		for (int32_t neigh_id = s2_offset + (lane_id); neigh_id < d + s2_offset; neigh_id += WARP_SIZE) {
+			// DO ACTUAL SSSP
+			int dst = graph.d_edge_dst[neigh_id];
+			int new_dst = graph.d_edge_weight[neigh_id] + src_distance;
+			if (new_dst < device_state.SP[dst]) {
+				atomicMin(&device_state.SP[dst], new_dst);
+				enqueueVertex(dst, device_state, curr_iter);
+			}	
+		}
+	}	
+
+	// STAGE 3 -- all threads have to do all, no need for LB
+	for (int32_t wid = 0; wid < stage_queue_sizes[2]; wid ++) {
+		my_vertex = stage3_queue[wid];
+		d = stage3_size[wid];
+		int32_t s3_offset = stage3_offset[wid];
+		src_distance = device_state.SP[my_vertex];
+		
+		for (int32_t neigh_id = s3_offset + (threadIdx.x); neigh_id < d + s3_offset; neigh_id += CTA_SIZE) {
+			// DO ACTUAL SSSP
+			int dst = graph.d_edge_dst[neigh_id];
+			int new_dst = graph.d_edge_weight[neigh_id] + src_distance;
+			if (new_dst < device_state.SP[dst]) {
+				atomicMin(&device_state.SP[dst], new_dst);
+				enqueueVertex(dst, device_state, curr_iter);
+			}	
+		}
+	}	
+}
+void __global__ update_nodes (gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
+	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+	int num_threads = blockDim.x * gridDim.x;
+	
+	int total_work = graph.num_vertices;
+	int work_per_thread = (total_work + num_threads - 1)/num_threads;
+	
+	for (int i = 0; i < work_per_thread; i++) {
+		int32_t node_id = thread_id + i * num_threads;
+		if (node_id < graph.num_vertices) {
+			if (device_state.frontier2[node_id]) {
+				device_state.frontier2[node_id] = 0;
+				int pos = atomicAdd(device_state.frontier1_size, 1);
+				device_state.frontier1[pos] = node_id;
+			}
+		}
+	}
+
+}
+void allocate_state(algo_state &host_state, algo_state &device_state, gpu_runtime::GraphT<int32_t> &graph) {
+	host_state.SP = new int[graph.num_vertices];
+	host_state.output_size = new int32_t[1];
+
+	host_state.frontier1_size = new int32_t[1];
+	host_state.frontier1 = new int32_t[graph.num_vertices];
+
+	
+	cudaMalloc(&device_state.SP, sizeof(int32_t)*graph.num_vertices);	
+
+	cudaMalloc(&device_state.frontier1, sizeof(int32_t)*graph.num_vertices * 6);	
+	cudaMalloc(&device_state.frontier2, sizeof(int32_t)*graph.num_vertices * 6);	
+	cudaMalloc(&device_state.iters, sizeof(int32_t)*graph.num_vertices);	
+
+	cudaMalloc(&device_state.frontier1_size, sizeof(int32_t));	
+	cudaMalloc(&device_state.frontier2_size, sizeof(int32_t));	
+
+	cudaMalloc(&device_state.output_size, sizeof(int32_t));
+
+
+	cudaMalloc(&device_state.worklist, sizeof(int32_t));
+}
+
+void swap_pointers(int32_t **a, int32_t **b) {
+	int32_t* t = *a;
+	*a = *b;
+	*b = t;
+}
+void swap_queues(algo_state &device_state) {
+	swap_pointers(&device_state.frontier1, &device_state.frontier2);
+	swap_pointers(&device_state.frontier1_size, &device_state.frontier2_size);
+}
+
+
+template <typename EdgeWeightType>
+void __device__ gpu_operator_body_3(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {
+	// Body of the actual operator code
+	EdgeWeightType weight = graph.d_edge_weight[edge_id];
+	//if (updateEdge(src, dst, weight)) {
+		gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, dst);
+		//}
+}
+
+
+int main(int argc, char *argv[]) {
+	cudaSetDevice(0);
+	cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
+	gpu_runtime::GraphT<int32_t> graph;
+	gpu_runtime::load_graph(graph, argv[1], false);
+
+	algo_state host_state, device_state;
+
+	allocate_state(host_state, device_state, graph);
+
+	cudaDeviceSynchronize();
+		
+	float total_time = 0;
+	for (int outer = 0; outer < ITER_COUNT; outer++) {
+		float iter_total = 0;
+		startTimer();
+		
+		startTimer();
+		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);		
+		int iters = 0;	
+		cudaDeviceSynchronize();
+		float t = stopTimer();
+		//printf("Init time = %f\n", t);
+		iter_total+=t;
+
+		host_state.frontier1_size[0] = 1;
+		while(*host_state.frontier1_size) {
+			startTimer();
+			iters++;
+			int num_threads = *host_state.frontier1_size *(STAGE_1_SIZE);
+			int num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+			
+			update_edges<<<num_cta, CTA_SIZE>>>(graph, device_state, iters);
+			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(edges, frontier, frontier);
+			
+			host_state.frontier1_size[0] = 0;
+			cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, sizeof(int32_t), cudaMemcpyHostToDevice);
+			
+			swap_queues(device_state);
+				
+			cudaCheckLastError();
+			cudaMemcpy(host_state.frontier1_size, device_state.frontier1_size, sizeof(int32_t), cudaMemcpyDeviceToHost);
+
+			t = stopTimer();
+			//printf("Iter %d time = %f, output_size = %d <%d, %d>\n", iters, t, *host_state.frontier1_size, num_cta, CTA_SIZE);
+			iter_total += t;
+		}
+		
+		//printf("Num iters = %d\n", iters);
+		//printf("Time elapsed = %f\n", iter_total);
+		total_time += iter_total;
+
+	}
+	//printf("Total time = %f\n", total_time);
+	if (argc > 2)
+		if (argv[2][0] == 'v'){ 
+			//FILE *output = fopen("output.txt", "w");
+			cudaMemcpy(host_state.SP, device_state.SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
+			for (int i = 0; i < graph.num_vertices; i++)
+				//fprintf(output, "%d, %d\n", i, host_state.SP[i]);
+				printf("%d\n", host_state.SP[i]);
+		}else if (argv[2][0] == 'c'){
+			/*
+			for (int i = 0; i < NUM_BLOCKS * NUM_THREADS; i++)
+				printf("%d: %d\n", i, counters[i]);
+			*/
+		}
+
+	return 0;
+
+}

From 57bea68434bf5815e2682157d0ed47ddd1619560 Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Tue, 8 Oct 2019 18:03:37 -0400
Subject: [PATCH 24/88] converting a true delta stepping version to use the
 runtime libraries

---
 test/gpu_tests/all_gpu_tests.py               |  11 +-
 .../test_input/sssp_delta_stepping.cu         | 118 ++++++++----------
 2 files changed, 59 insertions(+), 70 deletions(-)

diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
index 3b624078..9de9164a 100644
--- a/test/gpu_tests/all_gpu_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -33,9 +33,12 @@ def get_command_output(self, command):
 		self.assertEqual(exitcode, 0)
 		return output
 
-        def sssp_verified_test(self, input_file_name):
+        def sssp_verified_test(self, input_file_name, use_delta=False):
                 self.cpp_compile_test(input_file_name, [])
-                self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel v > verifier_input ")
+                if use_delta:
+                        self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel 2 v > verifier_input ")
+                else:
+                        self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel v > verifier_input ")             
                 self.get_command_output(self.verifier_directory + "/sssp_verifier -f " + self.graph_directory +  "/4.wel -t verifier_input -r 0")
                 
         
@@ -95,10 +98,10 @@ def test_sssp_lp_verified(self):
                 self.sssp_verified_test("sssp_lp.cu")
                 
         def test_sssp_delta_stepping(self):
-                self.cpp_exec_test("sssp_delta_stepping.cu", [], [self.graph_directory + "/4.mtx", "v"])
+                self.cpp_exec_test("sssp_delta_stepping.cu", [], [self.graph_directory + "/4.wel", "v"])
 
         def test_sssp_delta_stepping_verified(self):
-                self.sssp_verified_test("sssp_delta_stepping.cu")
+                self.sssp_verified_test("sssp_delta_stepping.cu", True)
                 
 if __name__ == '__main__':
 	unittest.main()
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index b7411c4a..b7b68bf5 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -1,5 +1,4 @@
-
-#include "graph.h"
+#include "gpu_intrinsics.h"
 #include <algorithm>
 
 #define ITER_COUNT (1)
@@ -40,19 +39,7 @@ typedef struct {
 	int32_t *new_window_start;
 }algo_state;
 
-struct timeval start_time_;
-struct timeval elapsed_time_;
-
-void startTimer(){
-	gettimeofday(&start_time_, NULL);
-}
 
-float stopTimer(){
-	gettimeofday(&elapsed_time_, NULL);
-	elapsed_time_.tv_sec  -= start_time_.tv_sec;
-	elapsed_time_.tv_usec -= start_time_.tv_usec;
-	return elapsed_time_.tv_sec + elapsed_time_.tv_usec/1e6;
-}
 void cudaCheckLastError(void) {
 	cudaError_t err = cudaGetLastError();
 	if (err != cudaSuccess) 
@@ -62,14 +49,15 @@ void cudaCheckLastError(void) {
 
 #define VIRTUAL_WARP_SIZE (32)
 #define NUM_THREADS (1024)
+#define NUM_BLOCKS (80)
 #define CTA_SIZE (1024)
 #define WARP_SIZE (32)
 #define STAGE_1_SIZE (8)
 
-void __global__ init_kernel(GraphT graph, algo_state device_state) {
+void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
         int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
         int num_threads = blockDim.x * gridDim.x;
-        int total_work = graph.num_nodes;
+        int total_work = graph.num_vertices;
         int work_per_thread = (total_work + num_threads - 1)/num_threads;
         for (int i = 0; i < work_per_thread; i++) {
                 int id = num_threads * i + thread_id;
@@ -80,7 +68,7 @@ void __global__ init_kernel(GraphT graph, algo_state device_state) {
         }
 	if (thread_id == 0) {
 		device_state.SP[0] = 0;
-		device_state.frontier1[graph.num_nodes] = 0;	
+		device_state.frontier1[graph.num_vertices] = 0;	
 		device_state.frontier1_size[0] = 1;
 		device_state.frontier1_size[1] = 1;
 		device_state.frontier1_size[2] = 0;
@@ -88,7 +76,7 @@ void __global__ init_kernel(GraphT graph, algo_state device_state) {
 		device_state.frontier1_size[4] = 0;
 	}
 }
-__device__ inline int warp_bcast(int v, int leader) { return __shfl_sync(-1, v, leader); }
+__device__ inline int warp_bcast(int v, int leader) { return __shfl_sync((uint32_t)-1, v, leader); }
 __device__ inline int atomicAggInc(int *ctr) {
 	int32_t lane_id = threadIdx.x % 32;
 	
@@ -105,9 +93,9 @@ __device__ void enqueueVertex(int32_t v, algo_state &device_state, int32_t new_d
 	if (new_dist < device_state.window_upper)
 		device_state.frontier2[v] = 1 ;
 }
-void __global__ update_edges (GraphT graph, algo_state device_state, int32_t curr_iter) {
+void __global__ update_edges (gpu_runtime::GraphT<int32_t> graph, algo_state device_state, int32_t curr_iter) {
 	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-	int num_threads = blockDim.x * gridDim.x;
+	//int num_threads = blockDim.x * gridDim.x;
 	int lane_id = thread_id % 32;
 
 	__shared__ int32_t stage2_queue[CTA_SIZE];
@@ -138,18 +126,18 @@ void __global__ update_edges (GraphT graph, algo_state device_state, int32_t cur
 	if (my_vertex_idx < total_vertices) {
 		//my_vertex = device_state.frontier1[my_vertex_idx];
 		if (my_vertex_idx < device_state.frontier1_size[1]) {
-			my_vertex = device_state.frontier1[graph.num_nodes + my_vertex_idx];
+			my_vertex = device_state.frontier1[graph.num_vertices + my_vertex_idx];
 		} else if (my_vertex_idx < device_state.frontier1_size[1] + device_state.frontier1_size[2]) {
-			my_vertex = device_state.frontier1[graph.num_nodes * 2 + my_vertex_idx - device_state.frontier1_size[1]];
+			my_vertex = device_state.frontier1[graph.num_vertices * 2 + my_vertex_idx - device_state.frontier1_size[1]];
 		} else if (my_vertex_idx < device_state.frontier1_size[1] + device_state.frontier1_size[2] + device_state.frontier1_size[3]) {
-			my_vertex = device_state.frontier1[graph.num_nodes * 3 + my_vertex_idx - device_state.frontier1_size[1] - device_state.frontier1_size[2]];	
+			my_vertex = device_state.frontier1[graph.num_vertices * 3 + my_vertex_idx - device_state.frontier1_size[1] - device_state.frontier1_size[2]];	
 		} else {
-			my_vertex = device_state.frontier1[graph.num_nodes * 4 + my_vertex_idx - device_state.frontier1_size[1] - device_state.frontier1_size[2] - device_state.frontier1_size[3]];
+			my_vertex = device_state.frontier1[graph.num_vertices * 4 + my_vertex_idx - device_state.frontier1_size[1] - device_state.frontier1_size[2] - device_state.frontier1_size[3]];
 		}	
 		// Step 1 segreggate vertices into shared buffers	
 		if (thread_id % (STAGE_1_SIZE) == 0 ) {
-			d = graph.degrees_d[my_vertex];
-			row_offset = graph.row_offsets_d[my_vertex];	
+			d = graph.d_get_degree(my_vertex);
+			row_offset = graph.d_src_offsets[my_vertex];	
 			int32_t s3_size = d/CTA_SIZE;
 			d = d - s3_size * CTA_SIZE;
 			if (s3_size) {
@@ -176,8 +164,8 @@ void __global__ update_edges (GraphT graph, algo_state device_state, int32_t cur
 
 	__syncthreads();
 	
-	d = __shfl_sync(-1, d, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
-	s1_offset = __shfl_sync(-1, s1_offset, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
+	d = __shfl_sync((uint32_t)-1, d, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
+	s1_offset = __shfl_sync((uint32_t)-1, s1_offset, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
 	int32_t src_distance;
 	if (my_vertex_idx < total_vertices) {
 		// STAGE 1	
@@ -185,8 +173,8 @@ void __global__ update_edges (GraphT graph, algo_state device_state, int32_t cur
 		src_distance = device_state.SP[my_vertex];
 		for (int32_t neigh_id = s1_offset + (lane_id % STAGE_1_SIZE); neigh_id < d + s1_offset; neigh_id += STAGE_1_SIZE) {
 			// DO ACTUAL SSSP
-			int32_t dst = graph.edges_d[neigh_id];
-			int32_t new_dst = graph.edge_weights_d[neigh_id] + src_distance;
+			int32_t dst = graph.d_edge_dst[neigh_id];
+			int32_t new_dst = graph.d_edge_weight[neigh_id] + src_distance;
 			if (new_dst < device_state.SP[dst]) {
 				atomicMin(&device_state.SP[dst], new_dst);
 				enqueueVertex(dst, device_state, new_dst);
@@ -200,7 +188,7 @@ void __global__ update_edges (GraphT graph, algo_state device_state, int32_t cur
 		if (lane_id == 0) {
 			to_process = atomicSub(&stage_queue_sizes[1], 1) - 1;	
 		}
-		to_process = __shfl_sync(-1, to_process, 0, 32);
+		to_process = __shfl_sync((uint32_t)-1, to_process, 0, 32);
 		if (to_process < 0)
 			break;
 		my_vertex = stage2_queue[to_process];
@@ -210,8 +198,8 @@ void __global__ update_edges (GraphT graph, algo_state device_state, int32_t cur
 		
 		for (int32_t neigh_id = s2_offset + (lane_id); neigh_id < d + s2_offset; neigh_id += WARP_SIZE) {
 			// DO ACTUAL SSSP
-			int dst = graph.edges_d[neigh_id];
-			int new_dst = graph.edge_weights_d[neigh_id] + src_distance;
+			int dst = graph.d_edge_dst[neigh_id];
+			int new_dst = graph.d_edge_weight[neigh_id] + src_distance;
 			if (new_dst < device_state.SP[dst]) {
 				atomicMin(&device_state.SP[dst], new_dst);
 				enqueueVertex(dst, device_state, new_dst);
@@ -228,8 +216,8 @@ void __global__ update_edges (GraphT graph, algo_state device_state, int32_t cur
 		
 		for (int32_t neigh_id = s3_offset + (threadIdx.x); neigh_id < d + s3_offset; neigh_id += CTA_SIZE) {
 			// DO ACTUAL SSSP
-			int dst = graph.edges_d[neigh_id];
-			int new_dst = graph.edge_weights_d[neigh_id] + src_distance;
+			int dst = graph.d_edge_dst[neigh_id];
+			int new_dst = graph.d_edge_weight[neigh_id] + src_distance;
 			if (new_dst < device_state.SP[dst]) {
 				atomicMin(&device_state.SP[dst], new_dst);
 				enqueueVertex(dst, device_state, new_dst);
@@ -237,34 +225,34 @@ void __global__ update_edges (GraphT graph, algo_state device_state, int32_t cur
 		}
 	}	
 }
-void __global__ update_nodes (GraphT graph, algo_state device_state) {
+void __global__ update_nodes (gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
 	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
 	int num_threads = blockDim.x * gridDim.x;
 	int warp_id = thread_id / 32;	
-	int total_work = graph.num_nodes;
+	int total_work = graph.num_vertices;
 	int work_per_thread = (total_work + num_threads - 1)/num_threads;
 	for (int i = 0; i < work_per_thread; i++) {
 		int32_t node_id = thread_id + i * num_threads;
-		if (node_id < graph.num_nodes) {
+		if (node_id < graph.num_vertices) {
 			if (device_state.frontier2[node_id]) {
 				device_state.frontier2[node_id] = 0;
 				int pos = atomicAggInc(device_state.frontier1_size + 1 + (warp_id % 4));
-				device_state.frontier1[pos + (warp_id % 4 + 1) * graph.num_nodes] = node_id;
+				device_state.frontier1[pos + (warp_id % 4 + 1) * graph.num_vertices] = node_id;
 			}
 		}
 	}	
 }
 
-void __global__ update_nodes_identify_min(GraphT graph, algo_state device_state) {
+void __global__ update_nodes_identify_min(gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
 	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
 	int num_threads = blockDim.x * gridDim.x;
 	
-	int total_work = graph.num_nodes;
+	int total_work = graph.num_vertices;
 	int work_per_thread = (total_work + num_threads - 1)/num_threads;
 	int32_t my_minimum = INT_MAX;
 	for (int i = 0; i < work_per_thread; i++) {
 		int32_t node_id = thread_id + i * num_threads;
-		if (node_id < graph.num_nodes) {
+		if (node_id < graph.num_vertices) {
 			if (device_state.SP[node_id] >= device_state.window_upper && device_state.SP[node_id] != INT_MAX && device_state.SP[node_id] < my_minimum) {
 				my_minimum = device_state.SP[node_id];
 			}
@@ -274,37 +262,37 @@ void __global__ update_nodes_identify_min(GraphT graph, algo_state device_state)
 		atomicMin(device_state.new_window_start, my_minimum);
 	}	
 }
-void __global__ update_nodes_special(GraphT graph, algo_state device_state) {
+void __global__ update_nodes_special(gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
 	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
 	int num_threads = blockDim.x * gridDim.x;
 	int warp_id = thread_id / 32;	
 	
-	int total_work = graph.num_nodes;
+	int total_work = graph.num_vertices;
 	int work_per_thread = (total_work + num_threads - 1)/num_threads;
 	for (int i = 0; i < work_per_thread; i++) {
 		int32_t node_id = thread_id + i * num_threads;
-		if (node_id < graph.num_nodes) {
+		if (node_id < graph.num_vertices) {
 			if(device_state.SP[node_id] >= device_state.window_lower && device_state.SP[node_id] < device_state.window_upper) {
 				int pos = atomicAggInc(device_state.frontier1_size + 1 + (warp_id % 4));
-				device_state.frontier1[pos + (warp_id % 4 + 1) * graph.num_nodes] = node_id;
+				device_state.frontier1[pos + (warp_id % 4 + 1) * graph.num_vertices] = node_id;
 			}	
 		}
 	}
 }
-void allocate_state(algo_state &host_state, algo_state &device_state, GraphT &graph) {
-	host_state.SP = new int[graph.num_nodes];
+void allocate_state(algo_state &host_state, algo_state &device_state, gpu_runtime::GraphT<int32_t> &graph) {
+	host_state.SP = new int[graph.num_vertices];
 	host_state.output_size = new int32_t[1];
 	host_state.new_window_start = new int32_t[1];
 
 	host_state.frontier1_size = new int32_t[1];
-	host_state.frontier1 = new int32_t[graph.num_nodes];
+	host_state.frontier1 = new int32_t[graph.num_vertices];
 
 
 	host_state.more_elems = new int32_t();
-	cudaMalloc(&device_state.SP, sizeof(int32_t)*graph.num_nodes);	
+	cudaMalloc(&device_state.SP, sizeof(int32_t)*graph.num_vertices);	
 
-	cudaMalloc(&device_state.frontier1, sizeof(int32_t)*graph.num_nodes * 5);	
-	cudaMalloc(&device_state.frontier2, sizeof(char)*graph.num_nodes );	
+	cudaMalloc(&device_state.frontier1, sizeof(int32_t)*graph.num_vertices * 5);	
+	cudaMalloc(&device_state.frontier2, sizeof(char)*graph.num_vertices );	
 
 	cudaMalloc(&device_state.frontier1_size, 5*sizeof(int32_t));	
 	//cudaMalloc(&device_state.frontier2_size, sizeof(int32_t));	
@@ -329,9 +317,9 @@ void swap_queues(algo_state &device_state) {
 int main(int argc, char *argv[]) {
 	cudaSetDevice(0);
 	cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
-	GraphT graph;
+	gpu_runtime::GraphT<int32_t> graph;
 
-	int32_t *new_indices = load_graph(argv[1], false, graph);
+	gpu_runtime::load_graph(graph, argv[1], false);
 	int32_t delta = atoi(argv[2]);
 
 	algo_state host_state, device_state;
@@ -356,7 +344,7 @@ int main(int argc, char *argv[]) {
 		int iters = 0;	
 		cudaDeviceSynchronize();
 		float t = stopTimer();
-		printf("Init time = %f\n", t);
+		//printf("Init time = %f\n", t);
 		iter_total+=t;
 
 		host_state.frontier1_size[0] = 1;
@@ -366,8 +354,6 @@ int main(int argc, char *argv[]) {
 		while(*host_state.frontier1_size) {
 			startTimer();
 			iters++;
-			int num_blocks = NUM_BLOCKS;
-			
 			int num_threads = *host_state.frontier1_size *(STAGE_1_SIZE);
 			int num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
 			
@@ -417,22 +403,22 @@ int main(int argc, char *argv[]) {
 			}
 
 			t = stopTimer();
-			printf("Iter %d time = %f, output_size = %d <%d, %d>\n", iters, t, *host_state.frontier1_size, num_cta, CTA_SIZE);
+			//printf("Iter %d time = %f, output_size = %d <%d, %d>\n", iters, t, *host_state.frontier1_size, num_cta, CTA_SIZE);
 			iter_total += t;
 		}
 		
-		printf("Num iters = %d\n", iters);
-		printf("Time elapsed = %f\n", iter_total);
+		//printf("Num iters = %d\n", iters);
+		//printf("Time elapsed = %f\n", iter_total);
 		total_time += iter_total;
 
 	}
-	printf("Total time = %f\n", total_time);
+	//printf("Total time = %f\n", total_time);
 	if (argc > 3)
-		if (argv[3][0] == 'o'){ 
-			FILE *output = fopen("output.txt", "w");
-			cudaMemcpy(host_state.SP, device_state.SP, sizeof(int32_t)*graph.num_nodes, cudaMemcpyDeviceToHost);
-			for (int i = 0; i < graph.num_nodes; i++)
-				fprintf(output, "%d, %d\n", i, host_state.SP[i]);
+		if (argv[3][0] == 'v'){ 
+			//FILE *output = fopen("output.txt", "w");
+			cudaMemcpy(host_state.SP, device_state.SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
+			for (int i = 0; i < graph.num_vertices; i++)
+				printf("%d\n", host_state.SP[i]);
 		}else if (argv[2][0] == 'c'){
 			/*
 			for (int i = 0; i < NUM_BLOCKS * NUM_THREADS; i++)

From 09c6e745e227e823155c2f40affc844cfd079193 Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Mon, 14 Oct 2019 11:54:57 -0400
Subject: [PATCH 25/88] gradually adding library rountines for frontier and
 load balance

---
 .../test_input/sssp_delta_stepping.cu         | 33 +++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index b7b68bf5..3195d5e3 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -39,6 +39,10 @@ typedef struct {
 	int32_t *new_window_start;
 }algo_state;
 
+int32_t __device__ *SP;
+int32_t *__host_SP;
+int32_t *__device_SP;
+
 
 void cudaCheckLastError(void) {
 	cudaError_t err = cudaGetLastError();
@@ -93,6 +97,26 @@ __device__ void enqueueVertex(int32_t v, algo_state &device_state, int32_t new_d
 	if (new_dist < device_state.window_upper)
 		device_state.frontier2[v] = 1 ;
 }
+
+bool __device__ updateEdge(int32_t src, int32_t dst, int32_t weight) {
+	bool output2;
+	bool SP_trackving_var_1 = 0;
+	SP_trackving_var_1 = gpu_runtime::writeMin(&SP[dst], (SP[src] + weight));
+	output2 = SP_trackving_var_1;
+	return output2;
+}
+
+template <typename EdgeWeightType>
+void __device__ gpu_operator_body_3(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {
+	// Body of the actual operator code
+	EdgeWeightType weight = graph.d_edge_weight[edge_id];
+	if (updateEdge(src, dst, weight)) {
+		gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, dst);
+	}
+}
+
+
+
 void __global__ update_edges (gpu_runtime::GraphT<int32_t> graph, algo_state device_state, int32_t curr_iter) {
 	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
 	//int num_threads = blockDim.x * gridDim.x;
@@ -330,6 +354,10 @@ int main(int argc, char *argv[]) {
 	host_state.window_upper = delta;
 	device_state.window_lower = 0;
 	device_state.window_upper = delta;
+
+	gpu_runtime::VertexFrontier frontier = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
+	gpu_runtime::builtin_addVertex(frontier, 0);
+	
 	
 
 	cudaDeviceSynchronize();
@@ -339,7 +367,6 @@ int main(int argc, char *argv[]) {
 		float iter_total = 0;
 		startTimer();
 		
-		startTimer();
 		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);		
 		int iters = 0;	
 		cudaDeviceSynchronize();
@@ -358,7 +385,9 @@ int main(int argc, char *argv[]) {
 			int num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
 			
 			update_edges<<<num_cta, CTA_SIZE>>>(graph, device_state, iters);
-
+			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(edges, frontier, frontier);  
+			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);  
+			
 			host_state.frontier1_size[0] = 0;
 			host_state.frontier1_size[1] = 0;
 			host_state.frontier1_size[2] = 0;

From e31115c99d9b6b4aa075ed9534da801e25ed0049 Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Mon, 14 Oct 2019 15:37:45 -0400
Subject: [PATCH 26/88] adding code to allocate the SP array

---
 .../test_input/sssp_delta_stepping.cu          | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 3195d5e3..15a8ee4d 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -338,16 +338,26 @@ void swap_queues(algo_state &device_state) {
 	//swap_pointers(&device_state.frontier1, &device_state.frontier2);
 	//swap_pointers(&device_state.frontier1_size, &device_state.frontier2_size);
 }
+
+void __device__ SP_generated_vector_op_apply_func_0(int32_t v) {
+	SP[v] = 2147483647;
+}
+
+
 int main(int argc, char *argv[]) {
 	cudaSetDevice(0);
 	cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
 	gpu_runtime::GraphT<int32_t> graph;
-
 	gpu_runtime::load_graph(graph, argv[1], false);
 	int32_t delta = atoi(argv[2]);
 
-	algo_state host_state, device_state;
+	cudaMalloc(&__device_SP, gpu_runtime::builtin_getVertices(graph) * sizeof(int32_t));
+	cudaMemcpyToSymbol(SP, &__device_SP, sizeof(int32_t*), 0);
+	__host_SP = new int32_t[gpu_runtime::builtin_getVertices(graph)];
+	gpu_runtime::vertex_set_apply_kernel<SP_generated_vector_op_apply_func_0><<<NUM_CTA, CTA_SIZE>>>(gpu_runtime::builtin_getVertices(graph));
+	
 
+	algo_state host_state, device_state;	
 	allocate_state(host_state, device_state, graph);
 
 	host_state.window_lower = 0;
@@ -357,8 +367,6 @@ int main(int argc, char *argv[]) {
 
 	gpu_runtime::VertexFrontier frontier = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
 	gpu_runtime::builtin_addVertex(frontier, 0);
-	
-	
 
 	cudaDeviceSynchronize();
 		
@@ -386,7 +394,7 @@ int main(int argc, char *argv[]) {
 			
 			update_edges<<<num_cta, CTA_SIZE>>>(graph, device_state, iters);
 			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(edges, frontier, frontier);  
-			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);  
+			gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);  
 			
 			host_state.frontier1_size[0] = 0;
 			host_state.frontier1_size[1] = 0;

From d5923d26f7fc9fe66fb4b9cdda4efb55817d8411 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Mon, 14 Oct 2019 17:04:37 -0400
Subject: [PATCH 27/88] Made the vertexset apply operator more general

---
 include/graphit/midend/mir_context.h     | 12 ++++++++++++
 src/backend/codegen_gpu/codegen_gpu.cpp  | 21 +++++++++++++++------
 src/runtime_lib/infra_gpu/graph.h        |  6 ++++++
 src/runtime_lib/infra_gpu/load_balance.h | 14 ++++++++------
 4 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/include/graphit/midend/mir_context.h b/include/graphit/midend/mir_context.h
index 48293cbe..eae515ff 100644
--- a/include/graphit/midend/mir_context.h
+++ b/include/graphit/midend/mir_context.h
@@ -152,6 +152,7 @@ namespace graphit {
         std::vector<mir::VarDecl::Ptr> getEdgeSets() {
             return const_edge_sets_;
         }
+	
 
         mir::VarDecl::Ptr getConstEdgeSetByName(std::string var_name) {
 
@@ -257,6 +258,17 @@ namespace graphit {
             }
         }
 
+	mir::VarDecl::Ptr getEdgeSetFromElementType(mir::ElementType::Ptr element_type) {
+		for (auto decl: getEdgeSets()) {
+			mir::Type::Ptr type = decl->type;
+			assert(mir::isa<mir::EdgeSetType>(type));
+			mir::EdgeSetType::Ptr edge_set_type = mir::to<mir::EdgeSetType>(type);
+			if (edge_set_type->element == element_type)
+				return decl; 
+		}
+		return nullptr;	
+	}
+
         bool updateElementInputFilename(mir::ElementType::Ptr element_type, mir::Expr::Ptr file_name) {
             input_filename_map_[element_type->ident] = file_name;
             return true;
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index d7d73bde..7fe31397 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -1081,17 +1081,26 @@ void CodeGenGPU::visit(mir::BreakStmt::Ptr break_stmt) {
 	oss << "break;" << std::endl;
 }
 void CodeGenGPU::visit(mir::VertexSetApplyExpr::Ptr vsae) {
-	oss << "gpu_runtime::vertex_set_apply_kernel"; 
-	oss << "<" << vsae->input_function_name << ">";
-	oss << "<<<NUM_CTA, CTA_SIZE>>>";
+	oss << "gpu_runtime::vertex_set_apply_kernel<"; 
 	auto mir_var = mir::to<mir::VarExpr> (vsae->target);
+	if (mir_context_->isConstVertexSet(mir_var->var.getName())) {
+		oss << "gpu_runtime::AccessorAll";
+	} else {
+		oss << "gpu_runtime::AccessorSparse";
+	}
+	oss << ", ";
+	oss << vsae->input_function_name << ">";
+	oss << "<<<NUM_CTA, CTA_SIZE>>>";
 	if (mir_context_->isConstVertexSet(mir_var->var.getName())) {
 		auto associated_element_type = mir_context_->getElementTypeFromVectorOrSetName(mir_var->var.getName());
 		assert(associated_element_type != nullptr);
-		auto associated_element_type_size = mir_context_->getElementCount(associated_element_type);
-		assert(associated_element_type_size != nullptr);
+		//auto associated_element_type_size = mir_context_->getElementCount(associated_element_type);
+		//assert(associated_element_type_size != nullptr);
+		auto associated_edge_set = mir_context_->getEdgeSetFromElementType(associated_element_type);
+		assert(associated_edge_set != nullptr);
 		oss << "(";
-		associated_element_type_size->accept(this);
+		//associated_element_type_size->accept(this);
+		oss << associated_edge_set->name << ".getFullFrontier()";
 		oss << ")";	
 	} else {
 		oss << "(";
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
index 1fcaaf8c..f5fbe316 100644
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -7,6 +7,7 @@
 // GraphT data structure 
 #define IGNORE_JULIENNE_TYPES
 #include "infra_gapbs/benchmark.h"
+#include "infra_gpu/vertex_frontier.h"
 namespace gpu_runtime {
 
 template <typename EdgeWeightType>
@@ -33,6 +34,11 @@ struct GraphT { // Field names are according to CSR, reuse for CSC
 	int32_t __device__ d_get_degree(int32_t vertex_id) {
 		return d_src_offsets[vertex_id + 1] - d_src_offsets[vertex_id];
 	}
+	VertexFrontier getFullFrontier(void) {
+		VertexFrontier frontier;
+		frontier.max_num_elems = num_vertices;
+		return frontier;
+	}
 };
 void consume(int32_t _) {
 }
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index f4c6231d..a7bf9ff3 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -13,15 +13,17 @@ using load_balance_payload_type = void (GraphT<EdgeWeightType>, int32_t, int32_t
 
 
 // VERTEX SET APPLY FUNCTIONS
-template <void body(int32_t vid)>
-static void __device__ vertex_set_apply(int32_t num_vertices) {
-	for(int32_t vid = threadIdx.x + blockDim.x * blockIdx.x; vid < num_vertices; vid+= blockDim.x * gridDim.x) {
+template <typename AccessorType, void body(int32_t vid)>
+static void __device__ vertex_set_apply(VertexFrontier &frontier) {
+	int32_t total_vertices = AccessorType::getSize(frontier);
+	for(int32_t vidx = threadIdx.x + blockDim.x * blockIdx.x; vidx < total_vertices; vidx += blockDim.x * gridDim.x) {
+		int32_t vid = AccessorType::getElement(frontier, vidx);
 		body(vid);
 	}
 }
-template <void body(int32_t vid)>
-static void __global__ vertex_set_apply_kernel(int32_t num_vertices) {
-	vertex_set_apply<body>(num_vertices);
+template <typename AccessorType, void body(int32_t vid)>
+static void __global__ vertex_set_apply_kernel(VertexFrontier frontier) {
+	vertex_set_apply<AccessorType, body>(frontier);
 } 
 
 // VERTEX BASED LOAD BALANCE FUNCTIONS

From d4d0ec53e1881d95ce2544e4a4b64757fd511384 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Mon, 14 Oct 2019 17:23:17 -0400
Subject: [PATCH 28/88] Added test cases for vertex set apply kernel and fixed
 the call in sssp_delta_stepping test

---
 .../gpu_tests/test_input/runtime_lib_tests.cu | 60 +++++++++++++++++++
 .../test_input/sssp_delta_stepping.cu         |  2 +-
 2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/test/gpu_tests/test_input/runtime_lib_tests.cu b/test/gpu_tests/test_input/runtime_lib_tests.cu
index 4bc6a82a..c16a6173 100644
--- a/test/gpu_tests/test_input/runtime_lib_tests.cu
+++ b/test/gpu_tests/test_input/runtime_lib_tests.cu
@@ -26,6 +26,66 @@ TEST_F(GPURuntimeLibTest, SimplePriorityQueueTest){
 	EXPECT_EQ (14, num_vertices);
 }
 
+__device__ int32_t* test_array_1;
+void __device__ vertex_set_apply_all_test_function(int32_t vid) {
+	test_array_1[vid] += 1;
+}
+
+TEST_F(GPURuntimeLibTest, VertexSetApplyAllTest) {
+	gpu_runtime::GraphT<int32_t> edges;
+	gpu_runtime::load_graph(edges, graph_directory + "/simple_mtx.mtx", false);
+	int num_vertices = gpu_runtime::builtin_getVertices(edges);
+	EXPECT_EQ (14, num_vertices);
+	
+	int32_t *test_array;	
+	cudaMalloc(&test_array, num_vertices * sizeof(int32_t));
+	cudaMemcpyToSymbol(test_array_1, &test_array, sizeof(int32_t*), 0);
+	
+	int32_t *test_array_host = new int32_t[num_vertices];
+	cudaMemset(test_array, 0, sizeof(int32_t) * num_vertices);	
+
+	gpu_runtime::vertex_set_apply_kernel<gpu_runtime::AccessorAll, vertex_set_apply_all_test_function><<<NUM_CTA, CTA_SIZE>>>(edges.getFullFrontier());
+	
+	cudaMemcpy(test_array_host, test_array, sizeof(int32_t) * num_vertices, cudaMemcpyDeviceToHost);
+	cudaFree(test_array);
+	for (int32_t index = 0; index < num_vertices; index++) {
+		EXPECT_EQ(1, test_array_host[index]);
+	}	
+}
+
+
+TEST_F(GPURuntimeLibTest, VertexSetApplySparseTest) {
+	gpu_runtime::GraphT<int32_t> edges;
+	gpu_runtime::load_graph(edges, graph_directory + "/simple_mtx.mtx", false);
+	int num_vertices = gpu_runtime::builtin_getVertices(edges);
+	EXPECT_EQ (14, num_vertices);
+	
+	int32_t *test_array;	
+	cudaMalloc(&test_array, num_vertices * sizeof(int32_t));
+	cudaMemcpyToSymbol(test_array_1, &test_array, sizeof(int32_t*), 0);
+	
+	int32_t *test_array_host = new int32_t[num_vertices];
+	cudaMemset(test_array, 0, sizeof(int32_t) * num_vertices);	
+
+	gpu_runtime::VertexFrontier frontier = gpu_runtime::create_new_vertex_set(num_vertices);
+
+	builtin_addVertex(frontier, 0);
+	builtin_addVertex(frontier, 7);
+	builtin_addVertex(frontier, 13);
+
+	
+	gpu_runtime::vertex_set_apply_kernel<gpu_runtime::AccessorSparse, vertex_set_apply_all_test_function><<<NUM_CTA, CTA_SIZE>>>(frontier);
+	
+	cudaMemcpy(test_array_host, test_array, sizeof(int32_t) * num_vertices, cudaMemcpyDeviceToHost);
+	cudaFree(test_array);
+	for (int32_t index = 0; index < num_vertices; index++) {
+		if (index == 0 || index == 7 || index == 13) 
+			EXPECT_EQ(1, test_array_host[index]);
+		else 
+			EXPECT_EQ(0, test_array_host[index]);
+	}	
+}
+
 int main(int argc, char* argv[]) {
 	if (argc < 2) {
 		std::cout << "Test needs path to graph directory as first argument" << std::endl;
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 15a8ee4d..1bb4a33a 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -354,7 +354,7 @@ int main(int argc, char *argv[]) {
 	cudaMalloc(&__device_SP, gpu_runtime::builtin_getVertices(graph) * sizeof(int32_t));
 	cudaMemcpyToSymbol(SP, &__device_SP, sizeof(int32_t*), 0);
 	__host_SP = new int32_t[gpu_runtime::builtin_getVertices(graph)];
-	gpu_runtime::vertex_set_apply_kernel<SP_generated_vector_op_apply_func_0><<<NUM_CTA, CTA_SIZE>>>(gpu_runtime::builtin_getVertices(graph));
+	gpu_runtime::vertex_set_apply_kernel<gpu_runtime::AccessorAll, SP_generated_vector_op_apply_func_0><<<NUM_CTA, CTA_SIZE>>>(graph.getFullFrontier());
 	
 
 	algo_state host_state, device_state;	

From de32c7a4aebe623396c062a00d12f89db0bc8822 Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Mon, 14 Oct 2019 17:24:57 -0400
Subject: [PATCH 29/88] fixing the initialization of SP on device

---
 test/gpu_tests/all_gpu_tests.py               | 10 ++++-
 .../test_input/sssp_delta_stepping.cu         | 38 ++++++++++++-------
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
index 52d47e86..0084b8b7 100644
--- a/test/gpu_tests/all_gpu_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -39,8 +39,14 @@ def sssp_verified_test(self, input_file_name, use_delta=False):
                         self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel 2 v > verifier_input ")
                 else:
                         self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel v > verifier_input ")             
-                self.get_command_output(self.verifier_directory + "/sssp_verifier -f " + self.graph_directory +  "/4.wel -t verifier_input -r 0")                
-        
+                output = self.get_command_output(self.verifier_directory + "/sssp_verifier -f " + self.graph_directory +  "/4.wel -t verifier_input -r 0")                
+                test_flag = False
+                for line in output.rstrip().split("\n"):
+                        if line.rstrip().find("SUCCESSFUL") != -1:
+                                test_flag = True
+                                break;
+                self.assertEqual(test_flag, True)
+                
 	@classmethod	
 	def setUpClass(cls):
 		if NVCC_COMPILER == "CUDA_NVCC_EXECUTABLE-NOTFOUND":
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 15a8ee4d..9c86c634 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -71,6 +71,9 @@ void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state devic
                 }
         }
 	if (thread_id == 0) {
+		//reset with the new data structure
+		SP[0] = 0;
+		
 		device_state.SP[0] = 0;
 		device_state.frontier1[graph.num_vertices] = 0;	
 		device_state.frontier1_size[0] = 1;
@@ -194,13 +197,19 @@ void __global__ update_edges (gpu_runtime::GraphT<int32_t> graph, algo_state dev
 	if (my_vertex_idx < total_vertices) {
 		// STAGE 1	
 		//my_vertex = device_state.frontier1[my_vertex_idx];
-		src_distance = device_state.SP[my_vertex];
+
+		//src_distance = device_state.SP[my_vertex];
+		src_distance = SP[my_vertex];
+
 		for (int32_t neigh_id = s1_offset + (lane_id % STAGE_1_SIZE); neigh_id < d + s1_offset; neigh_id += STAGE_1_SIZE) {
 			// DO ACTUAL SSSP
 			int32_t dst = graph.d_edge_dst[neigh_id];
 			int32_t new_dst = graph.d_edge_weight[neigh_id] + src_distance;
-			if (new_dst < device_state.SP[dst]) {
-				atomicMin(&device_state.SP[dst], new_dst);
+
+			//if (new_dst < device_state.SP[dst]) {
+			if (new_dst < SP[dst]) {
+				//atomicMin(&device_state.SP[dst], new_dst);
+				atomicMin(&SP[dst], new_dst);
 				enqueueVertex(dst, device_state, new_dst);
 			}	
 		}		
@@ -218,14 +227,17 @@ void __global__ update_edges (gpu_runtime::GraphT<int32_t> graph, algo_state dev
 		my_vertex = stage2_queue[to_process];
 		d = stage2_size[to_process];
 		int32_t s2_offset = stage2_offset[to_process];	
-		src_distance = device_state.SP[my_vertex];
+
+		//src_distance = device_state.SP[my_vertex];
+		src_distance = SP[my_vertex];
 		
 		for (int32_t neigh_id = s2_offset + (lane_id); neigh_id < d + s2_offset; neigh_id += WARP_SIZE) {
 			// DO ACTUAL SSSP
 			int dst = graph.d_edge_dst[neigh_id];
 			int new_dst = graph.d_edge_weight[neigh_id] + src_distance;
-			if (new_dst < device_state.SP[dst]) {
-				atomicMin(&device_state.SP[dst], new_dst);
+			//if (new_dst < device_state.SP[dst]) {
+			if (new_dst < SP[dst]) {
+				atomicMin(&SP[dst], new_dst);
 				enqueueVertex(dst, device_state, new_dst);
 			}	
 		}
@@ -236,14 +248,14 @@ void __global__ update_edges (gpu_runtime::GraphT<int32_t> graph, algo_state dev
 		my_vertex = stage3_queue[wid];
 		d = stage3_size[wid];
 		int32_t s3_offset = stage3_offset[wid];
-		src_distance = device_state.SP[my_vertex];
+		src_distance = SP[my_vertex];
 		
 		for (int32_t neigh_id = s3_offset + (threadIdx.x); neigh_id < d + s3_offset; neigh_id += CTA_SIZE) {
 			// DO ACTUAL SSSP
 			int dst = graph.d_edge_dst[neigh_id];
 			int new_dst = graph.d_edge_weight[neigh_id] + src_distance;
-			if (new_dst < device_state.SP[dst]) {
-				atomicMin(&device_state.SP[dst], new_dst);
+			if (new_dst < SP[dst]) {
+				atomicMin(&SP[dst], new_dst);
 				enqueueVertex(dst, device_state, new_dst);
 			}	
 		}
@@ -277,8 +289,8 @@ void __global__ update_nodes_identify_min(gpu_runtime::GraphT<int32_t> graph, al
 	for (int i = 0; i < work_per_thread; i++) {
 		int32_t node_id = thread_id + i * num_threads;
 		if (node_id < graph.num_vertices) {
-			if (device_state.SP[node_id] >= device_state.window_upper && device_state.SP[node_id] != INT_MAX && device_state.SP[node_id] < my_minimum) {
-				my_minimum = device_state.SP[node_id];
+			if (SP[node_id] >= device_state.window_upper && SP[node_id] != INT_MAX && SP[node_id] < my_minimum) {
+				my_minimum = SP[node_id];
 			}
 		}
 	}
@@ -296,7 +308,7 @@ void __global__ update_nodes_special(gpu_runtime::GraphT<int32_t> graph, algo_st
 	for (int i = 0; i < work_per_thread; i++) {
 		int32_t node_id = thread_id + i * num_threads;
 		if (node_id < graph.num_vertices) {
-			if(device_state.SP[node_id] >= device_state.window_lower && device_state.SP[node_id] < device_state.window_upper) {
+			if(SP[node_id] >= device_state.window_lower && SP[node_id] < device_state.window_upper) {
 				int pos = atomicAggInc(device_state.frontier1_size + 1 + (warp_id % 4));
 				device_state.frontier1[pos + (warp_id % 4 + 1) * graph.num_vertices] = node_id;
 			}	
@@ -453,7 +465,7 @@ int main(int argc, char *argv[]) {
 	if (argc > 3)
 		if (argv[3][0] == 'v'){ 
 			//FILE *output = fopen("output.txt", "w");
-			cudaMemcpy(host_state.SP, device_state.SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
+			cudaMemcpy(host_state.SP, __device_SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
 			for (int i = 0; i < graph.num_vertices; i++)
 				printf("%d\n", host_state.SP[i]);
 		}else if (argv[2][0] == 'c'){

From 8fe8e412bea5c7a9947ac84c7b6df1a67768f2eb Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Mon, 14 Oct 2019 20:01:08 -0400
Subject: [PATCH 30/88] Fixed the broken runtime test and added graphit compile
 and exec tests

---
 include/graphit/midend/mir_context.h          |  6 +++-
 test/c++/test.cpp                             |  1 +
 test/gpu_tests/all_gpu_tests.py               | 35 ++++++++++++++++++-
 .../test_input/default_gpu_schedule.gt        |  3 ++
 .../gpu_tests/test_input/simple_graph_load.gt |  8 +++++
 5 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 test/gpu_tests/test_input/default_gpu_schedule.gt
 create mode 100644 test/gpu_tests/test_input/simple_graph_load.gt

diff --git a/include/graphit/midend/mir_context.h b/include/graphit/midend/mir_context.h
index eae515ff..4fb571ff 100644
--- a/include/graphit/midend/mir_context.h
+++ b/include/graphit/midend/mir_context.h
@@ -263,7 +263,11 @@ namespace graphit {
 			mir::Type::Ptr type = decl->type;
 			assert(mir::isa<mir::EdgeSetType>(type));
 			mir::EdgeSetType::Ptr edge_set_type = mir::to<mir::EdgeSetType>(type);
-			if (edge_set_type->element == element_type)
+			if (edge_set_type->vertex_element_type_list == nullptr)
+				continue;
+			if (edge_set_type->vertex_element_type_list->size() !=2)
+				continue;
+			if ((*(edge_set_type->vertex_element_type_list))[0]->ident == element_type->ident && (*(edge_set_type->vertex_element_type_list))[1]->ident == element_type->ident)
 				return decl; 
 		}
 		return nullptr;	
diff --git a/test/c++/test.cpp b/test/c++/test.cpp
index ef86d510..3c281351 100644
--- a/test/c++/test.cpp
+++ b/test/c++/test.cpp
@@ -93,6 +93,7 @@ int main(int argc, char **argv) {
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.PRCCPullParallelDifferentSegments";
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.PRPullParallelNumaAware";
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.PRPullParallelNumaAware";
+//    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.BFSBasicSimpleGPUScheduleTest";
 
 
     return RUN_ALL_TESTS();
diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
index 52d47e86..2b1a86f3 100644
--- a/test/gpu_tests/all_gpu_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -72,13 +72,39 @@ def setUpClass(cls):
 		shutil.copytree(GRAPHIT_SOURCE_DIRECTORY + "/test/graphs", cls.scratch_directory + "/graphs")
 		cls.graph_directory = cls.scratch_directory + "/graphs"
 		cls.executable_name = cls.scratch_directory + "/test_exectuable"	
+		cls.cuda_filename = cls.scratch_directory + "/test_cpp.cu"
+		
+		cls.graphitc_py = GRAPHIT_BUILD_DIRECTORY + "/bin/graphitc.py"
+
 	def cpp_compile_test(self, input_file_name, extra_cpp_args=[]):
-		compile_command = self.nvcc_command + self.test_input_directory + "/" + input_file_name + " -o " + self.executable_name + " " + " ".join(extra_cpp_args)
+		if input_file_name[0] == "/":
+			compile_command = self.nvcc_command + input_file_name + " -o " + self.executable_name + " " + " ".join(extra_cpp_args)
+		else:
+			compile_command = self.nvcc_command + self.test_input_directory + "/" + input_file_name + " -o " + self.executable_name + " " + " ".join(extra_cpp_args)
 		self.get_command_output(compile_command)
 	
 	def cpp_exec_test(self, input_file_name, extra_cpp_args=[], extra_exec_args=[]):
 		self.cpp_compile_test(input_file_name, extra_cpp_args)
 		return self.get_command_output(self.executable_name + " " + " ".join(extra_exec_args))
+
+	def graphit_generate_test(self, input_file_name, input_schedule_name=""):
+		if input_file_name[0] != "/":
+			input_file_name = self.test_input_directory + "/" + input_file_name
+		if input_schedule_name != "" and input_schedule_name[0] != "/":
+			input_schedule_name = self.test_input_directory + "/" + input_schedule_name
+
+		if input_schedule_name != "":
+			self.get_command_output("python " + self.graphitc_py + " -a " + input_file_name + " -f " + input_schedule_name + " -o " + self.cuda_filename)
+		else:
+			self.get_command_output("python " + self.graphitc_py + " -f " + input_file_name + " -o " + self.cuda_filename)
+		
+	def graphit_compile_test(self, input_file_name, input_schedule_name="", extra_cpp_args=[]):	
+		self.graphit_generate_test(input_file_name, input_schedule_name)
+		self.cpp_compile_test(self.cuda_filename, extra_cpp_args)
+
+	def graphit_exec_test(self, input_file_name, input_schedule_name="", extra_cpp_args=[], extra_exec_args=[]):
+		self.graphit_generate_test(input_file_name, input_schedule_name)
+		return self.cpp_exec_test(self.cuda_filename, extra_cpp_args, extra_exec_args)
 			
 	def test_basic_compile(self):
 		self.cpp_compile_test("basic_compile.cu")
@@ -101,6 +127,13 @@ def test_sssp_delta_stepping(self):
 
         def test_sssp_delta_stepping_verified(self):
                 self.sssp_verified_test("sssp_delta_stepping.cu", True)
+
+	def test_simple_graphit_exec(self):
+		output = self.graphit_exec_test("simple_graph_load.gt", "default_gpu_schedule.gt", [], [self.graph_directory + "/simple_mtx.mtx"])
+		output = output.split("\n")
+		self.assertEqual(len(output), 2)
+		self.assertEqual(output[0], "14")
+
                 
 if __name__ == '__main__':
 	unittest.main()
diff --git a/test/gpu_tests/test_input/default_gpu_schedule.gt b/test/gpu_tests/test_input/default_gpu_schedule.gt
new file mode 100644
index 00000000..ae99ff2b
--- /dev/null
+++ b/test/gpu_tests/test_input/default_gpu_schedule.gt
@@ -0,0 +1,3 @@
+schedule:
+	SimpleGPUSchedule s1;
+	program->applyGPUSchedule("s1", s1);
diff --git a/test/gpu_tests/test_input/simple_graph_load.gt b/test/gpu_tests/test_input/simple_graph_load.gt
new file mode 100644
index 00000000..9af25ff8
--- /dev/null
+++ b/test/gpu_tests/test_input/simple_graph_load.gt
@@ -0,0 +1,8 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex, Vertex, int) = load (argv[1]);
+
+func main()
+	#s1# print edges.getVertices();	
+end

From 51ccc4ddbb53b34aa8cdeb073aa44c13cee6643e Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Mon, 14 Oct 2019 20:44:12 -0400
Subject: [PATCH 31/88] switching to the frontier and prepare sparse APIs

---
 .../test_input/sssp_delta_stepping.cu         | 61 ++++++++++++++-----
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 8a520809..aabb95fd 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -106,6 +106,11 @@ bool __device__ updateEdge(int32_t src, int32_t dst, int32_t weight) {
 	bool SP_trackving_var_1 = 0;
 	SP_trackving_var_1 = gpu_runtime::writeMin(&SP[dst], (SP[src] + weight));
 	output2 = SP_trackving_var_1;
+
+	//if (SP[dst] < device_state.window_upper){
+	//output2 = true;
+		//}
+	
 	return output2;
 }
 
@@ -113,8 +118,9 @@ template <typename EdgeWeightType>
 void __device__ gpu_operator_body_3(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {
 	// Body of the actual operator code
 	EdgeWeightType weight = graph.d_edge_weight[edge_id];
-	if (updateEdge(src, dst, weight)) {
-		gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, dst);
+	if (updateEdge(src, dst, weight)){
+		//gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, dst);
+		gpu_runtime::enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, dst);
 	}
 }
 
@@ -298,7 +304,7 @@ void __global__ update_nodes_identify_min(gpu_runtime::GraphT<int32_t> graph, al
 		atomicMin(device_state.new_window_start, my_minimum);
 	}	
 }
-void __global__ update_nodes_special(gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
+void __global__ update_nodes_special(gpu_runtime::GraphT<int32_t> graph, algo_state device_state,  gpu_runtime::VertexFrontier output_frontier) {
 	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
 	int num_threads = blockDim.x * gridDim.x;
 	int warp_id = thread_id / 32;	
@@ -309,8 +315,9 @@ void __global__ update_nodes_special(gpu_runtime::GraphT<int32_t> graph, algo_st
 		int32_t node_id = thread_id + i * num_threads;
 		if (node_id < graph.num_vertices) {
 			if(SP[node_id] >= device_state.window_lower && SP[node_id] < device_state.window_upper) {
-				int pos = atomicAggInc(device_state.frontier1_size + 1 + (warp_id % 4));
-				device_state.frontier1[pos + (warp_id % 4 + 1) * graph.num_vertices] = node_id;
+				gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, node_id);
+				//int pos = atomicAggInc(device_state.frontier1_size + 1 + (warp_id % 4));
+				//device_state.frontier1[pos + (warp_id % 4 + 1) * graph.num_vertices] = node_id;
 			}	
 		}
 	}
@@ -377,11 +384,13 @@ int main(int argc, char *argv[]) {
 	device_state.window_lower = 0;
 	device_state.window_upper = delta;
 
+	//this sets it to Sparse
 	gpu_runtime::VertexFrontier frontier = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
 	gpu_runtime::builtin_addVertex(frontier, 0);
 
 	cudaDeviceSynchronize();
-		
+
+	
 	float total_time = 0;
 	for (int outer = 0; outer < ITER_COUNT; outer++) {
 		float iter_total = 0;
@@ -396,16 +405,18 @@ int main(int argc, char *argv[]) {
 
 		host_state.frontier1_size[0] = 1;
 
-
-
-		while(*host_state.frontier1_size) {
+		//while(*host_state.frontier1_size) {
+		while(gpu_runtime::builtin_getVertexSetSize(frontier) != (0)){
 			startTimer();
 			iters++;
 			int num_threads = *host_state.frontier1_size *(STAGE_1_SIZE);
 			int num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
 			
-			update_edges<<<num_cta, CTA_SIZE>>>(graph, device_state, iters);
-			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(edges, frontier, frontier);  
+			//update_edges<<<num_cta, CTA_SIZE>>>(graph, device_state, iters);
+			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(edges, frontier, frontier);
+
+			gpu_runtime::vertex_set_prepare_sparse(frontier);
+			
 			gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);  
 			
 			host_state.frontier1_size[0] = 0;
@@ -415,7 +426,15 @@ int main(int argc, char *argv[]) {
 			host_state.frontier1_size[4] = 0;
 			cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, 5*sizeof(int32_t), cudaMemcpyHostToDevice);
 			
-			update_nodes<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);
+			//update_nodes<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);
+
+
+			gpu_runtime::swap_bytemaps(frontier);
+			// set the input to the prepare function
+			frontier.format_ready = gpu_runtime::VertexFrontier::BYTEMAP;
+			
+			
+			
 			cudaMemcpy(host_state.frontier1_size, device_state.frontier1_size, sizeof(int32_t)*5, cudaMemcpyDeviceToHost);
 			host_state.frontier1_size[0] = host_state.frontier1_size[1];
 			host_state.frontier1_size[0] += host_state.frontier1_size[2];
@@ -424,14 +443,20 @@ int main(int argc, char *argv[]) {
 			cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, sizeof(int32_t), cudaMemcpyHostToDevice);
 
 
-			if (host_state.frontier1_size[0] == 0) {
+			//if (host_state.frontier1_size[0] == 0) {
+			if (gpu_runtime::builtin_getVertexSetSize(frontier) == (0)) {
 				host_state.new_window_start[0] = INT_MAX;
 				cudaMemcpy(device_state.new_window_start, host_state.new_window_start, sizeof(int32_t), cudaMemcpyHostToDevice);
+
+				//should not need to change 
 				update_nodes_identify_min<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);	
 				cudaMemcpy(host_state.new_window_start, device_state.new_window_start, sizeof(int32_t), cudaMemcpyDeviceToHost);
+
+				//this is for termination when it is all finished
 				if (host_state.new_window_start[0] == INT_MAX) {
 					break;
-				}	
+				}
+				
 				device_state.window_lower = host_state.new_window_start[0];
 				device_state.window_upper = host_state.new_window_start[0] + delta; 
 				host_state.frontier1_size[0] = 0;
@@ -442,7 +467,13 @@ int main(int argc, char *argv[]) {
 				host_state.frontier1_size[3] = 0;
 				host_state.frontier1_size[4] = 0;
 				cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, 5*sizeof(int32_t), cudaMemcpyHostToDevice);
-				update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>( graph, device_state);	
+
+
+				update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>( graph, device_state, frontier);
+				gpu_runtime::swap_queues(frontier);
+				frontier.format_ready = gpu_runtime::VertexFrontier::SPARSE; 
+				
+				
 				cudaMemcpy(host_state.frontier1_size, device_state.frontier1_size, sizeof(int32_t)*5, cudaMemcpyDeviceToHost);
 				host_state.frontier1_size[0] = host_state.frontier1_size[1];
 				host_state.frontier1_size[0] += host_state.frontier1_size[2];

From f5f90335bf28b943c1e10048cfd2ada3164efa74 Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Mon, 14 Oct 2019 20:48:01 -0400
Subject: [PATCH 32/88] comment out the unncessary parts of the code

---
 .../test_input/sssp_delta_stepping.cu         | 56 +++++++++----------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index aabb95fd..1aab7db0 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -307,7 +307,7 @@ void __global__ update_nodes_identify_min(gpu_runtime::GraphT<int32_t> graph, al
 void __global__ update_nodes_special(gpu_runtime::GraphT<int32_t> graph, algo_state device_state,  gpu_runtime::VertexFrontier output_frontier) {
 	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
 	int num_threads = blockDim.x * gridDim.x;
-	int warp_id = thread_id / 32;	
+	//int warp_id = thread_id / 32;	
 	
 	int total_work = graph.num_vertices;
 	int work_per_thread = (total_work + num_threads - 1)/num_threads;
@@ -409,8 +409,8 @@ int main(int argc, char *argv[]) {
 		while(gpu_runtime::builtin_getVertexSetSize(frontier) != (0)){
 			startTimer();
 			iters++;
-			int num_threads = *host_state.frontier1_size *(STAGE_1_SIZE);
-			int num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+			//int num_threads = *host_state.frontier1_size *(STAGE_1_SIZE);
+			//int num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
 			
 			//update_edges<<<num_cta, CTA_SIZE>>>(graph, device_state, iters);
 			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(edges, frontier, frontier);
@@ -419,12 +419,12 @@ int main(int argc, char *argv[]) {
 			
 			gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);  
 			
-			host_state.frontier1_size[0] = 0;
-			host_state.frontier1_size[1] = 0;
-			host_state.frontier1_size[2] = 0;
-			host_state.frontier1_size[3] = 0;
-			host_state.frontier1_size[4] = 0;
-			cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, 5*sizeof(int32_t), cudaMemcpyHostToDevice);
+			// host_state.frontier1_size[0] = 0;
+			// host_state.frontier1_size[1] = 0;
+			// host_state.frontier1_size[2] = 0;
+			// host_state.frontier1_size[3] = 0;
+			// host_state.frontier1_size[4] = 0;
+			// cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, 5*sizeof(int32_t), cudaMemcpyHostToDevice);
 			
 			//update_nodes<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);
 
@@ -435,12 +435,12 @@ int main(int argc, char *argv[]) {
 			
 			
 			
-			cudaMemcpy(host_state.frontier1_size, device_state.frontier1_size, sizeof(int32_t)*5, cudaMemcpyDeviceToHost);
-			host_state.frontier1_size[0] = host_state.frontier1_size[1];
-			host_state.frontier1_size[0] += host_state.frontier1_size[2];
-			host_state.frontier1_size[0] += host_state.frontier1_size[3];
-			host_state.frontier1_size[0] += host_state.frontier1_size[4];
-			cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, sizeof(int32_t), cudaMemcpyHostToDevice);
+			// cudaMemcpy(host_state.frontier1_size, device_state.frontier1_size, sizeof(int32_t)*5, cudaMemcpyDeviceToHost);
+			// host_state.frontier1_size[0] = host_state.frontier1_size[1];
+			// host_state.frontier1_size[0] += host_state.frontier1_size[2];
+			// host_state.frontier1_size[0] += host_state.frontier1_size[3];
+			// host_state.frontier1_size[0] += host_state.frontier1_size[4];
+			// cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, sizeof(int32_t), cudaMemcpyHostToDevice);
 
 
 			//if (host_state.frontier1_size[0] == 0) {
@@ -459,14 +459,14 @@ int main(int argc, char *argv[]) {
 				
 				device_state.window_lower = host_state.new_window_start[0];
 				device_state.window_upper = host_state.new_window_start[0] + delta; 
-				host_state.frontier1_size[0] = 0;
+				// host_state.frontier1_size[0] = 0;
 
-				host_state.frontier1_size[0] = 0;
-				host_state.frontier1_size[1] = 0;
-				host_state.frontier1_size[2] = 0;
-				host_state.frontier1_size[3] = 0;
-				host_state.frontier1_size[4] = 0;
-				cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, 5*sizeof(int32_t), cudaMemcpyHostToDevice);
+				// host_state.frontier1_size[0] = 0;
+				// host_state.frontier1_size[1] = 0;
+				// host_state.frontier1_size[2] = 0;
+				// host_state.frontier1_size[3] = 0;
+				// host_state.frontier1_size[4] = 0;
+				// cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, 5*sizeof(int32_t), cudaMemcpyHostToDevice);
 
 
 				update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>( graph, device_state, frontier);
@@ -474,12 +474,12 @@ int main(int argc, char *argv[]) {
 				frontier.format_ready = gpu_runtime::VertexFrontier::SPARSE; 
 				
 				
-				cudaMemcpy(host_state.frontier1_size, device_state.frontier1_size, sizeof(int32_t)*5, cudaMemcpyDeviceToHost);
-				host_state.frontier1_size[0] = host_state.frontier1_size[1];
-				host_state.frontier1_size[0] += host_state.frontier1_size[2];
-				host_state.frontier1_size[0] += host_state.frontier1_size[3];
-				host_state.frontier1_size[0] += host_state.frontier1_size[4];
-				cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, sizeof(int32_t), cudaMemcpyHostToDevice);
+				// cudaMemcpy(host_state.frontier1_size, device_state.frontier1_size, sizeof(int32_t)*5, cudaMemcpyDeviceToHost);
+				// host_state.frontier1_size[0] = host_state.frontier1_size[1];
+				// host_state.frontier1_size[0] += host_state.frontier1_size[2];
+				// host_state.frontier1_size[0] += host_state.frontier1_size[3];
+				// host_state.frontier1_size[0] += host_state.frontier1_size[4];
+				// cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, sizeof(int32_t), cudaMemcpyHostToDevice);
 			}
 
 			t = stopTimer();

From 653fca83f35742572a1b85ba7b60f13b23ebc10f Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Mon, 14 Oct 2019 21:25:05 -0400
Subject: [PATCH 33/88] adding support for windowing in the updateEdge function

---
 test/gpu_tests/test_input/sssp_delta_stepping.cu | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 1aab7db0..f9794e85 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -43,6 +43,8 @@ int32_t __device__ *SP;
 int32_t *__host_SP;
 int32_t *__device_SP;
 
+int32_t __device__ window_lower;
+int32_t __device__ window_upper;
 
 void cudaCheckLastError(void) {
 	cudaError_t err = cudaGetLastError();
@@ -107,7 +109,8 @@ bool __device__ updateEdge(int32_t src, int32_t dst, int32_t weight) {
 	SP_trackving_var_1 = gpu_runtime::writeMin(&SP[dst], (SP[src] + weight));
 	output2 = SP_trackving_var_1;
 
-	//if (SP[dst] < device_state.window_upper){
+	//do not output this if it is not within the current window
+	if (SP[dst] >= window_upper) return false;
 	//output2 = true;
 		//}
 	
@@ -416,7 +419,9 @@ int main(int argc, char *argv[]) {
 			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(edges, frontier, frontier);
 
 			gpu_runtime::vertex_set_prepare_sparse(frontier);
-			
+
+
+			cudaMemcpyToSymbol(window_upper, &device_state.window_upper, sizeof(int32_t*), 0);
 			gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);  
 			
 			// host_state.frontier1_size[0] = 0;
@@ -456,9 +461,11 @@ int main(int argc, char *argv[]) {
 				if (host_state.new_window_start[0] == INT_MAX) {
 					break;
 				}
-				
+
+				//if it is not a pointer, then you can set by value directly
 				device_state.window_lower = host_state.new_window_start[0];
-				device_state.window_upper = host_state.new_window_start[0] + delta; 
+				device_state.window_upper = host_state.new_window_start[0] + delta;
+				
 				// host_state.frontier1_size[0] = 0;
 
 				// host_state.frontier1_size[0] = 0;

From e3ad3b559dddeec76ff00fcb11c12494dd612e86 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Mon, 14 Oct 2019 22:41:13 -0400
Subject: [PATCH 34/88] Changed kernel fusion to use local copies instead of
 global variables

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |  3 +-
 src/backend/codegen_gpu/codegen_gpu.cpp       | 93 ++++++++-----------
 src/runtime_lib/infra_gpu/vertex_frontier.h   | 42 ++++-----
 .../infra_gpu/vertex_representation.h         |  4 -
 4 files changed, 63 insertions(+), 79 deletions(-)

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index 5f2b43b3..3ee8d5af 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -161,7 +161,8 @@ class CodeGenGPUFusedKernel: public CodeGenGPU {
 	virtual void visit(mir::PrintStmt::Ptr) override;
 	
 	std::string var_name (std::string var) {
-		return current_kernel_name + "_" + var;
+		//return current_kernel_name + "_" + var;
+		return "__local_" + var;
 	}
 };
 
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 7fe31397..8d8dadb4 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -137,6 +137,12 @@ void CodeGenGPU::genFusedWhileLoop(mir::WhileStmt::Ptr while_stmt) {
 	oss << "grid_group _grid = this_grid();" << std::endl;
 	codegen.printIndent();
 	oss << "int32_t _thread_id = threadIdx.x + blockIdx.x * blockDim.x;" << std::endl;
+	// For all the variables we would also generate local copies in each thread
+	for (auto var: extractor.hoisted_vars) {	
+		codegen.printIndent();
+		oss << "auto __local_" << var.getName() << " = " << fused_kernel_name << "_" << var.getName() << ";" << std::endl;
+	}
+	
 	codegen.printIndent();
 	oss << "while (";
 	while_stmt->cond->accept(&codegen);
@@ -146,6 +152,18 @@ void CodeGenGPU::genFusedWhileLoop(mir::WhileStmt::Ptr while_stmt) {
 	codegen.dedent();
 	codegen.printIndent();
 	oss << "}" << std::endl;
+
+	// After the kernel has ended, we should copy back all the variables
+	codegen.printIndent();
+	oss << "if (_thread_id == 0) {" << std::endl;
+	codegen.indent();
+	for (auto var: extractor.hoisted_vars) {	
+		codegen.printIndent();
+		oss << fused_kernel_name << "_" << var.getName() << " = " << "__local_" << var.getName() << ";" << std::endl;
+	}
+	codegen.dedent();
+	codegen.printIndent();
+	oss << "}" << std::endl;	
 	codegen.dedent();
 	codegen.printIndent();
 	oss << "}" << std::endl;			
@@ -155,10 +173,6 @@ void CodeGenGPU::genFusedWhileLoop(mir::WhileStmt::Ptr while_stmt) {
 void CodeGenGPUFusedKernel::visit(mir::StmtBlock::Ptr stmt_block) {
 	for (auto stmt : *(stmt_block->stmts)) {
 		stmt->accept(this);
-		if (!mir::isa<mir::BreakStmt>(stmt)) {
-			printIndent();
-			oss << "_grid.sync();" << std::endl;
-		}
 	}
 }
 void CodeGenGPUKernelEmitter::genFuncDecl(mir::FuncDecl::Ptr func_decl) {
@@ -454,7 +468,7 @@ void CodeGenGPU::visit(mir::ExprStmt::Ptr expr_stmt) {
 }
 void CodeGenGPU::visit(mir::VarExpr::Ptr var_expr) {
 	if (is_hoisted_var(var_expr->var)) {
-		oss << current_kernel_name << "_" << var_expr->var.getName();
+		oss << "__local_" << var_expr->var.getName();
 		return;
 	}
 	oss << var_expr->var.getName();
@@ -602,18 +616,10 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 			oss << var_name(esae->from_func) << ");" << std::endl;
 		}
 	}
-	printIndent();
-	oss << "_grid.sync();" << std::endl;
 	if (target != nullptr) {
 		printIndent();
-		oss << "if (_thread_id == 0)" << std::endl;
-		indent();
-		printIndent();
-		target->accept(this);
+		target->accept(this);	
 		oss << " = " << var_name(esae->from_func) << ";" << std::endl;
-		dedent();
-		printIndent();
-		oss << "_grid.sync();" << std::endl;
 	}
 	printIndent();
 	oss << load_balance_function << "_device<";
@@ -640,8 +646,6 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 	else 
 		oss << "gpu_runtime::sentinel_frontier";
 	oss << ");" << std::endl;
-	printIndent();
-	oss << "_grid.sync();" << std::endl;
 	
 	if (target != nullptr) {
 		if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
@@ -650,45 +654,25 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 			target->accept(this);
 			oss << ");" << std::endl;
 			printIndent();
-			oss << "_grid.sync();" << std::endl;
-			printIndent();
-			oss << "if (_thread_id == 0)" << std::endl;
-			indent();
-			printIndent();
 			target->accept(this);
 			oss << ".format_ready = gpu_runtime::VertexFrontier::SPARSE;" << std::endl;
-			dedent();
 		} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
 			printIndent();
 			oss << "gpu_runtime::swap_bitmaps_device(";
 			target->accept(this);
 			oss << ");" << std::endl;
 			printIndent();
-			oss << "_grid.sync();" << std::endl;
-			printIndent();
-			oss << "if (_thread_id == 0)" << std::endl;
-			indent();
-			printIndent();
 			target->accept(this);
 			oss << ".format_ready = gpu_runtime::VertexFrontier::BITMAP;" << std::endl;
-			dedent();
 		} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
 			printIndent();
 			oss << "gpu_runtime::swap_bytemaps_device(";
 			target->accept(this);
 			oss << ");" << std::endl;
 			printIndent();
-			oss << "_grid.sync();" << std::endl;
-			printIndent();
-			oss << "if (_thread_id == 0)" << std::endl;
-			indent();
-			printIndent();
 			target->accept(this);
 			oss << ".format_ready = gpu_runtime::VertexFrontier::BYTEMAP;" << std::endl;
-			dedent();
 		}
-		printIndent();
-		oss << "_grid.sync();" << std::endl;
 	}
 	dedent();
 	printIndent();
@@ -713,15 +697,25 @@ void CodeGenGPUFusedKernel::visit(mir::AssignStmt::Ptr assign_stmt) {
 		mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);
 		genEdgeSetApplyExpr(esae, assign_stmt->lhs);
 	} else {
-		printIndent();
-		oss << "if (_thread_id == 0) " << std::endl;
-		indent();
-		printIndent();
-		assign_stmt->lhs->accept(this);
-		oss << " = ";
-		assign_stmt->expr->accept(this);
-		oss << ";" << std::endl;	
-		dedent();
+		if (mir::isa<mir::VarExpr>(assign_stmt->lhs) && is_hoisted_var(mir::to<mir::VarExpr>(assign_stmt->lhs)->var)) {
+			printIndent();
+			assign_stmt->lhs->accept(this);
+			oss << " = ";
+			assign_stmt->expr->accept(this);
+			oss << ";" << std::endl;
+		} else {
+			printIndent();
+			oss << "if (_thread_id == 0) " << std::endl;
+			indent();
+			printIndent();
+			assign_stmt->lhs->accept(this);
+			oss << " = ";
+			assign_stmt->expr->accept(this);
+			oss << ";" << std::endl;	
+			dedent();
+			printIndent();
+			oss << "_grid.sync();" << std::endl;
+		}
 	}	
 }
 
@@ -924,13 +918,6 @@ void CodeGenGPU::visit(mir::ForStmt::Ptr for_stmt) {
 }
 void CodeGenGPU::visit(mir::WhileStmt::Ptr while_stmt) {
 	if (while_stmt->is_fused == true) {
-		/*
-		for (auto decl: while_stmt->hoisted_decls) {
-			printIndent();
-			decl->type->accept(this);	
-			oss << " " << decl->name << ";" << std::endl;
-		}
-		*/
 		for (auto var: while_stmt->hoisted_vars) {
 			bool to_copy = true;
 			for (auto decl: while_stmt->hoisted_decls) {
@@ -1009,6 +996,8 @@ void CodeGenGPUFusedKernel::visit(mir::PrintStmt::Ptr print_stmt) {
 	print_stmt->expr->accept(this);
 	oss << ");" << std::endl;
 	dedent();
+	printIndent();
+	oss << "_grid.sync();" << std::endl;
 }
 void CodeGenGPUHost::visit(mir::Call::Ptr call_expr) {
 	if (call_expr->name == "deleteObject" || call_expr->name.substr(0, strlen("builtin_")) == "builtin_")	
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index d5fe6222..084d38ff 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -38,6 +38,7 @@ static int32_t builtin_getVertexSetSize(VertexFrontier &frontier) {
 	return curr_size;	
 }
 static int32_t __device__ device_builtin_getVertexSetSize(VertexFrontier &frontier) {
+	this_grid().sync();
 	return frontier.d_num_elems_input[0];
 }
 class AccessorSparse {
@@ -147,17 +148,16 @@ static void swap_queues(VertexFrontier &frontier) {
 	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));	
 }
 static void __device__ swap_queues_device(VertexFrontier &frontier) {	
-	if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
-		int32_t *temp = frontier.d_num_elems_input;
-		frontier.d_num_elems_input = frontier.d_num_elems_output;
-		frontier.d_num_elems_output = temp;
-		
-		temp = frontier.d_sparse_queue_input;
-		frontier.d_sparse_queue_input = frontier.d_sparse_queue_output;
-		frontier.d_sparse_queue_output = temp;
-
+	int32_t *temp = frontier.d_num_elems_input;
+	frontier.d_num_elems_input = frontier.d_num_elems_output;
+	frontier.d_num_elems_output = temp;
+	
+	temp = frontier.d_sparse_queue_input;
+	frontier.d_sparse_queue_input = frontier.d_sparse_queue_output;
+	frontier.d_sparse_queue_output = temp;
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) 
 		frontier.d_num_elems_output[0] = 0;
-	}
+	this_grid().sync();
 }
 
 static void swap_bytemaps(VertexFrontier &frontier) {
@@ -175,20 +175,18 @@ static void swap_bytemaps(VertexFrontier &frontier) {
 }
 
 static void __device__ swap_bytemaps_device(VertexFrontier &frontier) {
-	if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
-		int32_t *temp = frontier.d_num_elems_input;
-		frontier.d_num_elems_input = frontier.d_num_elems_output;
-		frontier.d_num_elems_output = temp;
-		
-		unsigned char* temp2;
-		temp2 = frontier.d_byte_map_input;
-		frontier.d_byte_map_input = frontier.d_byte_map_output;
-		frontier.d_byte_map_output = temp2;
-
+	int32_t *temp = frontier.d_num_elems_input;
+	frontier.d_num_elems_input = frontier.d_num_elems_output;
+	frontier.d_num_elems_output = temp;
+	
+	unsigned char* temp2;
+	temp2 = frontier.d_byte_map_input;
+	frontier.d_byte_map_input = frontier.d_byte_map_output;
+	frontier.d_byte_map_output = temp2;
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) 
 		frontier.d_num_elems_output[0] = 0;
-	}
-	this_grid().sync();
 	parallel_memset(frontier.d_byte_map_output, 0, sizeof(unsigned char) * frontier.max_num_elems);		
+	this_grid().sync();
 }
 static void swap_bitmaps(VertexFrontier &frontier) {
 	int32_t *temp = frontier.d_num_elems_input;
diff --git a/src/runtime_lib/infra_gpu/vertex_representation.h b/src/runtime_lib/infra_gpu/vertex_representation.h
index 36cf2146..9855a056 100644
--- a/src/runtime_lib/infra_gpu/vertex_representation.h
+++ b/src/runtime_lib/infra_gpu/vertex_representation.h
@@ -63,13 +63,11 @@ static void __device__ vertex_set_prepare_sparse_device(VertexFrontier &frontier
 		generalized_prepare_from_to<AccessorAll, condition_bytemap, update_sparse>(frontier);
 		this_grid().sync();
 		swap_queues_device(frontier);
-		this_grid().sync();
 		return;
 	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
 		generalized_prepare_from_to<AccessorAll, condition_bitmap, update_sparse>(frontier);
 		this_grid().sync();
 		swap_queues_device(frontier);
-		this_grid().sync();
 		return;
 	}
 }
@@ -91,7 +89,6 @@ static void __device__ vertex_set_prepare_boolmap_device(VertexFrontier &frontie
 		generalized_prepare_from_to<AccessorSparse, condition_sparse, update_bytemap>(frontier);
 		this_grid().sync();
 		swap_bytemaps_device(frontier);
-		this_grid().sync();
 		return;
 	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
 		return;
@@ -99,7 +96,6 @@ static void __device__ vertex_set_prepare_boolmap_device(VertexFrontier &frontie
 		generalized_prepare_from_to<AccessorAll, condition_bitmap, update_bytemap>(frontier);
 		this_grid().sync();
 		swap_bytemaps_device(frontier);
-		this_grid().sync();
 		return;
 	}
 }

From 625ab3940676e1e57de367720ba78528603d56ba Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Tue, 15 Oct 2019 15:51:34 -0400
Subject: [PATCH 35/88] Working Kernel fusion for BFS

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |  3 +-
 src/backend/codegen_gpu/codegen_gpu.cpp       | 49 ++++++++++++----
 src/runtime_lib/gpu_intrinsics.h              |  1 +
 .../infra_gpu/gpu_priority_queue.h            | 58 +++++++++----------
 src/runtime_lib/infra_gpu/vertex_frontier.h   | 32 +++++++++-
 .../infra_gpu/vertex_representation.h         | 16 +++++
 6 files changed, 115 insertions(+), 44 deletions(-)

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index 3ee8d5af..443ae836 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -29,7 +29,6 @@ class CodeGenGPUKernelEmitter: public mir::MIRVisitor {
 	void visit(mir::PullEdgeSetApplyExpr::Ptr);
 
 	void genEdgeSetGlobalKernel(mir::EdgeSetApplyExpr::Ptr);
-	void genFuncDecl(mir::FuncDecl::Ptr);
 
 };
 
@@ -57,6 +56,7 @@ class CodeGenGPU : public mir::MIRVisitor{
 private:
 	void genIncludeStmts(void);
 	void genEdgeSets(void);
+	void genFuncDecl(mir::FuncDecl::Ptr);
 
 
 	void genPropertyArrayImplementationWithInitialization(mir::VarDecl::Ptr shared_ptr);
@@ -159,6 +159,7 @@ class CodeGenGPUFusedKernel: public CodeGenGPU {
 	virtual void visit(mir::AssignStmt::Ptr) override;
 	virtual void visit(mir::VarDecl::Ptr) override;
 	virtual void visit(mir::PrintStmt::Ptr) override;
+	virtual void visit(mir::HybridGPUStmt::Ptr) override;
 	
 	std::string var_name (std::string var) {
 		//return current_kernel_name + "_" + var;
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 8d8dadb4..445a21b8 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -33,6 +33,11 @@ int CodeGenGPU::genGPU() {
 	}	
 		
 	std::vector<mir::FuncDecl::Ptr> functions = mir_context_->getFunctionList();
+	// Before we generate any functions or kernels, we generate the function declarations
+	for (auto function: functions) {
+		if (function->name != "main")
+			genFuncDecl(function);
+	}
 	
 	// Every operator requires a kernel to be generated
 	// Create that first because all the actual functions will be calling these kernels
@@ -175,24 +180,23 @@ void CodeGenGPUFusedKernel::visit(mir::StmtBlock::Ptr stmt_block) {
 		stmt->accept(this);
 	}
 }
-void CodeGenGPUKernelEmitter::genFuncDecl(mir::FuncDecl::Ptr func_decl) {
+void CodeGenGPU::genFuncDecl(mir::FuncDecl::Ptr func_decl) {
 	if (func_decl->result.isInitialized()) {
 		func_decl->result.getType()->accept(this);
-		assert(mir::isa<mir::ScalarType>(func_decl->result.getType()));
-		assert(mir::to<mir::ScalarType>(func_decl->result.getType())->type == mir::ScalarType::Type::BOOL);
-		oss << "bool";
 	} else {
 		oss << "void";
 	}
-	oss << " " << "__device__" << " " << func_decl->name << "(";
+
+	if (func_decl->function_context & mir::FuncDecl::function_context_type::CONTEXT_DEVICE)
+		oss << " " << "__device__" << " " << func_decl->name << "(";
+	else
+		oss << " " << func_decl->name << "(";
+
 	bool printDelimeter = false;
 	for (auto arg: func_decl->args) {
 		if (printDelimeter)
 			oss << ", ";
-		assert(mir::isa<mir::ElementType>(arg.getType()) || mir::isa<mir::ScalarType>(arg.getType()));
-		if (mir::isa<mir::ScalarType>(arg.getType()))
-			assert(mir::to<mir::ScalarType>(arg.getType())->type == mir::ScalarType::Type::INT);
-		oss << "int32_t";
+		arg.getType()->accept(this);
 		oss << " " << arg.getName();
 		printDelimeter = true;
 	}
@@ -200,7 +204,7 @@ void CodeGenGPUKernelEmitter::genFuncDecl(mir::FuncDecl::Ptr func_decl) {
 }
 void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 
-
+	/*
 	// Before we generate the payload for the load balancing function, we need to generate a declaration for the UDF
 	mir::FuncDecl::Ptr input_function_decl = mir_context_->getFunction(apply_expr->input_function_name);
 	genFuncDecl(input_function_decl);
@@ -208,6 +212,7 @@ void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 		mir::FuncDecl::Ptr to_function_decl = mir_context_->getFunction(apply_expr->to_func);
 		genFuncDecl(to_function_decl);
 	}
+	*/
 	// First we generate the function that is passed to the load balancing function
 
 	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
@@ -258,9 +263,11 @@ void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 }
 
 void CodeGenGPUKernelEmitter::visit(mir::PullEdgeSetApplyExpr::Ptr apply_expr) {
+	/*
 	// Before we generate the payload for the load balancing function, we need to generate a declaration for the UDF
 	mir::FuncDecl::Ptr input_function_decl = mir_context_->getFunction(apply_expr->input_function_name);
 	genFuncDecl(input_function_decl);
+	*/
 
 	// First we generate the function that is passed to the load balancing function
 	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
@@ -508,7 +515,7 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 		std::string to_func = esae->to_func;
 		if (to_func != "") {
 			printIndent();
-			oss << "gpu_runtime::vertex_set_create_reverse_sparse_queue<" << to_func << ">(";
+			oss << "gpu_runtime::vertex_set_create_reverse_sparse_queue_host<" << to_func << ">(";
 			oss << esae->from_func << ");" << std::endl;
 		}
 
@@ -1144,7 +1151,25 @@ void CodeGenGPUHost::visit(mir::StmtBlock::Ptr stmt_block) {
 void CodeGenGPU::visit(mir::HybridGPUStmt::Ptr stmt) {
 	if (stmt->criteria == fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria::INPUT_VERTEXSET_SIZE) {
 		printIndent();
-		oss << "if (builtin_getVertexSetSize(" << stmt->input_frontier_name << ") < " << stmt->input_frontier_name << ".max_num_elems * " << stmt->threshold << ") {" << std::endl;
+		oss << "if (gpu_runtime::builtin_getVertexSetSize(" << stmt->input_frontier_name << ") < " << stmt->input_frontier_name << ".max_num_elems * " << stmt->threshold << ") {" << std::endl;
+		indent();
+		stmt->stmt1->accept(this);
+		dedent();
+		printIndent();
+		oss << "} else {" << std::endl;
+		indent();	
+		stmt->stmt2->accept(this);
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;	
+	} else {
+		assert(false && "Invalid criteria for Hybrid Statement\n");
+	}
+}
+void CodeGenGPUFusedKernel::visit(mir::HybridGPUStmt::Ptr stmt) {
+	if (stmt->criteria == fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria::INPUT_VERTEXSET_SIZE) {
+		printIndent();
+		oss << "if (gpu_runtime::device_builtin_getVertexSetSize(" << var_name(stmt->input_frontier_name) << ") < " << var_name(stmt->input_frontier_name) << ".max_num_elems * " << stmt->threshold << ") {" << std::endl;
 		indent();
 		stmt->stmt1->accept(this);
 		dedent();
diff --git a/src/runtime_lib/gpu_intrinsics.h b/src/runtime_lib/gpu_intrinsics.h
index f487814a..09377d80 100644
--- a/src/runtime_lib/gpu_intrinsics.h
+++ b/src/runtime_lib/gpu_intrinsics.h
@@ -26,5 +26,6 @@ static __device__ void device_deleteObject(T &t) {
 }
 
 static void * no_args[1];
+
 }
 #endif
diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index 1ed2aa52..78fc72d2 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -7,35 +7,35 @@
 
 namespace gpu_runtime {
 
-  template<typename PriorityT_>
-    class GPUPriorityQueue {
-    
-  public:
-    explicit GPUPriorityQueue(PriorityT_* priorities, PriorityT_ delta=1)
-      : priorities_(priorities), delta_(delta){
-    }
-    
-    size_t get_current_priority(){
-      return current_priority_;
-    }
-
-	void update_current_priority(PriorityT_ priority_change_){
-
-	}
-    
-    bool finished() {
-      //TODO
-      return true;
-    }
-    
-    bool finishedNode(NodeID v){
-		return priorities_[v]/delta_ < get_current_priority();;
-    }
-    
-    PriorityT_* priorities_;
-    PriorityT_ delta_;
-	PriorityT_ current_priority_;
-  };
+template<typename PriorityT_>
+	class GPUPriorityQueue {
+
+	public:
+		explicit GPUPriorityQueue(PriorityT_* priorities, PriorityT_ delta=1)
+			: priorities_(priorities), delta_(delta){
+			}
+
+		size_t get_current_priority(){
+			return current_priority_;
+		}
+
+		void update_current_priority(PriorityT_ priority_change_){
+
+		}
+
+		bool finished() {
+			//TODO
+			return true;
+		}
+
+		bool finishedNode(NodeID v){
+			return priorities_[v]/delta_ < get_current_priority();;
+		}
+
+		PriorityT_* priorities_;
+		PriorityT_ delta_;
+		PriorityT_ current_priority_;
+	};
 }
 
 
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index 084d38ff..8b4254f1 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -203,6 +203,23 @@ static void swap_bitmaps(VertexFrontier &frontier) {
 	cudaMemset(frontier.d_bit_map_output, 0, sizeof(uint32_t) * num_byte_for_bitmap);
 	cudaCheckLastError();
 }
+static void __device__ swap_bitmaps_device(VertexFrontier &frontier) {
+	int32_t *temp = frontier.d_num_elems_input;
+	frontier.d_num_elems_input = frontier.d_num_elems_output;
+	frontier.d_num_elems_output = temp;
+	
+	uint32_t* temp2;
+	temp2 = frontier.d_bit_map_input;
+	frontier.d_bit_map_input = frontier.d_bit_map_output;
+	frontier.d_bit_map_output = temp2;
+
+	int32_t num_byte_for_bitmap = (frontier.max_num_elems + 8 * sizeof(uint32_t) - 1)/(sizeof(uint32_t) * 8);
+
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) 
+		frontier.d_num_elems_output[0] = 0;
+	parallel_memset((unsigned char*)frontier.d_bit_map_output, 0, sizeof(uint32_t) * num_byte_for_bitmap);		
+	this_grid().sync();
+}
 static void __device__ dedup_frontier_device(VertexFrontier &frontier) {
 	for(int32_t vidx = threadIdx.x + blockDim.x * blockIdx.x; vidx < frontier.d_num_elems_input[0]; vidx += blockDim.x * gridDim.x) {
 		int32_t vid = frontier.d_sparse_queue_input[vidx];
@@ -224,19 +241,30 @@ bool __device__ true_function(int32_t _) {
 	return true;
 }
 template <bool to_func(int32_t)>
-static void __global__ vertex_set_create_reverse_sparse_queue_kernel(VertexFrontier frontier) {
+static void __device__ vertex_set_create_reverse_sparse_queue(VertexFrontier &frontier) {
 	for (int32_t node_id = blockDim.x * blockIdx.x + threadIdx.x; node_id < frontier.max_num_elems; node_id += blockDim.x * gridDim.x) {
 		if ((to_func(node_id)))
 			enqueueVertexSparseQueue(frontier.d_sparse_queue_output, frontier.d_num_elems_output, node_id);
 	}	
 }
+template <bool to_func(int32_t)>
+static void __global__ vertex_set_create_reverse_sparse_queue_kernel(VertexFrontier &frontier) {
+	vertex_set_create_reverse_sparse_queue<to_func>(frontier);
+}
 
 template <bool to_func(int32_t)>
-static void vertex_set_create_reverse_sparse_queue(VertexFrontier &frontier) {
+static void vertex_set_create_reverse_sparse_queue_host(VertexFrontier &frontier) {
 	vertex_set_create_reverse_sparse_queue_kernel<to_func><<<NUM_CTA, CTA_SIZE>>>(frontier);
 	swap_queues(frontier);	
 }
 
+template <bool to_func(int32_t)>
+static void __device__ vertex_set_create_reverse_sparse_queue_device(VertexFrontier &frontier) {
+	vertex_set_create_reverse_sparse_queue<to_func>(frontier);
+	this_grid().sync();
+	swap_queues_device(frontier);	
+}
+
 }
 
 #endif
diff --git a/src/runtime_lib/infra_gpu/vertex_representation.h b/src/runtime_lib/infra_gpu/vertex_representation.h
index 9855a056..576182b5 100644
--- a/src/runtime_lib/infra_gpu/vertex_representation.h
+++ b/src/runtime_lib/infra_gpu/vertex_representation.h
@@ -112,5 +112,21 @@ static void vertex_set_prepare_bitmap(VertexFrontier &frontier) {
 		return;
 	}
 }
+static void __device__ vertex_set_prepare_bitmap_device(VertexFrontier &frontier) {
+	if (frontier.format_ready == VertexFrontier::SPARSE) {
+		generalized_prepare_from_to<AccessorSparse, condition_sparse, update_bitmap>(frontier);
+		this_grid().sync();
+		swap_bitmaps_device(frontier);
+		return;
+	} else if (frontier.format_ready == VertexFrontier::BYTEMAP) {
+		generalized_prepare_from_to<AccessorAll, condition_bytemap, update_bitmap>(frontier);
+		this_grid().sync();
+		swap_bitmaps_device(frontier);
+		return;	
+	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
+		return;
+	}
+
+}
 }
 #endif

From 4fac36fcf2d253383c3a4152b600e90cd6120321 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Wed, 16 Oct 2019 14:20:24 -0400
Subject: [PATCH 36/88] Changed the binary extension in the loader and the
 loader can directly load the binary file format

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |  2 ++
 src/backend/codegen_gpu/codegen_gpu.cpp       | 14 ++++++++++++-
 src/runtime_lib/infra_gpu/graph.h             | 21 ++++++++++++++++---
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index 443ae836..5041746c 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -63,6 +63,7 @@ class CodeGenGPU : public mir::MIRVisitor{
 
 
 	void genPropertyArrayDecl(mir::VarDecl::Ptr);
+	void genScalarDecl(mir::VarDecl::Ptr);
 	void genPropertyArrayAlloca(mir::VarDecl::Ptr);
 	
 	void genFusedWhileLoop(mir::WhileStmt::Ptr);
@@ -104,6 +105,7 @@ class CodeGenGPU : public mir::MIRVisitor{
 
 	virtual void visit(mir::TensorArrayReadExpr::Ptr) override;
 	virtual void visit(mir::IntLiteral::Ptr) override;
+	virtual void visit(mir::FloatLiteral::Ptr) override;
 	virtual void visit(mir::BoolLiteral::Ptr) override;
 	virtual void visit(mir::StringLiteral::Ptr) override;
 
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 445a21b8..09b046ac 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -28,7 +28,8 @@ int CodeGenGPU::genGPU() {
 			// This is some vertex data
 			genPropertyArrayDecl(constant);	
 		} else {
-			assert(false && "Constant type not handled yet in GPU backend\n");	
+			// This is some scalar variable w or w/o initialization
+			genScalarDecl(constant);
 		}
 	}	
 		
@@ -59,6 +60,14 @@ int CodeGenGPU::genGPU() {
 	oss << std::endl;
 	return 0;
 }
+
+void CodeGenGPU::genScalarDecl(mir::VarDecl::Ptr var_decl) {	
+	var_decl->type->accept(this);
+	oss << " __device__ " << var_decl->name << "; " << std::endl;
+	
+	var_decl->type->accept(this);
+	oss << " __host_" << var_decl->name << ";" << std::endl;
+}
 void CodeGenGPU::genPropertyArrayDecl(mir::VarDecl::Ptr constant) {
 	mir::VectorType::Ptr vector_type = mir::to<mir::VectorType>(constant->type);
 	vector_type->vector_element_type->accept(this);
@@ -773,6 +782,9 @@ void CodeGenGPUHost::visit(mir::TensorArrayReadExpr::Ptr expr) {
 void CodeGenGPU::visit(mir::IntLiteral::Ptr expr) {
 	oss << expr->val;
 }
+void CodeGenGPU::visit(mir::FloatLiteral::Ptr expr) {
+	oss << expr->val;
+}
 void CodeGenGPU::visit(mir::StringLiteral::Ptr expr) {
 	oss << "\"";
 	for (auto ch : expr->val)
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
index f5fbe316..dc7ee374 100644
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -48,15 +48,30 @@ void static sort_with_degree(GraphT<EdgeWeightType> &graph) {
 	assert(false && "Sort with degree not yet implemented\n");
 	return;
 }
+static bool string_ends_with(const char* str, const char* sub_str) {
+	if (strlen(sub_str) > strlen(str))
+		return false;
+	int32_t len1 = strlen(str);
+	int32_t len2 = strlen(sub_str);
+	if (strcmp(str + len1 - len2, sub_str) == 0)
+		return true;
+	return false;
+}
 template <typename EdgeWeightType>
 static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool to_sort = false) {
 	int flen = strlen(filename.c_str());
-	const char* bin_extension = to_sort?".graphit.sbin":".graphit.bin";
-	char bin_filename[100];
+	const char* bin_extension = to_sort?".graphit_sbin":".graphit_bin";
+	char bin_filename[1024];
 	strcpy(bin_filename, filename.c_str());
-	strcat(bin_filename, bin_extension);
+
+	if (string_ends_with(filename.c_str(), bin_extension) == false)	
+		strcat(bin_filename, bin_extension);
 	
 	FILE *bin_file = fopen(bin_filename, "rb");
+	if (!bin_file && string_ends_with(filename.c_str(), bin_extension)) {
+		std::cout << "Binary file not found" << std::endl;
+		exit(-1);
+	}
 	if (bin_file) {
 		CONSUME(fread(&graph.num_vertices, sizeof(int32_t), 1, bin_file));
 		CONSUME(fread(&graph.num_edges, sizeof(int32_t), 1, bin_file));

From f85d8a5ebf6f0a2263de0635dfacff0638a06578 Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Wed, 16 Oct 2019 18:20:22 -0400
Subject: [PATCH 37/88] fixing a bug with copying window_upper

---
 test/gpu_tests/test_input/sssp_delta_stepping.cu | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index f9794e85..0eddd9fb 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -8,6 +8,7 @@
 #include <vector>
 #include <queue>
 
+//#define DEBUG
 
 typedef struct {
 	int32_t *SP;
@@ -421,7 +422,8 @@ int main(int argc, char *argv[]) {
 			gpu_runtime::vertex_set_prepare_sparse(frontier);
 
 
-			cudaMemcpyToSymbol(window_upper, &device_state.window_upper, sizeof(int32_t*), 0);
+			cudaMemcpyToSymbol(window_upper, &device_state.window_upper, sizeof(int32_t), 0);
+			gpu_runtime::cudaCheckLastError();
 			gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);  
 			
 			// host_state.frontier1_size[0] = 0;
@@ -493,8 +495,11 @@ int main(int argc, char *argv[]) {
 			//printf("Iter %d time = %f, output_size = %d <%d, %d>\n", iters, t, *host_state.frontier1_size, num_cta, CTA_SIZE);
 			iter_total += t;
 		}
-		
-		//printf("Num iters = %d\n", iters);
+
+
+		#ifdef DEBUG
+		printf("Num iters = %d\n", iters);
+		#endif
 		//printf("Time elapsed = %f\n", iter_total);
 		total_time += iter_total;
 

From 07bd866a8ff602dcd479f20bb39c53f2c9009d9f Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Thu, 17 Oct 2019 15:11:42 -0400
Subject: [PATCH 38/88] adding support to pass start vertex as a commandline
 argument, cleaning up the sssp delta_stepping code a bit

---
 test/gpu_tests/all_gpu_tests.py               |  5 +-
 .../test_input/sssp_delta_stepping.cu         | 50 ++++++++-----------
 2 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
index 29f601f7..5dfa697c 100644
--- a/test/gpu_tests/all_gpu_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -36,7 +36,8 @@ def get_command_output(self, command):
         def sssp_verified_test(self, input_file_name, use_delta=False):
                 self.cpp_compile_test(input_file_name, [])
                 if use_delta:
-                        self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel 2 v > verifier_input ")
+                        #start point 0, delta 10, verified
+                        self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel 0 10 v > verifier_input ")
                 else:
                         self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel v > verifier_input ")             
                 output = self.get_command_output(self.verifier_directory + "/sssp_verifier -f " + self.graph_directory +  "/4.wel -t verifier_input -r 0")                
@@ -129,7 +130,7 @@ def test_sssp_lp_verified(self):
                 self.sssp_verified_test("sssp_lp.cu")
                 
         def test_sssp_delta_stepping(self):
-                self.cpp_exec_test("sssp_delta_stepping.cu", [], [self.graph_directory + "/simple_mtx.mtx", "v"])
+                self.cpp_exec_test("sssp_delta_stepping.cu", [], [self.graph_directory + "/simple_mtx.mtx", "0", "10",  "v"])
 
         def test_sssp_delta_stepping_verified(self):
                 self.sssp_verified_test("sssp_delta_stepping.cu", True)
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 0eddd9fb..3b335a67 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -47,6 +47,7 @@ int32_t *__device_SP;
 int32_t __device__ window_lower;
 int32_t __device__ window_upper;
 
+
 void cudaCheckLastError(void) {
 	cudaError_t err = cudaGetLastError();
 	if (err != cudaSuccess) 
@@ -61,7 +62,7 @@ void cudaCheckLastError(void) {
 #define WARP_SIZE (32)
 #define STAGE_1_SIZE (8)
 
-void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
+void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state device_state, int start_v) {
         int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
         int num_threads = blockDim.x * gridDim.x;
         int total_work = graph.num_vertices;
@@ -75,9 +76,9 @@ void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state devic
         }
 	if (thread_id == 0) {
 		//reset with the new data structure
-		SP[0] = 0;
+		SP[start_v] = 0;
 		
-		device_state.SP[0] = 0;
+		device_state.SP[start_v] = 0;
 		device_state.frontier1[graph.num_vertices] = 0;	
 		device_state.frontier1_size[0] = 1;
 		device_state.frontier1_size[1] = 1;
@@ -330,25 +331,8 @@ void allocate_state(algo_state &host_state, algo_state &device_state, gpu_runtim
 	host_state.SP = new int[graph.num_vertices];
 	host_state.output_size = new int32_t[1];
 	host_state.new_window_start = new int32_t[1];
-
-	host_state.frontier1_size = new int32_t[1];
-	host_state.frontier1 = new int32_t[graph.num_vertices];
-
-
-	host_state.more_elems = new int32_t();
 	cudaMalloc(&device_state.SP, sizeof(int32_t)*graph.num_vertices);	
-
-	cudaMalloc(&device_state.frontier1, sizeof(int32_t)*graph.num_vertices * 5);	
-	cudaMalloc(&device_state.frontier2, sizeof(char)*graph.num_vertices );	
-
-	cudaMalloc(&device_state.frontier1_size, 5*sizeof(int32_t));	
-	//cudaMalloc(&device_state.frontier2_size, sizeof(int32_t));	
-
 	cudaMalloc(&device_state.output_size, sizeof(int32_t));
-
-
-	cudaMalloc(&device_state.worklist, sizeof(int32_t));
-	cudaMalloc(&device_state.more_elems, sizeof(int32_t));
 	cudaMalloc(&device_state.new_window_start, sizeof(int32_t));
 }
 
@@ -372,8 +356,9 @@ int main(int argc, char *argv[]) {
 	cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
 	gpu_runtime::GraphT<int32_t> graph;
 	gpu_runtime::load_graph(graph, argv[1], false);
-	int32_t delta = atoi(argv[2]);
-
+	int32_t delta = atoi(argv[3]);
+	int32_t start_vertex = atoi(argv[2]);
+	
 	cudaMalloc(&__device_SP, gpu_runtime::builtin_getVertices(graph) * sizeof(int32_t));
 	cudaMemcpyToSymbol(SP, &__device_SP, sizeof(int32_t*), 0);
 	__host_SP = new int32_t[gpu_runtime::builtin_getVertices(graph)];
@@ -388,9 +373,11 @@ int main(int argc, char *argv[]) {
 	device_state.window_lower = 0;
 	device_state.window_upper = delta;
 
+	
+
 	//this sets it to Sparse
 	gpu_runtime::VertexFrontier frontier = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
-	gpu_runtime::builtin_addVertex(frontier, 0);
+	gpu_runtime::builtin_addVertex(frontier, start_vertex);
 
 	cudaDeviceSynchronize();
 
@@ -400,16 +387,13 @@ int main(int argc, char *argv[]) {
 		float iter_total = 0;
 		startTimer();
 		
-		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);		
+		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state, start_vertex);		
 		int iters = 0;	
 		cudaDeviceSynchronize();
 		float t = stopTimer();
 		//printf("Init time = %f\n", t);
 		iter_total+=t;
 
-		host_state.frontier1_size[0] = 1;
-
-		//while(*host_state.frontier1_size) {
 		while(gpu_runtime::builtin_getVertexSetSize(frontier) != (0)){
 			startTimer();
 			iters++;
@@ -493,6 +477,12 @@ int main(int argc, char *argv[]) {
 
 			t = stopTimer();
 			//printf("Iter %d time = %f, output_size = %d <%d, %d>\n", iters, t, *host_state.frontier1_size, num_cta, CTA_SIZE);
+
+
+			#ifdef DEBUG
+			//printf("Iter %d output_size = %d \n", iters, gpu_runtime::builtin_getVertexSetSize(frontier));
+			#endif
+			
 			iter_total += t;
 		}
 
@@ -506,11 +496,15 @@ int main(int argc, char *argv[]) {
 	}
 	//printf("Total time = %f\n", total_time);
 	if (argc > 3)
-		if (argv[3][0] == 'v'){ 
+		if (argv[4][0] == 'v'){ 
 			//FILE *output = fopen("output.txt", "w");
 			cudaMemcpy(host_state.SP, __device_SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
 			for (int i = 0; i < graph.num_vertices; i++)
+				#ifdef DEBUG
+				printf("%d, %d\n", i, host_state.SP[i]);
+				#else
 				printf("%d\n", host_state.SP[i]);
+                #endif
 		}else if (argv[2][0] == 'c'){
 			/*
 			for (int i = 0; i < NUM_BLOCKS * NUM_THREADS; i++)

From d6f000013ccad26061ddccdcbe5dc0606f724bb2 Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Thu, 17 Oct 2019 15:45:25 -0400
Subject: [PATCH 39/88] fixing a bug caused by previous cleanup

---
 .../test_input/sssp_delta_stepping.cu         | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 3b335a67..cd229202 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -71,7 +71,7 @@ void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state devic
                 int id = num_threads * i + thread_id;
                 if (id < total_work) {
 			device_state.SP[id] = INT_MAX;
-			device_state.frontier2[id] = 0;
+			//device_state.frontier2[id] = 0;
                 }
         }
 	if (thread_id == 0) {
@@ -79,12 +79,12 @@ void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state devic
 		SP[start_v] = 0;
 		
 		device_state.SP[start_v] = 0;
-		device_state.frontier1[graph.num_vertices] = 0;	
-		device_state.frontier1_size[0] = 1;
-		device_state.frontier1_size[1] = 1;
-		device_state.frontier1_size[2] = 0;
-		device_state.frontier1_size[3] = 0;
-		device_state.frontier1_size[4] = 0;
+		// device_state.frontier1[graph.num_vertices] = 0;	
+		// device_state.frontier1_size[0] = 1;
+		// device_state.frontier1_size[1] = 1;
+		// device_state.frontier1_size[2] = 0;
+		// device_state.frontier1_size[3] = 0;
+		// device_state.frontier1_size[4] = 0;
 	}
 }
 __device__ inline int warp_bcast(int v, int leader) { return __shfl_sync((uint32_t)-1, v, leader); }
@@ -358,6 +358,8 @@ int main(int argc, char *argv[]) {
 	gpu_runtime::load_graph(graph, argv[1], false);
 	int32_t delta = atoi(argv[3]);
 	int32_t start_vertex = atoi(argv[2]);
+
+
 	
 	cudaMalloc(&__device_SP, gpu_runtime::builtin_getVertices(graph) * sizeof(int32_t));
 	cudaMemcpyToSymbol(SP, &__device_SP, sizeof(int32_t*), 0);
@@ -368,6 +370,16 @@ int main(int argc, char *argv[]) {
 	algo_state host_state, device_state;	
 	allocate_state(host_state, device_state, graph);
 
+	//host_state.frontier1_size = new int32_t[1];
+	//host_state.frontier1 = new int32_t[graph.num_vertices];
+	//host_state.more_elems = new int32_t();
+
+	//cudaMalloc(&device_state.frontier1, sizeof(int32_t)*graph.num_vertices*5);
+	//cudaMalloc(&device_state.frontier2, sizeof(char)*graph.num_vertices);
+	//cudaMalloc(&device_state.frontier1_size, 5*sizeof(int32_t));
+
+	
+	
 	host_state.window_lower = 0;
 	host_state.window_upper = delta;
 	device_state.window_lower = 0;
@@ -387,7 +399,9 @@ int main(int argc, char *argv[]) {
 		float iter_total = 0;
 		startTimer();
 		
-		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state, start_vertex);		
+		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state, start_vertex);
+		gpu_runtime::cudaCheckLastError();
+		
 		int iters = 0;	
 		cudaDeviceSynchronize();
 		float t = stopTimer();

From 8ee221297f29ea05f9912093fbab570e45e785b0 Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Thu, 17 Oct 2019 16:03:28 -0400
Subject: [PATCH 40/88] further cleaning up the gpu sssp delta stepping code to
 get ready for refactoring and switching to more load balance schemes

---
 .../test_input/sssp_delta_stepping.cu         | 301 +-----------------
 1 file changed, 6 insertions(+), 295 deletions(-)

diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index cd229202..0b0fe441 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -12,31 +12,13 @@
 
 typedef struct {
 	int32_t *SP;
-
-	int32_t *frontier1;
-
-	
-	char *frontier2;
-
-	int32_t *frontier1_size;
-	int32_t *frontier2_size;
-
-
 	int32_t *output_size;
-
 	int32_t num_blocks;
-
 	int32_t *node_borders;
 	int32_t *edge_borders;
-
-	int32_t *worklist;
 	int32_t *old_indices;
-	
 	int32_t window_lower;
-	int32_t window_upper;
-	
-	int32_t *more_elems;
-		
+	int32_t window_upper;		
 	int32_t *new_window_start;
 }algo_state;
 
@@ -48,13 +30,6 @@ int32_t __device__ window_lower;
 int32_t __device__ window_upper;
 
 
-void cudaCheckLastError(void) {
-	cudaError_t err = cudaGetLastError();
-	if (err != cudaSuccess) 
-		printf("Error: %s\n", cudaGetErrorString(err));
-}
-
-
 #define VIRTUAL_WARP_SIZE (32)
 #define NUM_THREADS (1024)
 #define NUM_BLOCKS (80)
@@ -70,40 +45,15 @@ void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state devic
         for (int i = 0; i < work_per_thread; i++) {
                 int id = num_threads * i + thread_id;
                 if (id < total_work) {
-			device_state.SP[id] = INT_MAX;
-			//device_state.frontier2[id] = 0;
+					device_state.SP[id] = INT_MAX;
                 }
         }
 	if (thread_id == 0) {
 		//reset with the new data structure
 		SP[start_v] = 0;
-		
 		device_state.SP[start_v] = 0;
-		// device_state.frontier1[graph.num_vertices] = 0;	
-		// device_state.frontier1_size[0] = 1;
-		// device_state.frontier1_size[1] = 1;
-		// device_state.frontier1_size[2] = 0;
-		// device_state.frontier1_size[3] = 0;
-		// device_state.frontier1_size[4] = 0;
 	}
 }
-__device__ inline int warp_bcast(int v, int leader) { return __shfl_sync((uint32_t)-1, v, leader); }
-__device__ inline int atomicAggInc(int *ctr) {
-	int32_t lane_id = threadIdx.x % 32;
-	
-        int mask = __activemask();
-        int leader = __ffs(mask) - 1;
-        int res;
-        if(lane_id == leader)
-                res = atomicAdd(ctr, __popc(mask));
-        res = warp_bcast(res, leader);
-
-        return (res + __popc(mask & ((1 << lane_id) - 1)));
-}
-__device__ void enqueueVertex(int32_t v, algo_state &device_state, int32_t new_dist) {
-	if (new_dist < device_state.window_upper)
-		device_state.frontier2[v] = 1 ;
-}
 
 bool __device__ updateEdge(int32_t src, int32_t dst, int32_t weight) {
 	bool output2;
@@ -111,10 +61,7 @@ bool __device__ updateEdge(int32_t src, int32_t dst, int32_t weight) {
 	SP_trackving_var_1 = gpu_runtime::writeMin(&SP[dst], (SP[src] + weight));
 	output2 = SP_trackving_var_1;
 
-	//do not output this if it is not within the current window
 	if (SP[dst] >= window_upper) return false;
-	//output2 = true;
-		//}
 	
 	return output2;
 }
@@ -129,167 +76,6 @@ void __device__ gpu_operator_body_3(gpu_runtime::GraphT<EdgeWeightType> graph, i
 	}
 }
 
-
-
-void __global__ update_edges (gpu_runtime::GraphT<int32_t> graph, algo_state device_state, int32_t curr_iter) {
-	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-	//int num_threads = blockDim.x * gridDim.x;
-	int lane_id = thread_id % 32;
-
-	__shared__ int32_t stage2_queue[CTA_SIZE];
-	__shared__ int32_t stage3_queue[CTA_SIZE];
-	__shared__ int32_t stage_queue_sizes[3];
-	if (threadIdx.x == 0) {
-		stage_queue_sizes[0] = 0;
-		stage_queue_sizes[1] = 0;
-		stage_queue_sizes[2] = 0;
-	}
-	__syncthreads();
-	
-	
-	__shared__ int32_t stage2_offset[CTA_SIZE];
-	__shared__ int32_t stage3_offset[CTA_SIZE];
-
-	__shared__ int32_t stage2_size[CTA_SIZE];
-	__shared__ int32_t stage3_size[CTA_SIZE];
-	
-
-	int32_t total_vertices = device_state.frontier1_size[0];	
-
-	int32_t my_vertex_idx = thread_id / (STAGE_1_SIZE);
-	int32_t d;
-	int32_t s1_offset;
-	int32_t my_vertex;
-	int32_t row_offset;
-	if (my_vertex_idx < total_vertices) {
-		//my_vertex = device_state.frontier1[my_vertex_idx];
-		if (my_vertex_idx < device_state.frontier1_size[1]) {
-			my_vertex = device_state.frontier1[graph.num_vertices + my_vertex_idx];
-		} else if (my_vertex_idx < device_state.frontier1_size[1] + device_state.frontier1_size[2]) {
-			my_vertex = device_state.frontier1[graph.num_vertices * 2 + my_vertex_idx - device_state.frontier1_size[1]];
-		} else if (my_vertex_idx < device_state.frontier1_size[1] + device_state.frontier1_size[2] + device_state.frontier1_size[3]) {
-			my_vertex = device_state.frontier1[graph.num_vertices * 3 + my_vertex_idx - device_state.frontier1_size[1] - device_state.frontier1_size[2]];	
-		} else {
-			my_vertex = device_state.frontier1[graph.num_vertices * 4 + my_vertex_idx - device_state.frontier1_size[1] - device_state.frontier1_size[2] - device_state.frontier1_size[3]];
-		}	
-		// Step 1 segreggate vertices into shared buffers	
-		if (thread_id % (STAGE_1_SIZE) == 0 ) {
-			d = graph.d_get_degree(my_vertex);
-			row_offset = graph.d_src_offsets[my_vertex];	
-			int32_t s3_size = d/CTA_SIZE;
-			d = d - s3_size * CTA_SIZE;
-			if (s3_size) {
-				int32_t pos = atomicAggInc(&stage_queue_sizes[2]);
-				stage3_queue[pos] = my_vertex;			
-				stage3_size[pos] = s3_size * CTA_SIZE;
-				// stage3_offset[pos] = 0; // Not required because always 0
-				stage3_offset[pos] = row_offset;	
-			}
-			
-			int32_t s2_size = d/WARP_SIZE;
-			d = d - s2_size * WARP_SIZE;
-			
-			if (s2_size) {
-				int32_t pos = atomicAggInc(&stage_queue_sizes[1]);
-				stage2_queue[pos] = my_vertex;
-				stage2_offset[pos] = s3_size * CTA_SIZE + row_offset;
-				stage2_size[pos] = s2_size * WARP_SIZE;
-			}
-			s1_offset = s3_size * CTA_SIZE + s2_size * WARP_SIZE + row_offset;
-		}
-	}else
-		my_vertex = -1;
-
-	__syncthreads();
-	
-	d = __shfl_sync((uint32_t)-1, d, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
-	s1_offset = __shfl_sync((uint32_t)-1, s1_offset, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
-	int32_t src_distance;
-	if (my_vertex_idx < total_vertices) {
-		// STAGE 1	
-		//my_vertex = device_state.frontier1[my_vertex_idx];
-
-		//src_distance = device_state.SP[my_vertex];
-		src_distance = SP[my_vertex];
-
-		for (int32_t neigh_id = s1_offset + (lane_id % STAGE_1_SIZE); neigh_id < d + s1_offset; neigh_id += STAGE_1_SIZE) {
-			// DO ACTUAL SSSP
-			int32_t dst = graph.d_edge_dst[neigh_id];
-			int32_t new_dst = graph.d_edge_weight[neigh_id] + src_distance;
-
-			//if (new_dst < device_state.SP[dst]) {
-			if (new_dst < SP[dst]) {
-				//atomicMin(&device_state.SP[dst], new_dst);
-				atomicMin(&SP[dst], new_dst);
-				enqueueVertex(dst, device_state, new_dst);
-			}	
-		}		
-	}	
-	// STAGE 2 -- stage 2 is dynamically balanced
-	__syncwarp(); // SYNC the warp here because ...
-	while (1) {
-		int32_t to_process;
-		if (lane_id == 0) {
-			to_process = atomicSub(&stage_queue_sizes[1], 1) - 1;	
-		}
-		to_process = __shfl_sync((uint32_t)-1, to_process, 0, 32);
-		if (to_process < 0)
-			break;
-		my_vertex = stage2_queue[to_process];
-		d = stage2_size[to_process];
-		int32_t s2_offset = stage2_offset[to_process];	
-
-		//src_distance = device_state.SP[my_vertex];
-		src_distance = SP[my_vertex];
-		
-		for (int32_t neigh_id = s2_offset + (lane_id); neigh_id < d + s2_offset; neigh_id += WARP_SIZE) {
-			// DO ACTUAL SSSP
-			int dst = graph.d_edge_dst[neigh_id];
-			int new_dst = graph.d_edge_weight[neigh_id] + src_distance;
-			//if (new_dst < device_state.SP[dst]) {
-			if (new_dst < SP[dst]) {
-				atomicMin(&SP[dst], new_dst);
-				enqueueVertex(dst, device_state, new_dst);
-			}	
-		}
-	}	
-
-	// STAGE 3 -- all threads have to do all, no need for LB
-	for (int32_t wid = 0; wid < stage_queue_sizes[2]; wid ++) {
-		my_vertex = stage3_queue[wid];
-		d = stage3_size[wid];
-		int32_t s3_offset = stage3_offset[wid];
-		src_distance = SP[my_vertex];
-		
-		for (int32_t neigh_id = s3_offset + (threadIdx.x); neigh_id < d + s3_offset; neigh_id += CTA_SIZE) {
-			// DO ACTUAL SSSP
-			int dst = graph.d_edge_dst[neigh_id];
-			int new_dst = graph.d_edge_weight[neigh_id] + src_distance;
-			if (new_dst < SP[dst]) {
-				atomicMin(&SP[dst], new_dst);
-				enqueueVertex(dst, device_state, new_dst);
-			}	
-		}
-	}	
-}
-void __global__ update_nodes (gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
-	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-	int num_threads = blockDim.x * gridDim.x;
-	int warp_id = thread_id / 32;	
-	int total_work = graph.num_vertices;
-	int work_per_thread = (total_work + num_threads - 1)/num_threads;
-	for (int i = 0; i < work_per_thread; i++) {
-		int32_t node_id = thread_id + i * num_threads;
-		if (node_id < graph.num_vertices) {
-			if (device_state.frontier2[node_id]) {
-				device_state.frontier2[node_id] = 0;
-				int pos = atomicAggInc(device_state.frontier1_size + 1 + (warp_id % 4));
-				device_state.frontier1[pos + (warp_id % 4 + 1) * graph.num_vertices] = node_id;
-			}
-		}
-	}	
-}
-
 void __global__ update_nodes_identify_min(gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
 	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
 	int num_threads = blockDim.x * gridDim.x;
@@ -321,8 +107,6 @@ void __global__ update_nodes_special(gpu_runtime::GraphT<int32_t> graph, algo_st
 		if (node_id < graph.num_vertices) {
 			if(SP[node_id] >= device_state.window_lower && SP[node_id] < device_state.window_upper) {
 				gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, node_id);
-				//int pos = atomicAggInc(device_state.frontier1_size + 1 + (warp_id % 4));
-				//device_state.frontier1[pos + (warp_id % 4 + 1) * graph.num_vertices] = node_id;
 			}	
 		}
 	}
@@ -341,10 +125,6 @@ void swap_pointers(int32_t **a, int32_t **b) {
 	*a = *b;
 	*b = t;
 }
-void swap_queues(algo_state &device_state) {
-	//swap_pointers(&device_state.frontier1, &device_state.frontier2);
-	//swap_pointers(&device_state.frontier1_size, &device_state.frontier2_size);
-}
 
 void __device__ SP_generated_vector_op_apply_func_0(int32_t v) {
 	SP[v] = 2147483647;
@@ -358,45 +138,31 @@ int main(int argc, char *argv[]) {
 	gpu_runtime::load_graph(graph, argv[1], false);
 	int32_t delta = atoi(argv[3]);
 	int32_t start_vertex = atoi(argv[2]);
-
-
 	
 	cudaMalloc(&__device_SP, gpu_runtime::builtin_getVertices(graph) * sizeof(int32_t));
 	cudaMemcpyToSymbol(SP, &__device_SP, sizeof(int32_t*), 0);
 	__host_SP = new int32_t[gpu_runtime::builtin_getVertices(graph)];
 	gpu_runtime::vertex_set_apply_kernel<gpu_runtime::AccessorAll, SP_generated_vector_op_apply_func_0><<<NUM_CTA, CTA_SIZE>>>(graph.getFullFrontier());
 	
-
 	algo_state host_state, device_state;	
 	allocate_state(host_state, device_state, graph);
-
-	//host_state.frontier1_size = new int32_t[1];
-	//host_state.frontier1 = new int32_t[graph.num_vertices];
-	//host_state.more_elems = new int32_t();
-
-	//cudaMalloc(&device_state.frontier1, sizeof(int32_t)*graph.num_vertices*5);
-	//cudaMalloc(&device_state.frontier2, sizeof(char)*graph.num_vertices);
-	//cudaMalloc(&device_state.frontier1_size, 5*sizeof(int32_t));
-
-	
 	
 	host_state.window_lower = 0;
 	host_state.window_upper = delta;
 	device_state.window_lower = 0;
 	device_state.window_upper = delta;
 
-	
+   
 
-	//this sets it to Sparse
-	gpu_runtime::VertexFrontier frontier = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
-	gpu_runtime::builtin_addVertex(frontier, start_vertex);
 
 	cudaDeviceSynchronize();
-
 	
 	float total_time = 0;
 	for (int outer = 0; outer < ITER_COUNT; outer++) {
 		float iter_total = 0;
+		//this sets it to Sparse
+		gpu_runtime::VertexFrontier frontier = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
+		gpu_runtime::builtin_addVertex(frontier, start_vertex);
 		startTimer();
 		
 		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state, start_vertex);
@@ -411,44 +177,15 @@ int main(int argc, char *argv[]) {
 		while(gpu_runtime::builtin_getVertexSetSize(frontier) != (0)){
 			startTimer();
 			iters++;
-			//int num_threads = *host_state.frontier1_size *(STAGE_1_SIZE);
-			//int num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
-			
-			//update_edges<<<num_cta, CTA_SIZE>>>(graph, device_state, iters);
-			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(edges, frontier, frontier);
-
 			gpu_runtime::vertex_set_prepare_sparse(frontier);
-
-
 			cudaMemcpyToSymbol(window_upper, &device_state.window_upper, sizeof(int32_t), 0);
 			gpu_runtime::cudaCheckLastError();
 			gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);  
 			
-			// host_state.frontier1_size[0] = 0;
-			// host_state.frontier1_size[1] = 0;
-			// host_state.frontier1_size[2] = 0;
-			// host_state.frontier1_size[3] = 0;
-			// host_state.frontier1_size[4] = 0;
-			// cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, 5*sizeof(int32_t), cudaMemcpyHostToDevice);
-			
-			//update_nodes<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);
-
-
 			gpu_runtime::swap_bytemaps(frontier);
 			// set the input to the prepare function
 			frontier.format_ready = gpu_runtime::VertexFrontier::BYTEMAP;
 			
-			
-			
-			// cudaMemcpy(host_state.frontier1_size, device_state.frontier1_size, sizeof(int32_t)*5, cudaMemcpyDeviceToHost);
-			// host_state.frontier1_size[0] = host_state.frontier1_size[1];
-			// host_state.frontier1_size[0] += host_state.frontier1_size[2];
-			// host_state.frontier1_size[0] += host_state.frontier1_size[3];
-			// host_state.frontier1_size[0] += host_state.frontier1_size[4];
-			// cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, sizeof(int32_t), cudaMemcpyHostToDevice);
-
-
-			//if (host_state.frontier1_size[0] == 0) {
 			if (gpu_runtime::builtin_getVertexSetSize(frontier) == (0)) {
 				host_state.new_window_start[0] = INT_MAX;
 				cudaMemcpy(device_state.new_window_start, host_state.new_window_start, sizeof(int32_t), cudaMemcpyHostToDevice);
@@ -465,33 +202,14 @@ int main(int argc, char *argv[]) {
 				//if it is not a pointer, then you can set by value directly
 				device_state.window_lower = host_state.new_window_start[0];
 				device_state.window_upper = host_state.new_window_start[0] + delta;
-				
-				// host_state.frontier1_size[0] = 0;
-
-				// host_state.frontier1_size[0] = 0;
-				// host_state.frontier1_size[1] = 0;
-				// host_state.frontier1_size[2] = 0;
-				// host_state.frontier1_size[3] = 0;
-				// host_state.frontier1_size[4] = 0;
-				// cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, 5*sizeof(int32_t), cudaMemcpyHostToDevice);
-
 
 				update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>( graph, device_state, frontier);
 				gpu_runtime::swap_queues(frontier);
 				frontier.format_ready = gpu_runtime::VertexFrontier::SPARSE; 
 				
-				
-				// cudaMemcpy(host_state.frontier1_size, device_state.frontier1_size, sizeof(int32_t)*5, cudaMemcpyDeviceToHost);
-				// host_state.frontier1_size[0] = host_state.frontier1_size[1];
-				// host_state.frontier1_size[0] += host_state.frontier1_size[2];
-				// host_state.frontier1_size[0] += host_state.frontier1_size[3];
-				// host_state.frontier1_size[0] += host_state.frontier1_size[4];
-				// cudaMemcpy(device_state.frontier1_size, host_state.frontier1_size, sizeof(int32_t), cudaMemcpyHostToDevice);
 			}
 
 			t = stopTimer();
-			//printf("Iter %d time = %f, output_size = %d <%d, %d>\n", iters, t, *host_state.frontier1_size, num_cta, CTA_SIZE);
-
 
 			#ifdef DEBUG
 			//printf("Iter %d output_size = %d \n", iters, gpu_runtime::builtin_getVertexSetSize(frontier));
@@ -519,13 +237,6 @@ int main(int argc, char *argv[]) {
 				#else
 				printf("%d\n", host_state.SP[i]);
                 #endif
-		}else if (argv[2][0] == 'c'){
-			/*
-			for (int i = 0; i < NUM_BLOCKS * NUM_THREADS; i++)
-				printf("%d: %d\n", i, counters[i]);
-			*/
 		}
-
 	return 0;
-
 }

From 420f56d8d09bd942f120ccc0a8363dd7a1fdec92 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Thu, 17 Oct 2019 17:59:06 -0400
Subject: [PATCH 41/88] Generating working code for PageRank

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |   2 +-
 include/graphit/midend/mir.h                  |   1 +
 include/graphit/midend/mir_context.h          |  11 +
 include/graphit/midend/vector_op_lower.h      |   4 +-
 src/backend/codegen_gpu/codegen_gpu.cpp       | 208 ++++++++++++------
 src/midend/apply_expr_lower.cpp               |   2 +
 src/midend/mir_lower.cpp                      |   2 +-
 src/midend/vector_op_lower.cpp                | 199 +++++++++--------
 src/runtime_lib/infra_gpu/graph.h             |  26 ++-
 src/runtime_lib/infra_gpu/load_balance.h      |  65 +++++-
 src/runtime_lib/infra_gpu/support.h           |   6 +
 src/runtime_lib/infra_gpu/vertex_frontier.h   |   3 +
 12 files changed, 347 insertions(+), 182 deletions(-)

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index 5041746c..d70aa29c 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -144,7 +144,7 @@ class CodeGenGPUHost: public CodeGenGPU {
 
 	virtual void visit(mir::Call::Ptr) override;	
 	virtual void visit(mir::PrintStmt::Ptr) override;
-
+	virtual void visit(mir::VarExpr::Ptr) override;
 
 
 	void generateDeviceToHostCopy(mir::TensorArrayReadExpr::Ptr tare);
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index 03d88290..ce5089d0 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -853,6 +853,7 @@ namespace graphit {
 	    std::string kernel_function;
 	
 	    fir::gpu_schedule::SimpleGPUSchedule applied_schedule;
+	    bool requires_output = false;
 
         protected:
             virtual void copy(MIRNode::Ptr);
diff --git a/include/graphit/midend/mir_context.h b/include/graphit/midend/mir_context.h
index 4fb571ff..7d80ccad 100644
--- a/include/graphit/midend/mir_context.h
+++ b/include/graphit/midend/mir_context.h
@@ -176,6 +176,17 @@ namespace graphit {
 	    }
 	    return false;
 	}
+	bool isLoweredConst(std::string var_name) {
+		for (auto var: lowered_constants_) {
+			if (var->name == var_name)
+				return true;	
+		}
+		for (auto var: const_edge_sets_) {
+			if (var->name == var_name)
+				return true;
+		}
+		return false;
+	}
 
         void addConstVertexSet(mir::VarDecl::Ptr vertexset) {
             const_vertex_sets_.push_back(vertexset);
diff --git a/include/graphit/midend/vector_op_lower.h b/include/graphit/midend/vector_op_lower.h
index bce9ff16..6d2f69cb 100644
--- a/include/graphit/midend/vector_op_lower.h
+++ b/include/graphit/midend/vector_op_lower.h
@@ -6,6 +6,7 @@
 #define GRAPHIT_VECTOR_OP_LOWER_H
 
 #include <graphit/midend/mir_context.h>
+#include <graphit/frontend/schedule.h>
 
 namespace graphit {
     /**
@@ -15,7 +16,7 @@ namespace graphit {
      */
     class GlobalFieldVectorLower {
     public:
-        GlobalFieldVectorLower(MIRContext *mir_context) : mir_context_(mir_context){
+        GlobalFieldVectorLower(MIRContext *mir_context, Schedule *schedule) : mir_context_(mir_context), schedule_(schedule) {
 
         }
 
@@ -27,6 +28,7 @@ namespace graphit {
 
     private:
         MIRContext *mir_context_;
+        Schedule *schedule_ = nullptr;
 
     };
 }
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 09b046ac..edcbbf4d 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -88,13 +88,19 @@ void CodeGenGPU::genPropertyArrayAlloca(mir::VarDecl::Ptr var_decl) {
 	auto size_expr = mir_context_->getElementCount(vector_type->element_type);
 	assert(size_expr != nullptr);
 	
-
-	printIndent();
-	oss << "cudaMalloc(&__device_" << var_decl->name << ", ";
-	size_expr->accept(this);
-	oss << " * sizeof(";
-	vector_type->vector_element_type->accept(this);
-	oss << "));" << std::endl;
+	if (var_decl->initVal != nullptr && mir::isa<mir::Call>(var_decl->initVal)) {
+		printIndent();
+		oss << "__device_" << var_decl->name << " = ";
+		var_decl->initVal->accept(this);
+		oss << ";" << std::endl;
+	} else {
+		printIndent();
+		oss << "cudaMalloc(&__device_" << var_decl->name << ", ";
+		size_expr->accept(this);
+		oss << " * sizeof(";
+		vector_type->vector_element_type->accept(this);
+		oss << "));" << std::endl;
+	}
 	
 	printIndent();
 	oss << "cudaMemcpyToSymbol(";
@@ -240,26 +246,38 @@ void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 		dedent();
 	}
 	mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function_name);
-	if (input_function->args.size() == 3) {	
+	if (apply_expr->requires_output) {
+		if (input_function->args.size() == 3) {	
+			printIndent();
+			oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+			printIndent();
+			oss << "if (" << apply_expr->input_function_name << "(src, dst, weight)) {" << std::endl;
+		} else {
+			printIndent();
+			oss << "if (" << apply_expr->input_function_name << "(src, dst)) {" << std::endl;
+		}
+		indent();
 		printIndent();
-		oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+		if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED)
+			oss << "gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, dst);" << std::endl;
+		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP)
+			oss << "gpu_runtime::enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, dst);" << std::endl;
+		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP)
+			oss << "gpu_runtime::enqueueVertexBitmap(output_frontier.d_bit_map_output, output_frontier.d_num_elems_output, dst);" << std::endl;
+		dedent();
 		printIndent();
-		oss << "if (" << apply_expr->input_function_name << "(src, dst, weight)) {" << std::endl;
+		oss << "}" << std::endl;
 	} else {
-		printIndent();
-		oss << "if (" << apply_expr->input_function_name << "(src, dst)) {" << std::endl;
+		if (input_function->args.size() == 3) {	
+			printIndent();
+			oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+			printIndent();
+			oss << apply_expr->input_function_name << "(src, dst, weight);" << std::endl;
+		} else {
+			printIndent();
+			oss << apply_expr->input_function_name << "(src, dst);" << std::endl;
+		}
 	}
-	indent();
-	printIndent();
-	if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED)
-		oss << "gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, dst);" << std::endl;
-	else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP)
-		oss << "gpu_runtime::enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, dst);" << std::endl;
-	else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP)
-		oss << "gpu_runtime::enqueueVertexBitmap(output_frontier.d_bit_map_output, output_frontier.d_num_elems_output, dst);" << std::endl;
-	dedent();
-	printIndent();
-	oss << "}" << std::endl;
 	dedent();
 	printIndent();
 	oss << "}" << std::endl;	
@@ -280,10 +298,6 @@ void CodeGenGPUKernelEmitter::visit(mir::PullEdgeSetApplyExpr::Ptr apply_expr) {
 
 	// First we generate the function that is passed to the load balancing function
 	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
-	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
-	if (apply_expr->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
-		load_balance_function = "gpu_runtime::TWCE_load_balance";
-	}
 	
 	oss << "template <typename EdgeWeightType>" << std::endl;
 	oss << "void __device__ " << load_balancing_arg << "(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
@@ -309,27 +323,39 @@ void CodeGenGPUKernelEmitter::visit(mir::PullEdgeSetApplyExpr::Ptr apply_expr) {
 	}
 
 	mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function_name);
-	if (input_function->args.size() == 3) {	
+	if (apply_expr->requires_output) {
+		if (input_function->args.size() == 3) {	
+			printIndent();
+			oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+			printIndent();
+			oss << "if (" << apply_expr->input_function_name << "(dst, src, weight)) {" << std::endl;
+		} else {
+			printIndent();
+			oss << "if (" << apply_expr->input_function_name << "(dst, src)) {" << std::endl;
+		}
+
+		indent();
 		printIndent();
-		oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+		if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED)
+			oss << "gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, src);" << std::endl;
+		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP)
+			oss << "gpu_runtime::enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, src);" << std::endl;
+		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP)
+			oss << "gpu_runtime::enqueueVertexBitmap(output_frontier.d_bit_map_output, output_frontier.d_num_elems_output, src);" << std::endl;
+		dedent();
 		printIndent();
-		oss << "if (" << apply_expr->input_function_name << "(dst, src, weight)) {" << std::endl;
+		oss << "}" << std::endl;
 	} else {
-		printIndent();
-		oss << "if (" << apply_expr->input_function_name << "(dst, src)) {" << std::endl;
+		if (input_function->args.size() == 3) {	
+			printIndent();
+			oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+			printIndent();
+			oss << apply_expr->input_function_name << "(dst, src, weight);" << std::endl;
+		} else {
+			printIndent();
+			oss << apply_expr->input_function_name << "(dst, src);" << std::endl;
+		}
 	}
-
-	indent();
-	printIndent();
-	if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED)
-		oss << "gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, src);" << std::endl;
-	else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP)
-		oss << "gpu_runtime::enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, src);" << std::endl;
-	else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP)
-		oss << "gpu_runtime::enqueueVertexBitmap(output_frontier.d_bit_map_output, output_frontier.d_num_elems_output, src);" << std::endl;
-	dedent();
-	printIndent();
-	oss << "}" << std::endl;
 	dedent();
 	printIndent();
 	oss << "}" << std::endl;	
@@ -352,7 +378,9 @@ void CodeGenGPU::genEdgeSets(void) {
 	for (auto edgeset: mir_context_->getEdgeSets()) {
 		auto edge_set_type = mir::to<mir::EdgeSetType>(edgeset->type);
 		edge_set_type->accept(this);
-		oss << " " << edgeset->name << ";" << std::endl;
+		oss << " __device__ " << edgeset->name << ";" << std::endl;
+		edge_set_type->accept(this);
+		oss << " " << "__host_" << edgeset->name << ";" << std::endl;
 	}
 }
 
@@ -434,10 +462,14 @@ void CodeGenGPU::visit(mir::FuncDecl::Ptr func_decl) {
 				
 				printIndent();
 				oss << "gpu_runtime::load_graph(";
-				oss << var_name << ", ";
+				oss << "__host_" << var_name << ", ";
 				edge_set_load_expr->file_name->accept(this);
 				oss << ", false);" << std::endl;
 
+				printIndent();
+				oss << "cudaMemcpyToSymbol(";
+				oss << var_name << ", &__host_" << var_name << ", sizeof(__host_" << var_name << "), 0, cudaMemcpyHostToDevice);" << std::endl;
+
 			}
 			for (auto constant: mir_context_->getLoweredConstants()) {
 				if (mir::isa<mir::VectorType>(constant->type)) {
@@ -446,9 +478,13 @@ void CodeGenGPU::visit(mir::FuncDecl::Ptr func_decl) {
 				} else {
 					if (constant->initVal != nullptr) {
 						printIndent();
-						oss << constant->name << " = ";
+						oss << "__host_" << constant->name << " = ";
 						constant->initVal->accept(this);
 						oss << ";" << std::endl;
+						printIndent();
+						oss << "cudaMemcpyToSymbol(" << constant->name << ", &__host_" << constant->name << ", sizeof(";
+						constant->type->accept(this);
+						oss << "), 0, cudaMemcpyHostToDevice);" << std::endl;
 					}
 				}
 			}
@@ -478,16 +514,29 @@ void CodeGenGPU::visit(mir::ElementType::Ptr element_type) {
 	oss << "int32_t";
 }
 void CodeGenGPU::visit(mir::ExprStmt::Ptr expr_stmt) {
-	printIndent();
-	expr_stmt->expr->accept(this);
-	oss << ";" << std::endl;
+	if (mir::isa<mir::EdgeSetApplyExpr>(expr_stmt->expr)) {
+		genEdgeSetApplyExpr(mir::to<mir::EdgeSetApplyExpr>(expr_stmt->expr), nullptr);
+	} else {
+		printIndent();
+		expr_stmt->expr->accept(this);
+		oss << ";" << std::endl;
+	}
 }
+
 void CodeGenGPU::visit(mir::VarExpr::Ptr var_expr) {
 	if (is_hoisted_var(var_expr->var)) {
 		oss << "__local_" << var_expr->var.getName();
 		return;
-	}
-	oss << var_expr->var.getName();
+	} else
+		oss << var_expr->var.getName();
+}
+void CodeGenGPUHost::visit(mir::VarExpr::Ptr var_expr) {
+	if (mir_context_->isLoweredConst(var_expr->var.getName())) {
+		oss << "__host_" << var_expr->var.getName();
+		return;
+	} else 
+		oss << var_expr->var.getName();
+
 }
 void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr::Ptr target) {
 	if (target != nullptr && esae->from_func == "") {
@@ -495,31 +544,37 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 	}		
 	// We will assume that the output frontier can reuse the input frontier. 
 	// TOOD: Add liveness analysis for this
-	printIndent();	
+	printIndent();
 	oss << "{" << std::endl;
 	indent();
 	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
 	if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
 		load_balance_function = "gpu_runtime::TWCE_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::EDGE_ONLY) {
+		load_balance_function = "gpu_runtime::edge_only_load_balance";
 	}
 
 	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae)) {
-		printIndent();
-		oss << "gpu_runtime::vertex_set_prepare_sparse(";
-		oss << esae->from_func;
-		oss << ");" << std::endl;
-	} else if (mir::isa<mir::PullEdgeSetApplyExpr>(esae)) {
-		if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
+		if (esae->from_func != "") {
 			printIndent();
-			oss << "gpu_runtime::vertex_set_prepare_boolmap(";
-			oss << esae->from_func;
-			oss << ");" << std::endl;
-		} else if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BITMAP) {
-			printIndent();
-			oss << "gpu_runtime::vertex_set_prepare_bitmap(";
+			oss << "gpu_runtime::vertex_set_prepare_sparse(";
 			oss << esae->from_func;
 			oss << ");" << std::endl;
 		}
+	} else if (mir::isa<mir::PullEdgeSetApplyExpr>(esae)) {
+		if (esae->from_func != "") {
+			if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
+				printIndent();
+				oss << "gpu_runtime::vertex_set_prepare_boolmap(";
+				oss << esae->from_func;
+				oss << ");" << std::endl;
+			} else if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BITMAP) {
+				printIndent();
+				oss << "gpu_runtime::vertex_set_prepare_bitmap(";
+				oss << esae->from_func;
+				oss << ");" << std::endl;
+			}
+		}
 
 		std::string to_func = esae->to_func;
 		if (to_func != "") {
@@ -546,6 +601,8 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 		target_type->weight_type->accept(this);
 
 	std::string accessor_type = "gpu_runtime::AccessorSparse";
+	if (esae->from_func == "")
+		accessor_type = "gpu_runtime::AccessorAll";
 	if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->to_func == "")
 		accessor_type = "gpu_runtime::AccessorAll";
 	std::string src_filter = "gpu_runtime::true_function";
@@ -554,7 +611,14 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 
 	oss << ", " << esae->device_function << ", " << accessor_type << ", " << src_filter << ">(";
 	esae->target->accept(this);
-	oss << ", " << esae->from_func << ", ";
+	oss << ", ";
+	if (esae->from_func != "")
+		oss << esae->from_func;
+	else {
+		esae->target->accept(this);
+		oss << ".getFullFrontier()";
+	}
+	oss << ", ";
 	if (target != nullptr)
 		target->accept(this);
 	else 
@@ -573,7 +637,6 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 			printIndent();
 			target->accept(this);
 			oss << ".format_ready = gpu_runtime::VertexFrontier::SPARSE;" << std::endl;
-
 		} else if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
 			printIndent();
 			oss << "gpu_runtime::swap_bitmaps(";
@@ -607,6 +670,8 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
 	if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
 		load_balance_function = "gpu_runtime::TWCE_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::EDGE_ONLY) {
+		load_balance_function = "gpu_runtime::edge_only_load_balance";
 	}
 	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae)) {
 		printIndent();
@@ -697,7 +762,7 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 }
 void CodeGenGPU::visit(mir::AssignStmt::Ptr assign_stmt) {
 	if (mir::isa<mir::EdgeSetApplyExpr>(assign_stmt->expr)) {
-		mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);
+		mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);	
 		genEdgeSetApplyExpr(esae, assign_stmt->lhs);
 	} else {
 		printIndent();
@@ -708,6 +773,7 @@ void CodeGenGPU::visit(mir::AssignStmt::Ptr assign_stmt) {
 	}
 }
 
+
 void CodeGenGPUFusedKernel::visit(mir::AssignStmt::Ptr assign_stmt) {
 	if (mir::isa<mir::EdgeSetApplyExpr>(assign_stmt->expr)) {
 		mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);
@@ -771,8 +837,6 @@ void CodeGenGPU::visit(mir::TensorArrayReadExpr::Ptr expr) {
 }
 void CodeGenGPUHost::visit(mir::TensorArrayReadExpr::Ptr expr) {
 	mir::VarExpr::Ptr var_expr = mir::to<mir::VarExpr>(expr->target);
-	if (mir_context_->isLoweredConstTensor(var_expr->var.getName()))	
-		oss << "__host_";
 	expr->target->accept(this);
 	oss << "[";
 	expr->index->accept(this);
@@ -867,7 +931,7 @@ void CodeGenGPU::visit(mir::ReduceStmt::Ptr reduce_stmt) {
 				oss << reduce_stmt->tracking_var_name_ << " = true;" << std::endl;
 			}
 			printIndent();
-			oss << "writeAdd(&";
+			oss << "gpu_runtime::writeAdd(&";
 			reduce_stmt->lhs->accept(this);
 			oss << ", ";
 			reduce_stmt->expr->accept(this);
@@ -1108,7 +1172,7 @@ void CodeGenGPU::visit(mir::VertexSetApplyExpr::Ptr vsae) {
 		assert(associated_edge_set != nullptr);
 		oss << "(";
 		//associated_element_type_size->accept(this);
-		oss << associated_edge_set->name << ".getFullFrontier()";
+		oss << "__host_" << associated_edge_set->name << ".getFullFrontier()";
 		oss << ")";	
 	} else {
 		oss << "(";
diff --git a/src/midend/apply_expr_lower.cpp b/src/midend/apply_expr_lower.cpp
index 6ca6d315..22984dd6 100644
--- a/src/midend/apply_expr_lower.cpp
+++ b/src/midend/apply_expr_lower.cpp
@@ -178,6 +178,8 @@ namespace graphit {
 	if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
 		// Always parallelize all operators for GPU schedules
 		edgeset_apply->is_parallel = true;
+		if (edgeset_apply->tracking_field != "")
+			edgeset_apply->requires_output = true;
 		// Check if there is a GPU schedule attached to this statement - 
             	auto current_scope_name = label_scope_.getCurrentScope();
 		auto apply_schedule_iter = schedule_->apply_gpu_schedules.find(current_scope_name);
diff --git a/src/midend/mir_lower.cpp b/src/midend/mir_lower.cpp
index c988a57f..b5b5b857 100644
--- a/src/midend/mir_lower.cpp
+++ b/src/midend/mir_lower.cpp
@@ -23,7 +23,7 @@ namespace graphit {
     void MIRLower::lower(MIRContext* mir_context, Schedule* schedule){
 
         //lower global vector assignment to vector operations
-        GlobalFieldVectorLower(mir_context).lower();
+        GlobalFieldVectorLower(mir_context, schedule).lower();
 
         //lower  global edgeset assignment (from loading)
         // needed for reading commandline arguments in the main function
diff --git a/src/midend/vector_op_lower.cpp b/src/midend/vector_op_lower.cpp
index a425b498..6b7d891c 100644
--- a/src/midend/vector_op_lower.cpp
+++ b/src/midend/vector_op_lower.cpp
@@ -28,103 +28,108 @@ namespace  graphit {
 
                 // do the lowering if the right handside is a call stmt (may be add if the right hand side is part of a struct)
                 if (mir::isa<mir::Call>(var_decl->initVal)){
-                    auto orig_init_val = var_decl->initVal;
-                    mir::VectorType::Ptr vector_type = std::dynamic_pointer_cast<mir::VectorType>(var_decl->type);
-
-                    if (mir::isa<mir::ScalarType>(vector_type->vector_element_type)){
-                        mir::ScalarType::Ptr element_type =  mir::to<mir::ScalarType>(
-                                vector_type->vector_element_type);
-                        //reset the initval to something default 0 for integer and float
-                        if (element_type->type == mir::ScalarType::Type::INT){
-                            //initial value should be a int
-                            auto zero = std::make_shared<mir::IntLiteral>();
-                            zero->val = 0;
-                            var_decl->initVal = zero;
-                        }
-                        else if (element_type->type == mir::ScalarType::Type::FLOAT){
-                            //initial value should be a float
-
-                        }
-
-                        //insert another const var decl as the temporary holder for the function
-			//this vector is always going to be assigned a value (pointer) returned from the function call and hence does not need allocation
-                        auto tmp_var_decl = std::make_shared<mir::VarDecl>();
-			tmp_var_decl->needs_allocation = false;
-                        tmp_var_decl->type = var_decl->type;
-                        tmp_var_decl->initVal = orig_init_val;
-                        tmp_var_decl->name = "generated_tmp_vector_" + mir_context_->getUniqueNameCounterString();
-                        tmp_var_decl->modifier = var_decl->modifier;
-                        mir_context_->insertNewConstVectorDeclEnd(tmp_var_decl);
-
-                        //create a new apply function decl that copies over the vector
-                        if (mir_context_->isVertexElementType(vector_type->element_type->ident)){
-                            //a vertexset apply function if the element is a vertexset
-                            mir::FuncDecl::Ptr copy_over_apply_func = std::make_shared<mir::FuncDecl>();
-                            // create a utility function for creating new vertexset apply
-                            // set up a name
-                            copy_over_apply_func->name = "generated_vector_op_apply_func_"
-                                                         + mir_context_->getUniqueNameCounterString();
-                            auto arg_var_type = vector_type->element_type;
-                            mir::Var arg_var = mir::Var("v", arg_var_type);
-                            std::vector<mir::Var> arg_var_list = std::vector<mir::Var>();
-                            arg_var_list.push_back(arg_var);
-                            copy_over_apply_func->args = arg_var_list;
-
-                            auto mir_stmt_body = std::make_shared<mir::StmtBlock>();
-                            auto assign_stmt = std::make_shared<mir::AssignStmt>();
-
-                            auto lhs = std::make_shared<mir::TensorReadExpr>(
-                                    var_decl->name, "v",
-                                    var_decl->type,
-                                    vector_type->element_type
-                                    );
-
-                            auto rhs = std::make_shared<mir::TensorReadExpr>(
-                                    tmp_var_decl->name, "v",
-                                    tmp_var_decl->type,
-                                    vector_type->element_type
-                            );
-
-                            assign_stmt->lhs = lhs;
-                            assign_stmt->expr = rhs;
-                            mir_stmt_body->insertStmtEnd(assign_stmt);
-                            copy_over_apply_func->body = mir_stmt_body;
-                            //insert the utility function back into function list
-                            mir_context_->insertFuncDeclFront(copy_over_apply_func);
-
-
-                            // Lastly, insert a vertexset apply expression at the beginning of main
-                            mir::VarDecl::Ptr global_vertex_set_var_decl = mir_context_->getGlobalConstVertexSet();
-                            mir::VertexSetApplyExpr::Ptr vertex_set_apply_expr =
-                                    std::make_shared<mir::VertexSetApplyExpr>(global_vertex_set_var_decl->name,
-                                                                              global_vertex_set_var_decl->type,
-                                                                              copy_over_apply_func->name);
-                            mir::ExprStmt::Ptr apply_stmt = std::make_shared<mir::ExprStmt>();
-                            apply_stmt->expr = vertex_set_apply_expr;
-
-                            //No longer directly add to the main function block
-                            //mir::FuncDecl::Ptr main_func_decl = mir_context_->getMainFuncDecl();
-                            //main_func_decl->body->insertStmtFront(apply_stmt);
-
-
-                            //puts the current vector initilization block into the back of the initialization block
-                            //which would be later put into main function declaration
-                            //vector_initialization_block->insertStmtEnd(apply_stmt);
-
-
-                            auto tmp_var_assign_stmt = std::make_shared<mir::AssignStmt>();
-                            auto tmp_var_expr = std::make_shared<mir::VarExpr>();
-                            tmp_var_expr->var = mir::Var(tmp_var_decl->name, tmp_var_decl->type);
-                            tmp_var_assign_stmt->lhs = tmp_var_expr;
-                            tmp_var_assign_stmt->expr = tmp_var_decl->initVal;
-
-                            mir_context_->field_vector_init_stmts.push_back(tmp_var_assign_stmt);
-                            mir_context_->field_vector_init_stmts.push_back(apply_stmt);
-
-
-                        }
-                    }
-
+                    // Special case if this is GPU lowering
+                    
+		    if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
+			   // Do nothing for GPU 
+		    } else {
+			    auto orig_init_val = var_decl->initVal;
+			    mir::VectorType::Ptr vector_type = std::dynamic_pointer_cast<mir::VectorType>(var_decl->type);
+
+			    if (mir::isa<mir::ScalarType>(vector_type->vector_element_type)){
+				mir::ScalarType::Ptr element_type =  mir::to<mir::ScalarType>(
+					vector_type->vector_element_type);
+				//reset the initval to something default 0 for integer and float
+				if (element_type->type == mir::ScalarType::Type::INT){
+				    //initial value should be a int
+				    auto zero = std::make_shared<mir::IntLiteral>();
+				    zero->val = 0;
+				    var_decl->initVal = zero;
+				}
+				else if (element_type->type == mir::ScalarType::Type::FLOAT){
+				    //initial value should be a float
+
+				}
+
+				//insert another const var decl as the temporary holder for the function
+				//this vector is always going to be assigned a value (pointer) returned from the function call and hence does not need allocation
+				auto tmp_var_decl = std::make_shared<mir::VarDecl>();
+				tmp_var_decl->needs_allocation = false;
+				tmp_var_decl->type = var_decl->type;
+				tmp_var_decl->initVal = orig_init_val;
+				tmp_var_decl->name = "generated_tmp_vector_" + mir_context_->getUniqueNameCounterString();
+				tmp_var_decl->modifier = var_decl->modifier;
+				mir_context_->insertNewConstVectorDeclEnd(tmp_var_decl);
+
+				//create a new apply function decl that copies over the vector
+				if (mir_context_->isVertexElementType(vector_type->element_type->ident)){
+				    //a vertexset apply function if the element is a vertexset
+				    mir::FuncDecl::Ptr copy_over_apply_func = std::make_shared<mir::FuncDecl>();
+				    // create a utility function for creating new vertexset apply
+				    // set up a name
+				    copy_over_apply_func->name = "generated_vector_op_apply_func_"
+								 + mir_context_->getUniqueNameCounterString();
+				    auto arg_var_type = vector_type->element_type;
+				    mir::Var arg_var = mir::Var("v", arg_var_type);
+				    std::vector<mir::Var> arg_var_list = std::vector<mir::Var>();
+				    arg_var_list.push_back(arg_var);
+				    copy_over_apply_func->args = arg_var_list;
+
+				    auto mir_stmt_body = std::make_shared<mir::StmtBlock>();
+				    auto assign_stmt = std::make_shared<mir::AssignStmt>();
+
+				    auto lhs = std::make_shared<mir::TensorReadExpr>(
+					    var_decl->name, "v",
+					    var_decl->type,
+					    vector_type->element_type
+					    );
+
+				    auto rhs = std::make_shared<mir::TensorReadExpr>(
+					    tmp_var_decl->name, "v",
+					    tmp_var_decl->type,
+					    vector_type->element_type
+				    );
+
+				    assign_stmt->lhs = lhs;
+				    assign_stmt->expr = rhs;
+				    mir_stmt_body->insertStmtEnd(assign_stmt);
+				    copy_over_apply_func->body = mir_stmt_body;
+				    //insert the utility function back into function list
+				    mir_context_->insertFuncDeclFront(copy_over_apply_func);
+
+
+				    // Lastly, insert a vertexset apply expression at the beginning of main
+				    mir::VarDecl::Ptr global_vertex_set_var_decl = mir_context_->getGlobalConstVertexSet();
+				    mir::VertexSetApplyExpr::Ptr vertex_set_apply_expr =
+					    std::make_shared<mir::VertexSetApplyExpr>(global_vertex_set_var_decl->name,
+										      global_vertex_set_var_decl->type,
+										      copy_over_apply_func->name);
+				    mir::ExprStmt::Ptr apply_stmt = std::make_shared<mir::ExprStmt>();
+				    apply_stmt->expr = vertex_set_apply_expr;
+
+				    //No longer directly add to the main function block
+				    //mir::FuncDecl::Ptr main_func_decl = mir_context_->getMainFuncDecl();
+				    //main_func_decl->body->insertStmtFront(apply_stmt);
+
+
+				    //puts the current vector initilization block into the back of the initialization block
+				    //which would be later put into main function declaration
+				    //vector_initialization_block->insertStmtEnd(apply_stmt);
+
+
+				    auto tmp_var_assign_stmt = std::make_shared<mir::AssignStmt>();
+				    auto tmp_var_expr = std::make_shared<mir::VarExpr>();
+				    tmp_var_expr->var = mir::Var(tmp_var_decl->name, tmp_var_decl->type);
+				    tmp_var_assign_stmt->lhs = tmp_var_expr;
+				    tmp_var_assign_stmt->expr = tmp_var_decl->initVal;
+
+				    mir_context_->field_vector_init_stmts.push_back(tmp_var_assign_stmt);
+				    mir_context_->field_vector_init_stmts.push_back(apply_stmt);
+
+
+				}
+			    }
+		    }
                 } else {
                     //field vector property and the initialization is not through a call stmt
                     //create a new apply function decl that initializes every value
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
index dc7ee374..63ca6a73 100644
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -34,10 +34,10 @@ struct GraphT { // Field names are according to CSR, reuse for CSC
 	int32_t __device__ d_get_degree(int32_t vertex_id) {
 		return d_src_offsets[vertex_id + 1] - d_src_offsets[vertex_id];
 	}
-	VertexFrontier getFullFrontier(void) {
-		VertexFrontier frontier;
-		frontier.max_num_elems = num_vertices;
-		return frontier;
+	VertexFrontier full_frontier;
+	VertexFrontier& getFullFrontier(void) {
+		full_frontier.max_num_elems = num_vertices;
+		return full_frontier;
 	}
 };
 void consume(int32_t _) {
@@ -141,6 +141,24 @@ static int32_t builtin_getVertices(GraphT<EdgeWeightType> &graph) {
 	return graph.num_vertices;
 }
 
+template <typename EdgeWeightType>
+static int32_t __device__ device_builtin_getVertices(GraphT<EdgeWeightType> &graph) {
+	return graph.num_vertices;
+}
+
+template <typename EdgeWeightType> 
+void __global__ init_degrees_kernel(int32_t *degrees, GraphT<EdgeWeightType> graph) {
+	for (int32_t vid = threadIdx.x + blockIdx.x * blockDim.x; vid < graph.num_vertices; vid += gridDim.x * blockDim.x) 
+		degrees[vid] = graph.d_get_degree(vid);
+}
+
+template <typename EdgeWeightType>
+static int32_t* builtin_getOutDegrees(GraphT<EdgeWeightType> &graph) {
+	int32_t *degrees = nullptr;
+	cudaMalloc(&degrees, sizeof(int32_t) * graph.num_vertices);
+	init_degrees_kernel<<<NUM_CTA, CTA_SIZE>>>(degrees, graph);
+	return degrees;
+}
 
 }
 #endif
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index a7bf9ff3..80f41150 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -28,8 +28,8 @@ static void __global__ vertex_set_apply_kernel(VertexFrontier frontier) {
 
 // VERTEX BASED LOAD BALANCE FUNCTIONS
 template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
-void __device__ vertex_based_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
-	int32_t vid = threadIdx.x + blockDim.x * blockIdx.x;
+void __device__ vertex_based_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
+	int32_t vid = threadIdx.x + blockDim.x * cta_id;
 	if (vid >= AccessorType::getSize(input_frontier))
 		return;
 	int32_t src = AccessorType::getElement(input_frontier, vid);
@@ -46,9 +46,15 @@ void __host__ vertex_based_load_balance_info(VertexFrontier &frontier, int32_t &
 	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
 	cta_size = CTA_SIZE;
 }
+template <typename AccessorType>
+void __device__ vertex_based_load_balance_info_device(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	int32_t num_threads = AccessorType::getSizeDevice(frontier);
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	cta_size = CTA_SIZE;
+}
 template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
 void __global__ vertex_based_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
-	vertex_based_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier);
+	vertex_based_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
 }
 
 template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
@@ -60,9 +66,56 @@ void __host__ vertex_based_load_balance_host(GraphT<EdgeWeightType> &graph, Vert
 
 template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
 void __device__ vertex_based_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
-	//int32_t num_cta, cta_size;
-	//vertex_based_load_balance_info_device<AccessorType>(input_frontier, num_cta, cta_size);
-	// Do the actual processing
+	int32_t num_cta, cta_size;
+	vertex_based_load_balance_info_device<AccessorType>(input_frontier, num_cta, cta_size);
+	this_grid().sync();
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		vertex_based_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);	
+		__syncthreads();
+	}
+	this_grid().sync();
+}
+
+// EDGE_ONLY LOAD BALANCE FUNCTIONS
+
+template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier), typename AccessorType, bool src_filter(int32_t)>
+static void __device__ edge_only_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier input_frontier, VertexFrontier output_frontier, unsigned int cta_id, unsigned int total_cta) {
+	int32_t thread_id = blockDim.x * cta_id + threadIdx.x;
+	int32_t total_threads = blockDim.x * total_cta;
+	for (int32_t eid = thread_id; eid < graph.num_edges; eid += total_threads) {
+		int32_t src = graph.d_edge_src[eid];
+		if (src_filter(src) == true) {
+			int32_t dst = graph.d_edge_dst[eid];
+			load_balance_payload(graph, src, dst, eid, input_frontier, output_frontier);	
+		}
+	}		
+}
+template <typename AccessorType>
+void __host__ edge_only_load_balance_info(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	num_cta = NUM_CTA;
+	cta_size = CTA_SIZE;
+}
+template <typename AccessorType>
+void __device__ edge_only_load_balance_info_device(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	num_cta = NUM_CTA;
+	cta_size = CTA_SIZE;
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ edge_only_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	edge_only_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __host__ edge_only_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	edge_only_load_balance_info<AccessorType>(input_frontier, num_cta, cta_size);
+	edge_only_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __device__ edge_only_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	vertex_based_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);	
+	this_grid().sync();
 }
 
 // TWCE LOAD BALANCE FUNCTIONS
diff --git a/src/runtime_lib/infra_gpu/support.h b/src/runtime_lib/infra_gpu/support.h
index be990644..226a561a 100644
--- a/src/runtime_lib/infra_gpu/support.h
+++ b/src/runtime_lib/infra_gpu/support.h
@@ -31,6 +31,12 @@ static bool __device__ writeMin(T *dst, T src) {
 	bool ret = (old_value > src);
 	return ret;
 }
+
+template <typename T>
+static bool __device__ writeAdd(T *dst, T src) {
+	atomicAdd(dst, src);
+	return true;
+}
 template <typename T>
 static bool __device__ CAS(T *dst, T old_val, const T &new_val) {
 	return old_val == atomicCAS(dst, old_val, new_val);
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index 8b4254f1..47ac9694 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -32,6 +32,9 @@ struct VertexFrontier {
 
 	format_ready_type format_ready;
 };
+
+static VertexFrontier sentinel_frontier;
+
 static int32_t builtin_getVertexSetSize(VertexFrontier &frontier) {
 	int32_t curr_size = 0;
 	cudaMemcpy(&curr_size, frontier.d_num_elems_input, sizeof(int32_t), cudaMemcpyDeviceToHost);

From 38ed3e84183b52e1e57755222be60013be4e9c8f Mon Sep 17 00:00:00 2001
From: Yunming Zhang <yunming@lanka-dgx0.csail.mit.edu>
Date: Thu, 17 Oct 2019 18:57:29 -0400
Subject: [PATCH 42/88] adding support for timing the code

---
 .../test_input/sssp_delta_stepping.cu         | 44 ++++++++++++++-----
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 0b0fe441..dbe138ae 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -1,7 +1,7 @@
 #include "gpu_intrinsics.h"
 #include <algorithm>
 
-#define ITER_COUNT (1)
+
 #define USE_DEDUP 0
 #define SORT_NODES 0
 #include <assert.h>
@@ -10,6 +10,12 @@
 
 //#define DEBUG
 
+#ifdef DEBUG
+  #define ITER_COUNT (4)
+#else
+  #define ITER_COUNT (1)
+#endif
+
 typedef struct {
 	int32_t *SP;
 	int32_t *output_size;
@@ -142,15 +148,12 @@ int main(int argc, char *argv[]) {
 	cudaMalloc(&__device_SP, gpu_runtime::builtin_getVertices(graph) * sizeof(int32_t));
 	cudaMemcpyToSymbol(SP, &__device_SP, sizeof(int32_t*), 0);
 	__host_SP = new int32_t[gpu_runtime::builtin_getVertices(graph)];
-	gpu_runtime::vertex_set_apply_kernel<gpu_runtime::AccessorAll, SP_generated_vector_op_apply_func_0><<<NUM_CTA, CTA_SIZE>>>(graph.getFullFrontier());
+	
 	
 	algo_state host_state, device_state;	
 	allocate_state(host_state, device_state, graph);
 	
-	host_state.window_lower = 0;
-	host_state.window_upper = delta;
-	device_state.window_lower = 0;
-	device_state.window_upper = delta;
+
 
    
 
@@ -163,7 +166,13 @@ int main(int argc, char *argv[]) {
 		//this sets it to Sparse
 		gpu_runtime::VertexFrontier frontier = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
 		gpu_runtime::builtin_addVertex(frontier, start_vertex);
+		gpu_runtime::vertex_set_apply_kernel<gpu_runtime::AccessorAll, SP_generated_vector_op_apply_func_0><<<NUM_CTA, CTA_SIZE>>>(graph.getFullFrontier());
 		startTimer();
+
+		host_state.window_lower = 0;
+		host_state.window_upper = delta;
+		device_state.window_lower = 0;
+		device_state.window_upper = delta;
 		
 		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state, start_vertex);
 		gpu_runtime::cudaCheckLastError();
@@ -180,7 +189,9 @@ int main(int argc, char *argv[]) {
 			gpu_runtime::vertex_set_prepare_sparse(frontier);
 			cudaMemcpyToSymbol(window_upper, &device_state.window_upper, sizeof(int32_t), 0);
 			gpu_runtime::cudaCheckLastError();
-			gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);  
+			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);  
+			gpu_runtime::TWCE_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);
+
 			
 			gpu_runtime::swap_bytemaps(frontier);
 			// set the input to the prepare function
@@ -209,6 +220,7 @@ int main(int argc, char *argv[]) {
 				
 			}
 
+			cudaDeviceSynchronize();
 			t = stopTimer();
 
 			#ifdef DEBUG
@@ -221,22 +233,32 @@ int main(int argc, char *argv[]) {
 
 		#ifdef DEBUG
 		printf("Num iters = %d\n", iters);
+		printf("Time elapsed = %f\n", iter_total);
 		#endif
-		//printf("Time elapsed = %f\n", iter_total);
+		
 		total_time += iter_total;
 
 	}
-	//printf("Total time = %f\n", total_time);
+
+	#ifdef DEBUG
+	printf("Total time = %f\n", total_time);
+	#endif
+	
 	if (argc > 3)
 		if (argv[4][0] == 'v'){ 
 			//FILE *output = fopen("output.txt", "w");
 			cudaMemcpy(host_state.SP, __device_SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
-			for (int i = 0; i < graph.num_vertices; i++)
+			#ifdef DEBUG
+			FILE *output = fopen("output.txt", "w");
+			#endif
+			
+			for (int i = 0; i < graph.num_vertices; i++){
 				#ifdef DEBUG
-				printf("%d, %d\n", i, host_state.SP[i]);
+				fprintf(output, "%d, %d\n", i, host_state.SP[i]);
 				#else
 				printf("%d\n", host_state.SP[i]);
                 #endif
+			}
 		}
 	return 0;
 }

From 7c0d5aa4fe7b9fc65f9b03aef095747d59538ba3 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Tue, 22 Oct 2019 04:30:42 -0400
Subject: [PATCH 43/88] Fixed the indentation issue with sssp_verified_test to
 be compatible with python3

---
 test/gpu_tests/all_gpu_tests.py | 60 ++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
index 5dfa697c..564f21dc 100644
--- a/test/gpu_tests/all_gpu_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -33,21 +33,21 @@ def get_command_output(self, command):
 		self.assertEqual(exitcode, 0)
 		return output
 
-        def sssp_verified_test(self, input_file_name, use_delta=False):
-                self.cpp_compile_test(input_file_name, [])
-                if use_delta:
-                        #start point 0, delta 10, verified
-                        self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel 0 10 v > verifier_input ")
-                else:
-                        self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel v > verifier_input ")             
-                output = self.get_command_output(self.verifier_directory + "/sssp_verifier -f " + self.graph_directory +  "/4.wel -t verifier_input -r 0")                
-                test_flag = False
-                for line in output.rstrip().split("\n"):
-                        if line.rstrip().find("SUCCESSFUL") != -1:
-                                test_flag = True
-                                break;
-                self.assertEqual(test_flag, True)
-                
+	def sssp_verified_test(self, input_file_name, use_delta=False):
+		self.cpp_compile_test(input_file_name, [])
+		if use_delta:
+			#start point 0, delta 10, verified
+			self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel 0 10 v > verifier_input ")
+		else:
+			self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel v > verifier_input ")	     
+		output = self.get_command_output(self.verifier_directory + "/sssp_verifier -f " + self.graph_directory +  "/4.wel -t verifier_input -r 0")		
+		test_flag = False
+		for line in output.rstrip().split("\n"):
+			if line.rstrip().find("SUCCESSFUL") != -1:
+				test_flag = True
+				break;
+		self.assertEqual(test_flag, True)
+		
 	@classmethod	
 	def setUpClass(cls):
 		if NVCC_COMPILER == "CUDA_NVCC_EXECUTABLE-NOTFOUND":
@@ -56,7 +56,7 @@ def setUpClass(cls):
 
 		cls.build_directory = GRAPHIT_BUILD_DIRECTORY
 		cls.scratch_directory = GRAPHIT_BUILD_DIRECTORY + "/scratch"
-		cls.verifier_directory = cls.build_directory + "/bin"        
+		cls.verifier_directory = cls.build_directory + "/bin"	
 		if os.path.isdir(cls.scratch_directory):
 			shutil.rmtree(cls.scratch_directory)
 		os.mkdir(cls.scratch_directory)
@@ -122,18 +122,18 @@ def test_basic_load_graph(self):
 		self.assertEqual(output[0], "14, 106")
 	def test_runtime_library(self):
 		print (self.cpp_exec_test("runtime_lib_tests.cu", ["-I", GRAPHIT_SOURCE_DIRECTORY+"/test/gtest", GRAPHIT_SOURCE_DIRECTORY+"/test/gtest/gtest-all.cc"], [self.graph_directory]))
-                
-        def test_sssp_lp_runtime_lib(self):
-                self.cpp_exec_test("sssp_lp.cu", [], [self.graph_directory + "/simple_mtx.mtx", "v"])
+		
+	def test_sssp_lp_runtime_lib(self):
+		self.cpp_exec_test("sssp_lp.cu", [], [self.graph_directory + "/simple_mtx.mtx", "v"])
 
-        def test_sssp_lp_verified(self):
-                self.sssp_verified_test("sssp_lp.cu")
-                
-        def test_sssp_delta_stepping(self):
-                self.cpp_exec_test("sssp_delta_stepping.cu", [], [self.graph_directory + "/simple_mtx.mtx", "0", "10",  "v"])
+	def test_sssp_lp_verified(self):
+		self.sssp_verified_test("sssp_lp.cu")
+		
+	def test_sssp_delta_stepping(self):
+		self.cpp_exec_test("sssp_delta_stepping.cu", [], [self.graph_directory + "/simple_mtx.mtx", "0", "10",  "v"])
 
-        def test_sssp_delta_stepping_verified(self):
-                self.sssp_verified_test("sssp_delta_stepping.cu", True)
+	def test_sssp_delta_stepping_verified(self):
+		self.sssp_verified_test("sssp_delta_stepping.cu", True)
 
 	def test_simple_graphit_exec(self):
 		output = self.graphit_exec_test("simple_graph_load.gt", "default_gpu_schedule.gt", [], [self.graph_directory + "/simple_mtx.mtx"])
@@ -141,9 +141,9 @@ def test_simple_graphit_exec(self):
 		self.assertEqual(len(output), 2)
 		self.assertEqual(output[0], "14")
 
-                
+		
 if __name__ == '__main__':
 	unittest.main()
-        #suite = unittest.TestSuite()
-        #suite.addTest(TestGraphitCompiler('test_sssp_delta_stepping'))
-        #unittest.TextTestRunner(verbosity=2).run(suite)
+	#suite = unittest.TestSuite()
+	#suite.addTest(TestGraphitCompiler('test_sssp_delta_stepping'))
+	#unittest.TextTestRunner(verbosity=2).run(suite)

From cd2ae36afd549132e65f78272d3e37374ce1a284 Mon Sep 17 00:00:00 2001
From: "zhangyunming1990@gmail.com" <zhangyunming1990@gmail.com>
Date: Tue, 22 Oct 2019 18:05:28 -0400
Subject: [PATCH 44/88] updating the priority queue

---
 .../infra_gpu/gpu_priority_queue.h            | 64 ++++++++++---------
 test/gpu_tests/all_gpu_tests.py               |  2 +-
 .../gpu_tests/test_input/runtime_lib_tests.cu |  2 +-
 .../test_input/sssp_delta_stepping.cu         | 11 +++-
 4 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index 78fc72d2..81e6eb06 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -6,36 +6,40 @@
 #include "vertex_frontier.h" 
 
 namespace gpu_runtime {
-
-template<typename PriorityT_>
-	class GPUPriorityQueue {
-
-	public:
-		explicit GPUPriorityQueue(PriorityT_* priorities, PriorityT_ delta=1)
-			: priorities_(priorities), delta_(delta){
-			}
-
-		size_t get_current_priority(){
-			return current_priority_;
-		}
-
-		void update_current_priority(PriorityT_ priority_change_){
-
-		}
-
-		bool finished() {
-			//TODO
-			return true;
-		}
-
-		bool finishedNode(NodeID v){
-			return priorities_[v]/delta_ < get_current_priority();;
-		}
-
-		PriorityT_* priorities_;
-		PriorityT_ delta_;
-		PriorityT_ current_priority_;
-	};
+  
+  template<typename PriorityT_>
+    class GPUPriorityQueue {
+    
+  public:
+
+    size_t get_current_priority(){
+      return current_priority_;
+    }
+    
+    void update_current_priority(PriorityT_ priority_change_){
+      
+    }
+    
+    bool finished() {
+      //TODO
+      return true;
+    }
+    
+    bool host_finishedNode(NodeID v){
+      return host_priorities_[v]/delta_ < get_current_priority();;
+    }
+
+    bool __device__ device_finishedNode(NodeID v){
+
+    }
+    
+    PriorityT_* host_priorities_ = nullptr;
+    PriorityT_* device_priorities_ = nullptr;
+    
+    PriorityT_ delta_ = 1;
+    PriorityT_ current_priority_ = 0;
+    
+  };
 }
 
 
diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
index 5dfa697c..71e27f71 100644
--- a/test/gpu_tests/all_gpu_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -78,7 +78,7 @@ def setUpClass(cls):
 		
 		shutil.copytree(GRAPHIT_SOURCE_DIRECTORY + "/test/graphs", cls.scratch_directory + "/graphs")
 		cls.graph_directory = cls.scratch_directory + "/graphs"
-		cls.executable_name = cls.scratch_directory + "/test_exectuable"	
+		cls.executable_name = cls.scratch_directory + "/test_executable"	
 		cls.cuda_filename = cls.scratch_directory + "/test_cpp.cu"
 		
 		cls.graphitc_py = GRAPHIT_BUILD_DIRECTORY + "/bin/graphitc.py"
diff --git a/test/gpu_tests/test_input/runtime_lib_tests.cu b/test/gpu_tests/test_input/runtime_lib_tests.cu
index c16a6173..9e43d808 100644
--- a/test/gpu_tests/test_input/runtime_lib_tests.cu
+++ b/test/gpu_tests/test_input/runtime_lib_tests.cu
@@ -22,7 +22,7 @@ TEST_F(GPURuntimeLibTest, SimplePriorityQueueTest){
 	gpu_runtime::load_graph(edges, graph_directory + "/simple_mtx.mtx", false);
 	int num_vertices = gpu_runtime::builtin_getVertices(edges);
 	int* priorities = new int[num_vertices]; 
-	gpu_runtime::GPUPriorityQueue<int> pq = gpu_runtime::GPUPriorityQueue<int>(priorities);
+	gpu_runtime::GPUPriorityQueue<int> pq;
 	EXPECT_EQ (14, num_vertices);
 }
 
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index dbe138ae..ef79860c 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -11,11 +11,14 @@
 //#define DEBUG
 
 #ifdef DEBUG
-  #define ITER_COUNT (4)
+  #define ITER_COUNT (5)
 #else
   #define ITER_COUNT (1)
 #endif
 
+gpu_runtime::GPUPriorityQueue<int> host_gpq;
+gpu_runtime::GPUPriorityQueue<int> __device__  device_gpq; 
+
 typedef struct {
 	int32_t *SP;
 	int32_t *output_size;
@@ -147,8 +150,10 @@ int main(int argc, char *argv[]) {
 	
 	cudaMalloc(&__device_SP, gpu_runtime::builtin_getVertices(graph) * sizeof(int32_t));
 	cudaMemcpyToSymbol(SP, &__device_SP, sizeof(int32_t*), 0);
-	__host_SP = new int32_t[gpu_runtime::builtin_getVertices(graph)];
 	
+	__host_SP = new int32_t[gpu_runtime::builtin_getVertices(graph)];
+
+	//cudaMemcpyToSymbol(gpq, &host_gpq, sizeof(host_gpq), 0);
 	
 	algo_state host_state, device_state;	
 	allocate_state(host_state, device_state, graph);
@@ -257,7 +262,7 @@ int main(int argc, char *argv[]) {
 				fprintf(output, "%d, %d\n", i, host_state.SP[i]);
 				#else
 				printf("%d\n", host_state.SP[i]);
-                #endif
+                                #endif
 			}
 		}
 	return 0;

From 7f092a190d5d2f991b5e556586640de240240590 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Tue, 22 Oct 2019 18:45:32 -0400
Subject: [PATCH 45/88] Implemented new vector field property analyzer for GPU
 and changed the lowering to use it when gpu schedule is enabled

---
 .../gpu_vector_field_properties_analyzer.h    |  47 +++++++
 .../gpu_vector_field_properties_analyzer.cpp  | 133 ++++++++++++++++++
 src/midend/mir_lower.cpp                      |   7 +-
 3 files changed, 186 insertions(+), 1 deletion(-)
 create mode 100644 include/graphit/midend/gpu_vector_field_properties_analyzer.h
 create mode 100644 src/midend/gpu_vector_field_properties_analyzer.cpp

diff --git a/include/graphit/midend/gpu_vector_field_properties_analyzer.h b/include/graphit/midend/gpu_vector_field_properties_analyzer.h
new file mode 100644
index 00000000..5555b37c
--- /dev/null
+++ b/include/graphit/midend/gpu_vector_field_properties_analyzer.h
@@ -0,0 +1,47 @@
+#ifndef GPU_VECTOR_FIELD_PROPERTIES_ANALYZER_H
+#define GPU_VECTOR_FIELD_PROPERTIES_ANALYZER_H
+
+#include <graphit/midend/mir_context.h>
+#include <graphit/frontend/schedule.h>
+#include <graphit/midend/field_vector_property.h>
+#include <unordered_set>
+namespace graphit {
+
+class GPUVectorFieldPropertiesAnalyzer {
+	struct PropertyAnalyzingVisitor: public mir::MIRVisitor {
+		MIRContext* mir_context_;
+
+		std::unordered_set<std::string> independent_variables;
+		mir::FuncDecl::Ptr enclosing_function;
+
+		PropertyAnalyzingVisitor(MIRContext* mir_context, std::unordered_set<std::string> idp, mir::FuncDecl::Ptr ef): mir_context_(mir_context), independent_variables(idp), enclosing_function(ef) {
+		}
+		
+		using mir::MIRVisitor::visit;
+
+		bool is_independent_index(mir::Expr::Ptr);	
+
+		virtual void visit(mir::TensorReadExpr::Ptr) override;	
+		virtual void visit(mir::AssignStmt::Ptr) override;
+		virtual void visit(mir::ReduceStmt::Ptr) override;
+		
+	};
+	struct ApplyExprVisitor: public mir::MIRVisitor {
+		MIRContext* mir_context_;
+		ApplyExprVisitor(MIRContext* mir_context): mir_context_(mir_context) {
+		}
+		using mir::MIRVisitor::visit;
+		virtual void visit(mir::PushEdgeSetApplyExpr::Ptr) override;
+		virtual void visit(mir::PullEdgeSetApplyExpr::Ptr) override;
+	};
+
+	MIRContext* mir_context_;
+public:
+	void analyze(void);
+	GPUVectorFieldPropertiesAnalyzer(MIRContext* mir_context, Schedule* schedule): mir_context_(mir_context) {
+	}
+};
+
+}
+#endif
+
diff --git a/src/midend/gpu_vector_field_properties_analyzer.cpp b/src/midend/gpu_vector_field_properties_analyzer.cpp
new file mode 100644
index 00000000..1e200b2f
--- /dev/null
+++ b/src/midend/gpu_vector_field_properties_analyzer.cpp
@@ -0,0 +1,133 @@
+#include <graphit/midend/gpu_vector_field_properties_analyzer.h>
+
+namespace graphit {
+void GPUVectorFieldPropertiesAnalyzer::analyze(void) {
+	ApplyExprVisitor visitor(mir_context_);
+	for (auto func: mir_context_->getFunctionList()) {
+		func->accept(&visitor);
+	}	
+}
+void GPUVectorFieldPropertiesAnalyzer::ApplyExprVisitor::visit(mir::PushEdgeSetApplyExpr::Ptr pesae) {
+	// Push apply expression requires synchronization on src when using non vertex based load balance
+	// Push apply expression always requires synchronization on dst
+	std::unordered_set<std::string> idp_set;
+	mir::FuncDecl::Ptr func = mir_context_->getFunction(pesae->input_function_name);
+
+	std::string src_name = func->args[0].getName();
+	std::string dst_name = func->args[1].getName();
+
+	switch (pesae->applied_schedule.load_balancing) {
+		case fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::VERTEX_BASED:
+			idp_set.insert(src_name);
+			break;
+		default:
+			break;	
+	}	
+	
+	
+	PropertyAnalyzingVisitor visitor(mir_context_, idp_set, func);
+	func->accept(&visitor);
+}
+void GPUVectorFieldPropertiesAnalyzer::ApplyExprVisitor::visit(mir::PullEdgeSetApplyExpr::Ptr pesae) {
+	// Pull apply expression requires synchronization on dst when using non vertex based load balance
+	// Pull apply expression always requires synchronization on src
+	std::unordered_set<std::string> idp_set;
+	mir::FuncDecl::Ptr func = mir_context_->getFunction(pesae->input_function_name);
+
+	std::string src_name = func->args[0].getName();
+	std::string dst_name = func->args[1].getName();
+
+	switch (pesae->applied_schedule.load_balancing) {
+		case fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::VERTEX_BASED:
+			idp_set.insert(dst_name);
+			break;
+		default:
+			break;	
+	}	
+	
+	
+	PropertyAnalyzingVisitor visitor(mir_context_, idp_set, func);
+	func->accept(&visitor);
+}
+
+bool GPUVectorFieldPropertiesAnalyzer::PropertyAnalyzingVisitor::is_independent_index(mir::Expr::Ptr expr) {
+	if (mir::isa<mir::VarExpr>(expr)) {
+		mir::VarExpr::Ptr var_expr = mir::to<mir::VarExpr>(expr);
+		if (independent_variables.count(var_expr->var.getName()) > 0) {
+			return true;
+		}
+	}
+	if (mir::isa<mir::AddExpr>(expr)) {
+		mir::AddExpr::Ptr add_expr = mir::to<mir::AddExpr>(expr);
+		if (mir::isa<mir::IntLiteral>(add_expr->lhs) && is_independent_index(add_expr->rhs))
+			return true;
+		if (mir::isa<mir::IntLiteral>(add_expr->rhs) && is_independent_index(add_expr->lhs))
+			return true;
+	}
+	if (mir::isa<mir::MulExpr>(expr)) {
+		mir::MulExpr::Ptr mul_expr = mir::to<mir::MulExpr>(expr);
+		if (mir::isa<mir::IntLiteral>(mul_expr->lhs) && is_independent_index(mul_expr->rhs) && mir::to<mir::IntLiteral>(mul_expr->lhs)->val != 0)
+			return true;
+		if (mir::isa<mir::IntLiteral>(mul_expr->rhs) && is_independent_index(mul_expr->lhs) && mir::to<mir::IntLiteral>(mul_expr->rhs)->val != 0)
+			return true;
+	}
+
+	return false;
+	
+}
+void GPUVectorFieldPropertiesAnalyzer::PropertyAnalyzingVisitor::visit(mir::TensorReadExpr::Ptr tre) {
+
+	tre->index->accept(this);
+
+	FieldVectorProperty property;
+	property.read_write_type = FieldVectorProperty::ReadWriteType::READ_ONLY;
+	if (is_independent_index(tre->index)) {
+		property.access_type_ = FieldVectorProperty::AccessType::LOCAL;
+	} else {
+		property.access_type_ = FieldVectorProperty::AccessType::SHARED;
+	}
+	tre->field_vector_prop_  = property;
+	std::string target = tre->getTargetNameStr();
+	enclosing_function->field_vector_properties_map_[target] = property;
+}
+void GPUVectorFieldPropertiesAnalyzer::PropertyAnalyzingVisitor::visit(mir::AssignStmt::Ptr assign_stmt) {
+
+	assign_stmt->expr->accept(this);
+	
+	if (!mir::isa<mir::TensorReadExpr>(assign_stmt->lhs))
+		return;	
+
+
+	mir::TensorReadExpr::Ptr tre = mir::to<mir::TensorReadExpr>(assign_stmt->lhs);
+	tre->index->accept(this);
+	FieldVectorProperty property;
+	property.read_write_type = FieldVectorProperty::ReadWriteType::WRITE_ONLY;
+	if (is_independent_index(tre->index)) {
+		property.access_type_ = FieldVectorProperty::AccessType::LOCAL;
+	} else {
+		property.access_type_ = FieldVectorProperty::AccessType::SHARED;
+	}
+	tre->field_vector_prop_ = property;
+	std::string target = tre->getTargetNameStr();
+	enclosing_function->field_vector_properties_map_[target] = property;	
+}
+void GPUVectorFieldPropertiesAnalyzer::PropertyAnalyzingVisitor::visit(mir::ReduceStmt::Ptr reduce_stmt) {
+	reduce_stmt->expr->accept(this);
+	
+	if (!mir::isa<mir::TensorReadExpr>(reduce_stmt->lhs))
+		return;
+	mir::TensorReadExpr::Ptr tre = mir::to<mir::TensorReadExpr>(reduce_stmt->lhs);
+	tre->index->accept(this);
+	FieldVectorProperty property;
+	property.read_write_type = FieldVectorProperty::ReadWriteType::READ_AND_WRITE;
+	if (is_independent_index(tre->index)) {
+		property.access_type_ = FieldVectorProperty::AccessType::LOCAL;
+	} else {
+		property.access_type_ = FieldVectorProperty::AccessType::SHARED;
+	}
+	tre->field_vector_prop_ = property;
+	std::string target = tre->getTargetNameStr();
+	enclosing_function->field_vector_properties_map_[target] = property;	
+	
+}
+}
diff --git a/src/midend/mir_lower.cpp b/src/midend/mir_lower.cpp
index b5b5b857..3f7b5223 100644
--- a/src/midend/mir_lower.cpp
+++ b/src/midend/mir_lower.cpp
@@ -8,6 +8,7 @@
 #include <graphit/midend/vector_op_lower.h>
 #include <graphit/midend/change_tracking_lower.h>
 #include <graphit/midend/vector_field_properties_analyzer.h>
+#include <graphit/midend/gpu_vector_field_properties_analyzer.h>
 #include <graphit/midend/atomics_op_lower.h>
 #include <graphit/midend/vertex_edge_set_lower.h>
 #include <graphit/midend/merge_reduce_lower.h>
@@ -47,7 +48,11 @@ namespace graphit {
         // Use program analysis to figure out the properties of each tensor access
         // read write type: read/write/read and write (reduction)
         // access type: shared or local
-        VectorFieldPropertiesAnalyzer(mir_context,schedule).analyze();
+	if (schedule != nullptr && !schedule->apply_gpu_schedules.empty()) {
+		GPUVectorFieldPropertiesAnalyzer(mir_context,schedule).analyze();
+	} else {
+		VectorFieldPropertiesAnalyzer(mir_context,schedule).analyze();
+	}
 
         // The pass on lowering abstract data structures to
         // concrete data structures with physical layout information (arrays, field of a struct, dictionary)

From ce14b3fae7729fb04bae9b2ba3e5383a8b6618cc Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Tue, 22 Oct 2019 20:28:01 -0400
Subject: [PATCH 46/88] Added liveness analysis for frontier reuse in edgeset
 apply expr

---
 .../graphit/midend/frontier_reuse_analysis.h  | 37 ++++++++++
 include/graphit/midend/mir.h                  |  2 +
 src/backend/codegen_gpu/codegen_gpu.cpp       |  6 +-
 src/midend/frontier_reuse_analysis.cpp        | 69 +++++++++++++++++++
 src/midend/mir_lower.cpp                      |  8 ++-
 5 files changed, 119 insertions(+), 3 deletions(-)
 create mode 100644 include/graphit/midend/frontier_reuse_analysis.h
 create mode 100644 src/midend/frontier_reuse_analysis.cpp

diff --git a/include/graphit/midend/frontier_reuse_analysis.h b/include/graphit/midend/frontier_reuse_analysis.h
new file mode 100644
index 00000000..396f40b9
--- /dev/null
+++ b/include/graphit/midend/frontier_reuse_analysis.h
@@ -0,0 +1,37 @@
+#ifndef FRONTIER_REUSE_ANALYSIS_H
+#define FRONTIER_REUSE_ANALYSIS_H
+
+#include <graphit/midend/mir_context.h>
+#include <graphit/frontend/schedule.h>
+#include <graphit/midend/field_vector_property.h>
+#include <unordered_set>
+namespace graphit {
+class FrontierReuseAnalysis {
+public:
+	MIRContext *mir_context_;	
+	FrontierReuseAnalysis (MIRContext* mir_context): mir_context_(mir_context) {
+	}	
+	struct ReuseFindingVisitor: public mir::MIRVisitor {
+		MIRContext *mir_context_;	
+		ReuseFindingVisitor(MIRContext* mir_context): mir_context_(mir_context) {
+		}
+		using mir::MIRVisitor::visit;	
+		std::vector<mir::Stmt::Ptr> to_deletes;
+		bool is_frontier_reusable(mir::StmtBlock::Ptr, int index, std::string frontier_name); 
+		virtual void visit(mir::StmtBlock::Ptr) override;
+	};
+	struct FrontierUseFinder: public mir::MIRVisitor {
+		using mir::MIRVisitor::visit;
+		bool is_used = false;
+		std::string frontier_name;
+
+		virtual void visit(mir::VarExpr::Ptr) override;
+		virtual void visit(mir::PushEdgeSetApplyExpr::Ptr) override;
+		virtual void visit(mir::PullEdgeSetApplyExpr::Ptr) override;
+		
+	};
+	void analyze(void);
+};
+
+}
+#endif
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index ce5089d0..ffab1800 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -914,6 +914,8 @@ namespace graphit {
             std::string scope_label_name;
             MergeReduceField::Ptr merge_reduce;
 
+	    bool frontier_reusable = false;
+
             typedef std::shared_ptr<EdgeSetApplyExpr> Ptr;
 
             virtual void accept(MIRVisitor *visitor) {
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index edcbbf4d..19c2ac4d 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -543,7 +543,11 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 		assert(false && "GPU backend doesn't currently support creating output frontier without input frontier\n");
 	}		
 	// We will assume that the output frontier can reuse the input frontier. 
-	// TOOD: Add liveness analysis for this
+	// Assert that the frontier can be reused
+	if (target != nullptr && esae->frontier_reusable != true) {
+		assert(false && "GPU backend currently doesn't support creating frontiers from the apply expressions. Could not find opportunity for reuse\n");
+	}
+
 	printIndent();
 	oss << "{" << std::endl;
 	indent();
diff --git a/src/midend/frontier_reuse_analysis.cpp b/src/midend/frontier_reuse_analysis.cpp
new file mode 100644
index 00000000..49aee9be
--- /dev/null
+++ b/src/midend/frontier_reuse_analysis.cpp
@@ -0,0 +1,69 @@
+#include <graphit/midend/frontier_reuse_analysis.h>
+
+namespace graphit {
+void FrontierReuseAnalysis::analyze(void) {
+	for (auto func: mir_context_->getFunctionList()) {
+		ReuseFindingVisitor visitor(mir_context_);
+		func->accept(&visitor);	
+	}
+}
+bool FrontierReuseAnalysis::ReuseFindingVisitor::is_frontier_reusable(mir::StmtBlock::Ptr stmt_block, int index, std::string frontier_name) {
+	FrontierUseFinder finder;
+	finder.frontier_name = frontier_name;
+	index++;
+	for (int i = index; i < stmt_block->stmts->size(); i++) {
+		if (mir::isa<mir::ExprStmt>((*(stmt_block->stmts))[i])) {
+			mir::ExprStmt::Ptr expr_stmt = mir::to<mir::ExprStmt>((*(stmt_block->stmts))[i]);
+			if (mir::isa<mir::Call>(expr_stmt->expr)) {
+				mir::Call::Ptr call_expr = mir::to<mir::Call>(expr_stmt->expr);
+				if (call_expr->name == "deleteObject" && mir::isa<mir::VarExpr>(call_expr->args[0]) && mir::to<mir::VarExpr>(call_expr->args[0])->var.getName() == frontier_name) {
+					to_deletes.push_back(expr_stmt);
+					return true;
+				}
+			}	
+		}	
+		(*(stmt_block->stmts))[i]->accept(&finder);
+		if (finder.is_used)
+			return false;
+	}
+	return false;
+}
+void FrontierReuseAnalysis::ReuseFindingVisitor::visit(mir::StmtBlock::Ptr stmt_block) {
+	std::vector<mir::Stmt::Ptr> new_stmts;	
+	to_deletes.clear();
+	for (int i = 0; i < stmt_block->stmts->size(); i++) {
+		mir::Stmt::Ptr this_stmt = (*(stmt_block->stmts))[i];
+		if (mir::isa<mir::AssignStmt>(this_stmt)) {
+			mir::AssignStmt::Ptr assign_stmt = mir::to<mir::AssignStmt>(this_stmt);
+			if (mir::isa<mir::EdgeSetApplyExpr>(assign_stmt->expr)) {
+				mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);
+				if (esae->from_func != "" && !mir_context_->isFunction(esae->from_func)) {
+					std::string frontier_name = esae->from_func;
+					if (is_frontier_reusable(stmt_block, i, frontier_name)) {
+						esae->frontier_reusable = true;
+					}
+				}
+			}
+		}
+		if (std::find(to_deletes.begin(), to_deletes.end(), this_stmt) == to_deletes.end()) {
+			new_stmts.push_back(this_stmt);	
+		}
+	}	
+	(*(stmt_block->stmts)) = new_stmts;
+	mir::MIRVisitor::visit(stmt_block);
+}
+void FrontierReuseAnalysis::FrontierUseFinder::visit(mir::VarExpr::Ptr var_expr) {
+	if (var_expr->var.getName() == frontier_name)
+		is_used = true;
+}
+void FrontierReuseAnalysis::FrontierUseFinder::visit(mir::PushEdgeSetApplyExpr::Ptr pesae) {
+	mir::MIRVisitor::visit(pesae);
+	if (pesae->from_func == frontier_name)
+		is_used = true;
+}
+void FrontierReuseAnalysis::FrontierUseFinder::visit(mir::PullEdgeSetApplyExpr::Ptr pesae) {
+	mir::MIRVisitor::visit(pesae);
+	if (pesae->from_func == frontier_name)
+		is_used = true;
+}
+}
diff --git a/src/midend/mir_lower.cpp b/src/midend/mir_lower.cpp
index 3f7b5223..cbcdc553 100644
--- a/src/midend/mir_lower.cpp
+++ b/src/midend/mir_lower.cpp
@@ -14,6 +14,7 @@
 #include <graphit/midend/merge_reduce_lower.h>
 #include <graphit/midend/priority_features_lowering.h>
 #include <graphit/midend/while_loop_fusion.h>
+#include <graphit/midend/frontier_reuse_analysis.h>
 
 namespace graphit {
     /**
@@ -49,9 +50,9 @@ namespace graphit {
         // read write type: read/write/read and write (reduction)
         // access type: shared or local
 	if (schedule != nullptr && !schedule->apply_gpu_schedules.empty()) {
-		GPUVectorFieldPropertiesAnalyzer(mir_context,schedule).analyze();
+		GPUVectorFieldPropertiesAnalyzer(mir_context, schedule).analyze();
 	} else {
-		VectorFieldPropertiesAnalyzer(mir_context,schedule).analyze();
+		VectorFieldPropertiesAnalyzer(mir_context, schedule).analyze();
 	}
 
         // The pass on lowering abstract data structures to
@@ -73,6 +74,9 @@ namespace graphit {
 
 	// This pass lowers while loops that have fusion schedule attached to them 
 	WhileLoopFusion(mir_context, schedule).lower();	
+
+	// This pass finds EdgeSetApplyExpressions that allow frontiers to be reused and removes the corresponding deletes
+	FrontierReuseAnalysis(mir_context).analyze();
     }
 }
 

From 717cf1f9a1356efe58fed99bb19ff00ae9e8f942 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Tue, 22 Oct 2019 23:40:06 -0400
Subject: [PATCH 47/88] Fixed all runtime test issues

---
 .../graphit/midend/frontier_reuse_analysis.h  |  1 +
 include/graphit/midend/mir.h                  |  2 +
 src/midend/frontier_reuse_analysis.cpp        | 18 ++++++
 src/midend/mir.cpp                            |  2 +
 src/midend/mir_lower.cpp                      |  5 +-
 test/c++/high_level_schedule_test.cpp         | 61 +++++++++++++++++--
 6 files changed, 83 insertions(+), 6 deletions(-)

diff --git a/include/graphit/midend/frontier_reuse_analysis.h b/include/graphit/midend/frontier_reuse_analysis.h
index 396f40b9..731c2d15 100644
--- a/include/graphit/midend/frontier_reuse_analysis.h
+++ b/include/graphit/midend/frontier_reuse_analysis.h
@@ -28,6 +28,7 @@ class FrontierReuseAnalysis {
 		virtual void visit(mir::VarExpr::Ptr) override;
 		virtual void visit(mir::PushEdgeSetApplyExpr::Ptr) override;
 		virtual void visit(mir::PullEdgeSetApplyExpr::Ptr) override;
+		virtual void visit(mir::EdgeSetApplyExpr::Ptr) override;
 		
 	};
 	void analyze(void);
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index ffab1800..726e7145 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -948,6 +948,7 @@ namespace graphit {
                 enable_deduplication = edgeset_apply->enable_deduplication;
 		
 		applied_schedule = edgeset_apply->applied_schedule;
+		frontier_reusable = edgeset_apply->frontier_reusable;
             }
 
             virtual void accept(MIRVisitor *visitor) {
@@ -975,6 +976,7 @@ namespace graphit {
                 is_parallel = edgeset_apply->is_parallel;
                 enable_deduplication = edgeset_apply->enable_deduplication;
 		applied_schedule = edgeset_apply->applied_schedule;
+		frontier_reusable = edgeset_apply->frontier_reusable;
             }
 
             virtual void accept(MIRVisitor *visitor) {
diff --git a/src/midend/frontier_reuse_analysis.cpp b/src/midend/frontier_reuse_analysis.cpp
index 49aee9be..f5e785b3 100644
--- a/src/midend/frontier_reuse_analysis.cpp
+++ b/src/midend/frontier_reuse_analysis.cpp
@@ -44,6 +44,19 @@ void FrontierReuseAnalysis::ReuseFindingVisitor::visit(mir::StmtBlock::Ptr stmt_
 					}
 				}
 			}
+		} else if (mir::isa<mir::VarDecl>(this_stmt)) {
+			mir::VarDecl::Ptr var_decl = mir::to<mir::VarDecl>(this_stmt);
+			if (var_decl->initVal != nullptr) {
+				if (mir::isa<mir::EdgeSetApplyExpr>(var_decl->initVal)) {
+					mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(var_decl->initVal);
+					if (esae->from_func != "" && !mir_context_->isFunction(esae->from_func)) {
+						std::string frontier_name = esae->from_func;
+						if (is_frontier_reusable(stmt_block, i, frontier_name)) {
+							esae->frontier_reusable = true;
+						}
+					}
+				}	
+			}
 		}
 		if (std::find(to_deletes.begin(), to_deletes.end(), this_stmt) == to_deletes.end()) {
 			new_stmts.push_back(this_stmt);	
@@ -66,4 +79,9 @@ void FrontierReuseAnalysis::FrontierUseFinder::visit(mir::PullEdgeSetApplyExpr::
 	if (pesae->from_func == frontier_name)
 		is_used = true;
 }
+void FrontierReuseAnalysis::FrontierUseFinder::visit(mir::EdgeSetApplyExpr::Ptr esae) {
+	mir::MIRVisitor::visit(esae);
+	if (esae->from_func == frontier_name)
+		is_used = true;
+}
 }
diff --git a/src/midend/mir.cpp b/src/midend/mir.cpp
index 820199b9..a17aafbe 100644
--- a/src/midend/mir.cpp
+++ b/src/midend/mir.cpp
@@ -193,6 +193,8 @@ namespace graphit {
             enable_deduplication = expr->enable_deduplication;
             is_weighted = expr->is_weighted;
             scope_label_name = expr->scope_label_name;
+            frontier_reusable = expr->frontier_reusable;
+	
         }
 
 
diff --git a/src/midend/mir_lower.cpp b/src/midend/mir_lower.cpp
index cbcdc553..1e97ebbc 100644
--- a/src/midend/mir_lower.cpp
+++ b/src/midend/mir_lower.cpp
@@ -35,6 +35,9 @@ namespace graphit {
         //This pass needs to happen before ApplyExprLower pass because the default ReduceBeforeUpdate uses ApplyExprLower
         PriorityFeaturesLower(mir_context, schedule).lower();
 
+	// This pass finds EdgeSetApplyExpressions that allow frontiers to be reused and removes the corresponding deletes
+	FrontierReuseAnalysis(mir_context).analyze();
+
         // This pass sets properties of edgeset apply expressions based on the schedules including
         // edge traversal direction: push, pull, denseforward, hybrid_dense, hybrid_denseforward
         // deduplication: enable / disable
@@ -75,8 +78,6 @@ namespace graphit {
 	// This pass lowers while loops that have fusion schedule attached to them 
 	WhileLoopFusion(mir_context, schedule).lower();	
 
-	// This pass finds EdgeSetApplyExpressions that allow frontiers to be reused and removes the corresponding deletes
-	FrontierReuseAnalysis(mir_context).analyze();
     }
 }
 
diff --git a/test/c++/high_level_schedule_test.cpp b/test/c++/high_level_schedule_test.cpp
index ad696779..fffc75c6 100644
--- a/test/c++/high_level_schedule_test.cpp
+++ b/test/c++/high_level_schedule_test.cpp
@@ -43,6 +43,29 @@ class HighLevelScheduleTest : public ::testing::Test {
                                 "  print \"finished running BFS\"; \n"
                                 "end");
 
+        const char* bfs_char_gpu = ("element Vertex end\n"
+                                "element Edge end\n"
+                                "const edges : edgeset{Edge}(Vertex,Vertex) = load (\"../../test/graphs/test.el\");\n"
+                                "const vertices : vertexset{Vertex} = edges.getVertices();\n"
+                                "const parent : vector{Vertex}(int) = -1;\n"
+                                "func updateEdge(src : Vertex, dst : Vertex) "
+                                "  parent[dst] = src; "
+                                "end\n"
+                                "func toFilter(v : Vertex) -> output : bool "
+                                "  output = parent[v] == -1; "
+                                "end\n"
+                                "func main() "
+                                "  var frontier : vertexset{Vertex} = new vertexset{Vertex}(0); "
+                                "  frontier.addVertex(1); "
+                                "  while (frontier.getVertexSetSize() != 0) "
+                                "      #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge, parent, true); \n"
+				"      delete frontier;\n"
+				"      frontier = output;\n"	
+                                "  end\n"
+                                "  print \"finished running BFS\"; \n"
+                                "end");
+
+
 
         const char*  pr_char = ("element Vertex end\n"
                                              "element Edge end\n"
@@ -129,6 +152,32 @@ class HighLevelScheduleTest : public ::testing::Test {
                                                          "     end\n"
                                                          "end";
 
+        const char * sssp_char_gpu =      "element Vertex end\n"
+                                                         "element Edge end\n"
+                                                         "const edges : edgeset{Edge}(Vertex,Vertex, int) = load (\"../test/graphs/test.wel\");\n"
+                                                         "const vertices : vertexset{Vertex} = edges.getVertices();\n"
+                                                         "const SP : vector{Vertex}(int) = 2147483647; %should be INT_MAX \n"
+                                                         "func updateEdge(src : Vertex, dst : Vertex, weight : int) -> output : bool\n"
+                                                         "    SP[dst] min= (SP[src] + weight);\n"
+                                                         "end\n"
+                                                         "func main() \n"
+                                                         "    var n : int = edges.getVertices();\n"
+                                                         "    var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);\n"
+                                                         "    frontier.addVertex(0); %add source vertex \n"
+                                                         "    SP[0] = 0;\n"
+                                                         "    var rounds : int = 0;\n"
+                                                         "    while (frontier.getVertexSetSize() != 0)\n"
+                                                         "         #s1# var output: vertexset{Vertex} = edges.from(frontier).applyModified(updateEdge, SP);\n"
+							 "         delete frontier;\n"
+					                 "         frontier = output;\n"
+                                                         "         rounds = rounds + 1;\n"
+                                                         "         if rounds == n\n"
+                                                         "             print \"negative cycle\";\n"
+                                                         "          end\n"
+                                                         "     end\n"
+                                                         "end";
+
+
         const char * sssp_async_char =      "element Vertex end\n"
                 "element Edge end\n"
                 "const edges : edgeset{Edge}(Vertex,Vertex, int) = load (\"../test/graphs/test.wel\");\n"
@@ -715,8 +764,10 @@ class HighLevelScheduleTest : public ::testing::Test {
                                             "end");
 
         bfs_str_ =  string (bfs_char);
+        bfs_str_gpu_ =  string (bfs_char_gpu);
         pr_str_ = string(pr_char);
         sssp_str_ = string  (sssp_char);
+        sssp_str_gpu_ = string  (sssp_char_gpu);
         sssp_async_str_ = string (sssp_async_char);
         cf_str_ = string  (cf_char);
         cc_str_ = string  (cc_char);
@@ -805,8 +856,10 @@ class HighLevelScheduleTest : public ::testing::Test {
     graphit::MIRContext *mir_context_;
 
     string bfs_str_;
+    string bfs_str_gpu_;
     string pr_str_;
     string sssp_str_;
+    string sssp_str_gpu_;
     string sssp_async_str_;
     string cf_str_;
     string cc_str_;
@@ -2291,7 +2344,7 @@ TEST_F(HighLevelScheduleTest, SetCoverUintDefaultSchedule){
 }
 
 TEST_F(HighLevelScheduleTest, BFSBasicSimpleGPUScheduleTest) {
-    istringstream is (bfs_str_);
+    istringstream is (bfs_str_gpu_);
     fe_->parseStream(is, context_, errors_);
     fir::high_level_schedule::ProgramScheduleNode::Ptr program
             = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
@@ -2304,7 +2357,7 @@ TEST_F(HighLevelScheduleTest, BFSBasicSimpleGPUScheduleTest) {
 }
 
 TEST_F(HighLevelScheduleTest, BFSBasicHybridGPUScheduleTest) {
-    istringstream is (bfs_str_);
+    istringstream is (bfs_str_gpu_);
     fe_->parseStream(is, context_, errors_);
     fir::high_level_schedule::ProgramScheduleNode::Ptr program
             = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
@@ -2321,7 +2374,7 @@ TEST_F(HighLevelScheduleTest, BFSBasicHybridGPUScheduleTest) {
 }
 
 TEST_F(HighLevelScheduleTest, SSSP_LabelProp_GPUScheduleTest) {
-    istringstream is (sssp_str_);
+    istringstream is (sssp_str_gpu_);
     fe_->parseStream(is, context_, errors_);
     fir::high_level_schedule::ProgramScheduleNode::Ptr program
         = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
@@ -2335,7 +2388,7 @@ TEST_F(HighLevelScheduleTest, SSSP_LabelProp_GPUScheduleTest) {
 TEST_F(HighLevelScheduleTest, BFSHybridPushPullScheduleTest) {
     using namespace fir::gpu_schedule;
 
-    istringstream is (bfs_str_);
+    istringstream is (bfs_str_gpu_);
     fe_->parseStream(is, context_, errors_);
     fir::high_level_schedule::ProgramScheduleNode::Ptr program
             = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);

From 7c5b21c8478dc9293f9e42e8f448e8d35ed9b84f Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Wed, 23 Oct 2019 21:33:09 -0400
Subject: [PATCH 48/88] Added support for argv for hybrid threshold and
 scheduling options for edge blocking

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |  1 +
 include/graphit/frontend/gpu_schedule.h       | 45 ++++++++++++++++++-
 include/graphit/midend/mir.h                  |  1 +
 src/backend/codegen_gpu/codegen_gpu.cpp       | 20 ++++++++-
 src/midend/apply_expr_lower.cpp               |  1 +
 src/runtime_lib/gpu_intrinsics.h              |  2 +
 6 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index d70aa29c..976fbeca 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -54,6 +54,7 @@ class CodeGenGPU : public mir::MIRVisitor{
 	MIRContext * mir_context_;
 
 private:
+	void genGlobalDeclarations();
 	void genIncludeStmts(void);
 	void genEdgeSets(void);
 	void genFuncDecl(mir::FuncDecl::Ptr);
diff --git a/include/graphit/frontend/gpu_schedule.h b/include/graphit/frontend/gpu_schedule.h
index 793c246e..3157c962 100644
--- a/include/graphit/frontend/gpu_schedule.h
+++ b/include/graphit/frontend/gpu_schedule.h
@@ -30,7 +30,9 @@ enum gpu_schedule_options {
 	VERTEX_BASED,
 	INPUT_VERTEXSET_SIZE,
 	BITMAP,
-	BOOLMAP
+	BOOLMAP,
+	BLOCKED,
+	UNBLOCKED
 };
 
 class GPUSchedule {
@@ -72,6 +74,11 @@ class SimpleGPUSchedule: public GPUSchedule {
 		STRICT,
 		EDGE_ONLY
 	};
+	
+	enum class edge_blocking_type {
+		BLOCKED,
+		UNBLOCKED
+	};
 
 	enum class kernel_fusion_type {
 		FUSION_DISABLED,
@@ -85,6 +92,8 @@ class SimpleGPUSchedule: public GPUSchedule {
 	frontier_creation_type frontier_creation;
 	deduplication_type deduplication;
 	load_balancing_type load_balancing;
+	edge_blocking_type edge_blocking;
+	uint32_t edge_blocking_size;
 	kernel_fusion_type kernel_fusion;
 	
 	SimpleGPUSchedule () {
@@ -93,6 +102,8 @@ class SimpleGPUSchedule: public GPUSchedule {
 		frontier_creation = frontier_creation_type::FRONTIER_FUSED;
 		deduplication = deduplication_type::DEDUP_DISABLED;
 		load_balancing = load_balancing_type::VERTEX_BASED;
+		edge_blocking = edge_blocking_type::UNBLOCKED;
+		edge_blocking_size = 0;
 		kernel_fusion = kernel_fusion_type::FUSION_DISABLED;
 	}	
 
@@ -153,7 +164,7 @@ class SimpleGPUSchedule: public GPUSchedule {
 		}
 	}
 
-	void configLoadBalance(enum gpu_schedule_options o) {
+	void configLoadBalance(enum gpu_schedule_options o, enum gpu_schedule_options blocking = UNBLOCKED, int32_t blocking_size = 1) {
 		switch(o) {
 			case VERTEX_BASED:
 				load_balancing = load_balancing_type::VERTEX_BASED;
@@ -175,6 +186,18 @@ class SimpleGPUSchedule: public GPUSchedule {
 				break;
 			case EDGE_ONLY:
 				load_balancing = load_balancing_type::EDGE_ONLY;
+				switch (blocking) {
+					case BLOCKED:
+						edge_blocking = edge_blocking_type::BLOCKED;
+						edge_blocking_size = blocking_size;	
+						break;
+					case UNBLOCKED:
+						edge_blocking = edge_blocking_type::UNBLOCKED;
+						break;
+					default:
+						assert(false && "Invalid option for configLoadBalance");
+						break;
+				}
 				break;
 			default:
 				assert(false && "Invalid option for configLoadBalance");
@@ -206,6 +229,8 @@ class HybridGPUSchedule: public GPUSchedule {
 	SimpleGPUSchedule s2;
 	
 	float threshold;
+	int32_t argv_index;
+
 	enum class hybrid_criteria {
 		INPUT_VERTEXSET_SIZE
 	};
@@ -225,6 +250,22 @@ class HybridGPUSchedule: public GPUSchedule {
 		s1 = _s1;
 		s2 = _s2;
 	}
+	HybridGPUSchedule (enum gpu_schedule_options o, const char *t, SimpleGPUSchedule &_s1, SimpleGPUSchedule &_s2) {
+		switch (o) {
+			case INPUT_VERTEXSET_SIZE:
+				_hybrid_criteria = hybrid_criteria::INPUT_VERTEXSET_SIZE;
+				break;
+			default:
+				assert(false && "Invalid option for HybridGPUScheduleCriteria\n");
+				break;
+		}
+		s1 = _s1;
+		s2 = _s2;	
+		if (sscanf(t, "argv[%i]", &argv_index) != 1) {
+			assert(false && "Invalid threshold option\n");
+		}
+		threshold = -100;
+	}
 };
 
 
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index 726e7145..ee37e972 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -1548,6 +1548,7 @@ namespace graphit {
 		StmtBlock::Ptr stmt1;
 		StmtBlock::Ptr stmt2;
 		float threshold;
+		int32_t argv_index;
 		fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria criteria;
 			
 		std::string input_frontier_name;
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 19c2ac4d..64ce889e 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -18,6 +18,8 @@ int CodeGenGPU::genGPU() {
 
 	genIncludeStmts();
 	
+	genGlobalDeclarations();
+
 	// This generates all the declarations of type GraphT<...>
 	genEdgeSets();
 
@@ -371,7 +373,9 @@ void CodeGenGPU::genIncludeStmts(void) {
 	oss << "#include \"gpu_intrinsics.h\"" << std::endl;
 	oss << "#include <cooperative_groups.h>" << std::endl;
 	oss << "using namespace cooperative_groups;" << std::endl;
+}
 
+void CodeGenGPU::genGlobalDeclarations(void) {
 }
 
 void CodeGenGPU::genEdgeSets(void) {
@@ -454,6 +458,8 @@ void CodeGenGPU::visit(mir::FuncDecl::Ptr func_decl) {
 		indent();
 
 		if (func_decl->name == "main") {
+			printIndent();
+			oss << "gpu_runtime::register_argv(argc, argv);" << std::endl;
 			for (auto stmt: mir_context_->edgeset_alloc_stmts) {
 				mir::AssignStmt::Ptr assign_stmt = mir::to<mir::AssignStmt>(stmt);
 				mir::EdgeSetLoadExpr::Ptr edge_set_load_expr = mir::to<mir::EdgeSetLoadExpr>(assign_stmt->expr);
@@ -1231,7 +1237,12 @@ void CodeGenGPUHost::visit(mir::StmtBlock::Ptr stmt_block) {
 void CodeGenGPU::visit(mir::HybridGPUStmt::Ptr stmt) {
 	if (stmt->criteria == fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria::INPUT_VERTEXSET_SIZE) {
 		printIndent();
-		oss << "if (gpu_runtime::builtin_getVertexSetSize(" << stmt->input_frontier_name << ") < " << stmt->input_frontier_name << ".max_num_elems * " << stmt->threshold << ") {" << std::endl;
+		oss << "if (gpu_runtime::builtin_getVertexSetSize(" << stmt->input_frontier_name << ") < " << stmt->input_frontier_name << ".max_num_elems * ";
+		if (stmt->threshold > 0) 
+			oss << stmt->threshold;
+		else 
+			oss << "gpu_runtime::str_to_float(gpu_runtime::get_argv(" << stmt->argv_index << "))";
+		oss << ") {" << std::endl;
 		indent();
 		stmt->stmt1->accept(this);
 		dedent();
@@ -1249,7 +1260,12 @@ void CodeGenGPU::visit(mir::HybridGPUStmt::Ptr stmt) {
 void CodeGenGPUFusedKernel::visit(mir::HybridGPUStmt::Ptr stmt) {
 	if (stmt->criteria == fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria::INPUT_VERTEXSET_SIZE) {
 		printIndent();
-		oss << "if (gpu_runtime::device_builtin_getVertexSetSize(" << var_name(stmt->input_frontier_name) << ") < " << var_name(stmt->input_frontier_name) << ".max_num_elems * " << stmt->threshold << ") {" << std::endl;
+		oss << "if (gpu_runtime::device_builtin_getVertexSetSize(" << var_name(stmt->input_frontier_name) << ") < " << var_name(stmt->input_frontier_name) << ".max_num_elems * ";
+		if (stmt->threshold > 0) 
+			oss << stmt->threshold;
+		else 
+			oss << "gpu_runtime::device_str_to_float(gpu_runtime::device_get_argv(" << stmt->argv_index << "))";
+		oss << ") {" << std::endl;
 		indent();
 		stmt->stmt1->accept(this);
 		dedent();
diff --git a/src/midend/apply_expr_lower.cpp b/src/midend/apply_expr_lower.cpp
index 22984dd6..c3631251 100644
--- a/src/midend/apply_expr_lower.cpp
+++ b/src/midend/apply_expr_lower.cpp
@@ -121,6 +121,7 @@ namespace graphit {
 					hybrid_node->stmt1 = stmt_block_1;
 					hybrid_node->stmt2 = stmt_block_2;
 					hybrid_node->threshold = hybrid_schedule->threshold;
+					hybrid_node->argv_index = hybrid_schedule->argv_index;
 					hybrid_node->criteria = hybrid_schedule->_hybrid_criteria;
 					if (hybrid_node->criteria == fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria::INPUT_VERTEXSET_SIZE && edgeset_apply->from_func != "") {
 						hybrid_node->input_frontier_name = edgeset_apply->from_func;	
diff --git a/src/runtime_lib/gpu_intrinsics.h b/src/runtime_lib/gpu_intrinsics.h
index 09377d80..f683ade7 100644
--- a/src/runtime_lib/gpu_intrinsics.h
+++ b/src/runtime_lib/gpu_intrinsics.h
@@ -27,5 +27,7 @@ static __device__ void device_deleteObject(T &t) {
 
 static void * no_args[1];
 
+void register_argv(int32_t argc, char* argv[]) {
+}
 }
 #endif

From ff9e4ba2137d4b285bc098c338e14c6280b06e40 Mon Sep 17 00:00:00 2001
From: "zhangyunming1990@gmail.com" <zhangyunming1990@gmail.com>
Date: Thu, 24 Oct 2019 16:22:09 -0400
Subject: [PATCH 49/88] refactoring to current_priority_ , delta, and
 window_upper_ in the priority_queue_ data structure

---
 .../infra_gpu/gpu_priority_queue.h            |  1 +
 .../test_input/sssp_delta_stepping.cu         | 81 ++++++++++---------
 2 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index 81e6eb06..011b202f 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -38,6 +38,7 @@ namespace gpu_runtime {
     
     PriorityT_ delta_ = 1;
     PriorityT_ current_priority_ = 0;
+    PriorityT_ window_upper_ = 0;
     
   };
 }
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index ef79860c..2d78c2eb 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -35,8 +35,8 @@ int32_t __device__ *SP;
 int32_t *__host_SP;
 int32_t *__device_SP;
 
-int32_t __device__ window_lower;
-int32_t __device__ window_upper;
+//int32_t __device__ window_lower;
+//int32_t __device__ window_upper;
 
 
 #define VIRTUAL_WARP_SIZE (32)
@@ -70,7 +70,8 @@ bool __device__ updateEdge(int32_t src, int32_t dst, int32_t weight) {
 	SP_trackving_var_1 = gpu_runtime::writeMin(&SP[dst], (SP[src] + weight));
 	output2 = SP_trackving_var_1;
 
-	if (SP[dst] >= window_upper) return false;
+	//if (SP[dst] >= window_upper) return false;
+	if (SP[dst] >= (device_gpq.current_priority_ + device_gpq.delta_)) return false;
 	
 	return output2;
 }
@@ -95,13 +96,15 @@ void __global__ update_nodes_identify_min(gpu_runtime::GraphT<int32_t> graph, al
 	for (int i = 0; i < work_per_thread; i++) {
 		int32_t node_id = thread_id + i * num_threads;
 		if (node_id < graph.num_vertices) {
-			if (SP[node_id] >= device_state.window_upper && SP[node_id] != INT_MAX && SP[node_id] < my_minimum) {
+		  if (SP[node_id] >= (device_gpq.window_upper_) && SP[node_id] != INT_MAX && SP[node_id] < my_minimum) {
 				my_minimum = SP[node_id];
 			}
 		}
 	}
-	if (my_minimum < device_state.new_window_start[0]) {
-		atomicMin(device_state.new_window_start, my_minimum);
+	//if (my_minimum < device_state.new_window_start[0]) {
+	if (my_minimum < device_gpq.current_priority_){
+	  //atomicMin(device_state.new_window_start, my_minimum);
+	  atomicMin(&(device_gpq.current_priority_), my_minimum);
 	}	
 }
 void __global__ update_nodes_special(gpu_runtime::GraphT<int32_t> graph, algo_state device_state,  gpu_runtime::VertexFrontier output_frontier) {
@@ -114,7 +117,8 @@ void __global__ update_nodes_special(gpu_runtime::GraphT<int32_t> graph, algo_st
 	for (int i = 0; i < work_per_thread; i++) {
 		int32_t node_id = thread_id + i * num_threads;
 		if (node_id < graph.num_vertices) {
-			if(SP[node_id] >= device_state.window_lower && SP[node_id] < device_state.window_upper) {
+		  //if(SP[node_id] >= device_state.window_lower && SP[node_id] < device_state.window_upper) {
+		  if(SP[node_id] >= device_gpq.current_priority_ && SP[node_id] < (device_gpq.current_priority_ + device_gpq.delta_)) {
 				gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, node_id);
 			}	
 		}
@@ -153,15 +157,8 @@ int main(int argc, char *argv[]) {
 	
 	__host_SP = new int32_t[gpu_runtime::builtin_getVertices(graph)];
 
-	//cudaMemcpyToSymbol(gpq, &host_gpq, sizeof(host_gpq), 0);
-	
 	algo_state host_state, device_state;	
 	allocate_state(host_state, device_state, graph);
-	
-
-
-   
-
 
 	cudaDeviceSynchronize();
 	
@@ -174,13 +171,15 @@ int main(int argc, char *argv[]) {
 		gpu_runtime::vertex_set_apply_kernel<gpu_runtime::AccessorAll, SP_generated_vector_op_apply_func_0><<<NUM_CTA, CTA_SIZE>>>(graph.getFullFrontier());
 		startTimer();
 
-		host_state.window_lower = 0;
-		host_state.window_upper = delta;
-		device_state.window_lower = 0;
-		device_state.window_upper = delta;
+		host_gpq.delta_ = delta;
+		host_gpq.current_priority_ = 0 ;
+
+		cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
+		gpu_runtime::cudaCheckLastError();
 		
 		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state, start_vertex);
 		gpu_runtime::cudaCheckLastError();
+		//std::cout << "test2" << std::endl;
 		
 		int iters = 0;	
 		cudaDeviceSynchronize();
@@ -192,37 +191,39 @@ int main(int argc, char *argv[]) {
 			startTimer();
 			iters++;
 			gpu_runtime::vertex_set_prepare_sparse(frontier);
-			cudaMemcpyToSymbol(window_upper, &device_state.window_upper, sizeof(int32_t), 0);
+			//cudaMemcpyToSymbol(window_upper, &device_state.window_upper, sizeof(int32_t), 0);
+			//Might not be necessary, always synchronized at this point?? 
+			cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
 			gpu_runtime::cudaCheckLastError();
+
 			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);  
 			gpu_runtime::TWCE_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);
-
+			gpu_runtime::cudaCheckLastError();
 			
 			gpu_runtime::swap_bytemaps(frontier);
 			// set the input to the prepare function
 			frontier.format_ready = gpu_runtime::VertexFrontier::BYTEMAP;
 			
 			if (gpu_runtime::builtin_getVertexSetSize(frontier) == (0)) {
-				host_state.new_window_start[0] = INT_MAX;
-				cudaMemcpy(device_state.new_window_start, host_state.new_window_start, sizeof(int32_t), cudaMemcpyHostToDevice);
-
-				//should not need to change 
-				update_nodes_identify_min<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);	
-				cudaMemcpy(host_state.new_window_start, device_state.new_window_start, sizeof(int32_t), cudaMemcpyDeviceToHost);
-
-				//this is for termination when it is all finished
-				if (host_state.new_window_start[0] == INT_MAX) {
-					break;
-				}
-
-				//if it is not a pointer, then you can set by value directly
-				device_state.window_lower = host_state.new_window_start[0];
-				device_state.window_upper = host_state.new_window_start[0] + delta;
-
-				update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>( graph, device_state, frontier);
-				gpu_runtime::swap_queues(frontier);
-				frontier.format_ready = gpu_runtime::VertexFrontier::SPARSE; 
-				
+			  //host_state.new_window_start[0] = INT_MAX;
+			  host_gpq.window_upper_ = host_gpq.current_priority_ + host_gpq.delta_;
+			  host_gpq.current_priority_ = INT_MAX;
+			  
+			  cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
+			  gpu_runtime::cudaCheckLastError();
+
+			  update_nodes_identify_min<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);
+			  gpu_runtime::cudaCheckLastError();
+			  cudaMemcpyFromSymbol(&host_gpq, device_gpq, sizeof(host_gpq), 0,cudaMemcpyDeviceToHost);
+			  gpu_runtime::cudaCheckLastError();
+
+			  if(host_gpq.current_priority_ == INT_MAX){
+			    break;
+			  }			  
+			  update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>( graph, device_state, frontier);
+			  gpu_runtime::cudaCheckLastError();
+			  gpu_runtime::swap_queues(frontier);
+			  frontier.format_ready = gpu_runtime::VertexFrontier::SPARSE; 
 			}
 
 			cudaDeviceSynchronize();

From 5bc43cb452618363c4d1dfff7d84978d1439ca78 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Thu, 24 Oct 2019 18:09:54 -0400
Subject: [PATCH 50/88] Fixed the argv issue not working properly on device

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |  4 +-
 include/graphit/midend/mir.h                  |  1 +
 include/graphit/midend/mir_context.h          |  1 +
 src/backend/codegen_gpu/codegen_gpu.cpp       | 38 ++++++++++++++-----
 src/midend/apply_expr_lower.cpp               |  1 +
 src/runtime_lib/gpu_intrinsics.h              |  6 ++-
 src/runtime_lib/infra_gpu/load_balance.h      |  2 +-
 7 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index 976fbeca..c3fea7b8 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -54,9 +54,10 @@ class CodeGenGPU : public mir::MIRVisitor{
 	MIRContext * mir_context_;
 
 private:
-	void genGlobalDeclarations();
+	void genGlobalDeclarations(void);
 	void genIncludeStmts(void);
 	void genEdgeSets(void);
+	void genHybridThresholds(void);
 	void genFuncDecl(mir::FuncDecl::Ptr);
 
 
@@ -163,6 +164,7 @@ class CodeGenGPUFusedKernel: public CodeGenGPU {
 	virtual void visit(mir::VarDecl::Ptr) override;
 	virtual void visit(mir::PrintStmt::Ptr) override;
 	virtual void visit(mir::HybridGPUStmt::Ptr) override;
+	virtual void visit(mir::VertexSetDedupExpr::Ptr) override;
 	
 	std::string var_name (std::string var) {
 		//return current_kernel_name + "_" + var;
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index ee37e972..3aeb948d 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -1549,6 +1549,7 @@ namespace graphit {
 		StmtBlock::Ptr stmt2;
 		float threshold;
 		int32_t argv_index;
+		std::string threshold_var_name;
 		fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria criteria;
 			
 		std::string input_frontier_name;
diff --git a/include/graphit/midend/mir_context.h b/include/graphit/midend/mir_context.h
index 7d80ccad..398efb62 100644
--- a/include/graphit/midend/mir_context.h
+++ b/include/graphit/midend/mir_context.h
@@ -452,6 +452,7 @@ namespace graphit {
 
 	// Used by kernel fusion optimization
 	std::vector<mir::WhileStmt::Ptr> fused_while_loops;
+	std::vector<mir::HybridGPUStmt::Ptr> hybrid_gpu_stmts;
 
     };
 
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 64ce889e..321a27d6 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -376,6 +376,12 @@ void CodeGenGPU::genIncludeStmts(void) {
 }
 
 void CodeGenGPU::genGlobalDeclarations(void) {
+	for (auto stmt: mir_context_->hybrid_gpu_stmts) {
+		std::string threshold_var_name = "hybrid_threshold_var" + mir_context_->getUniqueNameCounterString();	
+		oss << "float " << threshold_var_name << ";" << std::endl;
+		oss << "float __device__ __device_" << threshold_var_name << ";" << std::endl;
+		stmt->threshold_var_name = threshold_var_name;
+	}
 }
 
 void CodeGenGPU::genEdgeSets(void) {
@@ -430,6 +436,20 @@ void CodeGenGPU::visit(mir::ScalarType::Ptr scalar_type) {
 	}
 }
 
+void CodeGenGPU::genHybridThresholds(void) {
+	for (auto stmt: mir_context_->hybrid_gpu_stmts) {
+		std::string var_name = stmt->threshold_var_name;
+		if (stmt->threshold < 0) {
+			printIndent();
+			oss << stmt->threshold_var_name << " = gpu_runtime::str_to_float(argv[" << stmt->argv_index << "])" << std::endl;
+		} else {
+			printIndent();
+			oss << stmt->threshold_var_name << " = " << stmt->threshold << std::endl;
+		}
+		printIndent();
+		oss << "cudaMemcpyToSymbol(__device_" << stmt->threshold_var_name << ", &" << stmt->threshold_var_name << ", sizeof(float), 0);" << std::endl;
+	}
+}
 void CodeGenGPU::visit(mir::FuncDecl::Ptr func_decl) {
 	if (func_decl->type == mir::FuncDecl::Type::EXTERNAL) {
 		assert(false && "GPU backend currently doesn't support external functions\n");
@@ -458,8 +478,7 @@ void CodeGenGPU::visit(mir::FuncDecl::Ptr func_decl) {
 		indent();
 
 		if (func_decl->name == "main") {
-			printIndent();
-			oss << "gpu_runtime::register_argv(argc, argv);" << std::endl;
+			genHybridThresholds();
 			for (auto stmt: mir_context_->edgeset_alloc_stmts) {
 				mir::AssignStmt::Ptr assign_stmt = mir::to<mir::AssignStmt>(stmt);
 				mir::EdgeSetLoadExpr::Ptr edge_set_load_expr = mir::to<mir::EdgeSetLoadExpr>(assign_stmt->expr);
@@ -993,6 +1012,11 @@ void CodeGenGPU::visit(mir::VertexSetDedupExpr::Ptr vsde) {
 	vsde->target->accept(this);
 	oss << ")";
 }
+void CodeGenGPUFusedKernel::visit(mir::VertexSetDedupExpr::Ptr vsde) {
+	oss << "gpu_runtime::device_dedup_frontier(";
+	vsde->target->accept(this);
+	oss << ")";
+}
 void CodeGenGPU::visit(mir::BoolLiteral::Ptr bool_literal) {
 	oss << bool_literal->val?"true":"false";
 }
@@ -1238,10 +1262,7 @@ void CodeGenGPU::visit(mir::HybridGPUStmt::Ptr stmt) {
 	if (stmt->criteria == fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria::INPUT_VERTEXSET_SIZE) {
 		printIndent();
 		oss << "if (gpu_runtime::builtin_getVertexSetSize(" << stmt->input_frontier_name << ") < " << stmt->input_frontier_name << ".max_num_elems * ";
-		if (stmt->threshold > 0) 
-			oss << stmt->threshold;
-		else 
-			oss << "gpu_runtime::str_to_float(gpu_runtime::get_argv(" << stmt->argv_index << "))";
+		oss << stmt->threshold_var_name;
 		oss << ") {" << std::endl;
 		indent();
 		stmt->stmt1->accept(this);
@@ -1261,10 +1282,7 @@ void CodeGenGPUFusedKernel::visit(mir::HybridGPUStmt::Ptr stmt) {
 	if (stmt->criteria == fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria::INPUT_VERTEXSET_SIZE) {
 		printIndent();
 		oss << "if (gpu_runtime::device_builtin_getVertexSetSize(" << var_name(stmt->input_frontier_name) << ") < " << var_name(stmt->input_frontier_name) << ".max_num_elems * ";
-		if (stmt->threshold > 0) 
-			oss << stmt->threshold;
-		else 
-			oss << "gpu_runtime::device_str_to_float(gpu_runtime::device_get_argv(" << stmt->argv_index << "))";
+		oss << "__device_" << stmt->threshold_var_name;
 		oss << ") {" << std::endl;
 		indent();
 		stmt->stmt1->accept(this);
diff --git a/src/midend/apply_expr_lower.cpp b/src/midend/apply_expr_lower.cpp
index c3631251..308e0a8e 100644
--- a/src/midend/apply_expr_lower.cpp
+++ b/src/midend/apply_expr_lower.cpp
@@ -130,6 +130,7 @@ namespace graphit {
 					}
 					
 					node = hybrid_node;
+					mir_context_->hybrid_gpu_stmts.push_back(hybrid_node);
 					if (assign_stmt->stmt_label != "") {
 						label_scope_.unscope();
 					}
diff --git a/src/runtime_lib/gpu_intrinsics.h b/src/runtime_lib/gpu_intrinsics.h
index f683ade7..ae16c9aa 100644
--- a/src/runtime_lib/gpu_intrinsics.h
+++ b/src/runtime_lib/gpu_intrinsics.h
@@ -27,7 +27,11 @@ static __device__ void device_deleteObject(T &t) {
 
 static void * no_args[1];
 
-void register_argv(int32_t argc, char* argv[]) {
+float str_to_float(const char* str) {
+	float val;
+	if (sscanf(str, "%f", &val) != 1)
+		return 0.0;
+	return val;
 }
 }
 #endif
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index 80f41150..915fdf42 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -48,7 +48,7 @@ void __host__ vertex_based_load_balance_info(VertexFrontier &frontier, int32_t &
 }
 template <typename AccessorType>
 void __device__ vertex_based_load_balance_info_device(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
-	int32_t num_threads = AccessorType::getSizeDevice(frontier);
+	int32_t num_threads = AccessorType::getSize(frontier);
 	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
 	cta_size = CTA_SIZE;
 }

From e26f331697fb61095c6f3140cc5c122acef49e44 Mon Sep 17 00:00:00 2001
From: "zhangyunming1990@gmail.com" <zhangyunming1990@gmail.com>
Date: Thu, 24 Oct 2019 19:48:22 -0400
Subject: [PATCH 51/88] refactoring out the terminate condition with finished

---
 src/runtime_lib/infra_gpu/gpu_priority_queue.h   |  9 ++++-----
 test/gpu_tests/test_input/sssp_delta_stepping.cu | 10 +++++-----
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index 011b202f..6cc789d9 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -12,21 +12,20 @@ namespace gpu_runtime {
     
   public:
 
-    size_t get_current_priority(){
+    size_t getCurrentPriority(){
       return current_priority_;
     }
     
-    void update_current_priority(PriorityT_ priority_change_){
+    void updatePriorityMin(PriorityT_ priority_change_){
       
     }
     
     bool finished() {
-      //TODO
-      return true;
+      return current_priority_ == INT_MAX;
     }
     
     bool host_finishedNode(NodeID v){
-      return host_priorities_[v]/delta_ < get_current_priority();;
+      return host_priorities_[v]/delta_ < current_priority_;
     }
 
     bool __device__ device_finishedNode(NodeID v){
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 2d78c2eb..dfcb64a5 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -179,7 +179,6 @@ int main(int argc, char *argv[]) {
 		
 		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state, start_vertex);
 		gpu_runtime::cudaCheckLastError();
-		//std::cout << "test2" << std::endl;
 		
 		int iters = 0;	
 		cudaDeviceSynchronize();
@@ -187,7 +186,8 @@ int main(int argc, char *argv[]) {
 		//printf("Init time = %f\n", t);
 		iter_total+=t;
 
-		while(gpu_runtime::builtin_getVertexSetSize(frontier) != (0)){
+		//while(gpu_runtime::builtin_getVertexSetSize(frontier) != (0)){
+		while(! host_gpq.finished()){
 			startTimer();
 			iters++;
 			gpu_runtime::vertex_set_prepare_sparse(frontier);
@@ -217,9 +217,9 @@ int main(int argc, char *argv[]) {
 			  cudaMemcpyFromSymbol(&host_gpq, device_gpq, sizeof(host_gpq), 0,cudaMemcpyDeviceToHost);
 			  gpu_runtime::cudaCheckLastError();
 
-			  if(host_gpq.current_priority_ == INT_MAX){
-			    break;
-			  }			  
+			  //if(host_gpq.current_priority_ == INT_MAX){
+			  //  break;
+			  //}			  
 			  update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>( graph, device_state, frontier);
 			  gpu_runtime::cudaCheckLastError();
 			  gpu_runtime::swap_queues(frontier);

From bd3ef98f8bede06895d06549a9da05970a6a8052 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Thu, 24 Oct 2019 21:19:22 -0400
Subject: [PATCH 52/88] Merged TWC, CM and WM with test cases for all load
 balance schemes

---
 include/graphit/midend/mir.h                  |   2 +
 src/backend/codegen_gpu/codegen_gpu.cpp       |  16 +-
 src/runtime_lib/infra_gpu/graph.h             |  15 +
 src/runtime_lib/infra_gpu/load_balance.h      | 425 ++++++++++++++++++
 src/runtime_lib/infra_gpu/vertex_frontier.h   |   1 +
 test/gpu_tests/all_gpu_tests.py               |  26 +-
 test/gpu_tests/test_input/inputs/sssp.gt      |  40 ++
 .../test_input/schedules/sssp_CM_schedule.gt  |   4 +
 .../schedules/sssp_TWCE_schedule.gt           |   4 +
 .../test_input/schedules/sssp_TWC_schedule.gt |   4 +
 .../test_input/schedules/sssp_WM_schedule.gt  |   4 +
 .../schedules/sssp_default_schedule.gt        |   3 +
 12 files changed, 539 insertions(+), 5 deletions(-)
 create mode 100644 test/gpu_tests/test_input/inputs/sssp.gt
 create mode 100644 test/gpu_tests/test_input/schedules/sssp_CM_schedule.gt
 create mode 100644 test/gpu_tests/test_input/schedules/sssp_TWCE_schedule.gt
 create mode 100644 test/gpu_tests/test_input/schedules/sssp_TWC_schedule.gt
 create mode 100644 test/gpu_tests/test_input/schedules/sssp_WM_schedule.gt
 create mode 100644 test/gpu_tests/test_input/schedules/sssp_default_schedule.gt

diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index 3aeb948d..be08d86a 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -949,6 +949,7 @@ namespace graphit {
 		
 		applied_schedule = edgeset_apply->applied_schedule;
 		frontier_reusable = edgeset_apply->frontier_reusable;
+		requires_output = edgeset_apply->requires_output;
             }
 
             virtual void accept(MIRVisitor *visitor) {
@@ -977,6 +978,7 @@ namespace graphit {
                 enable_deduplication = edgeset_apply->enable_deduplication;
 		applied_schedule = edgeset_apply->applied_schedule;
 		frontier_reusable = edgeset_apply->frontier_reusable;
+		requires_output = edgeset_apply->requires_output;
             }
 
             virtual void accept(MIRVisitor *visitor) {
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 321a27d6..63e6d658 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -441,7 +441,7 @@ void CodeGenGPU::genHybridThresholds(void) {
 		std::string var_name = stmt->threshold_var_name;
 		if (stmt->threshold < 0) {
 			printIndent();
-			oss << stmt->threshold_var_name << " = gpu_runtime::str_to_float(argv[" << stmt->argv_index << "])" << std::endl;
+			oss << stmt->threshold_var_name << " = gpu_runtime::str_to_float(argv[" << stmt->argv_index << "]);" << std::endl;
 		} else {
 			printIndent();
 			oss << stmt->threshold_var_name << " = " << stmt->threshold << std::endl;
@@ -581,6 +581,12 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 		load_balance_function = "gpu_runtime::TWCE_load_balance";
 	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::EDGE_ONLY) {
 		load_balance_function = "gpu_runtime::edge_only_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWC) {
+		load_balance_function = "gpu_runtime::TWC_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::CM) {
+		load_balance_function = "gpu_runtime::CM_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::WM) {
+		load_balance_function = "gpu_runtime::WM_load_balance";
 	}
 
 	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae)) {
@@ -701,6 +707,12 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 		load_balance_function = "gpu_runtime::TWCE_load_balance";
 	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::EDGE_ONLY) {
 		load_balance_function = "gpu_runtime::edge_only_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWC) {
+		load_balance_function = "gpu_runtime::TWC_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::CM) {
+		load_balance_function = "gpu_runtime::CM_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::WM) {
+		load_balance_function = "gpu_runtime::WM_load_balance";
 	}
 	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae)) {
 		printIndent();
@@ -1013,7 +1025,7 @@ void CodeGenGPU::visit(mir::VertexSetDedupExpr::Ptr vsde) {
 	oss << ")";
 }
 void CodeGenGPUFusedKernel::visit(mir::VertexSetDedupExpr::Ptr vsde) {
-	oss << "gpu_runtime::device_dedup_frontier(";
+	oss << "gpu_runtime::dedup_frontier_device(";
 	vsde->target->accept(this);
 	oss << ")";
 }
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
index 63ca6a73..f60ed836 100644
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -39,6 +39,16 @@ struct GraphT { // Field names are according to CSR, reuse for CSC
 		full_frontier.max_num_elems = num_vertices;
 		return full_frontier;
 	}
+
+
+	// Load balance scratch pads
+	// TWC bins
+	int32_t *twc_small_bin;
+	int32_t *twc_mid_bin;
+	int32_t *twc_large_bin;
+	
+	int32_t *twc_bin_sizes;
+		
 };
 void consume(int32_t _) {
 }
@@ -135,6 +145,11 @@ static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool
 	cudaMemcpy(graph.d_src_offsets, graph.h_src_offsets, sizeof(int32_t) * (graph.num_vertices + 1), cudaMemcpyHostToDevice);
 	//std::cout << filename << " (" << graph.num_vertices << ", " << graph.num_edges << ")" << std::endl;
 
+	cudaMalloc(&graph.twc_small_bin, graph.num_vertices * 6 * sizeof(int32_t));
+	cudaMalloc(&graph.twc_mid_bin, graph.num_vertices * 6 * sizeof(int32_t));
+	cudaMalloc(&graph.twc_large_bin, graph.num_vertices * 6 * sizeof(int32_t));
+	cudaMalloc(&graph.twc_bin_sizes, 3 * sizeof(int32_t));
+
 }
 template <typename EdgeWeightType>
 static int32_t builtin_getVertices(GraphT<EdgeWeightType> &graph) {
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index 915fdf42..c6d9b56d 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -259,6 +259,431 @@ void __device__ TWCE_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFr
 	}
 	this_grid().sync();
 }
+
+// CM load balance functions
+int32_t __device__ binary_search_upperbound(int32_t *array, int32_t len, int32_t key){
+	int32_t s = 0;
+	while(len>0){
+		int32_t half = len>>1;
+		int32_t mid = s + half;
+		if(array[mid] > key){
+			len = half;
+		}else{
+			s = mid+1;
+			len = len-half-1;
+		}
+	}
+	return s;
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __device__ CM_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
+
+	__shared__ int32_t sm_idx[CTA_SIZE], sm_deg[CTA_SIZE], sm_loc[CTA_SIZE];
+	int32_t thread_id = threadIdx.x + blockDim.x * cta_id;
+	int32_t tot_size = AccessorType::getSize(input_frontier);
+
+        int32_t deg, index, src_idx;
+        if(thread_id < tot_size) {
+		index = AccessorType::getElement(input_frontier, thread_id);
+                deg = graph.d_get_degree(index);
+
+		sm_idx[threadIdx.x] = index;
+                sm_deg[threadIdx.x] = deg;
+                sm_loc[threadIdx.x] = graph.d_src_offsets[index];
+        } else {
+                deg = 0;
+                sm_deg[threadIdx.x] = deg;
+        }
+
+        int32_t lane = (threadIdx.x & 31);
+        int32_t offset = 0;
+	
+	// prefix sum
+	int32_t cosize = blockDim.x;
+	int32_t tot_deg;
+	int32_t phase = threadIdx.x;
+	int32_t off=32;
+
+	for(int32_t d=2; d<=32; d<<=1) {
+		int32_t temp = __shfl_up_sync((uint32_t)-1, deg, d/2);
+		if (lane % d == d - 1) deg += temp;
+	}
+	sm_deg[threadIdx.x] = deg;
+
+	for(int32_t d=cosize>>(1+5); d>0; d>>=1){
+		__syncthreads();
+		if(phase<d){
+			int32_t ai = off*(2*phase+1)-1;
+			int32_t bi = off*(2*phase+2)-1;
+			sm_deg[bi] += sm_deg[ai];
+		}
+		off<<=1;
+	}
+
+	__syncthreads();
+	tot_deg = sm_deg[cosize-1];
+	__syncthreads();
+	if(!phase) sm_deg[cosize-1]=0;
+	__syncthreads();
+
+	for(int32_t d=1; d<(cosize>>5); d<<=1){
+		off >>=1;
+		__syncthreads();
+		if(phase<d){
+			int32_t ai = off*(2*phase+1)-1;
+			int32_t bi = off*(2*phase+2)-1;
+
+			int32_t t = sm_deg[ai];
+			sm_deg[ai]  = sm_deg[bi];
+			sm_deg[bi] += t;
+		}
+	}
+	__syncthreads();
+	deg = sm_deg[threadIdx.x];
+	__syncthreads();
+	for(int32_t d=32; d>1; d>>=1) {
+		int32_t temp_big = __shfl_down_sync((uint32_t)-1, deg, d/2);
+		int32_t temp_small = __shfl_up_sync((uint32_t)-1, deg, d/2);
+		if (lane % d == d/2 - 1) deg = temp_big;
+		else if(lane % d == d - 1) deg += temp_small;
+	}
+	sm_deg[threadIdx.x] = deg;
+	__syncthreads();
+	
+	// compute
+        int32_t width = thread_id - threadIdx.x + blockDim.x;
+        if(tot_size < width) width = tot_size;
+        width -= thread_id - threadIdx.x;
+
+        for(int32_t i=threadIdx.x; i<tot_deg; i+=blockDim.x) {
+                int32_t id = binary_search_upperbound(&sm_deg[offset], width, i)-1;
+
+                if(id >= width) continue;
+                src_idx = sm_idx[offset + id];
+		if (src_filter(src_idx) == false)
+			continue;
+                int32_t ei = sm_loc[offset + id] + i - sm_deg[offset + id];
+                int32_t dst_idx = graph.d_edge_dst[ei];
+		load_balance_payload(graph, src_idx, dst_idx, ei, input_frontier, output_frontier);
+        }
+}
+template <typename AccessorType>
+void __host__ CM_load_balance_info(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	int32_t num_threads = AccessorType::getSizeHost(frontier);
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	cta_size = CTA_SIZE;
+}
+template <typename AccessorType>
+void __device__ CM_load_balance_info_device(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	int32_t num_threads = AccessorType::getSize(frontier);
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	cta_size = CTA_SIZE;
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ CM_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	CM_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __host__ CM_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	CM_load_balance_info<AccessorType>(input_frontier, num_cta, cta_size);
+	CM_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __device__ CM_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	CM_load_balance_info_device<AccessorType>(input_frontier, num_cta, cta_size);
+	this_grid().sync();
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		CM_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);	
+		__syncthreads();
+	}
+	this_grid().sync();
+}
+
+
+// WM load balance functions
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __device__ WM_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
+
+	__shared__ int32_t sm_idx[CTA_SIZE], sm_deg[CTA_SIZE], sm_loc[CTA_SIZE];
+	int32_t thread_id = threadIdx.x + blockDim.x * cta_id;
+	int32_t tot_size = AccessorType::getSize(input_frontier);
+
+        int32_t deg, index, src_idx;
+        if(thread_id < tot_size) {
+		index = AccessorType::getElement(input_frontier, thread_id);
+                deg = graph.d_get_degree(index);
+
+		sm_idx[threadIdx.x] = index;
+                sm_deg[threadIdx.x] = deg;
+                sm_loc[threadIdx.x] = graph.d_src_offsets[index];
+        } else {
+                deg = 0;
+                sm_deg[threadIdx.x] = deg;
+        }
+
+        // prefix sum
+        int32_t lane = (threadIdx.x&31);
+        int32_t offset = threadIdx.x - lane;
+        for(int32_t d=1; d<32; d<<=1) {
+                int32_t temp = __shfl_up_sync((uint32_t)-1, deg, d);
+                if (lane >= d) deg += temp;
+        }
+        int32_t tot_deg = __shfl_sync((uint32_t)-1, deg, 31);
+        if(lane == 31) deg = 0;
+        sm_deg[offset + ((lane+1)&31)] = deg;
+        __syncthreads();
+
+        // compute
+        int32_t width = thread_id - lane + 32;
+        if(tot_size < width) width = tot_size;
+        width -= thread_id - lane;
+
+        for(int32_t i=lane; i<tot_deg; i+=32) {
+                int32_t id = binary_search_upperbound(&sm_deg[offset], width, i)-1;
+
+                src_idx = sm_idx[offset + id];
+		if (src_filter(src_idx) == false)
+			continue;
+
+                int32_t ei = sm_loc[offset + id] + i - sm_deg[offset + id];
+                int32_t dst_idx = graph.d_edge_dst[ei];
+		load_balance_payload(graph, src_idx, dst_idx, ei, input_frontier, output_frontier);
+        }
+}
+template <typename AccessorType>
+void __host__ WM_load_balance_info(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	int32_t num_threads = AccessorType::getSizeHost(frontier);
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	cta_size = CTA_SIZE;
+}
+template <typename AccessorType>
+void __device__ WM_load_balance_info_device(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+	int32_t num_threads = AccessorType::getSize(frontier);
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	cta_size = CTA_SIZE;
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ WM_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	WM_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __host__ WM_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	WM_load_balance_info<AccessorType>(input_frontier, num_cta, cta_size);
+	WM_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __device__ WM_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta, cta_size;
+	WM_load_balance_info_device<AccessorType>(input_frontier, num_cta, cta_size);
+	this_grid().sync();
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		WM_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);	
+		__syncthreads();
+	}
+	this_grid().sync();
+}
+
+//TWCE load balance functions
+#define MID_BIN (32)
+#define LARGE_BIN (CTA_SIZE)
+
+template <typename EdgeWeightType, typename AccessorType>
+void __device__ TWC_split_frontier (GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, unsigned int cta_id, unsigned int num_cta) {
+        int32_t thread_id = threadIdx.x + blockDim.x * cta_id;
+        int32_t tot_size = AccessorType::getSize(input_frontier);
+	int32_t idx, deg;
+	if(thread_id < tot_size) {
+		idx = AccessorType::getElement(input_frontier, thread_id);
+		deg = graph.d_get_degree(idx);
+		if(deg < MID_BIN) {
+			int32_t k = atomicAggInc(&graph.twc_bin_sizes[0]);
+			graph.twc_small_bin[k] = idx;
+		} else if(deg < LARGE_BIN) {
+			int32_t k = atomicAggInc(&graph.twc_bin_sizes[1]);
+			graph.twc_mid_bin[k] = idx;
+		} else {
+			int32_t k = atomicAggInc(&graph.twc_bin_sizes[2]);
+			graph.twc_large_bin[k] = idx;
+		}
+	}	
+}
+template <typename EdgeWeightType, typename AccessorType>
+void __global__ TWC_split_frontier_kernel (GraphT<EdgeWeightType> graph, VertexFrontier input_frontier) {
+	TWC_split_frontier<EdgeWeightType, AccessorType> (graph, input_frontier, blockIdx.x, gridDim.x);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __device__ TWC_small_bin (GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
+
+	__shared__ int32_t sm_idx[CTA_SIZE], sm_deg[CTA_SIZE], sm_loc[CTA_SIZE];
+	int32_t thread_id = threadIdx.x + blockDim.x * cta_id;
+	int32_t tot_size = graph.twc_bin_sizes[0];
+
+        int32_t deg, index, src_idx;
+        if(thread_id < tot_size) {
+		index = graph.twc_small_bin[thread_id];
+                deg = graph.d_get_degree(index);
+
+		sm_idx[threadIdx.x] = index;
+                sm_deg[threadIdx.x] = deg;
+                sm_loc[threadIdx.x] = graph.d_src_offsets[index];
+        } else {
+                deg = 0;
+                sm_deg[threadIdx.x] = deg;
+        }
+
+        // prefix sum
+        int32_t lane = (threadIdx.x&31);
+        int32_t offset = threadIdx.x - lane;
+        for(int32_t d=1; d<32; d<<=1) {
+                int32_t temp = __shfl_up_sync((uint32_t)-1, deg, d);
+                if (lane >= d) deg += temp;
+        }
+        int32_t tot_deg = __shfl_sync((uint32_t)-1, deg, 31);
+        if(lane == 31) deg = 0;
+        sm_deg[offset + ((lane+1)&31)] = deg;
+        __syncthreads();
+
+        // compute
+        int32_t width = thread_id - lane + 32;
+        if(tot_size < width) width = tot_size;
+        width -= thread_id - lane;
+
+        for(int32_t i=lane; i<tot_deg; i+=32) {
+                int32_t id = binary_search_upperbound(&sm_deg[offset], width, i)-1;
+
+                src_idx = sm_idx[offset + id];
+		if (src_filter(src_idx) == false)
+			continue;
+
+                int32_t ei = sm_loc[offset + id] + i - sm_deg[offset + id];
+                int32_t dst_idx = graph.d_edge_dst[ei];
+		load_balance_payload(graph, src_idx, dst_idx, ei, input_frontier, output_frontier);
+        }
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ TWC_small_bin_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	TWC_small_bin<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+	
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __device__ TWC_mid_bin (GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
+	int32_t vid = (threadIdx.x + blockDim.x * cta_id)/MID_BIN;
+	int32_t tot_size = graph.twc_bin_sizes[1];
+	
+	if (vid >= tot_size)
+		return;
+
+	int32_t src = graph.twc_mid_bin[vid];
+	for (int32_t eid = graph.d_src_offsets[src]+(threadIdx.x%MID_BIN); eid < graph.d_src_offsets[src+1]; eid+=MID_BIN) {
+		if (src_filter(src) == false)
+			break;
+		int32_t dst = graph.d_edge_dst[eid];
+		load_balance_payload(graph, src, dst, eid, input_frontier, output_frontier);
+	}
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ TWC_mid_bin_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	TWC_mid_bin<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+	
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __device__ TWC_large_bin (GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
+	int32_t vid = (threadIdx.x + blockDim.x * cta_id)/LARGE_BIN;
+	int32_t tot_size = graph.twc_bin_sizes[2];
+	if (vid >= tot_size)
+		return;
+	int32_t src = graph.twc_large_bin[vid];
+	for (int32_t eid = graph.d_src_offsets[src]+(threadIdx.x%LARGE_BIN); eid < graph.d_src_offsets[src+1]; eid+=LARGE_BIN) {
+		if (src_filter(src) == false)
+			break;
+		int32_t dst = graph.d_edge_dst[eid];
+		load_balance_payload(graph, src, dst, eid, input_frontier, output_frontier);
+	}
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ TWC_large_bin_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	TWC_large_bin<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+	
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __host__ TWC_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	cudaMemset(graph.twc_bin_sizes, 0, sizeof(int32_t) * 3);
+	int num_threads = AccessorType::getSizeHost(input_frontier);	
+	int num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	int cta_size = CTA_SIZE;
+	TWC_split_frontier_kernel<EdgeWeightType, AccessorType><<<num_cta, cta_size>>>(graph, input_frontier);
+	int32_t twc_bin_sizes[3];
+	cudaMemcpy(twc_bin_sizes, graph.twc_bin_sizes, 3 * sizeof(int32_t), cudaMemcpyDeviceToHost);
+	num_threads = twc_bin_sizes[0];	
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	TWC_small_bin_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier); 
+	num_threads = twc_bin_sizes[1] * MID_BIN;	
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	TWC_mid_bin_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier); 
+	num_threads = twc_bin_sizes[2] * LARGE_BIN;	
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	TWC_large_bin_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier); 	
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __device__ TWC_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread_id < 3) {
+		graph.twc_bin_sizes[thread_id] = 0;
+	}	
+	this_grid().sync();
+
+	int num_threads = AccessorType::getSize(input_frontier);	
+	int num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	int cta_size = CTA_SIZE;
+
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		TWC_split_frontier<EdgeWeightType, AccessorType>(graph, input_frontier, cta_id, num_cta);
+		__syncthreads();
+	}
+
+	this_grid().sync();	
+
+	num_threads = graph.twc_bin_sizes[0];	
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+	
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		TWC_small_bin<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);
+		__syncthreads();
+	}
+
+	num_threads = graph.twc_bin_sizes[1] * MID_BIN;
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		TWC_mid_bin<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);
+		__syncthreads();
+	}
+
+	num_threads = graph.twc_bin_sizes[2] * LARGE_BIN;
+	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
+		TWC_large_bin<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);
+		__syncthreads();
+	}
+	
+	this_grid().sync();
+}
+
+
 }
 
 #endif
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index 47ac9694..ada24e67 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -31,6 +31,7 @@ struct VertexFrontier {
 	};
 
 	format_ready_type format_ready;
+
 };
 
 static VertexFrontier sentinel_frontier;
diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
index 6b996c26..c94eeb22 100644
--- a/test/gpu_tests/all_gpu_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -37,10 +37,10 @@ def sssp_verified_test(self, input_file_name, use_delta=False):
 		self.cpp_compile_test(input_file_name, [])
 		if use_delta:
 			#start point 0, delta 10, verified
-			self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel 0 10 v > verifier_input ")
+			self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel 0 10 v > " + self.verifier_input)
 		else:
-			self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel v > verifier_input ")	     
-		output = self.get_command_output(self.verifier_directory + "/sssp_verifier -f " + self.graph_directory +  "/4.wel -t verifier_input -r 0")		
+			self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel v > " + self.verifier_input)	     
+		output = self.get_command_output(self.verifier_directory + "/sssp_verifier -f " + self.graph_directory +  "/4.wel -t " + self.verifier_input + "  -r 0")		
 		test_flag = False
 		for line in output.rstrip().split("\n"):
 			if line.rstrip().find("SUCCESSFUL") != -1:
@@ -82,6 +82,7 @@ def setUpClass(cls):
 		cls.cuda_filename = cls.scratch_directory + "/test_cpp.cu"
 		
 		cls.graphitc_py = GRAPHIT_BUILD_DIRECTORY + "/bin/graphitc.py"
+		cls.verifier_input = cls.scratch_directory + "/verifier_input"
 
 	def cpp_compile_test(self, input_file_name, extra_cpp_args=[]):
 		if input_file_name[0] == "/":
@@ -141,6 +142,25 @@ def test_simple_graphit_exec(self):
 		self.assertEqual(len(output), 2)
 		self.assertEqual(output[0], "14")
 
+	def test_simple_graphit_sssp_basic_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_default_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_TWCE_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_TWCE_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_TWC_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_TWC_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_CM_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_CM_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_WM_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_WM_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
 		
 if __name__ == '__main__':
 	unittest.main()
diff --git a/test/gpu_tests/test_input/inputs/sssp.gt b/test/gpu_tests/test_input/inputs/sssp.gt
new file mode 100644
index 00000000..f3ee02e4
--- /dev/null
+++ b/test/gpu_tests/test_input/inputs/sssp.gt
@@ -0,0 +1,40 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex, int) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const SP : vector{Vertex}(int) = 2147483647; %should be INT_MAX
+
+func updateEdge(src : Vertex, dst : Vertex, weight : int)
+     SP[dst] min= (SP[src] + weight);
+end
+
+func reset(v: Vertex)
+    SP[v] = 2147483647;
+end
+
+func main()
+    vertices.apply(reset);
+    var n : int = edges.getVertices();
+    var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+    var start_vertex : int = atoi(argv[2]);
+    frontier.addVertex(start_vertex); %add source vertex
+    SP[start_vertex] = 0;
+    var rounds : int = 0;
+    while (frontier.getVertexSetSize() != 0)
+        #s1# var output: vertexset{Vertex}  = edges.from(frontier).applyModified(updateEdge, SP);
+        delete frontier;
+        frontier = output;
+        rounds = rounds + 1;
+        if rounds == n
+                break;
+        end
+    end
+    delete frontier;
+    for vid in 0:n
+        print SP[vid];
+    end
+end
+
+
+
diff --git a/test/gpu_tests/test_input/schedules/sssp_CM_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_CM_schedule.gt
new file mode 100644
index 00000000..c21c26da
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_CM_schedule.gt
@@ -0,0 +1,4 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(CM);
+	program->applyGPUSchedule("s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_TWCE_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_TWCE_schedule.gt
new file mode 100644
index 00000000..e2c72966
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_TWCE_schedule.gt
@@ -0,0 +1,4 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	program->applyGPUSchedule("s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_TWC_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_TWC_schedule.gt
new file mode 100644
index 00000000..16fb8b35
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_TWC_schedule.gt
@@ -0,0 +1,4 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWC);
+	program->applyGPUSchedule("s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_WM_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_WM_schedule.gt
new file mode 100644
index 00000000..7b76a0fd
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_WM_schedule.gt
@@ -0,0 +1,4 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(WM);
+	program->applyGPUSchedule("s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_default_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_default_schedule.gt
new file mode 100644
index 00000000..ae99ff2b
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_default_schedule.gt
@@ -0,0 +1,3 @@
+schedule:
+	SimpleGPUSchedule s1;
+	program->applyGPUSchedule("s1", s1);

From bc91bb982b19cf598b52e57031503fa1e5fd05e3 Mon Sep 17 00:00:00 2001
From: "zhangyunming1990@gmail.com" <zhangyunming1990@gmail.com>
Date: Thu, 24 Oct 2019 21:30:59 -0400
Subject: [PATCH 53/88] refactoring the code to make frontier a member of
 priority queue and setting up an init function for priority queue

---
 .../infra_gpu/gpu_priority_queue.h            | 22 +++++++++++++
 src/runtime_lib/infra_gpu/vertex_frontier.h   |  6 +++-
 .../test_input/sssp_delta_stepping.cu         | 32 +++++++++++--------
 3 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index 6cc789d9..1349109c 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -5,6 +5,7 @@
 #include <cinttypes>
 #include "vertex_frontier.h" 
 
+
 namespace gpu_runtime {
   
   template<typename PriorityT_>
@@ -15,6 +16,18 @@ namespace gpu_runtime {
     size_t getCurrentPriority(){
       return current_priority_;
     }
+
+    void init(PriorityT_ * host_priorities, PriorityT_* device_priorities, PriorityT_ initial_priority, PriorityT_ delta, NodeID initial_node = -1){
+      host_priorities_ = host_priorities;
+      device_priorities_ = device_priorities;
+      current_priority_ = initial_priority;
+      delta_ = delta;
+      if (initial_node != -1){
+	//if (frontier_ != {0}){
+	  gpu_runtime::builtin_addVertex(frontier_, initial_node);
+	  //}
+      }
+    }
     
     void updatePriorityMin(PriorityT_ priority_change_){
       
@@ -30,6 +43,11 @@ namespace gpu_runtime {
 
     bool __device__ device_finishedNode(NodeID v){
 
+    }
+
+    gpu_runtime::VertexFrontier dequeueReadySet(){
+      
+
     }
     
     PriorityT_* host_priorities_ = nullptr;
@@ -38,6 +56,10 @@ namespace gpu_runtime {
     PriorityT_ delta_ = 1;
     PriorityT_ current_priority_ = 0;
     PriorityT_ window_upper_ = 0;
+
+    //Need to do = {0} to avoid dynamic initialization error
+    VertexFrontier frontier_ = {0};
+
     
   };
 }
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index 47ac9694..d7d11e76 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -5,7 +5,10 @@
 #include <cooperative_groups.h>
 using namespace cooperative_groups;
 namespace gpu_runtime {
-struct VertexFrontier {
+class VertexFrontier {
+
+ public:
+  
 	int32_t max_num_elems; 
 
 	int32_t *d_num_elems_input;
@@ -31,6 +34,7 @@ struct VertexFrontier {
 	};
 
 	format_ready_type format_ready;
+
 };
 
 static VertexFrontier sentinel_frontier;
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index dfcb64a5..543c268a 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -5,7 +5,7 @@
 #define USE_DEDUP 0
 #define SORT_NODES 0
 #include <assert.h>
-#include <vector>
+#include <vector>2
 #include <queue>
 
 //#define DEBUG
@@ -19,6 +19,7 @@
 gpu_runtime::GPUPriorityQueue<int> host_gpq;
 gpu_runtime::GPUPriorityQueue<int> __device__  device_gpq; 
 
+
 typedef struct {
 	int32_t *SP;
 	int32_t *output_size;
@@ -166,13 +167,18 @@ int main(int argc, char *argv[]) {
 	for (int outer = 0; outer < ITER_COUNT; outer++) {
 		float iter_total = 0;
 		//this sets it to Sparse
-		gpu_runtime::VertexFrontier frontier = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
-		gpu_runtime::builtin_addVertex(frontier, start_vertex);
+		host_gpq.frontier_ = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
+
+		//frontier = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
+		
+		//gpu_runtime::builtin_addVertex(host_gpq.frontier_, start_vertex);
 		gpu_runtime::vertex_set_apply_kernel<gpu_runtime::AccessorAll, SP_generated_vector_op_apply_func_0><<<NUM_CTA, CTA_SIZE>>>(graph.getFullFrontier());
 		startTimer();
 
-		host_gpq.delta_ = delta;
-		host_gpq.current_priority_ = 0 ;
+		//host_gpq.delta_ = delta;
+		//host_gpq.current_priority_ = 0 ;
+
+		host_gpq.init(__host_SP, __device_SP, 0, delta, start_vertex);
 
 		cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
 		gpu_runtime::cudaCheckLastError();
@@ -190,21 +196,21 @@ int main(int argc, char *argv[]) {
 		while(! host_gpq.finished()){
 			startTimer();
 			iters++;
-			gpu_runtime::vertex_set_prepare_sparse(frontier);
+			gpu_runtime::vertex_set_prepare_sparse(host_gpq.frontier_);
 			//cudaMemcpyToSymbol(window_upper, &device_state.window_upper, sizeof(int32_t), 0);
 			//Might not be necessary, always synchronized at this point?? 
 			cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
 			gpu_runtime::cudaCheckLastError();
 
 			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);  
-			gpu_runtime::TWCE_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);
+			gpu_runtime::TWCE_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, host_gpq.frontier_, host_gpq.frontier_);
 			gpu_runtime::cudaCheckLastError();
 			
-			gpu_runtime::swap_bytemaps(frontier);
+			gpu_runtime::swap_bytemaps(host_gpq.frontier_);
 			// set the input to the prepare function
-			frontier.format_ready = gpu_runtime::VertexFrontier::BYTEMAP;
+			host_gpq.frontier_.format_ready = gpu_runtime::VertexFrontier::BYTEMAP;
 			
-			if (gpu_runtime::builtin_getVertexSetSize(frontier) == (0)) {
+			if (gpu_runtime::builtin_getVertexSetSize(host_gpq.frontier_) == (0)) {
 			  //host_state.new_window_start[0] = INT_MAX;
 			  host_gpq.window_upper_ = host_gpq.current_priority_ + host_gpq.delta_;
 			  host_gpq.current_priority_ = INT_MAX;
@@ -220,10 +226,10 @@ int main(int argc, char *argv[]) {
 			  //if(host_gpq.current_priority_ == INT_MAX){
 			  //  break;
 			  //}			  
-			  update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>( graph, device_state, frontier);
+			  update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>( graph, device_state, host_gpq.frontier_);
 			  gpu_runtime::cudaCheckLastError();
-			  gpu_runtime::swap_queues(frontier);
-			  frontier.format_ready = gpu_runtime::VertexFrontier::SPARSE; 
+			  gpu_runtime::swap_queues(host_gpq.frontier_);
+			  host_gpq.frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE; 
 			}
 
 			cudaDeviceSynchronize();

From 48f0877bfd490c234e10222c339ad0e1eb908993 Mon Sep 17 00:00:00 2001
From: "zhangyunming1990@gmail.com" <zhangyunming1990@gmail.com>
Date: Thu, 24 Oct 2019 22:00:15 -0400
Subject: [PATCH 54/88] replace the priorities accessed in
 update_nodes_identify_min and update_nodes_special to use the priorities
 inside GPUPriorityQueue

---
 src/runtime_lib/infra_gpu/gpu_priority_queue.h   | 4 +++-
 test/gpu_tests/test_input/sssp_delta_stepping.cu | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index 1349109c..b193b6aa 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -45,7 +45,9 @@ namespace gpu_runtime {
 
     }
 
-    gpu_runtime::VertexFrontier dequeueReadySet(){
+    
+
+    gpu_runtime::VertexFrontier __device__ dequeueReadySet(){
       
 
     }
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 543c268a..89b343f8 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -97,8 +97,8 @@ void __global__ update_nodes_identify_min(gpu_runtime::GraphT<int32_t> graph, al
 	for (int i = 0; i < work_per_thread; i++) {
 		int32_t node_id = thread_id + i * num_threads;
 		if (node_id < graph.num_vertices) {
-		  if (SP[node_id] >= (device_gpq.window_upper_) && SP[node_id] != INT_MAX && SP[node_id] < my_minimum) {
-				my_minimum = SP[node_id];
+		  if (device_gpq.device_priorities_[node_id] >= (device_gpq.window_upper_) && device_gpq.device_priorities_[node_id] != INT_MAX && device_gpq.device_priorities_[node_id] < my_minimum) {
+				my_minimum = device_gpq.device_priorities_[node_id];
 			}
 		}
 	}
@@ -119,7 +119,7 @@ void __global__ update_nodes_special(gpu_runtime::GraphT<int32_t> graph, algo_st
 		int32_t node_id = thread_id + i * num_threads;
 		if (node_id < graph.num_vertices) {
 		  //if(SP[node_id] >= device_state.window_lower && SP[node_id] < device_state.window_upper) {
-		  if(SP[node_id] >= device_gpq.current_priority_ && SP[node_id] < (device_gpq.current_priority_ + device_gpq.delta_)) {
+		  if(device_gpq.device_priorities_[node_id] >= device_gpq.current_priority_ && SP[node_id] < (device_gpq.current_priority_ + device_gpq.delta_)) {
 				gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, node_id);
 			}	
 		}

From c9826666bb5a8581660c46cda72e7e6c25e37157 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Fri, 25 Oct 2019 01:24:07 -0400
Subject: [PATCH 55/88] Added strict load balance and test case for strict with
 test cases for all load balance schemes with kernel fusion

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |   3 +
 src/backend/codegen_gpu/codegen_gpu.cpp       |   4 +-
 src/runtime_lib/infra_gpu/graph.h             |   5 +
 src/runtime_lib/infra_gpu/load_balance.h      | 297 +++++++++++++++++-
 test/gpu_tests/all_gpu_tests.py               |  32 +-
 test/gpu_tests/test_input/inputs/sssp.gt      |   2 +-
 .../sssp_CM_kernel_fusion_schedule.gt         |   8 +
 .../test_input/schedules/sssp_CM_schedule.gt  |   2 +-
 .../sssp_TWCE_kernel_fusion_schedule.gt       |   8 +
 .../schedules/sssp_TWCE_schedule.gt           |   2 +-
 .../sssp_TWC_kernel_fusion_schedule.gt        |   8 +
 .../test_input/schedules/sssp_TWC_schedule.gt |   2 +-
 .../sssp_WM_kernel_fusion_schedule.gt         |   8 +
 .../test_input/schedules/sssp_WM_schedule.gt  |   2 +-
 .../schedules/sssp_default_schedule.gt        |   2 +-
 .../sssp_strict_kernel_fusion_schedule.gt     |   8 +
 .../schedules/sssp_strict_schedule.gt         |   4 +
 ...ssp_vertex_based_kernel_fusion_schedule.gt |   7 +
 18 files changed, 394 insertions(+), 10 deletions(-)
 create mode 100644 test/gpu_tests/test_input/schedules/sssp_CM_kernel_fusion_schedule.gt
 create mode 100644 test/gpu_tests/test_input/schedules/sssp_TWCE_kernel_fusion_schedule.gt
 create mode 100644 test/gpu_tests/test_input/schedules/sssp_TWC_kernel_fusion_schedule.gt
 create mode 100644 test/gpu_tests/test_input/schedules/sssp_WM_kernel_fusion_schedule.gt
 create mode 100644 test/gpu_tests/test_input/schedules/sssp_strict_kernel_fusion_schedule.gt
 create mode 100644 test/gpu_tests/test_input/schedules/sssp_strict_schedule.gt
 create mode 100644 test/gpu_tests/test_input/schedules/sssp_vertex_based_kernel_fusion_schedule.gt

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index c3fea7b8..5b2c0412 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -177,6 +177,9 @@ class KernelVariableExtractor: public mir::MIRVisitor {
 	using mir::MIRVisitor::visit;
 	std::vector<mir::Var> hoisted_vars; 
 	std::vector<mir::VarDecl::Ptr> hoisted_decls;
+	MIRContext *mir_context_;
+	KernelVariableExtractor(MIRContext* mir_context): mir_context_(mir_context) {
+	}
 
 	void insertVar(mir::Var var_to_insert) {
 		for (auto var: hoisted_vars)
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 63e6d658..1eecb206 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -121,6 +121,8 @@ void CodeGenGPU::genPropertyArrayAlloca(mir::VarDecl::Ptr var_decl) {
 		
 }
 void KernelVariableExtractor::visit(mir::VarExpr::Ptr var_expr) {
+	if (mir_context_->isLoweredConst(var_expr->var.getName()))
+		return;
 	insertVar(var_expr->var);
 }
 void KernelVariableExtractor::visit(mir::VarDecl::Ptr var_decl) {
@@ -133,7 +135,7 @@ void CodeGenGPU::genFusedWhileLoop(mir::WhileStmt::Ptr while_stmt) {
 
 	// Now we extract the list of variables that are used in the kernel that are not const 
 	// So we can hoist them
-	KernelVariableExtractor extractor;
+	KernelVariableExtractor extractor(mir_context_);
 	while_stmt->accept(&extractor);
 
 	while_stmt->hoisted_vars = extractor.hoisted_vars;
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
index f60ed836..8dab7060 100644
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -48,6 +48,11 @@ struct GraphT { // Field names are according to CSR, reuse for CSC
 	int32_t *twc_large_bin;
 	
 	int32_t *twc_bin_sizes;
+
+	// strict frontiers
+	int32_t *strict_sum;
+	int32_t *strict_cta_sum;
+	int32_t *strict_grid_sum;
 		
 };
 void consume(int32_t _) {
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index c6d9b56d..1140c2c8 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -647,7 +647,6 @@ void __device__ TWC_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFro
 
 	int num_threads = AccessorType::getSize(input_frontier);	
 	int num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
-	int cta_size = CTA_SIZE;
 
 	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {
 		TWC_split_frontier<EdgeWeightType, AccessorType>(graph, input_frontier, cta_id, num_cta);
@@ -683,7 +682,303 @@ void __device__ TWC_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFro
 	this_grid().sync();
 }
 
+// STRICT LOAD BALANCE FUNCTIONS
+
+#define NNZ_PER_BLOCK (CTA_SIZE)
+#define STRICT_SM_SIZE (CTA_SIZE)
+#define PREFIX_BLK (CTA_SIZE)
 
+template <typename AccessorType, typename EdgeWeightType>
+void __device__ strict_gather(GraphT<EdgeWeightType> &graph, VertexFrontier &frontier, unsigned int cta_id, unsigned int num_cta) {
+        int32_t thread_id = threadIdx.x + blockDim.x * cta_id;
+        int32_t tot_size = AccessorType::getSize(frontier);
+	int32_t idx, deg;
+	if(thread_id < tot_size) {
+		idx = AccessorType::getElement(frontier, thread_id);
+		graph.strict_sum[thread_id] = graph.d_get_degree(idx);
+	}
 }
 
+template <typename AccessorType, typename EdgeWeightType>
+void __global__ strict_gather_kernel(GraphT<EdgeWeightType> graph, VertexFrontier frontier) {
+	strict_gather<AccessorType, EdgeWeightType>(graph, frontier, blockIdx.x, gridDim.x);
+}
+void __device__ strict_get_partial_sum(int32_t *elt, int32_t *buf, int32_t f_size, int32_t nnz_per_blk, unsigned int cta_id, unsigned int num_cta)
+{
+	int32_t idx = cta_id*nnz_per_blk + threadIdx.x;
+	int32_t upper_idx = (cta_id+1)*nnz_per_blk;
+	if(upper_idx > f_size) upper_idx = f_size;
+	int32_t accum=0;
+
+	__shared__ int32_t sm_accum[32];
+	for(int32_t i=idx; i<upper_idx; i+=blockDim.x) {
+		accum += elt[i];
+	}
+	accum += __shfl_down_sync((uint32_t)-1, accum, 16);
+	accum += __shfl_down_sync((uint32_t)-1, accum, 8);
+	accum += __shfl_down_sync((uint32_t)-1, accum, 4);
+	accum += __shfl_down_sync((uint32_t)-1, accum, 2);
+	accum += __shfl_down_sync((uint32_t)-1, accum, 1);
+	if(threadIdx.x % 32 == 0) {
+		sm_accum[threadIdx.x/32] = accum;
+	}
+	__syncthreads();
+	if(threadIdx.x < PREFIX_BLK/32) {
+		accum = sm_accum[threadIdx.x];
+	} else {
+		accum = 0;
+	}
+	__syncwarp();
+	if(threadIdx.x < 32) {
+		accum += __shfl_down_sync((uint32_t)-1, accum, 16);
+		accum += __shfl_down_sync((uint32_t)-1, accum, 8);
+		accum += __shfl_down_sync((uint32_t)-1, accum, 4);
+		accum += __shfl_down_sync((uint32_t)-1, accum, 2);
+		accum += __shfl_down_sync((uint32_t)-1, accum, 1);
+	}
+	if(threadIdx.x == 0) {
+		buf[blockIdx.x] = accum;
+	}
+}
+void __global__ strict_get_partial_sum_kernel(int32_t *elt, int32_t *buf, int32_t f_size, int32_t nnz_per_blk) {
+	strict_get_partial_sum(elt, buf, f_size, nnz_per_blk, blockIdx.x, gridDim.x);
+}
+
+void __device__ strict_local_prefix_sum(int32_t *elt, int32_t *buf, int32_t *glt, int32_t prefix_mode, int32_t f_size, int32_t nnz_per_blk, unsigned int cta_id, unsigned int num_cta) {
+	__shared__ int32_t sm_deg[PREFIX_BLK];
+
+	int32_t lane = (threadIdx.x&31);
+
+	// prefix sum
+	int32_t cosize = blockDim.x;
+	int32_t tot_deg;
+	int32_t phase = threadIdx.x;
+	int32_t off=32;
+
+	int32_t base_offset = 0;
+	if(cta_id > 0) base_offset = buf[cta_id];
+
+	int32_t idx = cta_id*nnz_per_blk + threadIdx.x;
+	int32_t upper_idx = (cta_id+1)*nnz_per_blk;
+	if(upper_idx > f_size) upper_idx = f_size;
+
+	for(int32_t i=idx; i<(cta_id+1)*nnz_per_blk; i+=blockDim.x) {
+		int32_t deg = 0;
+		if(i < upper_idx) deg = elt[i];
+
+		for(int32_t d=2; d<=32; d<<=1) {
+			int32_t temp = __shfl_up_sync((uint32_t)-1, deg, d/2);
+			if (lane % d == d - 1) deg += temp;
+		}
+		sm_deg[threadIdx.x] = deg;
+
+		for(int32_t d=cosize>>(1+5); d>0; d>>=1){
+			__syncthreads();
+			if(phase<d){
+				int32_t ai = off*(2*phase+1)-1;
+				int32_t bi = off*(2*phase+2)-1;
+				sm_deg[bi] += sm_deg[ai];
+			}
+			off<<=1;
+		}
+
+		__syncthreads();
+		tot_deg = sm_deg[cosize-1];
+		__syncthreads();
+		if(!phase) sm_deg[cosize-1]=0;
+		__syncthreads();
+
+		for(int32_t d=1; d<(cosize>>5); d<<=1){
+			off >>=1;
+			__syncthreads();
+			if(phase<d){
+				int32_t ai = off*(2*phase+1)-1;
+				int32_t bi = off*(2*phase+2)-1;
+
+				int32_t t = sm_deg[ai];
+				sm_deg[ai]  = sm_deg[bi];
+				sm_deg[bi] += t;
+			}
+		}
+		__syncthreads();
+		deg = sm_deg[threadIdx.x];
+		__syncthreads();
+		for(int32_t d=32; d>1; d>>=1) {
+			int32_t temp_big = __shfl_down_sync((uint32_t)-1, deg, d/2);
+			int32_t temp_small = __shfl_up_sync((uint32_t)-1, deg, d/2);
+			if (lane % d == d/2 - 1) deg = temp_big;
+			else if(lane % d == d - 1) deg += temp_small;
+		}
+		//sm_deg[threadIdx.x] = deg;
+		if(i < upper_idx) {
+			elt[i] = base_offset + deg;
+		}
+		__syncthreads();
+		base_offset += tot_deg;
+
+	}
+	if (prefix_mode == 1 && threadIdx.x == 0) {
+		glt[0] = base_offset;
+	}
+}
+void __global__ strict_local_prefix_sum_kernel(int32_t *elt, int32_t *buf, int32_t *glt, int32_t prefix_mode, int32_t f_size, int32_t nnz_per_blk) {
+	strict_local_prefix_sum(elt, buf, glt, prefix_mode, f_size, nnz_per_blk, blockIdx.x, gridDim.x);
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __device__ strict_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
+
+	__shared__ int32_t sm_idx[STRICT_SM_SIZE], sm_deg[STRICT_SM_SIZE], sm_loc[STRICT_SM_SIZE];
+	int32_t thread_id = threadIdx.x + blockDim.x * cta_id;
+	int32_t tot_size = AccessorType::getSize(input_frontier);
+
+        int32_t deg, index, index_size, src_idx;
+
+	// can be fused
+	bool last_tb = (cta_id == (graph.strict_grid_sum[0] + NNZ_PER_BLOCK-1)/NNZ_PER_BLOCK-1);
+	int32_t start_row = binary_search_upperbound(&graph.strict_sum[0], tot_size, NNZ_PER_BLOCK*cta_id)-1;
+	int32_t end_row = binary_search_upperbound(&graph.strict_sum[0], tot_size, NNZ_PER_BLOCK*(cta_id+1))-1;
+
+	int32_t row_size = end_row - start_row + 1;
+	int32_t start_idx;
+
+	if(row_size <= STRICT_SM_SIZE) {
+		if(threadIdx.x < row_size) {
+			index = AccessorType::getElement(input_frontier, start_row+threadIdx.x);
+			deg = graph.d_get_degree(index);
+
+			sm_idx[threadIdx.x] = index;
+			int32_t tmp_deg = graph.strict_sum[start_row + threadIdx.x] - cta_id * NNZ_PER_BLOCK;
+			if(tmp_deg >= 0) {
+				sm_deg[threadIdx.x] = tmp_deg;
+				sm_loc[threadIdx.x] = graph.d_src_offsets[index];
+			} else {
+				sm_deg[threadIdx.x] = 0;
+				sm_loc[threadIdx.x] = graph.d_src_offsets[index] - tmp_deg;
+			}
+		} else {
+			deg = 0;
+			sm_deg[threadIdx.x] = INT_MAX;
+		}
+		__syncthreads();
+
+		int32_t lane = (threadIdx.x&31);
+		int32_t offset = 0;
+
+		int32_t tot_deg;
+		if(!last_tb) tot_deg = NNZ_PER_BLOCK;
+		else tot_deg = (graph.strict_grid_sum[0] - 1) % NNZ_PER_BLOCK + 1;
+
+		int32_t phase = threadIdx.x;
+		int32_t off=32;
+
+		int32_t width = row_size;
+		for(int32_t i=threadIdx.x; i<tot_deg; i+=blockDim.x) {
+			int32_t id = binary_search_upperbound(&sm_deg[offset], width, i)-1;
+			if(id >= width) continue;
+			src_idx = sm_idx[offset + id];
+			if (src_filter(src_idx) == false)
+				continue;
+			int32_t ei = sm_loc[offset + id] + i - sm_deg[offset + id];
+			int32_t dst_idx = graph.d_edge_dst[ei];
+			load_balance_payload(graph, src_idx, dst_idx, ei, input_frontier, output_frontier);
+		}
+	} else {
+		int32_t tot_deg;
+		if(!last_tb) tot_deg = NNZ_PER_BLOCK;
+		else tot_deg = (graph.strict_grid_sum[0] - 1) % NNZ_PER_BLOCK + 1;
+
+		int32_t width = row_size;
+		int32_t offset = 0;
+
+		for(int32_t i=cta_id*NNZ_PER_BLOCK+threadIdx.x; i<cta_id*NNZ_PER_BLOCK+tot_deg; i+=blockDim.x) {
+			int32_t id = binary_search_upperbound(&graph.strict_sum[start_row], width, i)-1;
+			if(id >= width) continue;
+			src_idx = AccessorType::getElement(input_frontier, start_row+id);
+			if (src_filter(src_idx) == false)
+				continue;
+			int32_t ei = graph.d_src_offsets[src_idx] + i - graph.strict_sum[start_row + id];
+			int32_t dst_idx = graph.d_edge_dst[ei];
+			load_balance_payload(graph, src_idx, dst_idx, ei, input_frontier, output_frontier);
+		}
+
+
+	}
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ strict_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	strict_load_balance(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+}
+
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __host__ strict_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int num_threads = AccessorType::getSizeHost(input_frontier);	
+	int num_cta = (num_threads + CTA_SIZE - 1)/CTA_SIZE;
+	int cta_size = CTA_SIZE;	
+	
+	strict_gather_kernel<AccessorType, EdgeWeightType><<<num_cta, cta_size>>>(graph, input_frontier);
+	
+	int32_t tot_blk = NUM_CTA;	
+	int32_t low_blk = (num_threads + PREFIX_BLK - 1)/PREFIX_BLK;
+	if (low_blk < tot_blk)
+		tot_blk = low_blk;	
+	
+	int32_t gran = PREFIX_BLK * tot_blk;
+	int32_t nnz_per_thread = (num_threads + gran - 1)/gran;
+	int32_t nnz_per_blk = (nnz_per_thread * PREFIX_BLK);
+
+	strict_get_partial_sum_kernel<<<tot_blk, PREFIX_BLK>>>(graph.strict_sum, graph.strict_cta_sum, num_threads, nnz_per_blk);
+	strict_local_prefix_sum_kernel<<<1, PREFIX_BLK>>>(graph.strict_cta_sum, graph.strict_cta_sum, graph.strict_grid_sum, 1, tot_blk + 1, tot_blk + 1);
+	strict_local_prefix_sum_kernel<<<tot_blk, PREFIX_BLK>>>(graph.strict_sum, graph.strict_cta_sum, graph.strict_grid_sum, 0, num_threads, nnz_per_blk);
+	cudaMemcpy(&num_threads, graph.strict_grid_sum, sizeof(int32_t), cudaMemcpyDeviceToHost);
+	num_cta = (num_threads + CTA_SIZE - 1)/CTA_SIZE;
+	cta_size = CTA_SIZE;	
+
+	strict_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);	
+}
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
+void __device__ strict_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int num_threads = AccessorType::getSize(input_frontier);	
+	int num_cta = (num_threads + CTA_SIZE - 1)/CTA_SIZE;
+	int cta_size = CTA_SIZE;	
+
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {	
+		strict_gather<AccessorType, EdgeWeightType>(graph, input_frontier, cta_id, num_cta);
+		__syncthreads();
+	}
+	this_grid().sync();
+	
+	int32_t tot_blk = NUM_CTA;	
+	int32_t low_blk = (num_threads + PREFIX_BLK - 1)/PREFIX_BLK;
+	if (low_blk < tot_blk)
+		tot_blk = low_blk;	
+	int32_t gran = PREFIX_BLK * tot_blk;
+	int32_t nnz_per_thread = (num_threads + gran - 1)/gran;
+	int32_t nnz_per_blk = (nnz_per_thread * PREFIX_BLK);
+
+	for (int32_t cta_id = blockIdx.x; cta_id < tot_blk; cta_id += gridDim.x) {	
+		strict_get_partial_sum(graph.strict_sum, graph.strict_cta_sum, num_threads, nnz_per_blk, cta_id, tot_blk);
+		__syncthreads();
+	}
+	this_grid().sync();
+	if (blockIdx.x == 0) {
+		strict_local_prefix_sum(graph.strict_cta_sum, graph.strict_cta_sum, graph.strict_grid_sum, 1, tot_blk + 1, tot_blk + 1, blockIdx.x, 1);
+	}	
+	this_grid().sync();
+	for (int32_t cta_id = blockIdx.x; cta_id < tot_blk; cta_id += gridDim.x) {	
+		strict_local_prefix_sum(graph.strict_sum, graph.strict_cta_sum, graph.strict_grid_sum, 0, num_threads, nnz_per_blk, cta_id, tot_blk);
+		__syncthreads();
+	}
+	this_grid().sync();
+	num_threads = graph.strict_grid_sum[0];
+	num_cta = (num_threads + CTA_SIZE - 1)/CTA_SIZE;
+	cta_size = CTA_SIZE;	
+	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {	
+		strict_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);
+		__syncthreads();
+	}
+	this_grid().sync();
+	
+}
+
+}
 #endif
diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
index c94eeb22..021b085b 100644
--- a/test/gpu_tests/all_gpu_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -73,8 +73,8 @@ def setUpClass(cls):
 		compute_capability = output[0]
 		num_of_sm = output[1]
 		
-		cls.nvcc_command += " -DNUM_CTA=" + num_of_sm + " -DCTA_SIZE=1024 -gencode arch=compute_" + compute_capability + ",code=sm_" + compute_capability
-		cls.nvcc_command += " -std=c++11 -O3 -I " + GRAPHIT_SOURCE_DIRECTORY + "/src/runtime_lib/ -Xcompiler \"-w\" -Wno-deprecated-gpu-targets "
+		cls.nvcc_command += " -rdc=true -DNUM_CTA=" + str(int(num_of_sm)*2) + " -DCTA_SIZE=512 -gencode arch=compute_" + compute_capability + ",code=sm_" + compute_capability
+		cls.nvcc_command += " -std=c++11 -O3 -I " + GRAPHIT_SOURCE_DIRECTORY + "/src/runtime_lib/ -Xcompiler \"-w\" -Wno-deprecated-gpu-targets --use_fast_math -Xptxas \" -dlcm=ca --maxrregcount=64\" "
 		
 		shutil.copytree(GRAPHIT_SOURCE_DIRECTORY + "/test/graphs", cls.scratch_directory + "/graphs")
 		cls.graph_directory = cls.scratch_directory + "/graphs"
@@ -162,6 +162,34 @@ def test_simple_graphit_sssp_WM_schedule(self):
 		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_WM_schedule.gt")
 		self.sssp_verified_test(self.cuda_filename, False)
 		
+	def test_simple_graphit_sssp_strict_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_strict_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_vertex_based_kernel_fusion_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_vertex_based_kernel_fusion_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_TWC_kernel_fusion_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_TWC_kernel_fusion_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_TWCE_kernel_fusion_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_TWCE_kernel_fusion_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_CM_kernel_fusion_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_CM_kernel_fusion_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_WM_kernel_fusion_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_WM_kernel_fusion_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
+	def test_simple_graphit_sssp_strict_kernel_fusion_schedule(self):
+		self.graphit_generate_test("inputs/sssp.gt", "schedules/sssp_strict_kernel_fusion_schedule.gt")
+		self.sssp_verified_test(self.cuda_filename, False)
+
 if __name__ == '__main__':
 	unittest.main()
 	#suite = unittest.TestSuite()
diff --git a/test/gpu_tests/test_input/inputs/sssp.gt b/test/gpu_tests/test_input/inputs/sssp.gt
index f3ee02e4..06cbe777 100644
--- a/test/gpu_tests/test_input/inputs/sssp.gt
+++ b/test/gpu_tests/test_input/inputs/sssp.gt
@@ -21,7 +21,7 @@ func main()
     frontier.addVertex(start_vertex); %add source vertex
     SP[start_vertex] = 0;
     var rounds : int = 0;
-    while (frontier.getVertexSetSize() != 0)
+    #s0# while (frontier.getVertexSetSize() != 0)
         #s1# var output: vertexset{Vertex}  = edges.from(frontier).applyModified(updateEdge, SP);
         delete frontier;
         frontier = output;
diff --git a/test/gpu_tests/test_input/schedules/sssp_CM_kernel_fusion_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_CM_kernel_fusion_schedule.gt
new file mode 100644
index 00000000..718654bf
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_CM_kernel_fusion_schedule.gt
@@ -0,0 +1,8 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(CM);
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/test/gpu_tests/test_input/schedules/sssp_CM_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_CM_schedule.gt
index c21c26da..c6d6052e 100644
--- a/test/gpu_tests/test_input/schedules/sssp_CM_schedule.gt
+++ b/test/gpu_tests/test_input/schedules/sssp_CM_schedule.gt
@@ -1,4 +1,4 @@
 schedule:
 	SimpleGPUSchedule s1;
 	s1.configLoadBalance(CM);
-	program->applyGPUSchedule("s1", s1);
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_TWCE_kernel_fusion_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_TWCE_kernel_fusion_schedule.gt
new file mode 100644
index 00000000..7bfdaab5
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_TWCE_kernel_fusion_schedule.gt
@@ -0,0 +1,8 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/test/gpu_tests/test_input/schedules/sssp_TWCE_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_TWCE_schedule.gt
index e2c72966..d309bef4 100644
--- a/test/gpu_tests/test_input/schedules/sssp_TWCE_schedule.gt
+++ b/test/gpu_tests/test_input/schedules/sssp_TWCE_schedule.gt
@@ -1,4 +1,4 @@
 schedule:
 	SimpleGPUSchedule s1;
 	s1.configLoadBalance(TWCE);
-	program->applyGPUSchedule("s1", s1);
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_TWC_kernel_fusion_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_TWC_kernel_fusion_schedule.gt
new file mode 100644
index 00000000..59721e6f
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_TWC_kernel_fusion_schedule.gt
@@ -0,0 +1,8 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWC);
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/test/gpu_tests/test_input/schedules/sssp_TWC_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_TWC_schedule.gt
index 16fb8b35..6c8f164f 100644
--- a/test/gpu_tests/test_input/schedules/sssp_TWC_schedule.gt
+++ b/test/gpu_tests/test_input/schedules/sssp_TWC_schedule.gt
@@ -1,4 +1,4 @@
 schedule:
 	SimpleGPUSchedule s1;
 	s1.configLoadBalance(TWC);
-	program->applyGPUSchedule("s1", s1);
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_WM_kernel_fusion_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_WM_kernel_fusion_schedule.gt
new file mode 100644
index 00000000..74ffe161
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_WM_kernel_fusion_schedule.gt
@@ -0,0 +1,8 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(WM);
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/test/gpu_tests/test_input/schedules/sssp_WM_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_WM_schedule.gt
index 7b76a0fd..9df6f14b 100644
--- a/test/gpu_tests/test_input/schedules/sssp_WM_schedule.gt
+++ b/test/gpu_tests/test_input/schedules/sssp_WM_schedule.gt
@@ -1,4 +1,4 @@
 schedule:
 	SimpleGPUSchedule s1;
 	s1.configLoadBalance(WM);
-	program->applyGPUSchedule("s1", s1);
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_default_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_default_schedule.gt
index ae99ff2b..225cce1b 100644
--- a/test/gpu_tests/test_input/schedules/sssp_default_schedule.gt
+++ b/test/gpu_tests/test_input/schedules/sssp_default_schedule.gt
@@ -1,3 +1,3 @@
 schedule:
 	SimpleGPUSchedule s1;
-	program->applyGPUSchedule("s1", s1);
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_strict_kernel_fusion_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_strict_kernel_fusion_schedule.gt
new file mode 100644
index 00000000..f9c4b730
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_strict_kernel_fusion_schedule.gt
@@ -0,0 +1,8 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(STRICT);
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/test/gpu_tests/test_input/schedules/sssp_strict_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_strict_schedule.gt
new file mode 100644
index 00000000..e458ca60
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_strict_schedule.gt
@@ -0,0 +1,4 @@
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(STRICT);
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/test/gpu_tests/test_input/schedules/sssp_vertex_based_kernel_fusion_schedule.gt b/test/gpu_tests/test_input/schedules/sssp_vertex_based_kernel_fusion_schedule.gt
new file mode 100644
index 00000000..67d773a5
--- /dev/null
+++ b/test/gpu_tests/test_input/schedules/sssp_vertex_based_kernel_fusion_schedule.gt
@@ -0,0 +1,7 @@
+schedule:
+	SimpleGPUSchedule s1;
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);

From bae47f13500a65012125c1459db476a614c9996b Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Fri, 25 Oct 2019 11:33:20 -0400
Subject: [PATCH 56/88] Fixed small bug in strict

---
 src/backend/codegen_gpu/codegen_gpu.cpp  |  4 +++
 src/runtime_lib/infra_gpu/graph.h        |  4 +++
 src/runtime_lib/infra_gpu/load_balance.h | 39 +++++++++++++-----------
 test/gpu_tests/all_gpu_tests.py          |  2 +-
 test/gpu_tests/test_input/sssp_lp.cu     | 13 ++++----
 5 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 1eecb206..d1e70d8f 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -589,6 +589,8 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 		load_balance_function = "gpu_runtime::CM_load_balance";
 	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::WM) {
 		load_balance_function = "gpu_runtime::WM_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::STRICT) {
+		load_balance_function = "gpu_runtime::strict_load_balance";
 	}
 
 	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae)) {
@@ -715,6 +717,8 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 		load_balance_function = "gpu_runtime::CM_load_balance";
 	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::WM) {
 		load_balance_function = "gpu_runtime::WM_load_balance";
+	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::STRICT) {
+		load_balance_function = "gpu_runtime::strict_load_balance";
 	}
 	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae)) {
 		printIndent();
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
index 8dab7060..307528bd 100644
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -155,6 +155,10 @@ static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool
 	cudaMalloc(&graph.twc_large_bin, graph.num_vertices * 6 * sizeof(int32_t));
 	cudaMalloc(&graph.twc_bin_sizes, 3 * sizeof(int32_t));
 
+	cudaMalloc(&graph.strict_sum, graph.num_vertices * 6 * sizeof(int32_t));
+	cudaMalloc(&graph.strict_cta_sum, NUM_CTA * 2 * sizeof(int32_t));
+	cudaMalloc(&graph.strict_grid_sum, sizeof(int32_t));
+
 }
 template <typename EdgeWeightType>
 static int32_t builtin_getVertices(GraphT<EdgeWeightType> &graph) {
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index 1140c2c8..46f9fd56 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -692,7 +692,7 @@ template <typename AccessorType, typename EdgeWeightType>
 void __device__ strict_gather(GraphT<EdgeWeightType> &graph, VertexFrontier &frontier, unsigned int cta_id, unsigned int num_cta) {
         int32_t thread_id = threadIdx.x + blockDim.x * cta_id;
         int32_t tot_size = AccessorType::getSize(frontier);
-	int32_t idx, deg;
+	int32_t idx;
 	if(thread_id < tot_size) {
 		idx = AccessorType::getElement(frontier, thread_id);
 		graph.strict_sum[thread_id] = graph.d_get_degree(idx);
@@ -736,8 +736,9 @@ void __device__ strict_get_partial_sum(int32_t *elt, int32_t *buf, int32_t f_siz
 		accum += __shfl_down_sync((uint32_t)-1, accum, 2);
 		accum += __shfl_down_sync((uint32_t)-1, accum, 1);
 	}
+	__syncthreads();
 	if(threadIdx.x == 0) {
-		buf[blockIdx.x] = accum;
+		buf[cta_id] = accum;
 	}
 }
 void __global__ strict_get_partial_sum_kernel(int32_t *elt, int32_t *buf, int32_t f_size, int32_t nnz_per_blk) {
@@ -762,7 +763,7 @@ void __device__ strict_local_prefix_sum(int32_t *elt, int32_t *buf, int32_t *glt
 	int32_t upper_idx = (cta_id+1)*nnz_per_blk;
 	if(upper_idx > f_size) upper_idx = f_size;
 
-	for(int32_t i=idx; i<(cta_id+1)*nnz_per_blk; i+=blockDim.x) {
+	for(int32_t i=idx; i<(cta_id+1)*nnz_per_blk; i += blockDim.x) {
 		int32_t deg = 0;
 		if(i < upper_idx) deg = elt[i];
 
@@ -817,6 +818,7 @@ void __device__ strict_local_prefix_sum(int32_t *elt, int32_t *buf, int32_t *glt
 		base_offset += tot_deg;
 
 	}
+	__syncthreads();
 	if (prefix_mode == 1 && threadIdx.x == 0) {
 		glt[0] = base_offset;
 	}
@@ -828,10 +830,11 @@ template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> loa
 void __device__ strict_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
 
 	__shared__ int32_t sm_idx[STRICT_SM_SIZE], sm_deg[STRICT_SM_SIZE], sm_loc[STRICT_SM_SIZE];
-	int32_t thread_id = threadIdx.x + blockDim.x * cta_id;
+	//int32_t thread_id = threadIdx.x + blockDim.x * cta_id;
 	int32_t tot_size = AccessorType::getSize(input_frontier);
 
-        int32_t deg, index, index_size, src_idx;
+        int32_t index, src_idx;
+	//int32_t deg;
 
 	// can be fused
 	bool last_tb = (cta_id == (graph.strict_grid_sum[0] + NNZ_PER_BLOCK-1)/NNZ_PER_BLOCK-1);
@@ -839,12 +842,12 @@ void __device__ strict_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontie
 	int32_t end_row = binary_search_upperbound(&graph.strict_sum[0], tot_size, NNZ_PER_BLOCK*(cta_id+1))-1;
 
 	int32_t row_size = end_row - start_row + 1;
-	int32_t start_idx;
+	//int32_t start_idx;
 
 	if(row_size <= STRICT_SM_SIZE) {
 		if(threadIdx.x < row_size) {
 			index = AccessorType::getElement(input_frontier, start_row+threadIdx.x);
-			deg = graph.d_get_degree(index);
+			//deg = graph.d_get_degree(index);
 
 			sm_idx[threadIdx.x] = index;
 			int32_t tmp_deg = graph.strict_sum[start_row + threadIdx.x] - cta_id * NNZ_PER_BLOCK;
@@ -856,20 +859,20 @@ void __device__ strict_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontie
 				sm_loc[threadIdx.x] = graph.d_src_offsets[index] - tmp_deg;
 			}
 		} else {
-			deg = 0;
+			//deg = 0;
 			sm_deg[threadIdx.x] = INT_MAX;
 		}
 		__syncthreads();
 
-		int32_t lane = (threadIdx.x&31);
+		//int32_t lane = (threadIdx.x&31);
 		int32_t offset = 0;
 
 		int32_t tot_deg;
 		if(!last_tb) tot_deg = NNZ_PER_BLOCK;
 		else tot_deg = (graph.strict_grid_sum[0] - 1) % NNZ_PER_BLOCK + 1;
 
-		int32_t phase = threadIdx.x;
-		int32_t off=32;
+		//int32_t phase = threadIdx.x;
+		//int32_t off=32;
 
 		int32_t width = row_size;
 		for(int32_t i=threadIdx.x; i<tot_deg; i+=blockDim.x) {
@@ -888,7 +891,7 @@ void __device__ strict_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontie
 		else tot_deg = (graph.strict_grid_sum[0] - 1) % NNZ_PER_BLOCK + 1;
 
 		int32_t width = row_size;
-		int32_t offset = 0;
+		//int32_t offset = 0;
 
 		for(int32_t i=cta_id*NNZ_PER_BLOCK+threadIdx.x; i<cta_id*NNZ_PER_BLOCK+tot_deg; i+=blockDim.x) {
 			int32_t id = binary_search_upperbound(&graph.strict_sum[start_row], width, i)-1;
@@ -906,7 +909,7 @@ void __device__ strict_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontie
 }
 template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
 void __global__ strict_load_balance_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
-	strict_load_balance(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
+	strict_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
 }
 
 template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
@@ -914,7 +917,6 @@ void __host__ strict_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFron
 	int num_threads = AccessorType::getSizeHost(input_frontier);	
 	int num_cta = (num_threads + CTA_SIZE - 1)/CTA_SIZE;
 	int cta_size = CTA_SIZE;	
-	
 	strict_gather_kernel<AccessorType, EdgeWeightType><<<num_cta, cta_size>>>(graph, input_frontier);
 	
 	int32_t tot_blk = NUM_CTA;	
@@ -926,10 +928,13 @@ void __host__ strict_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFron
 	int32_t nnz_per_thread = (num_threads + gran - 1)/gran;
 	int32_t nnz_per_blk = (nnz_per_thread * PREFIX_BLK);
 
+
 	strict_get_partial_sum_kernel<<<tot_blk, PREFIX_BLK>>>(graph.strict_sum, graph.strict_cta_sum, num_threads, nnz_per_blk);
-	strict_local_prefix_sum_kernel<<<1, PREFIX_BLK>>>(graph.strict_cta_sum, graph.strict_cta_sum, graph.strict_grid_sum, 1, tot_blk + 1, tot_blk + 1);
+	
+	strict_local_prefix_sum_kernel<<<1, PREFIX_BLK>>>(graph.strict_cta_sum, graph.strict_cta_sum, graph.strict_grid_sum, 1, tot_blk + 1, (tot_blk + PREFIX_BLK)/PREFIX_BLK * PREFIX_BLK);
 	strict_local_prefix_sum_kernel<<<tot_blk, PREFIX_BLK>>>(graph.strict_sum, graph.strict_cta_sum, graph.strict_grid_sum, 0, num_threads, nnz_per_blk);
 	cudaMemcpy(&num_threads, graph.strict_grid_sum, sizeof(int32_t), cudaMemcpyDeviceToHost);
+	cudaCheckLastError();
 	num_cta = (num_threads + CTA_SIZE - 1)/CTA_SIZE;
 	cta_size = CTA_SIZE;	
 
@@ -939,7 +944,6 @@ template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> loa
 void __device__ strict_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
 	int num_threads = AccessorType::getSize(input_frontier);	
 	int num_cta = (num_threads + CTA_SIZE - 1)/CTA_SIZE;
-	int cta_size = CTA_SIZE;	
 
 	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {	
 		strict_gather<AccessorType, EdgeWeightType>(graph, input_frontier, cta_id, num_cta);
@@ -961,7 +965,7 @@ void __device__ strict_load_balance_device(GraphT<EdgeWeightType> &graph, Vertex
 	}
 	this_grid().sync();
 	if (blockIdx.x == 0) {
-		strict_local_prefix_sum(graph.strict_cta_sum, graph.strict_cta_sum, graph.strict_grid_sum, 1, tot_blk + 1, tot_blk + 1, blockIdx.x, 1);
+		strict_local_prefix_sum(graph.strict_cta_sum, graph.strict_cta_sum, graph.strict_grid_sum, 1, tot_blk + 1, (tot_blk + PREFIX_BLK)/PREFIX_BLK * PREFIX_BLK, blockIdx.x, 1);
 	}	
 	this_grid().sync();
 	for (int32_t cta_id = blockIdx.x; cta_id < tot_blk; cta_id += gridDim.x) {	
@@ -971,7 +975,6 @@ void __device__ strict_load_balance_device(GraphT<EdgeWeightType> &graph, Vertex
 	this_grid().sync();
 	num_threads = graph.strict_grid_sum[0];
 	num_cta = (num_threads + CTA_SIZE - 1)/CTA_SIZE;
-	cta_size = CTA_SIZE;	
 	for (int32_t cta_id = blockIdx.x; cta_id < num_cta; cta_id += gridDim.x) {	
 		strict_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, cta_id, num_cta);
 		__syncthreads();
diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
index 021b085b..43dd5d0b 100644
--- a/test/gpu_tests/all_gpu_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -39,7 +39,7 @@ def sssp_verified_test(self, input_file_name, use_delta=False):
 			#start point 0, delta 10, verified
 			self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel 0 10 v > " + self.verifier_input)
 		else:
-			self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel v > " + self.verifier_input)	     
+			self.get_command_output(self.executable_name + " " + self.graph_directory + "/4.wel 0 v > " + self.verifier_input)	     
 		output = self.get_command_output(self.verifier_directory + "/sssp_verifier -f " + self.graph_directory +  "/4.wel -t " + self.verifier_input + "  -r 0")		
 		test_flag = False
 		for line in output.rstrip().split("\n"):
diff --git a/test/gpu_tests/test_input/sssp_lp.cu b/test/gpu_tests/test_input/sssp_lp.cu
index 89471969..37688991 100644
--- a/test/gpu_tests/test_input/sssp_lp.cu
+++ b/test/gpu_tests/test_input/sssp_lp.cu
@@ -58,7 +58,7 @@ void cudaCheckLastError(void) {
 #define WARP_SIZE (32)
 #define STAGE_1_SIZE (8)
 
-void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
+void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state device_state, int32_t start_vertex) {
         int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
         int num_threads = blockDim.x * gridDim.x;
         int total_work = graph.num_vertices;
@@ -71,9 +71,9 @@ void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state devic
                 }
         }
 	if (thread_id == 0) {
-		device_state.SP[0] = 0;
+		device_state.SP[start_vertex] = 0;
 		//starting point is set to 0 
-		device_state.frontier1[0] = 0;	
+		device_state.frontier1[0] = start_vertex;
 		*device_state.frontier1_size = 1;
 		*device_state.frontier2_size = 0;
 	}
@@ -289,6 +289,7 @@ int main(int argc, char *argv[]) {
 	cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
 	gpu_runtime::GraphT<int32_t> graph;
 	gpu_runtime::load_graph(graph, argv[1], false);
+	int32_t start_vertex = atoi(argv[2]);
 
 	algo_state host_state, device_state;
 
@@ -302,7 +303,7 @@ int main(int argc, char *argv[]) {
 		startTimer();
 		
 		startTimer();
-		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);		
+		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state, start_vertex);		
 		int iters = 0;	
 		cudaDeviceSynchronize();
 		float t = stopTimer();
@@ -338,8 +339,8 @@ int main(int argc, char *argv[]) {
 
 	}
 	//printf("Total time = %f\n", total_time);
-	if (argc > 2)
-		if (argv[2][0] == 'v'){ 
+	if (argc > 3)
+		if (argv[3][0] == 'v'){ 
 			//FILE *output = fopen("output.txt", "w");
 			cudaMemcpy(host_state.SP, device_state.SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
 			for (int i = 0; i < graph.num_vertices; i++)

From 4f7d15aa2df9855b2486ffb971253fc67d4de22d Mon Sep 17 00:00:00 2001
From: "zhangyunming1990@gmail.com" <zhangyunming1990@gmail.com>
Date: Fri, 25 Oct 2019 14:35:57 -0400
Subject: [PATCH 57/88] switching to launch kernel functions in the dequeue
 method with pointers to device_gpq

---
 .../infra_gpu/gpu_priority_queue.h            | 38 ++++++++++++++++---
 .../test_input/sssp_delta_stepping.cu         | 35 ++++++++++++-----
 2 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index b193b6aa..bef211db 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -7,6 +7,12 @@
 
 
 namespace gpu_runtime {
+
+    template<typename PriorityT_>
+    class GPUPriorityQueue;
+
+    static void __global__ update_nodes_identify_min(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices);
+  
   
   template<typename PriorityT_>
     class GPUPriorityQueue {
@@ -46,10 +52,9 @@ namespace gpu_runtime {
     }
 
     
-
-    gpu_runtime::VertexFrontier __device__ dequeueReadySet(){
-      
-
+    gpu_runtime::VertexFrontier __host__ dequeueReadySet(GPUPriorityQueue<PriorityT_> * device_gpq){
+      update_nodes_identify_min<<<NUM_BLOCKS, CTA_SIZE>>>(device_gpq, frontier_.max_num_elems);
+      return;
     }
     
     PriorityT_* host_priorities_ = nullptr;
@@ -61,9 +66,32 @@ namespace gpu_runtime {
 
     //Need to do = {0} to avoid dynamic initialization error
     VertexFrontier frontier_ = {0};
-
     
   };
+
+
+  static void __global__ update_nodes_identify_min(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices)
+  {
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int num_threads = blockDim.x * gridDim.x;
+    int total_work = num_vertices;
+    int work_per_thread = (total_work + num_threads - 1)/num_threads;
+    int32_t my_minimum = INT_MAX;
+    for (int i = 0; i < work_per_thread; i++) {
+      int32_t node_id = thread_id + i * num_threads;
+	if (node_id < num_vertices) {
+	  if (gpq->device_priorities_[node_id] >= (gpq->window_upper_) && gpq->device_priorities_[node_id] != INT_MAX && gpq->device_priorities_[node_id] < my_minimum) {
+	    my_minimum = gpq->device_priorities_[node_id];
+	  }
+	}
+    }
+    
+    if (my_minimum < gpq->current_priority_){
+          atomicMin(&(gpq->current_priority_), my_minimum);
+    }
+  }
+
+  
 }
 
 
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 89b343f8..f1c5e7d5 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -1,3 +1,14 @@
+
+
+
+#define VIRTUAL_WARP_SIZE (32)
+#define NUM_THREADS (1024)
+#define NUM_BLOCKS (80)
+#define CTA_SIZE (1024)
+#define WARP_SIZE (32)
+#define STAGE_1_SIZE (8)
+
+
 #include "gpu_intrinsics.h"
 #include <algorithm>
 
@@ -8,7 +19,7 @@
 #include <vector>2
 #include <queue>
 
-//#define DEBUG
+#define DEBUG
 
 #ifdef DEBUG
   #define ITER_COUNT (5)
@@ -40,13 +51,6 @@ int32_t *__device_SP;
 //int32_t __device__ window_upper;
 
 
-#define VIRTUAL_WARP_SIZE (32)
-#define NUM_THREADS (1024)
-#define NUM_BLOCKS (80)
-#define CTA_SIZE (1024)
-#define WARP_SIZE (32)
-#define STAGE_1_SIZE (8)
-
 void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state device_state, int start_v) {
         int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
         int num_threads = blockDim.x * gridDim.x;
@@ -214,15 +218,26 @@ int main(int argc, char *argv[]) {
 			  //host_state.new_window_start[0] = INT_MAX;
 			  host_gpq.window_upper_ = host_gpq.current_priority_ + host_gpq.delta_;
 			  host_gpq.current_priority_ = INT_MAX;
-			  
+
+			  //std::cout << "test0" << std::endl;
 			  cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
 			  gpu_runtime::cudaCheckLastError();
 
-			  update_nodes_identify_min<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);
+			  //update_nodes_identify_min<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);
+			  //device_gpq.update_nodes_identify_min(graph.num_vertices);
+			  //std::cout << "test1" << std::endl;
+
+			  
+			  gpu_runtime::GPUPriorityQueue<int> * tmp_gpq;
+			  cudaGetSymbolAddress(((void **)&tmp_gpq), device_gpq);
+			  host_gpq.dequeueReadySet(tmp_gpq);
+			  
 			  gpu_runtime::cudaCheckLastError();
 			  cudaMemcpyFromSymbol(&host_gpq, device_gpq, sizeof(host_gpq), 0,cudaMemcpyDeviceToHost);
 			  gpu_runtime::cudaCheckLastError();
 
+			  //std::cout << "host_gpq.current_priority_: " << host_gpq.current_priority_ << std::endl;
+
 			  //if(host_gpq.current_priority_ == INT_MAX){
 			  //  break;
 			  //}			  

From 996434ab803be1bd857999495e23c932030d27f6 Mon Sep 17 00:00:00 2001
From: "zhangyunming1990@gmail.com" <zhangyunming1990@gmail.com>
Date: Fri, 25 Oct 2019 16:42:16 -0400
Subject: [PATCH 58/88] adding a default NUM_BLOCKS for priority queue

---
 src/runtime_lib/infra_gpu/gpu_priority_queue.h   | 3 +++
 test/gpu_tests/test_input/runtime_lib_tests.cu   | 1 +
 test/gpu_tests/test_input/sssp_delta_stepping.cu | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index bef211db..24d6ebec 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -5,6 +5,9 @@
 #include <cinttypes>
 #include "vertex_frontier.h" 
 
+#ifndef NUM_BLOCKS
+  #define NUM_BLOCKS 80
+#endif
 
 namespace gpu_runtime {
 
diff --git a/test/gpu_tests/test_input/runtime_lib_tests.cu b/test/gpu_tests/test_input/runtime_lib_tests.cu
index 9e43d808..c6f0d893 100644
--- a/test/gpu_tests/test_input/runtime_lib_tests.cu
+++ b/test/gpu_tests/test_input/runtime_lib_tests.cu
@@ -1,4 +1,5 @@
 #include <gtest.h>
+#define NUM_BLOCKS (80)
 #include "gpu_intrinsics.h"
 
 std::string graph_directory;
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index f1c5e7d5..c3572985 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -19,7 +19,7 @@
 #include <vector>2
 #include <queue>
 
-#define DEBUG
+//#define DEBUG
 
 #ifdef DEBUG
   #define ITER_COUNT (5)

From c975b9d4b7708f7dd86c06699994fd181ded5b9e Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Fri, 25 Oct 2019 16:48:13 -0400
Subject: [PATCH 59/88] Added graph blocking operation and improved performance
 of TWCE

---
 src/runtime_lib/infra_gpu/graph.h        | 47 ++++++++++++++++++++++++
 src/runtime_lib/infra_gpu/load_balance.h | 25 +++++++------
 2 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
index 307528bd..3cb461bf 100644
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -72,6 +72,52 @@ static bool string_ends_with(const char* str, const char* sub_str) {
 		return true;
 	return false;
 }
+
+static int32_t identify_block_id (int32_t vid, int32_t blocking_size) {
+	return vid / blocking_size;
+}
+template <typename EdgeWeightType>
+static void block_graph_edges(GraphT<EdgeWeightType> &input_graph, GraphT<EdgeWeightType> &output_graph, int32_t blocking_size) {
+	output_graph = input_graph;
+	output_graph.h_src_offsets = nullptr;
+	output_graph.d_src_offsets = nullptr;
+
+	output_graph.h_edge_src = new int32_t[input_graph.num_edges];
+	output_graph.h_edge_dst = new int32_t[input_graph.num_edges];
+	output_graph.h_edge_weights = new EdgeWeightType[input_graph.num_edges];
+
+	int32_t num_blocks = (input_graph.num_vertices + blocking_size - 1)/blocking_size;
+	
+	int32_t *block_sizes = new int32_t[num_blocks+1];		
+	
+	for (int32_t eid = 0; eid < input_graph.num_edges; eid++) {
+		int32_t dst = input_graph.d_edge_dst[eid];
+		int32_t block_id = identify_block_id(dst, blocking_size);
+		block_sizes[block_id] += 1;
+	}	
+	block_sizes[0] = 0;
+	for (int32_t eid = 0; eid < input_graph.num_edges; eid++) {
+		int32_t dst = input_graph.d_edge_dst[eid];
+		int32_t block_id = identify_block_id(dst, blocking_size);
+		int32_t new_eid = block_sizes[block_id];
+		block_sizes[block_id]++;
+		output_graph.h_edge_src[new_eid] = input_graph.d_edge_src[eid];	
+		output_graph.h_edge_dst[new_eid] = input_graph.d_edge_dst[eid];	
+		output_graph.h_edge_weights[new_eid] = input_graph.d_edge_weights[eid];	
+	}
+	
+	delete[] block_sizes;
+	cudaMalloc(&output_graph.d_edge_src, sizeof(int32_t) * output_graph.num_edges);
+	cudaMalloc(&output_graph.d_edge_dst, sizeof(int32_t) * output_graph.num_edges);
+	cudaMalloc(&output_graph.d_edge_weight, sizeof(EdgeWeightType) * output_graph.num_edges);
+	
+	
+	cudaMemcpy(output_graph.d_edge_src, output_graph.h_edge_src, sizeof(int32_t) * output_graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(output_graph.d_edge_dst, output_graph.h_edge_dst, sizeof(int32_t) * output_graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(output_graph.d_edge_weight, output_graph.h_edge_weight, sizeof(EdgeWeightType) * output_graph.num_edges, cudaMemcpyHostToDevice);
+		
+}
+
 template <typename EdgeWeightType>
 static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool to_sort = false) {
 	int flen = strlen(filename.c_str());
@@ -160,6 +206,7 @@ static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool
 	cudaMalloc(&graph.strict_grid_sum, sizeof(int32_t));
 
 }
+
 template <typename EdgeWeightType>
 static int32_t builtin_getVertices(GraphT<EdgeWeightType> &graph) {
 	return graph.num_vertices;
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index 46f9fd56..7adb8e52 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -151,35 +151,38 @@ static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> &graph, VertexFr
 	if (local_vertex_idx < total_vertices) {
 		local_vertex = AccessorType::getElement(input_frontier, local_vertex_idx);
 		// Step 1 seggregate vertices into shared buffers
-		if (threadIdx.x % (STAGE_1_SIZE) == 0) {
-			degree = graph.d_get_degree(local_vertex);
-			src_offset = graph.d_src_offsets[local_vertex];
-			int32_t s3_size = degree/CTA_SIZE;
-			degree = degree - s3_size * CTA_SIZE;
-			if (s3_size > 0) {
+		degree = graph.d_get_degree(local_vertex);
+		src_offset = graph.d_src_offsets[local_vertex];
+		int32_t s3_size = degree/CTA_SIZE;
+		degree = degree - s3_size * CTA_SIZE;
+		if (s3_size > 0) {
+			if (threadIdx.x % (STAGE_1_SIZE) == 0) {
 				int32_t pos = atomicAggInc(&stage_queue_sizes[2]);
 				stage3_queue[pos] = local_vertex;
 				stage3_size[pos] = s3_size * CTA_SIZE;
 				stage3_offset[pos] = src_offset;
 			}
+		}
 
-			int32_t s2_size = degree/WARP_SIZE;
-			degree = degree - WARP_SIZE * s2_size;
-			if (s2_size > 0) {
+		int32_t s2_size = degree/WARP_SIZE;
+		degree = degree - WARP_SIZE * s2_size;
+		if (s2_size > 0) {
+			if (threadIdx.x % (STAGE_1_SIZE) == 0) {
 				int32_t pos = atomicAggInc(&stage_queue_sizes[1]);
 				stage2_queue[pos] = local_vertex;
 				stage2_offset[pos] = s3_size * CTA_SIZE + src_offset;
 				stage2_size[pos] = s2_size * WARP_SIZE;
 			}
-			s1_offset = s3_size * CTA_SIZE + s2_size * WARP_SIZE + src_offset;
 		}
+		s1_offset = s3_size * CTA_SIZE + s2_size * WARP_SIZE + src_offset;
 	} else 
 		local_vertex = -1;
 	__syncthreads();
+/*
 	degree = __shfl_sync((uint32_t)-1, degree, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
 	s1_offset = __shfl_sync((uint32_t)-1, s1_offset, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
 	local_vertex = __shfl_sync((uint32_t)-1, local_vertex, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
-
+*/
 	if (local_vertex_idx < total_vertices) {
 		// STAGE 1
 		for (int32_t neigh_id = s1_offset + (lane_id % STAGE_1_SIZE); neigh_id < degree + s1_offset; neigh_id += STAGE_1_SIZE) {

From b4bfa597f208791235433c2cfb400646874c5c40 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Fri, 25 Oct 2019 16:49:11 -0400
Subject: [PATCH 60/88] Small change  in TWCE

---
 src/runtime_lib/infra_gpu/load_balance.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index 7adb8e52..59ea5174 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -131,10 +131,8 @@ static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> &graph, VertexFr
 	__shared__ int32_t stage3_queue[CTA_SIZE];
 	__shared__ int32_t stage_queue_sizes[3];
 	
-	if (threadIdx.x == 0) {
-		stage_queue_sizes[0] = 0;
-		stage_queue_sizes[1] = 0;
-		stage_queue_sizes[2] = 0;
+	if (threadIdx.x < 3) {
+		stage_queue_sizes[threadIdx.x] = 0;
 	}
 	__syncthreads();
 	__shared__ int32_t stage2_offset[CTA_SIZE];

From 122401eeaf7b40e2e177c49a5b75206c434188c2 Mon Sep 17 00:00:00 2001
From: "zhangyunming1990@gmail.com" <zhangyunming1990@gmail.com>
Date: Mon, 28 Oct 2019 16:20:51 -0400
Subject: [PATCH 61/88] moving the dequeue operation to the start of the while
 loop instead of the end

---
 .../test_input/sssp_delta_stepping.cu         | 56 ++++++++++++-------
 1 file changed, 36 insertions(+), 20 deletions(-)

diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index c3572985..0078cdf2 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -172,16 +172,10 @@ int main(int argc, char *argv[]) {
 		float iter_total = 0;
 		//this sets it to Sparse
 		host_gpq.frontier_ = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
-
-		//frontier = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
 		
-		//gpu_runtime::builtin_addVertex(host_gpq.frontier_, start_vertex);
 		gpu_runtime::vertex_set_apply_kernel<gpu_runtime::AccessorAll, SP_generated_vector_op_apply_func_0><<<NUM_CTA, CTA_SIZE>>>(graph.getFullFrontier());
 		startTimer();
 
-		//host_gpq.delta_ = delta;
-		//host_gpq.current_priority_ = 0 ;
-
 		host_gpq.init(__host_SP, __device_SP, 0, delta, start_vertex);
 
 		cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
@@ -196,25 +190,18 @@ int main(int argc, char *argv[]) {
 		//printf("Init time = %f\n", t);
 		iter_total+=t;
 
+		//std::cout << "frontier size: " << gpu_runtime::builtin_getVertexSetSize(host_gpq.frontier_) << std::endl;
+		
 		//while(gpu_runtime::builtin_getVertexSetSize(frontier) != (0)){
 		while(! host_gpq.finished()){
 			startTimer();
 			iters++;
-			gpu_runtime::vertex_set_prepare_sparse(host_gpq.frontier_);
-			//cudaMemcpyToSymbol(window_upper, &device_state.window_upper, sizeof(int32_t), 0);
-			//Might not be necessary, always synchronized at this point?? 
-			cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
-			gpu_runtime::cudaCheckLastError();
 
-			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);  
-			gpu_runtime::TWCE_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, host_gpq.frontier_, host_gpq.frontier_);
-			gpu_runtime::cudaCheckLastError();
-			
-			gpu_runtime::swap_bytemaps(host_gpq.frontier_);
-			// set the input to the prepare function
-			host_gpq.frontier_.format_ready = gpu_runtime::VertexFrontier::BYTEMAP;
-			
+			//std::cout << "iter: " << iters << std::endl;
+
 			if (gpu_runtime::builtin_getVertexSetSize(host_gpq.frontier_) == (0)) {
+
+			  //std::cout << "inside dequeue routine" << std::endl;
 			  //host_state.new_window_start[0] = INT_MAX;
 			  host_gpq.window_upper_ = host_gpq.current_priority_ + host_gpq.delta_;
 			  host_gpq.current_priority_ = INT_MAX;
@@ -244,9 +231,38 @@ int main(int argc, char *argv[]) {
 			  update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>( graph, device_state, host_gpq.frontier_);
 			  gpu_runtime::cudaCheckLastError();
 			  gpu_runtime::swap_queues(host_gpq.frontier_);
-			  host_gpq.frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE; 
+			  host_gpq.frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE;
+			  
+			}//end of checking if host_gpq frontier is empty or not
+
+			if (host_gpq.finished()){
+			  break;
 			}
 
+			
+			gpu_runtime::vertex_set_prepare_sparse(host_gpq.frontier_);
+			//cudaMemcpyToSymbol(window_upper, &device_state.window_upper, sizeof(int32_t), 0);
+			//Might not be necessary, always synchronized at this point?? 
+			cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
+			gpu_runtime::cudaCheckLastError();
+
+			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);  
+			gpu_runtime::TWCE_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, host_gpq.frontier_, host_gpq.frontier_);
+			gpu_runtime::cudaCheckLastError();
+			
+			gpu_runtime::swap_bytemaps(host_gpq.frontier_);
+			// set the input to the prepare function
+			host_gpq.frontier_.format_ready = gpu_runtime::VertexFrontier::BYTEMAP;
+			
+			
+
+
+
+
+			
+
+
+			
 			cudaDeviceSynchronize();
 			t = stopTimer();
 

From 8313c6dda6a6ad290014691853aa54b7eec8a4c5 Mon Sep 17 00:00:00 2001
From: "zhangyunming1990@gmail.com" <zhangyunming1990@gmail.com>
Date: Tue, 29 Oct 2019 11:49:41 -0400
Subject: [PATCH 62/88] refactoring the code to use dequeueReadySet

---
 .../infra_gpu/gpu_priority_queue.h            | 55 +++++++++++++++++--
 .../test_input/sssp_delta_stepping.cu         |  9 ++-
 2 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index 24d6ebec..a32d7121 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -9,13 +9,20 @@
   #define NUM_BLOCKS 80
 #endif
 
+#ifndef CTA_SIZE
+  #define CTA_SIZE 1024
+#endif
+
+
 namespace gpu_runtime {
 
     template<typename PriorityT_>
     class GPUPriorityQueue;
 
     static void __global__ update_nodes_identify_min(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices);
-  
+
+
+    static void __global__ update_nodes_special(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices, gpu_runtime::VertexFrontier output_frontier);
   
   template<typename PriorityT_>
     class GPUPriorityQueue {
@@ -55,9 +62,26 @@ namespace gpu_runtime {
     }
 
     
-    gpu_runtime::VertexFrontier __host__ dequeueReadySet(GPUPriorityQueue<PriorityT_> * device_gpq){
-      update_nodes_identify_min<<<NUM_BLOCKS, CTA_SIZE>>>(device_gpq, frontier_.max_num_elems);
-      return;
+    void  dequeueReadySet(GPUPriorityQueue<PriorityT_> * device_gpq){
+      if (gpu_runtime::builtin_getVertexSetSize(frontier_) == 0) {
+	window_upper_ = current_priority_ + delta_;
+	current_priority_ = INT_MAX;
+
+	cudaMemcpy(device_gpq, this, sizeof(*device_gpq), cudaMemcpyHostToDevice); 
+	gpu_runtime::cudaCheckLastError();
+	
+	update_nodes_identify_min<<<NUM_BLOCKS, CTA_SIZE>>>(device_gpq, frontier_.max_num_elems);
+	gpu_runtime::cudaCheckLastError();
+
+	cudaMemcpy(this, device_gpq, sizeof(*this), cudaMemcpyDeviceToHost);
+	gpu_runtime::cudaCheckLastError();
+
+	//this line needs to be fixed
+	update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>(device_gpq, frontier_.max_num_elems,  frontier_);
+	gpu_runtime::cudaCheckLastError();
+	gpu_runtime::swap_queues(frontier_);
+	frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE;
+      }
     }
     
     PriorityT_* host_priorities_ = nullptr;
@@ -92,7 +116,30 @@ namespace gpu_runtime {
     if (my_minimum < gpq->current_priority_){
           atomicMin(&(gpq->current_priority_), my_minimum);
     }
+  }//end of update_nodes_identify_min
+
+
+
+  static void __global__ update_nodes_special(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices, gpu_runtime::VertexFrontier output_frontier){
+    
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int num_threads = blockDim.x * gridDim.x;
+    //int warp_id = thread_id / 32;
+    
+    int total_work = num_vertices;
+    int work_per_thread = (total_work + num_threads - 1)/num_threads;
+    for (int i = 0; i < work_per_thread; i++) {
+      int32_t node_id = thread_id + i * num_threads;
+      if (node_id < num_vertices) {
+	if(gpq->device_priorities_[node_id] >= gpq->current_priority_ && gpq->device_priorities_[node_id] < (gpq->current_priority_ + gpq->delta_)) {
+	  gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, node_id);
+	}
+      }
+    }
   }
+  
+
+
 
   
 }
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 0078cdf2..9ee0c986 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -19,7 +19,7 @@
 #include <vector>2
 #include <queue>
 
-//#define DEBUG
+#define DEBUG
 
 #ifdef DEBUG
   #define ITER_COUNT (5)
@@ -199,6 +199,7 @@ int main(int argc, char *argv[]) {
 
 			//std::cout << "iter: " << iters << std::endl;
 
+			/*
 			if (gpu_runtime::builtin_getVertexSetSize(host_gpq.frontier_) == (0)) {
 
 			  //std::cout << "inside dequeue routine" << std::endl;
@@ -234,6 +235,12 @@ int main(int argc, char *argv[]) {
 			  host_gpq.frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE;
 			  
 			}//end of checking if host_gpq frontier is empty or not
+			*/
+			
+			gpu_runtime::GPUPriorityQueue<int> * tmp_gpq;
+			cudaGetSymbolAddress(((void **)&tmp_gpq), device_gpq);
+			host_gpq.dequeueReadySet(tmp_gpq);
+			
 
 			if (host_gpq.finished()){
 			  break;

From 5dd47167cced93a3ef225e394480261aca3028df Mon Sep 17 00:00:00 2001
From: "zhangyunming1990@gmail.com" <zhangyunming1990@gmail.com>
Date: Tue, 29 Oct 2019 12:36:07 -0400
Subject: [PATCH 63/88] cleaning up the refactored code

---
 .../test_input/sssp_delta_stepping.cu         | 157 +-----------------
 1 file changed, 7 insertions(+), 150 deletions(-)

diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 9ee0c986..de20a789 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -16,8 +16,6 @@
 #define USE_DEDUP 0
 #define SORT_NODES 0
 #include <assert.h>
-#include <vector>2
-#include <queue>
 
 #define DEBUG
 
@@ -31,53 +29,28 @@ gpu_runtime::GPUPriorityQueue<int> host_gpq;
 gpu_runtime::GPUPriorityQueue<int> __device__  device_gpq; 
 
 
-typedef struct {
-	int32_t *SP;
-	int32_t *output_size;
-	int32_t num_blocks;
-	int32_t *node_borders;
-	int32_t *edge_borders;
-	int32_t *old_indices;
-	int32_t window_lower;
-	int32_t window_upper;		
-	int32_t *new_window_start;
-}algo_state;
-
 int32_t __device__ *SP;
 int32_t *__host_SP;
 int32_t *__device_SP;
 
-//int32_t __device__ window_lower;
-//int32_t __device__ window_upper;
-
 
-void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, algo_state device_state, int start_v) {
+void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, int start_v) {
         int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
         int num_threads = blockDim.x * gridDim.x;
         int total_work = graph.num_vertices;
         int work_per_thread = (total_work + num_threads - 1)/num_threads;
-        for (int i = 0; i < work_per_thread; i++) {
-                int id = num_threads * i + thread_id;
-                if (id < total_work) {
-					device_state.SP[id] = INT_MAX;
-                }
-        }
 	if (thread_id == 0) {
 		//reset with the new data structure
 		SP[start_v] = 0;
-		device_state.SP[start_v] = 0;
 	}
 }
 
 bool __device__ updateEdge(int32_t src, int32_t dst, int32_t weight) {
-	bool output2;
+  bool output2;
 	bool SP_trackving_var_1 = 0;
 	SP_trackving_var_1 = gpu_runtime::writeMin(&SP[dst], (SP[src] + weight));
 	output2 = SP_trackving_var_1;
-
-	//if (SP[dst] >= window_upper) return false;
 	if (SP[dst] >= (device_gpq.current_priority_ + device_gpq.delta_)) return false;
-	
 	return output2;
 }
 
@@ -86,69 +59,14 @@ void __device__ gpu_operator_body_3(gpu_runtime::GraphT<EdgeWeightType> graph, i
 	// Body of the actual operator code
 	EdgeWeightType weight = graph.d_edge_weight[edge_id];
 	if (updateEdge(src, dst, weight)){
-		//gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, dst);
 		gpu_runtime::enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, dst);
 	}
 }
 
-void __global__ update_nodes_identify_min(gpu_runtime::GraphT<int32_t> graph, algo_state device_state) {
-	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-	int num_threads = blockDim.x * gridDim.x;
-	
-	int total_work = graph.num_vertices;
-	int work_per_thread = (total_work + num_threads - 1)/num_threads;
-	int32_t my_minimum = INT_MAX;
-	for (int i = 0; i < work_per_thread; i++) {
-		int32_t node_id = thread_id + i * num_threads;
-		if (node_id < graph.num_vertices) {
-		  if (device_gpq.device_priorities_[node_id] >= (device_gpq.window_upper_) && device_gpq.device_priorities_[node_id] != INT_MAX && device_gpq.device_priorities_[node_id] < my_minimum) {
-				my_minimum = device_gpq.device_priorities_[node_id];
-			}
-		}
-	}
-	//if (my_minimum < device_state.new_window_start[0]) {
-	if (my_minimum < device_gpq.current_priority_){
-	  //atomicMin(device_state.new_window_start, my_minimum);
-	  atomicMin(&(device_gpq.current_priority_), my_minimum);
-	}	
-}
-void __global__ update_nodes_special(gpu_runtime::GraphT<int32_t> graph, algo_state device_state,  gpu_runtime::VertexFrontier output_frontier) {
-	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-	int num_threads = blockDim.x * gridDim.x;
-	//int warp_id = thread_id / 32;	
-	
-	int total_work = graph.num_vertices;
-	int work_per_thread = (total_work + num_threads - 1)/num_threads;
-	for (int i = 0; i < work_per_thread; i++) {
-		int32_t node_id = thread_id + i * num_threads;
-		if (node_id < graph.num_vertices) {
-		  //if(SP[node_id] >= device_state.window_lower && SP[node_id] < device_state.window_upper) {
-		  if(device_gpq.device_priorities_[node_id] >= device_gpq.current_priority_ && SP[node_id] < (device_gpq.current_priority_ + device_gpq.delta_)) {
-				gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, node_id);
-			}	
-		}
-	}
-}
-void allocate_state(algo_state &host_state, algo_state &device_state, gpu_runtime::GraphT<int32_t> &graph) {
-	host_state.SP = new int[graph.num_vertices];
-	host_state.output_size = new int32_t[1];
-	host_state.new_window_start = new int32_t[1];
-	cudaMalloc(&device_state.SP, sizeof(int32_t)*graph.num_vertices);	
-	cudaMalloc(&device_state.output_size, sizeof(int32_t));
-	cudaMalloc(&device_state.new_window_start, sizeof(int32_t));
-}
-
-void swap_pointers(int32_t **a, int32_t **b) {
-	int32_t* t = *a;
-	*a = *b;
-	*b = t;
-}
-
 void __device__ SP_generated_vector_op_apply_func_0(int32_t v) {
 	SP[v] = 2147483647;
 }
 
-
 int main(int argc, char *argv[]) {
 	cudaSetDevice(0);
 	cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
@@ -159,14 +77,8 @@ int main(int argc, char *argv[]) {
 	
 	cudaMalloc(&__device_SP, gpu_runtime::builtin_getVertices(graph) * sizeof(int32_t));
 	cudaMemcpyToSymbol(SP, &__device_SP, sizeof(int32_t*), 0);
-	
 	__host_SP = new int32_t[gpu_runtime::builtin_getVertices(graph)];
-
-	algo_state host_state, device_state;	
-	allocate_state(host_state, device_state, graph);
-
 	cudaDeviceSynchronize();
-	
 	float total_time = 0;
 	for (int outer = 0; outer < ITER_COUNT; outer++) {
 		float iter_total = 0;
@@ -181,7 +93,7 @@ int main(int argc, char *argv[]) {
 		cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
 		gpu_runtime::cudaCheckLastError();
 		
-		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state, start_vertex);
+		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, start_vertex);
 		gpu_runtime::cudaCheckLastError();
 		
 		int iters = 0;	
@@ -197,79 +109,24 @@ int main(int argc, char *argv[]) {
 			startTimer();
 			iters++;
 
-			//std::cout << "iter: " << iters << std::endl;
-
-			/*
-			if (gpu_runtime::builtin_getVertexSetSize(host_gpq.frontier_) == (0)) {
-
-			  //std::cout << "inside dequeue routine" << std::endl;
-			  //host_state.new_window_start[0] = INT_MAX;
-			  host_gpq.window_upper_ = host_gpq.current_priority_ + host_gpq.delta_;
-			  host_gpq.current_priority_ = INT_MAX;
-
-			  //std::cout << "test0" << std::endl;
-			  cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
-			  gpu_runtime::cudaCheckLastError();
-
-			  //update_nodes_identify_min<<<NUM_BLOCKS, CTA_SIZE>>>(graph, device_state);
-			  //device_gpq.update_nodes_identify_min(graph.num_vertices);
-			  //std::cout << "test1" << std::endl;
-
-			  
-			  gpu_runtime::GPUPriorityQueue<int> * tmp_gpq;
-			  cudaGetSymbolAddress(((void **)&tmp_gpq), device_gpq);
-			  host_gpq.dequeueReadySet(tmp_gpq);
-			  
-			  gpu_runtime::cudaCheckLastError();
-			  cudaMemcpyFromSymbol(&host_gpq, device_gpq, sizeof(host_gpq), 0,cudaMemcpyDeviceToHost);
-			  gpu_runtime::cudaCheckLastError();
-
-			  //std::cout << "host_gpq.current_priority_: " << host_gpq.current_priority_ << std::endl;
-
-			  //if(host_gpq.current_priority_ == INT_MAX){
-			  //  break;
-			  //}			  
-			  update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>( graph, device_state, host_gpq.frontier_);
-			  gpu_runtime::cudaCheckLastError();
-			  gpu_runtime::swap_queues(host_gpq.frontier_);
-			  host_gpq.frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE;
-			  
-			}//end of checking if host_gpq frontier is empty or not
-			*/
-			
 			gpu_runtime::GPUPriorityQueue<int> * tmp_gpq;
 			cudaGetSymbolAddress(((void **)&tmp_gpq), device_gpq);
 			host_gpq.dequeueReadySet(tmp_gpq);
 			
-
 			if (host_gpq.finished()){
 			  break;
 			}
 
-			
 			gpu_runtime::vertex_set_prepare_sparse(host_gpq.frontier_);
-			//cudaMemcpyToSymbol(window_upper, &device_state.window_upper, sizeof(int32_t), 0);
-			//Might not be necessary, always synchronized at this point?? 
 			cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
 			gpu_runtime::cudaCheckLastError();
 
-			//gpu_runtime::vertex_based_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);  
 			gpu_runtime::TWCE_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, host_gpq.frontier_, host_gpq.frontier_);
 			gpu_runtime::cudaCheckLastError();
 			
 			gpu_runtime::swap_bytemaps(host_gpq.frontier_);
 			// set the input to the prepare function
-			host_gpq.frontier_.format_ready = gpu_runtime::VertexFrontier::BYTEMAP;
-			
-			
-
-
-
-
-			
-
-
-			
+			host_gpq.frontier_.format_ready = gpu_runtime::VertexFrontier::BYTEMAP;	
 			cudaDeviceSynchronize();
 			t = stopTimer();
 
@@ -297,16 +154,16 @@ int main(int argc, char *argv[]) {
 	if (argc > 3)
 		if (argv[4][0] == 'v'){ 
 			//FILE *output = fopen("output.txt", "w");
-			cudaMemcpy(host_state.SP, __device_SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
+			cudaMemcpy(__host_SP, __device_SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
 			#ifdef DEBUG
 			FILE *output = fopen("output.txt", "w");
 			#endif
 			
 			for (int i = 0; i < graph.num_vertices; i++){
 				#ifdef DEBUG
-				fprintf(output, "%d, %d\n", i, host_state.SP[i]);
+				fprintf(output, "%d, %d\n", i, __host_SP[i]);
 				#else
-				printf("%d\n", host_state.SP[i]);
+				printf("%d\n", __host_SP[i]);
                                 #endif
 			}
 		}

From 35a3571339245c114a9c651db9478055ee043ad7 Mon Sep 17 00:00:00 2001
From: "zhangyunming1990@gmail.com" <zhangyunming1990@gmail.com>
Date: Tue, 29 Oct 2019 12:41:46 -0400
Subject: [PATCH 64/88] comment out the debug flag

---
 test/gpu_tests/test_input/sssp_delta_stepping.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index de20a789..39d68ea0 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -17,7 +17,7 @@
 #define SORT_NODES 0
 #include <assert.h>
 
-#define DEBUG
+//#define DEBUG
 
 #ifdef DEBUG
   #define ITER_COUNT (5)

From f06c645ba7c8610c3535bb054105ed3224bb3ee1 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Tue, 29 Oct 2019 13:30:22 -0400
Subject: [PATCH 65/88] Got CC working. Moved the output frontier enqueing
 logic inside the UDF

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |   3 +-
 .../midend/gpu_change_tracking_lower.h        |  39 +++++
 include/graphit/midend/mir.h                  |   3 +
 src/backend/codegen_gpu/codegen_gpu.cpp       | 154 ++++++++++--------
 src/midend/gpu_change_tracking_lower.cpp      |  90 ++++++++++
 src/midend/mir.cpp                            |   2 +
 src/midend/mir_lower.cpp                      |   8 +-
 src/runtime_lib/gpu_intrinsics.h              |   9 +-
 src/runtime_lib/infra_gpu/load_balance.h      |   5 -
 src/runtime_lib/infra_gpu/vertex_frontier.h   |  40 ++++-
 10 files changed, 265 insertions(+), 88 deletions(-)
 create mode 100644 include/graphit/midend/gpu_change_tracking_lower.h
 create mode 100644 src/midend/gpu_change_tracking_lower.cpp

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index 5b2c0412..cfd7e0ae 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -32,7 +32,7 @@ class CodeGenGPUKernelEmitter: public mir::MIRVisitor {
 
 };
 
-class CodeGenGPU : public mir::MIRVisitor{
+class CodeGenGPU: public mir::MIRVisitor{
 public:
 	CodeGenGPU(std::ostream &input_oss, MIRContext *mir_context, std::string module_name_, std::string module_path):
 		oss(input_oss), mir_context_(mir_context), module_name(module_name_) {
@@ -41,7 +41,6 @@ class CodeGenGPU : public mir::MIRVisitor{
 		}
 
 	int genGPU();
-
 protected:
 
 	void indent() { ++indentLevel; }
diff --git a/include/graphit/midend/gpu_change_tracking_lower.h b/include/graphit/midend/gpu_change_tracking_lower.h
new file mode 100644
index 00000000..e03f19a9
--- /dev/null
+++ b/include/graphit/midend/gpu_change_tracking_lower.h
@@ -0,0 +1,39 @@
+#ifndef GPU_CHANGE_TRACKING_LOWER_H
+#define GPU_CHANGE_TRACKING_LOWER_H
+
+#include <graphit/midend/mir_visitor.h>
+#include <graphit/midend/mir_context.h>
+#include <graphit/frontend/schedule.h>
+namespace graphit {
+class GPUChangeTrackingLower {
+public:
+	MIRContext *mir_context_;
+	Schedule *schedule_;
+	GPUChangeTrackingLower(MIRContext *mir_context, Schedule *schedule): mir_context_(mir_context), schedule_(schedule) {
+	}
+	void lower (void);
+	struct UdfArgChangeVisitor: public mir::MIRVisitor {
+		using mir::MIRVisitor::visit;
+		MIRContext *mir_context_;
+		UdfArgChangeVisitor(MIRContext *mir_context): mir_context_(mir_context) {
+		}
+		void updateUdf(mir::FuncDecl::Ptr func_decl, mir::EdgeSetApplyExpr::Ptr);
+		virtual void visit(mir::PushEdgeSetApplyExpr::Ptr) override;
+		virtual void visit(mir::PullEdgeSetApplyExpr::Ptr) override;
+	};
+
+	struct ReductionOpChangeVisitor: public mir::MIRVisitor {
+		using mir::MIRVisitor::visit;
+		MIRContext *mir_context_;
+		mir::EdgeSetApplyExpr::Ptr current_edge_set_apply_expr;
+		std::string udf_tracking_var;
+		ReductionOpChangeVisitor(MIRContext *mir_context, std::string tracking_var, mir::EdgeSetApplyExpr::Ptr edge_set_apply_expr): mir_context_(mir_context), udf_tracking_var(tracking_var), current_edge_set_apply_expr(edge_set_apply_expr) {
+		}	
+		virtual void visit(mir::StmtBlock::Ptr) override;
+
+
+	};
+};
+}
+
+#endif
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index be08d86a..f457f3b9 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -473,6 +473,7 @@ namespace graphit {
             ReductionOp reduce_op_;
             std::string tracking_var_name_ = "";
             bool is_atomic_ = false;
+	    std::shared_ptr<EdgeSetApplyExpr> calling_edge_set_apply_expr = nullptr;
 
             typedef std::shared_ptr<ReduceStmt> Ptr;
 
@@ -495,6 +496,7 @@ namespace graphit {
         struct CompareAndSwapStmt : public AssignStmt {
             Expr::Ptr compare_val_expr;
             std::string tracking_var_;
+	    std::shared_ptr<EdgeSetApplyExpr> calling_edge_set_apply_expr = nullptr;
 
             typedef std::shared_ptr<CompareAndSwapStmt> Ptr;
 
@@ -617,6 +619,7 @@ namespace graphit {
 
             //TODO: replace this with a statement
             StmtBlock::Ptr body;
+	
 
             typedef std::shared_ptr<FuncDecl> Ptr;
 
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index d1e70d8f..f8c19596 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -86,9 +86,13 @@ void CodeGenGPU::genPropertyArrayDecl(mir::VarDecl::Ptr constant) {
 void CodeGenGPU::genPropertyArrayAlloca(mir::VarDecl::Ptr var_decl) {
 	auto vector_type = mir::to<mir::VectorType>(var_decl->type);
 	assert(vector_type != nullptr);
+
+	mir::Expr::Ptr size_expr = nullptr;	
+	if (vector_type->element_type != nullptr) {
+		size_expr = mir_context_->getElementCount(vector_type->element_type);
+		assert(size_expr != nullptr);
+	}
 	
-	auto size_expr = mir_context_->getElementCount(vector_type->element_type);
-	assert(size_expr != nullptr);
 	
 	if (var_decl->initVal != nullptr && mir::isa<mir::Call>(var_decl->initVal)) {
 		printIndent();
@@ -98,7 +102,10 @@ void CodeGenGPU::genPropertyArrayAlloca(mir::VarDecl::Ptr var_decl) {
 	} else {
 		printIndent();
 		oss << "cudaMalloc(&__device_" << var_decl->name << ", ";
-		size_expr->accept(this);
+		if (size_expr != nullptr)
+			size_expr->accept(this);
+		else
+			oss << vector_type->range_indexset;
 		oss << " * sizeof(";
 		vector_type->vector_element_type->accept(this);
 		oss << "));" << std::endl;
@@ -115,7 +122,10 @@ void CodeGenGPU::genPropertyArrayAlloca(mir::VarDecl::Ptr var_decl) {
 	oss << "__host_" << var_decl->name << " = new ";
 	vector_type->vector_element_type->accept(this);
 	oss << "[";
-	size_expr->accept(this);
+	if (size_expr != nullptr)
+		size_expr->accept(this);
+	else
+		oss << vector_type->range_indexset;
 	oss << "];" << std::endl;
 	
 		
@@ -250,46 +260,23 @@ void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 		dedent();
 	}
 	mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function_name);
-	if (apply_expr->requires_output) {
-		if (input_function->args.size() == 3) {	
-			printIndent();
-			oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
-			printIndent();
-			oss << "if (" << apply_expr->input_function_name << "(src, dst, weight)) {" << std::endl;
-		} else {
-			printIndent();
-			oss << "if (" << apply_expr->input_function_name << "(src, dst)) {" << std::endl;
-		}
-		indent();
+	// Enqueueing is disabled from here. We are now enqueing from the UDF 
+	if (apply_expr->is_weighted) {	
 		printIndent();
-		if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED)
-			oss << "gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, dst);" << std::endl;
-		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP)
-			oss << "gpu_runtime::enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, dst);" << std::endl;
-		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP)
-			oss << "gpu_runtime::enqueueVertexBitmap(output_frontier.d_bit_map_output, output_frontier.d_num_elems_output, dst);" << std::endl;
-		dedent();
+		oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
 		printIndent();
-		oss << "}" << std::endl;
+		oss << apply_expr->input_function_name << "(src, dst, weight";
 	} else {
-		if (input_function->args.size() == 3) {	
-			printIndent();
-			oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
-			printIndent();
-			oss << apply_expr->input_function_name << "(src, dst, weight);" << std::endl;
-		} else {
-			printIndent();
-			oss << apply_expr->input_function_name << "(src, dst);" << std::endl;
-		}
+		printIndent();
+		oss << apply_expr->input_function_name << "(src, dst";
 	}
+	if (apply_expr->requires_output)
+		oss << ", output_frontier";
+	oss << ");" << std::endl;
 	dedent();
 	printIndent();
 	oss << "}" << std::endl;	
 	apply_expr->device_function = load_balancing_arg;
-	// We are not generating the kernel now because we are directly using the host wrappers from the library
-/*
-	genEdgeSetGlobalKernel(apply_expr);
-*/
 	
 }
 
@@ -327,48 +314,24 @@ void CodeGenGPUKernelEmitter::visit(mir::PullEdgeSetApplyExpr::Ptr apply_expr) {
 	}
 
 	mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function_name);
-	if (apply_expr->requires_output) {
-		if (input_function->args.size() == 3) {	
-			printIndent();
-			oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
-			printIndent();
-			oss << "if (" << apply_expr->input_function_name << "(dst, src, weight)) {" << std::endl;
-		} else {
-			printIndent();
-			oss << "if (" << apply_expr->input_function_name << "(dst, src)) {" << std::endl;
-		}
-
-		indent();
+	// Enqueueing is disabled from here. We are now enqueing from the UDF 
+	if (apply_expr->is_weighted) {	
 		printIndent();
-		if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED)
-			oss << "gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, src);" << std::endl;
-		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP)
-			oss << "gpu_runtime::enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, src);" << std::endl;
-		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP)
-			oss << "gpu_runtime::enqueueVertexBitmap(output_frontier.d_bit_map_output, output_frontier.d_num_elems_output, src);" << std::endl;
-		dedent();
+		oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
 		printIndent();
-		oss << "}" << std::endl;
+		oss << apply_expr->input_function_name << "(dst, src, weight";
 	} else {
-		if (input_function->args.size() == 3) {	
-			printIndent();
-			oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
-			printIndent();
-			oss << apply_expr->input_function_name << "(dst, src, weight);" << std::endl;
-		} else {
-			printIndent();
-			oss << apply_expr->input_function_name << "(dst, src);" << std::endl;
-		}
+		printIndent();
+		oss << apply_expr->input_function_name << "(dst, src";
 	}
+	if (apply_expr->requires_output)
+		oss << ", output_frontier";
+	oss << ");" << std::endl;
 	dedent();
 	printIndent();
 	oss << "}" << std::endl;	
 	apply_expr->device_function = load_balancing_arg;
 
-	// We are not generating the kernel now because we are directly using the host wrappers from the library
-/*
-	genEdgeSetGlobalKernel(apply_expr);
-*/
 }
 
 void CodeGenGPU::genIncludeStmts(void) {
@@ -985,16 +948,58 @@ void CodeGenGPU::visit(mir::ReduceStmt::Ptr reduce_stmt) {
 			oss << ");" << std::endl;
 			break;
 	}	
+
+	if (reduce_stmt->tracking_var_name_ != "") {
+		mir::EdgeSetApplyExpr::Ptr apply_expr = reduce_stmt->calling_edge_set_apply_expr;
+		printIndent();
+		oss << "if (" << reduce_stmt->tracking_var_name_ << ") {" << std::endl;
+		indent();
+		printIndent();
+		if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED)
+			oss << "gpu_runtime::enqueueVertexSparseQueue(__output_frontier.d_sparse_queue_output, __output_frontier.d_num_elems_output, ";
+		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP)
+			oss << "gpu_runtime::enqueueVertexBytemap(__output_frontier.d_byte_map_output, __output_frontier.d_num_elems_output, ";
+		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP)
+			oss << "gpu_runtime::enqueueVertexBitmap(__output_frontier.d_bit_map_output, __output_frontier.d_num_elems_output, ";
+		mir::TensorReadExpr::Ptr tre = mir::to<mir::TensorReadExpr>(reduce_stmt->lhs);
+		tre->index->accept(this);
+		oss << ");" << std::endl;
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;
+	}
+
 }
 void CodeGenGPU::visit(mir::CompareAndSwapStmt::Ptr cas_stmt) {
 	printIndent();
-	oss << cas_stmt->tracking_var_ << " = gpu_runtime::CAS(&";
+	if (cas_stmt->tracking_var_ != "") 
+		oss << cas_stmt->tracking_var_ << " = ";
+	oss << "gpu_runtime::CAS(&";
 	cas_stmt->lhs->accept(this);
 	oss << ", ";
 	cas_stmt->compare_val_expr->accept(this);
 	oss << ", ";
 	cas_stmt->expr->accept(this);
 	oss << ");" << std::endl;
+	if (cas_stmt->tracking_var_ != "") {
+		mir::EdgeSetApplyExpr::Ptr apply_expr = cas_stmt->calling_edge_set_apply_expr;
+		printIndent();
+		oss << "if (" << cas_stmt->tracking_var_ << ") {" << std::endl;
+		indent();
+		printIndent();
+		if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED)
+			oss << "gpu_runtime::enqueueVertexSparseQueue(__output_frontier.d_sparse_queue_output, __output_frontier.d_num_elems_output, ";
+		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP)
+			oss << "gpu_runtime::enqueueVertexBytemap(__output_frontier.d_byte_map_output, __output_frontier.d_num_elems_output, ";
+		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP)
+			oss << "gpu_runtime::enqueueVertexBitmap(__output_frontier.d_bit_map_output, __output_frontier.d_num_elems_output, ";
+		mir::TensorReadExpr::Ptr tre = mir::to<mir::TensorReadExpr>(cas_stmt->lhs);
+		tre->index->accept(this);
+		oss << ");" << std::endl;
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;
+	}
 }
 void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
 	
@@ -1236,6 +1241,11 @@ void CodeGenGPU::visit(mir::VertexSetAllocExpr::Ptr vsae) {
 	mir::Expr::Ptr size_expr = mir_context_->getElementCount(vsae->element_type);
 	oss << "gpu_runtime::create_new_vertex_set(";
 	size_expr->accept(this);
+	oss << ", ";
+	if (vsae->size_expr == nullptr)
+		oss << "0";
+	else
+		vsae->size_expr->accept(this);
 	oss << ")";
 }
 void CodeGenGPUHost::generateDeviceToHostCopy(mir::TensorArrayReadExpr::Ptr tare) {
@@ -1247,7 +1257,7 @@ void CodeGenGPUHost::generateDeviceToHostCopy(mir::TensorArrayReadExpr::Ptr tare
 	oss << ", __device_" << var_name << " + ";
 	tare->index->accept(this);
 	oss << ", sizeof(";
-	mir::to<mir::VectorType>(target.getType())->element_type->accept(this);
+	mir::to<mir::VectorType>(target.getType())->vector_element_type->accept(this);
 	oss << "), cudaMemcpyDeviceToHost);" << std::endl;	
 	
 }
@@ -1260,7 +1270,7 @@ void CodeGenGPUHost::generateHostToDeviceCopy(mir::TensorArrayReadExpr::Ptr tare
 	oss << ", __host_" << var_name << " + ";
 	tare->index->accept(this);
 	oss << ", sizeof(";
-	mir::to<mir::VectorType>(target.getType())->element_type->accept(this);
+	mir::to<mir::VectorType>(target.getType())->vector_element_type->accept(this);
 	oss << "), cudaMemcpyHostToDevice);" << std::endl;	
 }
 void CodeGenGPUHost::visit(mir::StmtBlock::Ptr stmt_block) {
diff --git a/src/midend/gpu_change_tracking_lower.cpp b/src/midend/gpu_change_tracking_lower.cpp
new file mode 100644
index 00000000..894b85ab
--- /dev/null
+++ b/src/midend/gpu_change_tracking_lower.cpp
@@ -0,0 +1,90 @@
+#include <graphit/midend/gpu_change_tracking_lower.h>
+
+namespace graphit {
+void GPUChangeTrackingLower::lower(void) {
+	UdfArgChangeVisitor visitor(mir_context_);
+	for (auto func: mir_context_->getFunctionList()) {
+		func->accept(&visitor);
+	}
+}
+void GPUChangeTrackingLower::UdfArgChangeVisitor::updateUdf(mir::FuncDecl::Ptr func_decl, mir::EdgeSetApplyExpr::Ptr esae) {
+	if (esae->requires_output == false)
+		return;
+	//assert(func_decl->udf_tracking_var == "" && "Currently, each UDF can only be used by one EdgeSetApply");
+	//func_decl->udf_tracking_var = esae->tracking_field;
+	//func_decl->calling_edge_set_apply_expr = esae;
+
+	mir::VarExpr::Ptr var_expr = mir::to<mir::VarExpr>(esae->target);	
+	mir::EdgeSetType::Ptr edge_set_type = mir::to<mir::EdgeSetType>(var_expr->var.getType());
+	mir::ElementType::Ptr element_type = (*(edge_set_type->vertex_element_type_list))[0];
+	mir::VertexSetType::Ptr vertex_set_type = std::make_shared<mir::VertexSetType>();
+	vertex_set_type->element = element_type;
+	
+	mir::Var new_arg("__output_frontier", vertex_set_type);
+	func_decl->args.push_back(new_arg);
+	
+	// Now modify all the reduce stmts inside
+	ReductionOpChangeVisitor visitor(mir_context_, esae->tracking_field, esae);
+	func_decl->accept(&visitor);
+}
+void GPUChangeTrackingLower::UdfArgChangeVisitor::visit(mir::PushEdgeSetApplyExpr::Ptr pesae) {
+	mir::FuncDecl::Ptr func_decl = mir_context_->getFunction(pesae->input_function_name);	
+	updateUdf(func_decl, pesae);
+}
+void GPUChangeTrackingLower::UdfArgChangeVisitor::visit(mir::PullEdgeSetApplyExpr::Ptr pesae) {
+	mir::FuncDecl::Ptr func_decl = mir_context_->getFunction(pesae->input_function_name);	
+	updateUdf(func_decl, pesae);
+}
+
+void GPUChangeTrackingLower::ReductionOpChangeVisitor::visit(mir::StmtBlock::Ptr stmt_block) {
+	std::vector<mir::Stmt::Ptr> new_stmts;
+	for (auto stmt: *(stmt_block->stmts)) {
+		stmt->accept(this);
+		if (mir::isa<mir::ReduceStmt>(stmt)) {
+			mir::ReduceStmt::Ptr reduce_stmt = mir::to<mir::ReduceStmt>(stmt);
+			if (mir::isa<mir::TensorReadExpr>(reduce_stmt->lhs)) {
+				mir::TensorReadExpr::Ptr tre = mir::to<mir::TensorReadExpr>(reduce_stmt->lhs);
+				if (mir::isa<mir::VarExpr>(tre->target) && mir::to<mir::VarExpr>(tre->target)->var.getName() == udf_tracking_var) {
+					std::string result_var_name = "result_var" + mir_context_->getUniqueNameCounterString();
+					reduce_stmt->tracking_var_name_ = result_var_name;
+					reduce_stmt->calling_edge_set_apply_expr = current_edge_set_apply_expr;
+					
+					mir::ScalarType::Ptr scalar_type = std::make_shared<mir::ScalarType>();
+					scalar_type->type = mir::ScalarType::Type::BOOL;
+					mir::BoolLiteral::Ptr bool_literal = std::make_shared<mir::BoolLiteral>();
+					bool_literal->val = false;
+					mir::VarDecl::Ptr decl_stmt = std::make_shared<mir::VarDecl>();
+					decl_stmt->name = result_var_name;
+					decl_stmt->type = scalar_type;
+					decl_stmt->initVal = bool_literal;
+					new_stmts.push_back(decl_stmt);
+				}
+			}
+		} else if (mir::isa<mir::CompareAndSwapStmt>(stmt)) {
+			mir::CompareAndSwapStmt::Ptr cas_stmt = mir::to<mir::CompareAndSwapStmt>(stmt);
+			if (mir::isa<mir::TensorReadExpr>(cas_stmt->lhs)) {
+				mir::TensorReadExpr::Ptr tre = mir::to<mir::TensorReadExpr>(cas_stmt->lhs);
+				if (mir::isa<mir::VarExpr>(tre->target) && mir::to<mir::VarExpr>(tre->target)->var.getName() == udf_tracking_var) {
+					std::string result_var_name = "result_var" + mir_context_->getUniqueNameCounterString();
+					cas_stmt->tracking_var_ = result_var_name;
+					cas_stmt->calling_edge_set_apply_expr = current_edge_set_apply_expr;
+					
+					mir::ScalarType::Ptr scalar_type = std::make_shared<mir::ScalarType>();
+					scalar_type->type = mir::ScalarType::Type::BOOL;
+					mir::BoolLiteral::Ptr bool_literal = std::make_shared<mir::BoolLiteral>();
+					bool_literal->val = false;
+					mir::VarDecl::Ptr decl_stmt = std::make_shared<mir::VarDecl>();
+					decl_stmt->name = result_var_name;
+					decl_stmt->type = scalar_type;
+					decl_stmt->initVal = bool_literal;
+					new_stmts.push_back(decl_stmt);
+				}
+				
+			}		
+		}
+		new_stmts.push_back(stmt);
+	}
+	*(stmt_block->stmts) = new_stmts;
+}
+
+}
diff --git a/src/midend/mir.cpp b/src/midend/mir.cpp
index a17aafbe..779d432d 100644
--- a/src/midend/mir.cpp
+++ b/src/midend/mir.cpp
@@ -711,6 +711,7 @@ namespace graphit {
             reduce_op_ = stmt->reduce_op_;
             tracking_var_name_ = stmt->tracking_var_name_;
             is_atomic_ = stmt->is_atomic_;
+	    calling_edge_set_apply_expr = stmt->calling_edge_set_apply_expr;
         }
 
 
@@ -724,6 +725,7 @@ namespace graphit {
             auto stmt = to<mir::CompareAndSwapStmt>(node);
             compare_val_expr = stmt->compare_val_expr->clone<Expr>();
             tracking_var_ = stmt->tracking_var_;
+	    calling_edge_set_apply_expr = stmt->calling_edge_set_apply_expr;
         }
 
 
diff --git a/src/midend/mir_lower.cpp b/src/midend/mir_lower.cpp
index 1e97ebbc..34f9b94f 100644
--- a/src/midend/mir_lower.cpp
+++ b/src/midend/mir_lower.cpp
@@ -7,6 +7,7 @@
 #include <graphit/midend/apply_expr_lower.h>
 #include <graphit/midend/vector_op_lower.h>
 #include <graphit/midend/change_tracking_lower.h>
+#include <graphit/midend/gpu_change_tracking_lower.h>
 #include <graphit/midend/vector_field_properties_analyzer.h>
 #include <graphit/midend/gpu_vector_field_properties_analyzer.h>
 #include <graphit/midend/atomics_op_lower.h>
@@ -69,7 +70,12 @@ namespace graphit {
         // This pass generates code for tracking if a field has been modified
         // during the execution of the edgeset apply functions.
         // It return values for implicit tracking of changes to certain field
-        ChangeTrackingLower(mir_context, schedule).lower();
+	if (schedule != nullptr && !schedule->apply_gpu_schedules.empty()) {
+		// No change tracking lower for GPUs
+		GPUChangeTrackingLower(mir_context, schedule).lower();
+	} else {	
+        	ChangeTrackingLower(mir_context, schedule).lower();
+	}
 
         // This pass extracts the merge field and reduce operator. If numa_aware is set to true in
         // the schedule for the corresponding label, it also adds NUMA optimization
diff --git a/src/runtime_lib/gpu_intrinsics.h b/src/runtime_lib/gpu_intrinsics.h
index ae16c9aa..0d3d8c17 100644
--- a/src/runtime_lib/gpu_intrinsics.h
+++ b/src/runtime_lib/gpu_intrinsics.h
@@ -16,12 +16,17 @@
 namespace gpu_runtime {
 
 template <typename T>
-static void deleteObject(T &t) {
+void deleteObject(T &t) {
 	// Currently deleteObject is empty
 
 }
+template <>
+void deleteObject<VertexFrontier>(VertexFrontier &t) {
+	delete_vertex_frontier(t);
+}
+
 template <typename T>
-static __device__ void device_deleteObject(T &t) {
+__device__ void device_deleteObject(T &t) {
 	// Currently deleteObject is empty
 }
 
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index 59ea5174..ee699371 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -176,11 +176,6 @@ static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> &graph, VertexFr
 	} else 
 		local_vertex = -1;
 	__syncthreads();
-/*
-	degree = __shfl_sync((uint32_t)-1, degree, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
-	s1_offset = __shfl_sync((uint32_t)-1, s1_offset, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
-	local_vertex = __shfl_sync((uint32_t)-1, local_vertex, (lane_id / STAGE_1_SIZE) * STAGE_1_SIZE, 32);
-*/
 	if (local_vertex_idx < total_vertices) {
 		// STAGE 1
 		for (int32_t neigh_id = s1_offset + (lane_id % STAGE_1_SIZE); neigh_id < degree + s1_offset; neigh_id += STAGE_1_SIZE) {
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index d7d11e76..ffa7ea4f 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -37,6 +37,19 @@ class VertexFrontier {
 
 };
 
+
+void delete_vertex_frontier(VertexFrontier &frontier) {
+	cudaFree(frontier.d_sparse_queue_input);	
+	cudaFree(frontier.d_sparse_queue_output);
+	cudaFree(frontier.d_num_elems_input);
+	cudaFree(frontier.d_num_elems_output);
+	cudaFree(frontier.d_byte_map_input);
+	cudaFree(frontier.d_byte_map_output);
+	cudaFree(frontier.d_bit_map_input);
+	cudaFree(frontier.d_bit_map_output);
+	cudaFree(frontier.d_dedup_counters);
+	return;
+}
 static VertexFrontier sentinel_frontier;
 
 static int32_t builtin_getVertexSetSize(VertexFrontier &frontier) {
@@ -72,15 +85,29 @@ class AccessorAll {
 		return frontier.max_num_elems;
 	}
 };
-static VertexFrontier create_new_vertex_set(int32_t num_vertices) {
+
+void __global__ initialize_frontier_all(VertexFrontier frontier) {
+	for (int32_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < frontier.max_num_elems; idx += blockDim.x * gridDim.x)
+		frontier.d_sparse_queue_input[idx] = idx;
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+		frontier.d_num_elems_input[0] = frontier.max_num_elems;
+	}
+}
+static VertexFrontier create_new_vertex_set(int32_t num_vertices, int32_t init_elems = 0) {
 	VertexFrontier frontier;
+	frontier.max_num_elems = num_vertices;
 	cudaMalloc(&frontier.d_num_elems_input, sizeof(int32_t));
 	cudaMalloc(&frontier.d_num_elems_output, sizeof(int32_t));
-	cudaMemset(frontier.d_num_elems_input, 0, sizeof(int32_t));
-	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));
-
 	cudaMalloc(&frontier.d_sparse_queue_input, sizeof(int32_t) * num_vertices * 6);
 	cudaMalloc(&frontier.d_sparse_queue_output, sizeof(int32_t) * num_vertices * 6);
+	
+	if (num_vertices == init_elems) {
+		initialize_frontier_all<<<NUM_CTA, CTA_SIZE>>>(frontier);		
+	} else {
+		cudaMemset(frontier.d_num_elems_input, 0, sizeof(int32_t));
+	}
+	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));
+
 
 	cudaMalloc(&frontier.d_byte_map_input, sizeof(unsigned char) * num_vertices);
 	cudaMalloc(&frontier.d_byte_map_output, sizeof(unsigned char) * num_vertices);
@@ -96,7 +123,6 @@ static VertexFrontier create_new_vertex_set(int32_t num_vertices) {
 	cudaMemset(frontier.d_bit_map_output, 0, sizeof(uint32_t) * num_byte_for_bitmap);	
 	cudaCheckLastError();
 
-	frontier.max_num_elems = num_vertices;
 
 	frontier.curr_dedup_counter = 0;
 	cudaMalloc(&frontier.d_dedup_counters, sizeof(int32_t) * num_vertices);
@@ -255,7 +281,7 @@ static void __device__ vertex_set_create_reverse_sparse_queue(VertexFrontier &fr
 	}	
 }
 template <bool to_func(int32_t)>
-static void __global__ vertex_set_create_reverse_sparse_queue_kernel(VertexFrontier &frontier) {
+static void __global__ vertex_set_create_reverse_sparse_queue_kernel(VertexFrontier frontier) {
 	vertex_set_create_reverse_sparse_queue<to_func>(frontier);
 }
 
@@ -271,6 +297,8 @@ static void __device__ vertex_set_create_reverse_sparse_queue_device(VertexFront
 	this_grid().sync();
 	swap_queues_device(frontier);	
 }
+static void foo_bar(void) {
+}
 
 }
 

From a10baa00c586c5a6ce1987df8c2bda01e7530b4d Mon Sep 17 00:00:00 2001
From: "zhangyunming1990@gmail.com" <zhangyunming1990@gmail.com>
Date: Tue, 29 Oct 2019 14:43:42 -0400
Subject: [PATCH 66/88] refactoring the finished function to dequeue when the
 current frontier is empty

---
 .../infra_gpu/gpu_priority_queue.h            | 37 ++++++++++++++++---
 .../test_input/sssp_delta_stepping.cu         | 18 ++++-----
 2 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index a32d7121..eba406d7 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -38,10 +38,9 @@ namespace gpu_runtime {
       device_priorities_ = device_priorities;
       current_priority_ = initial_priority;
       delta_ = delta;
+      ready_set_dequeued = false;
       if (initial_node != -1){
-	//if (frontier_ != {0}){
 	  gpu_runtime::builtin_addVertex(frontier_, initial_node);
-	  //}
       }
     }
     
@@ -49,8 +48,18 @@ namespace gpu_runtime {
       
     }
     
-    bool finished() {
-      return current_priority_ == INT_MAX;
+    bool finished(GPUPriorityQueue<PriorityT_> * device_gpq) {
+      if (current_priority_ == INT_MAX){
+	return true;
+      }
+
+      if (!ready_set_dequeued && gpu_runtime::builtin_getVertexSetSize(frontier_) == 0){
+	dequeueReadySet(device_gpq);
+	ready_set_dequeued = true;
+	return current_priority_ == INT_MAX;
+      } 
+
+      return false;
     }
     
     bool host_finishedNode(NodeID v){
@@ -62,7 +71,16 @@ namespace gpu_runtime {
     }
 
     
-    void  dequeueReadySet(GPUPriorityQueue<PriorityT_> * device_gpq){
+    VertexFrontier dequeueReadySet(GPUPriorityQueue<PriorityT_> * device_gpq){
+      // if this is already dequeued in the previous finish() operator
+      // then don't do the dequeu operation again
+      if (ready_set_dequeued){
+	//Now that we dequeued it, the next ready set is no longer dequeued
+	ready_set_dequeued = false;
+	return frontier_;
+      }
+
+      //perform the dequeue operation only if the current frontier is empty
       if (gpu_runtime::builtin_getVertexSetSize(frontier_) == 0) {
 	window_upper_ = current_priority_ + delta_;
 	current_priority_ = INT_MAX;
@@ -81,7 +99,14 @@ namespace gpu_runtime {
 	gpu_runtime::cudaCheckLastError();
 	gpu_runtime::swap_queues(frontier_);
 	frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE;
+
+	//Now that we dequeued it, the next ready set is no longer dequeued
+	ready_set_dequeued = false;
+	return frontier_;
       }
+
+      //if it is empty, just return the empty frontier
+      return frontier_;
     }
     
     PriorityT_* host_priorities_ = nullptr;
@@ -93,7 +118,7 @@ namespace gpu_runtime {
 
     //Need to do = {0} to avoid dynamic initialization error
     VertexFrontier frontier_ = {0};
-    
+    bool ready_set_dequeued = false;
   };
 
 
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 39d68ea0..f7549aed 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -102,20 +102,18 @@ int main(int argc, char *argv[]) {
 		//printf("Init time = %f\n", t);
 		iter_total+=t;
 
-		//std::cout << "frontier size: " << gpu_runtime::builtin_getVertexSetSize(host_gpq.frontier_) << std::endl;
+		gpu_runtime::GPUPriorityQueue<int> * tmp_gpq;
+		cudaGetSymbolAddress(((void **)&tmp_gpq), device_gpq);
 		
-		//while(gpu_runtime::builtin_getVertexSetSize(frontier) != (0)){
-		while(! host_gpq.finished()){
+		while(! host_gpq.finished(tmp_gpq)){
 			startTimer();
 			iters++;
-
-			gpu_runtime::GPUPriorityQueue<int> * tmp_gpq;
-			cudaGetSymbolAddress(((void **)&tmp_gpq), device_gpq);
-			host_gpq.dequeueReadySet(tmp_gpq);
 			
-			if (host_gpq.finished()){
-			  break;
-			}
+			gpu_runtime::VertexFrontier frontier = host_gpq.dequeueReadySet(tmp_gpq);
+			
+			//if (host_gpq.finished()){
+			//  break;
+			//}
 
 			gpu_runtime::vertex_set_prepare_sparse(host_gpq.frontier_);
 			cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);

From 23b8c83967fc110eb4a834064991cffaf6766a67 Mon Sep 17 00:00:00 2001
From: "zhangyunming1990@gmail.com" <zhangyunming1990@gmail.com>
Date: Tue, 29 Oct 2019 15:56:42 -0400
Subject: [PATCH 67/88] using reference for the frontier

---
 .../infra_gpu/gpu_priority_queue.h            |  5 +++--
 .../test_input/sssp_delta_stepping.cu         | 21 ++++++++-----------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index eba406d7..22f1ad0f 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -33,12 +33,13 @@ namespace gpu_runtime {
       return current_priority_;
     }
 
-    void init(PriorityT_ * host_priorities, PriorityT_* device_priorities, PriorityT_ initial_priority, PriorityT_ delta, NodeID initial_node = -1){
+    void init(GraphT<int32_t> graph, PriorityT_ * host_priorities, PriorityT_* device_priorities, PriorityT_ initial_priority, PriorityT_ delta, NodeID initial_node = -1){
       host_priorities_ = host_priorities;
       device_priorities_ = device_priorities;
       current_priority_ = initial_priority;
       delta_ = delta;
       ready_set_dequeued = false;
+      frontier_ = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
       if (initial_node != -1){
 	  gpu_runtime::builtin_addVertex(frontier_, initial_node);
       }
@@ -71,7 +72,7 @@ namespace gpu_runtime {
     }
 
     
-    VertexFrontier dequeueReadySet(GPUPriorityQueue<PriorityT_> * device_gpq){
+    VertexFrontier& dequeueReadySet(GPUPriorityQueue<PriorityT_> * device_gpq){
       // if this is already dequeued in the previous finish() operator
       // then don't do the dequeu operation again
       if (ready_set_dequeued){
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index f7549aed..866178bf 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -83,12 +83,12 @@ int main(int argc, char *argv[]) {
 	for (int outer = 0; outer < ITER_COUNT; outer++) {
 		float iter_total = 0;
 		//this sets it to Sparse
-		host_gpq.frontier_ = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
+		//host_gpq.frontier_ = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
 		
 		gpu_runtime::vertex_set_apply_kernel<gpu_runtime::AccessorAll, SP_generated_vector_op_apply_func_0><<<NUM_CTA, CTA_SIZE>>>(graph.getFullFrontier());
 		startTimer();
 
-		host_gpq.init(__host_SP, __device_SP, 0, delta, start_vertex);
+		host_gpq.init(graph, __host_SP, __device_SP, 0, delta, start_vertex);
 
 		cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
 		gpu_runtime::cudaCheckLastError();
@@ -109,22 +109,19 @@ int main(int argc, char *argv[]) {
 			startTimer();
 			iters++;
 			
-			gpu_runtime::VertexFrontier frontier = host_gpq.dequeueReadySet(tmp_gpq);
+			gpu_runtime::VertexFrontier& frontier = host_gpq.dequeueReadySet(tmp_gpq);
 			
-			//if (host_gpq.finished()){
-			//  break;
-			//}
-
-			gpu_runtime::vertex_set_prepare_sparse(host_gpq.frontier_);
+			gpu_runtime::vertex_set_prepare_sparse(frontier);
 			cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
 			gpu_runtime::cudaCheckLastError();
 
-			gpu_runtime::TWCE_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, host_gpq.frontier_, host_gpq.frontier_);
+			gpu_runtime::TWCE_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, frontier);
 			gpu_runtime::cudaCheckLastError();
-			
-			gpu_runtime::swap_bytemaps(host_gpq.frontier_);
+
+			gpu_runtime::swap_bytemaps(frontier);
 			// set the input to the prepare function
-			host_gpq.frontier_.format_ready = gpu_runtime::VertexFrontier::BYTEMAP;	
+			frontier.format_ready = gpu_runtime::VertexFrontier::BYTEMAP;
+			
 			cudaDeviceSynchronize();
 			t = stopTimer();
 

From 156a3f3e5867377dad5c34ff59ceb6a9c637e294 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Tue, 29 Oct 2019 16:21:59 -0400
Subject: [PATCH 68/88] Added the EnqueueVertex operator and fixed the
 implementation for BFS

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |  2 +
 .../midend/gpu_change_tracking_lower.h        |  3 +-
 include/graphit/midend/mir.h                  | 15 ++-
 include/graphit/midend/mir_rewriter.h         |  1 +
 include/graphit/midend/mir_visitor.h          |  2 +
 src/backend/codegen_gpu/codegen_gpu.cpp       | 61 +++++-------
 src/midend/gpu_change_tracking_lower.cpp      | 93 +++++++++++++++++--
 src/midend/mir.cpp                            | 14 ++-
 src/midend/mir_rewriter.cpp                   |  5 +
 src/midend/mir_visitor.cpp                    |  4 +
 src/runtime_lib/infra_gpu/support.h           |  2 +
 11 files changed, 152 insertions(+), 50 deletions(-)

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index cfd7e0ae..a799ff6f 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -130,6 +130,8 @@ class CodeGenGPU: public mir::MIRVisitor{
 	virtual void visit(mir::VertexSetDedupExpr::Ptr) override;
 	virtual void visit(mir::HybridGPUStmt::Ptr) override;
 
+	virtual void visit(mir::EnqueueVertex::Ptr) override;
+
 
 };
 class CodeGenGPUHost: public CodeGenGPU {
diff --git a/include/graphit/midend/gpu_change_tracking_lower.h b/include/graphit/midend/gpu_change_tracking_lower.h
index e03f19a9..42c122fd 100644
--- a/include/graphit/midend/gpu_change_tracking_lower.h
+++ b/include/graphit/midend/gpu_change_tracking_lower.h
@@ -27,7 +27,8 @@ class GPUChangeTrackingLower {
 		MIRContext *mir_context_;
 		mir::EdgeSetApplyExpr::Ptr current_edge_set_apply_expr;
 		std::string udf_tracking_var;
-		ReductionOpChangeVisitor(MIRContext *mir_context, std::string tracking_var, mir::EdgeSetApplyExpr::Ptr edge_set_apply_expr): mir_context_(mir_context), udf_tracking_var(tracking_var), current_edge_set_apply_expr(edge_set_apply_expr) {
+		mir::Type::Ptr frontier_type;
+		ReductionOpChangeVisitor(MIRContext *mir_context, std::string tracking_var, mir::EdgeSetApplyExpr::Ptr edge_set_apply_expr, mir::Type::Ptr type): mir_context_(mir_context), udf_tracking_var(tracking_var), current_edge_set_apply_expr(edge_set_apply_expr), frontier_type(type) {
 		}	
 		virtual void visit(mir::StmtBlock::Ptr) override;
 
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index f457f3b9..bd77dcff 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -473,7 +473,6 @@ namespace graphit {
             ReductionOp reduce_op_;
             std::string tracking_var_name_ = "";
             bool is_atomic_ = false;
-	    std::shared_ptr<EdgeSetApplyExpr> calling_edge_set_apply_expr = nullptr;
 
             typedef std::shared_ptr<ReduceStmt> Ptr;
 
@@ -496,7 +495,6 @@ namespace graphit {
         struct CompareAndSwapStmt : public AssignStmt {
             Expr::Ptr compare_val_expr;
             std::string tracking_var_;
-	    std::shared_ptr<EdgeSetApplyExpr> calling_edge_set_apply_expr = nullptr;
 
             typedef std::shared_ptr<CompareAndSwapStmt> Ptr;
 
@@ -1567,6 +1565,19 @@ namespace graphit {
 		virtual void copy(MIRNode::Ptr);
 		virtual MIRNode::Ptr cloneNode();
 	};
+	struct EnqueueVertex: Stmt {
+		Expr::Ptr vertex_id;
+		Expr::Ptr vertex_frontier;
+		enum class Type {SPARSE, BOOLMAP, BITMAP};
+		Type type;
+		typedef std::shared_ptr<EnqueueVertex> Ptr;
+		virtual void accept(MIRVisitor *visitor) {
+			visitor->visit(self<EnqueueVertex>());
+		}
+	protected:
+		virtual void copy(MIRNode::Ptr);
+		virtual MIRNode::Ptr cloneNode();
+	};
     }
 
 }
diff --git a/include/graphit/midend/mir_rewriter.h b/include/graphit/midend/mir_rewriter.h
index b915bfe5..48652d88 100644
--- a/include/graphit/midend/mir_rewriter.h
+++ b/include/graphit/midend/mir_rewriter.h
@@ -157,6 +157,7 @@ namespace graphit {
 	    // GPU Additions
 	    virtual void visit(std::shared_ptr<VertexSetDedupExpr>);
 	    virtual void visit(std::shared_ptr<HybridGPUStmt>);
+	    virtual void visit(std::shared_ptr<EnqueueVertex>); 
 
             template<typename T = Program>
             std::shared_ptr<T> rewrite(std::shared_ptr<T> ptr) {
diff --git a/include/graphit/midend/mir_visitor.h b/include/graphit/midend/mir_visitor.h
index 557fa1bc..5e1d8e34 100644
--- a/include/graphit/midend/mir_visitor.h
+++ b/include/graphit/midend/mir_visitor.h
@@ -115,6 +115,7 @@ namespace graphit {
 	// GPU Additions
 	struct VertexSetDedupExpr;
 	struct HybridGPUStmt;
+	struct EnqueueVertex;
 	
 
         struct MIRVisitor {
@@ -264,6 +265,7 @@ namespace graphit {
 	    // GPU Additions
 	    virtual void visit(std::shared_ptr<VertexSetDedupExpr>); 
 	    virtual void visit(std::shared_ptr<HybridGPUStmt>); 
+	    virtual void visit(std::shared_ptr<EnqueueVertex>);
 
        	    protected:
 
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index f8c19596..86ada518 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -949,27 +949,31 @@ void CodeGenGPU::visit(mir::ReduceStmt::Ptr reduce_stmt) {
 			break;
 	}	
 
-	if (reduce_stmt->tracking_var_name_ != "") {
-		mir::EdgeSetApplyExpr::Ptr apply_expr = reduce_stmt->calling_edge_set_apply_expr;
-		printIndent();
-		oss << "if (" << reduce_stmt->tracking_var_name_ << ") {" << std::endl;
-		indent();
-		printIndent();
-		if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED)
-			oss << "gpu_runtime::enqueueVertexSparseQueue(__output_frontier.d_sparse_queue_output, __output_frontier.d_num_elems_output, ";
-		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP)
-			oss << "gpu_runtime::enqueueVertexBytemap(__output_frontier.d_byte_map_output, __output_frontier.d_num_elems_output, ";
-		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP)
-			oss << "gpu_runtime::enqueueVertexBitmap(__output_frontier.d_bit_map_output, __output_frontier.d_num_elems_output, ";
-		mir::TensorReadExpr::Ptr tre = mir::to<mir::TensorReadExpr>(reduce_stmt->lhs);
-		tre->index->accept(this);
-		oss << ");" << std::endl;
-		dedent();
-		printIndent();
-		oss << "}" << std::endl;
-	}
+}
 
+void CodeGenGPU::visit(mir::EnqueueVertex::Ptr enqueue_vertex) {
+	printIndent();
+	if (enqueue_vertex->type == mir::EnqueueVertex::Type::SPARSE) {
+		oss << "gpu_runtime::enqueueVertexSparseQueue(";
+		enqueue_vertex->vertex_frontier->accept(this);
+		oss << ".d_sparse_queue_output";
+	} else if (enqueue_vertex->type == mir::EnqueueVertex::Type::BOOLMAP) {
+		oss << "gpu_runtime::enqueueVertexBytemap(";
+		enqueue_vertex->vertex_frontier->accept(this);
+		oss << ".d_byte_map_output";
+	} else if (enqueue_vertex->type == mir::EnqueueVertex::Type::BITMAP) {
+		oss << "gpu_runtime::enqueueVertexBitmap(";
+		enqueue_vertex->vertex_frontier->accept(this);
+		oss << ".d_bit_map_output";
+	}
+	oss << ", ";
+	enqueue_vertex->vertex_frontier->accept(this);
+	oss << ".d_num_elems_output, ";
+	enqueue_vertex->vertex_id->accept(this);
+	oss << ");" << std::endl;	
+	
 }
+
 void CodeGenGPU::visit(mir::CompareAndSwapStmt::Ptr cas_stmt) {
 	printIndent();
 	if (cas_stmt->tracking_var_ != "") 
@@ -981,25 +985,6 @@ void CodeGenGPU::visit(mir::CompareAndSwapStmt::Ptr cas_stmt) {
 	oss << ", ";
 	cas_stmt->expr->accept(this);
 	oss << ");" << std::endl;
-	if (cas_stmt->tracking_var_ != "") {
-		mir::EdgeSetApplyExpr::Ptr apply_expr = cas_stmt->calling_edge_set_apply_expr;
-		printIndent();
-		oss << "if (" << cas_stmt->tracking_var_ << ") {" << std::endl;
-		indent();
-		printIndent();
-		if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED)
-			oss << "gpu_runtime::enqueueVertexSparseQueue(__output_frontier.d_sparse_queue_output, __output_frontier.d_num_elems_output, ";
-		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP)
-			oss << "gpu_runtime::enqueueVertexBytemap(__output_frontier.d_byte_map_output, __output_frontier.d_num_elems_output, ";
-		else if (apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP)
-			oss << "gpu_runtime::enqueueVertexBitmap(__output_frontier.d_bit_map_output, __output_frontier.d_num_elems_output, ";
-		mir::TensorReadExpr::Ptr tre = mir::to<mir::TensorReadExpr>(cas_stmt->lhs);
-		tre->index->accept(this);
-		oss << ");" << std::endl;
-		dedent();
-		printIndent();
-		oss << "}" << std::endl;
-	}
 }
 void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
 	
diff --git a/src/midend/gpu_change_tracking_lower.cpp b/src/midend/gpu_change_tracking_lower.cpp
index 894b85ab..d4b8bc3c 100644
--- a/src/midend/gpu_change_tracking_lower.cpp
+++ b/src/midend/gpu_change_tracking_lower.cpp
@@ -10,9 +10,6 @@ void GPUChangeTrackingLower::lower(void) {
 void GPUChangeTrackingLower::UdfArgChangeVisitor::updateUdf(mir::FuncDecl::Ptr func_decl, mir::EdgeSetApplyExpr::Ptr esae) {
 	if (esae->requires_output == false)
 		return;
-	//assert(func_decl->udf_tracking_var == "" && "Currently, each UDF can only be used by one EdgeSetApply");
-	//func_decl->udf_tracking_var = esae->tracking_field;
-	//func_decl->calling_edge_set_apply_expr = esae;
 
 	mir::VarExpr::Ptr var_expr = mir::to<mir::VarExpr>(esae->target);	
 	mir::EdgeSetType::Ptr edge_set_type = mir::to<mir::EdgeSetType>(var_expr->var.getType());
@@ -24,7 +21,7 @@ void GPUChangeTrackingLower::UdfArgChangeVisitor::updateUdf(mir::FuncDecl::Ptr f
 	func_decl->args.push_back(new_arg);
 	
 	// Now modify all the reduce stmts inside
-	ReductionOpChangeVisitor visitor(mir_context_, esae->tracking_field, esae);
+	ReductionOpChangeVisitor visitor(mir_context_, esae->tracking_field, esae, vertex_set_type);
 	func_decl->accept(&visitor);
 }
 void GPUChangeTrackingLower::UdfArgChangeVisitor::visit(mir::PushEdgeSetApplyExpr::Ptr pesae) {
@@ -40,6 +37,7 @@ void GPUChangeTrackingLower::ReductionOpChangeVisitor::visit(mir::StmtBlock::Ptr
 	std::vector<mir::Stmt::Ptr> new_stmts;
 	for (auto stmt: *(stmt_block->stmts)) {
 		stmt->accept(this);
+		bool stmt_added = false;
 		if (mir::isa<mir::ReduceStmt>(stmt)) {
 			mir::ReduceStmt::Ptr reduce_stmt = mir::to<mir::ReduceStmt>(stmt);
 			if (mir::isa<mir::TensorReadExpr>(reduce_stmt->lhs)) {
@@ -47,7 +45,6 @@ void GPUChangeTrackingLower::ReductionOpChangeVisitor::visit(mir::StmtBlock::Ptr
 				if (mir::isa<mir::VarExpr>(tre->target) && mir::to<mir::VarExpr>(tre->target)->var.getName() == udf_tracking_var) {
 					std::string result_var_name = "result_var" + mir_context_->getUniqueNameCounterString();
 					reduce_stmt->tracking_var_name_ = result_var_name;
-					reduce_stmt->calling_edge_set_apply_expr = current_edge_set_apply_expr;
 					
 					mir::ScalarType::Ptr scalar_type = std::make_shared<mir::ScalarType>();
 					scalar_type->type = mir::ScalarType::Type::BOOL;
@@ -58,6 +55,35 @@ void GPUChangeTrackingLower::ReductionOpChangeVisitor::visit(mir::StmtBlock::Ptr
 					decl_stmt->type = scalar_type;
 					decl_stmt->initVal = bool_literal;
 					new_stmts.push_back(decl_stmt);
+					new_stmts.push_back(stmt);
+
+					// Now construct the conditional enqueue
+					mir::Var tracking_var(result_var_name, scalar_type);
+					mir::VarExpr::Ptr condition_expr = std::make_shared<mir::VarExpr>();
+					condition_expr->var = tracking_var;
+					mir::IfStmt::Ptr if_stmt = std::make_shared<mir::IfStmt>();
+					if_stmt->cond = condition_expr;
+					
+					mir::StmtBlock::Ptr stmt_block = std::make_shared<mir::StmtBlock>();
+					if_stmt->ifBody = stmt_block;
+					
+					mir::EnqueueVertex::Ptr enqueue_vertex = std::make_shared<mir::EnqueueVertex>();
+					mir::Var frontier_var("__output_frontier", frontier_type);
+					mir::VarExpr::Ptr frontier_expr = std::make_shared<mir::VarExpr>();
+					frontier_expr->var = frontier_var;
+					enqueue_vertex->vertex_id = tre->index;
+					enqueue_vertex->vertex_frontier = frontier_expr;	
+					if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::SPARSE;
+					} else if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::BOOLMAP;
+					} else if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::BITMAP;
+					}
+					stmt_block->insertStmtEnd(enqueue_vertex);
+					if_stmt->elseBody = nullptr;
+					new_stmts.push_back(if_stmt);
+					stmt_added = true;
 				}
 			}
 		} else if (mir::isa<mir::CompareAndSwapStmt>(stmt)) {
@@ -67,7 +93,6 @@ void GPUChangeTrackingLower::ReductionOpChangeVisitor::visit(mir::StmtBlock::Ptr
 				if (mir::isa<mir::VarExpr>(tre->target) && mir::to<mir::VarExpr>(tre->target)->var.getName() == udf_tracking_var) {
 					std::string result_var_name = "result_var" + mir_context_->getUniqueNameCounterString();
 					cas_stmt->tracking_var_ = result_var_name;
-					cas_stmt->calling_edge_set_apply_expr = current_edge_set_apply_expr;
 					
 					mir::ScalarType::Ptr scalar_type = std::make_shared<mir::ScalarType>();
 					scalar_type->type = mir::ScalarType::Type::BOOL;
@@ -78,11 +103,65 @@ void GPUChangeTrackingLower::ReductionOpChangeVisitor::visit(mir::StmtBlock::Ptr
 					decl_stmt->type = scalar_type;
 					decl_stmt->initVal = bool_literal;
 					new_stmts.push_back(decl_stmt);
+					new_stmts.push_back(stmt);
+
+					// Now construct the conditional enqueue
+					mir::Var tracking_var(result_var_name, scalar_type);
+					mir::VarExpr::Ptr condition_expr = std::make_shared<mir::VarExpr>();
+					condition_expr->var = tracking_var;
+					mir::IfStmt::Ptr if_stmt = std::make_shared<mir::IfStmt>();
+					if_stmt->cond = condition_expr;
+					
+					mir::StmtBlock::Ptr stmt_block = std::make_shared<mir::StmtBlock>();
+					if_stmt->ifBody = stmt_block;
+					
+					mir::EnqueueVertex::Ptr enqueue_vertex = std::make_shared<mir::EnqueueVertex>();
+					mir::Var frontier_var("__output_frontier", frontier_type);
+					mir::VarExpr::Ptr frontier_expr = std::make_shared<mir::VarExpr>();
+					frontier_expr->var = frontier_var;
+					enqueue_vertex->vertex_id = tre->index;
+					enqueue_vertex->vertex_frontier = frontier_expr;	
+					if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::SPARSE;
+					} else if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::BOOLMAP;
+					} else if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::BITMAP;
+					}
+					stmt_block->insertStmtEnd(enqueue_vertex);
+					if_stmt->elseBody = nullptr;
+					new_stmts.push_back(if_stmt);
+					stmt_added = true;
 				}
 				
 			}		
+		} else if (mir::isa<mir::AssignStmt>(stmt)) {
+			mir::AssignStmt::Ptr assign_stmt = mir::to<mir::AssignStmt>(stmt);
+			if (mir::isa<mir::TensorReadExpr>(assign_stmt->lhs)) {
+				mir::TensorReadExpr::Ptr tre = mir::to<mir::TensorReadExpr>(assign_stmt->lhs);
+				if (mir::isa<mir::VarExpr>(tre->target) && mir::to<mir::VarExpr>(tre->target)->var.getName() == udf_tracking_var) {
+					new_stmts.push_back(stmt);
+					mir::EnqueueVertex::Ptr enqueue_vertex = std::make_shared<mir::EnqueueVertex>();
+					mir::Var frontier_var("__output_frontier", frontier_type);
+					mir::VarExpr::Ptr frontier_expr = std::make_shared<mir::VarExpr>();
+					frontier_expr->var = frontier_var;
+					enqueue_vertex->vertex_id = tre->index;
+					enqueue_vertex->vertex_frontier = frontier_expr;	
+					if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::SPARSE;
+					} else if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::BOOLMAP;
+					} else if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
+						enqueue_vertex->type = mir::EnqueueVertex::Type::BITMAP;
+					}
+					new_stmts.push_back(enqueue_vertex);
+					stmt_added = true;
+				}
+			}
+			
 		}
-		new_stmts.push_back(stmt);
+		if (!stmt_added)
+			new_stmts.push_back(stmt);
 	}
 	*(stmt_block->stmts) = new_stmts;
 }
diff --git a/src/midend/mir.cpp b/src/midend/mir.cpp
index 779d432d..22dd4174 100644
--- a/src/midend/mir.cpp
+++ b/src/midend/mir.cpp
@@ -711,7 +711,6 @@ namespace graphit {
             reduce_op_ = stmt->reduce_op_;
             tracking_var_name_ = stmt->tracking_var_name_;
             is_atomic_ = stmt->is_atomic_;
-	    calling_edge_set_apply_expr = stmt->calling_edge_set_apply_expr;
         }
 
 
@@ -725,7 +724,6 @@ namespace graphit {
             auto stmt = to<mir::CompareAndSwapStmt>(node);
             compare_val_expr = stmt->compare_val_expr->clone<Expr>();
             tracking_var_ = stmt->tracking_var_;
-	    calling_edge_set_apply_expr = stmt->calling_edge_set_apply_expr;
         }
 
 
@@ -973,5 +971,17 @@ namespace graphit {
 		return node;
 	}
 
+	void EnqueueVertex::copy(MIRNode::Ptr node) {
+		const auto op = mir::to<EnqueueVertex>(node);
+		vertex_id = op->vertex_id;
+		vertex_frontier = op->vertex_frontier;
+		type = op->type;
+	}
+	MIRNode::Ptr EnqueueVertex::cloneNode() {
+		const auto node = std::make_shared<EnqueueVertex>();
+		node->copy(shared_from_this());
+		return node;
+	}
+
     }
 }
diff --git a/src/midend/mir_rewriter.cpp b/src/midend/mir_rewriter.cpp
index 94714486..a437b39f 100644
--- a/src/midend/mir_rewriter.cpp
+++ b/src/midend/mir_rewriter.cpp
@@ -421,6 +421,11 @@ namespace graphit {
 		stmt->stmt2 = rewrite<StmtBlock>(stmt->stmt2);
 		node = stmt;
 	}
+	void MIRRewriter::visit(EnqueueVertex::Ptr stmt) {
+		stmt->vertex_id = rewrite<Expr>(stmt->vertex_id);
+		stmt->vertex_frontier = rewrite<Expr>(stmt->vertex_frontier);
+		node = stmt;
+	}
 
     }
 }
diff --git a/src/midend/mir_visitor.cpp b/src/midend/mir_visitor.cpp
index ccf62752..2db996b0 100644
--- a/src/midend/mir_visitor.cpp
+++ b/src/midend/mir_visitor.cpp
@@ -380,5 +380,9 @@ namespace graphit {
 		op->stmt1->accept(this);
 		op->stmt2->accept(this);		
 	}
+	void MIRVisitor::visit(std::shared_ptr<EnqueueVertex> op) {
+		op->vertex_id->accept(this);
+		op->vertex_frontier->accept(this);
+	}
     }
 }
diff --git a/src/runtime_lib/infra_gpu/support.h b/src/runtime_lib/infra_gpu/support.h
index 226a561a..4358a9eb 100644
--- a/src/runtime_lib/infra_gpu/support.h
+++ b/src/runtime_lib/infra_gpu/support.h
@@ -39,6 +39,8 @@ static bool __device__ writeAdd(T *dst, T src) {
 }
 template <typename T>
 static bool __device__ CAS(T *dst, T old_val, const T &new_val) {
+	if (*dst != old_val)
+		return false;
 	return old_val == atomicCAS(dst, old_val, new_val);
 }
 static void __device__ parallel_memset(unsigned char* dst, unsigned char val, size_t total_bytes) {

From 437ecb04ff3576ccf94b38bd3d9d30fdb2ec0913 Mon Sep 17 00:00:00 2001
From: "zhangyunming1990@gmail.com" <zhangyunming1990@gmail.com>
Date: Tue, 29 Oct 2019 19:11:23 -0400
Subject: [PATCH 69/88] refactor with updatePriorityMin on the priority queue

---
 src/runtime_lib/infra_gpu/gpu_priority_queue.h   |  9 +++++++--
 test/gpu_tests/test_input/sssp_delta_stepping.cu | 15 ++++++++++-----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index 22f1ad0f..1157af77 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -45,8 +45,13 @@ namespace gpu_runtime {
       }
     }
     
-    void updatePriorityMin(PriorityT_ priority_change_){
-      
+    void __device__ updatePriorityMin(GPUPriorityQueue<PriorityT_> * device_gpq,  PriorityT_ new_priority, VertexFrontier output_frontier, int32_t node){
+      bool output = gpu_runtime::writeMin(&(device_gpq->device_priorities_[node]), new_priority);
+     if (device_gpq->device_priorities_[node] >= (device_gpq->current_priority_ + device_gpq->delta_)) return;
+     if (output){
+       enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, node);
+     }
+
     }
     
     bool finished(GPUPriorityQueue<PriorityT_> * device_gpq) {
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping.cu b/test/gpu_tests/test_input/sssp_delta_stepping.cu
index 866178bf..4f85f5e3 100644
--- a/test/gpu_tests/test_input/sssp_delta_stepping.cu
+++ b/test/gpu_tests/test_input/sssp_delta_stepping.cu
@@ -45,22 +45,27 @@ void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, int start_v) {
 	}
 }
 
-bool __device__ updateEdge(int32_t src, int32_t dst, int32_t weight) {
-  bool output2;
-	bool SP_trackving_var_1 = 0;
+/*bool __device__ updateEdge(int32_t src, int32_t dst, int32_t weight) {
+        bool output2;
+        bool SP_trackving_var_1 = 0;
 	SP_trackving_var_1 = gpu_runtime::writeMin(&SP[dst], (SP[src] + weight));
 	output2 = SP_trackving_var_1;
 	if (SP[dst] >= (device_gpq.current_priority_ + device_gpq.delta_)) return false;
 	return output2;
+	}*/
+
+void __device__ deviceUpdateEdge(int32_t src, int32_t dst, int32_t weight, gpu_runtime::VertexFrontier output_frontier){
+  device_gpq.updatePriorityMin(&device_gpq, (SP[src] + weight), output_frontier, dst);
 }
 
 template <typename EdgeWeightType>
 void __device__ gpu_operator_body_3(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {
 	// Body of the actual operator code
 	EdgeWeightType weight = graph.d_edge_weight[edge_id];
-	if (updateEdge(src, dst, weight)){
+	deviceUpdateEdge(src, dst, weight, output_frontier);
+	/*if (updateEdge(src, dst, weight)){
 		gpu_runtime::enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, dst);
-	}
+		}*/
 }
 
 void __device__ SP_generated_vector_op_apply_func_0(int32_t v) {

From 03035399ae9af4517ac8e4b71708d77602eafdb2 Mon Sep 17 00:00:00 2001
From: Yunming Zhang <zhangyunming1990@gmail.com>
Date: Thu, 31 Oct 2019 17:17:33 -0400
Subject: [PATCH 70/88] adding a check to make sure only eager or eager with
 merge schedule would lower into ordered processing operator

---
 src/midend/priority_features_lowering.cpp | 11 +++++++----
 test/c++/high_level_schedule_test.cpp     | 10 ++++++++++
 test/c++/test.cpp                         |  4 ++--
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/midend/priority_features_lowering.cpp b/src/midend/priority_features_lowering.cpp
index 78135e1e..ac7cc10a 100644
--- a/src/midend/priority_features_lowering.cpp
+++ b/src/midend/priority_features_lowering.cpp
@@ -49,10 +49,13 @@ namespace graphit {
             function->accept(&lower_update_priority_edge_set_apply_expr);
         }
 
-        // Detect pattern for OrderedProcessingOperator, and lower into the MIR node for OrderedProcessingOp
-        auto lower_ordered_processing_op = LowerIntoOrderedProcessingOperatorRewriter(schedule_, mir_context_);
-        for (auto function : functions) {
-            lower_ordered_processing_op.rewrite(function);
+        if (mir_context_->priority_update_type == mir::PriorityUpdateType::EagerPriorityUpdateWithMerge ||
+                mir_context_->priority_update_type == mir::PriorityUpdateType::EagerPriorityUpdate){
+            // Detect pattern for OrderedProcessingOperator, and lower into the MIR node for OrderedProcessingOp
+            auto lower_ordered_processing_op = LowerIntoOrderedProcessingOperatorRewriter(schedule_, mir_context_);
+            for (auto function : functions) {
+                lower_ordered_processing_op.rewrite(function);
+            }
         }
 
         // Lowers into PriorityUpdateOperators (PriorityUpdateMin and PriorityUpdateSum)
diff --git a/test/c++/high_level_schedule_test.cpp b/test/c++/high_level_schedule_test.cpp
index fffc75c6..0edc6421 100644
--- a/test/c++/high_level_schedule_test.cpp
+++ b/test/c++/high_level_schedule_test.cpp
@@ -2385,6 +2385,16 @@ TEST_F(HighLevelScheduleTest, SSSP_LabelProp_GPUScheduleTest) {
     EXPECT_EQ (0, basicTestWithGPUSchedule(program));
 }
 
+TEST_F(HighLevelScheduleTest, DeltaSteppingWithDefaultGPUSchedule) {
+    istringstream is (delta_stepping_str_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+        = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+    fir::gpu_schedule::SimpleGPUSchedule s1;
+    program->applyGPUSchedule("s1", s1);
+    EXPECT_EQ (0, basicTestWithGPUSchedule(program));
+}
+
 TEST_F(HighLevelScheduleTest, BFSHybridPushPullScheduleTest) {
     using namespace fir::gpu_schedule;
 
diff --git a/test/c++/test.cpp b/test/c++/test.cpp
index 3c281351..4a30e779 100644
--- a/test/c++/test.cpp
+++ b/test/c++/test.cpp
@@ -56,11 +56,11 @@ int main(int argc, char **argv) {
 //
 //    ::testing::GTEST_FLAG(filter) = "LowLevelScheduleTest.SimpleApplyFunctionFusion";
 
-//    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.UnorderedKCoreSparsePushDensePullParallel";
+//    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.DeltaSteppingWithDefaultGPUSchedule";
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.UnorderedKCoreSparsePushParallel";
 
 
-//    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.KCoreSparsePushParallel";
+//    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.DeltaSteppingDensePullParallel";
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.KCoreSparsePushSerial";
 //
 //    ::testing::GTEST_FLAG(filter) = "HighLevelScheduleTest.KCoreSumReduceBeforeUpdate";

From b2cdeccaa119ff2a0263aed16fa6d14e7bb9079d Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Tue, 5 Nov 2019 15:47:36 -0500
Subject: [PATCH 71/88] Fixed all codegen for SSSP_delta_stepping

---
 .../codegen_gpu/assign_function_context.h     |   1 +
 .../graphit/backend/codegen_gpu/codegen_gpu.h |   4 +
 include/graphit/frontend/gpu_schedule.h       |  15 +
 .../midend/gpu_change_tracking_lower.h        |   1 +
 .../midend/gpu_priority_features_lowering.h   |  57 ++++
 .../gpu_vector_field_properties_analyzer.h    |   3 +
 include/graphit/midend/mir.h                  |   3 +
 include/graphit/midend/mir_context.h          |  11 +
 .../midend/priority_queue_frontier_reuse.h    |  10 +
 .../codegen_gpu/assign_function_context.cpp   |   8 +
 src/backend/codegen_gpu/codegen_gpu.cpp       | 173 +++++++++-
 src/midend/apply_expr_lower.cpp               |   1 +
 src/midend/gpu_change_tracking_lower.cpp      |   4 +
 src/midend/gpu_priority_features_lowering.cpp | 117 +++++++
 .../gpu_vector_field_properties_analyzer.cpp  |  31 ++
 src/midend/mir.cpp                            |   1 +
 src/midend/mir_emitter.cpp                    |   5 +-
 src/midend/mir_lower.cpp                      |  10 +-
 .../infra_gpu/gpu_priority_queue.h            | 306 +++++++++---------
 test/gpu_tests/all_gpu_tests.py               |   3 +
 .../sssp_delta_stepping_frontier_byval.cu     | 172 ++++++++++
 21 files changed, 763 insertions(+), 173 deletions(-)
 create mode 100644 include/graphit/midend/gpu_priority_features_lowering.h
 create mode 100644 include/graphit/midend/priority_queue_frontier_reuse.h
 create mode 100644 src/midend/gpu_priority_features_lowering.cpp
 create mode 100644 test/gpu_tests/test_input/sssp_delta_stepping_frontier_byval.cu

diff --git a/include/graphit/backend/codegen_gpu/assign_function_context.h b/include/graphit/backend/codegen_gpu/assign_function_context.h
index 1a014df8..ff1264e7 100644
--- a/include/graphit/backend/codegen_gpu/assign_function_context.h
+++ b/include/graphit/backend/codegen_gpu/assign_function_context.h
@@ -17,6 +17,7 @@ class AssignFunctionContext : mir::MIRVisitor {
 		int assign_function_context(void);
 	protected:
 		void visit(mir::PushEdgeSetApplyExpr::Ptr);
+		void visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr);
 		void visit(mir::PullEdgeSetApplyExpr::Ptr);
 		void visit(mir::VertexSetApplyExpr::Ptr);
 	private:
diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index a799ff6f..d9d150e9 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -27,6 +27,7 @@ class CodeGenGPUKernelEmitter: public mir::MIRVisitor {
 
 	void visit(mir::PushEdgeSetApplyExpr::Ptr);
 	void visit(mir::PullEdgeSetApplyExpr::Ptr);
+	void visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr);
 
 	void genEdgeSetGlobalKernel(mir::EdgeSetApplyExpr::Ptr);
 
@@ -89,6 +90,7 @@ class CodeGenGPU: public mir::MIRVisitor{
 	void generateBinaryExpr(mir::BinaryExpr::Ptr, std::string);
 protected:
 	virtual void visit(mir::EdgeSetType::Ptr) override;
+	virtual void visit(mir::PriorityQueueType::Ptr) override;
 	virtual void visit(mir::VertexSetType::Ptr) override;
 	virtual void visit(mir::ScalarType::Ptr) override;
 	virtual void visit(mir::FuncDecl::Ptr) override;
@@ -114,6 +116,7 @@ class CodeGenGPU: public mir::MIRVisitor{
 
 	virtual void visit(mir::ReduceStmt::Ptr) override;
 	virtual void visit(mir::CompareAndSwapStmt::Ptr) override;
+
 	virtual void visit(mir::VarDecl::Ptr) override;
 
 	virtual void visit(mir::ForStmt::Ptr) override;
@@ -132,6 +135,7 @@ class CodeGenGPU: public mir::MIRVisitor{
 
 	virtual void visit(mir::EnqueueVertex::Ptr) override;
 
+	void genPriorityUpdateOperator(mir::PriorityUpdateOperator::Ptr); 
 
 };
 class CodeGenGPUHost: public CodeGenGPU {
diff --git a/include/graphit/frontend/gpu_schedule.h b/include/graphit/frontend/gpu_schedule.h
index 3157c962..ab255844 100644
--- a/include/graphit/frontend/gpu_schedule.h
+++ b/include/graphit/frontend/gpu_schedule.h
@@ -95,6 +95,8 @@ class SimpleGPUSchedule: public GPUSchedule {
 	edge_blocking_type edge_blocking;
 	uint32_t edge_blocking_size;
 	kernel_fusion_type kernel_fusion;
+
+	int32_t delta;
 	
 	SimpleGPUSchedule () {
 		direction = direction_type::DIR_PUSH;
@@ -105,6 +107,7 @@ class SimpleGPUSchedule: public GPUSchedule {
 		edge_blocking = edge_blocking_type::UNBLOCKED;
 		edge_blocking_size = 0;
 		kernel_fusion = kernel_fusion_type::FUSION_DISABLED;
+		delta = 1;
 	}	
 
 public:	
@@ -219,6 +222,18 @@ class SimpleGPUSchedule: public GPUSchedule {
 		}
 		
 	}
+	void configDelta(int32_t d) {
+		if (d <= 0)
+			assert(false && "Invalid option for configDelta");
+		delta = d;
+	}
+	void configDelta(const char* d) {
+		if (sscanf(d, "argv[%i]", &delta) != 1) {
+			assert(false && "Invalid option for configDelta");
+		}	
+		delta *= -1;
+	}
+	
 };
 
 class HybridGPUSchedule: public GPUSchedule {
diff --git a/include/graphit/midend/gpu_change_tracking_lower.h b/include/graphit/midend/gpu_change_tracking_lower.h
index 42c122fd..f6a1e078 100644
--- a/include/graphit/midend/gpu_change_tracking_lower.h
+++ b/include/graphit/midend/gpu_change_tracking_lower.h
@@ -20,6 +20,7 @@ class GPUChangeTrackingLower {
 		void updateUdf(mir::FuncDecl::Ptr func_decl, mir::EdgeSetApplyExpr::Ptr);
 		virtual void visit(mir::PushEdgeSetApplyExpr::Ptr) override;
 		virtual void visit(mir::PullEdgeSetApplyExpr::Ptr) override;
+		virtual void visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr) override;
 	};
 
 	struct ReductionOpChangeVisitor: public mir::MIRVisitor {
diff --git a/include/graphit/midend/gpu_priority_features_lowering.h b/include/graphit/midend/gpu_priority_features_lowering.h
new file mode 100644
index 00000000..8ee97ab5
--- /dev/null
+++ b/include/graphit/midend/gpu_priority_features_lowering.h
@@ -0,0 +1,57 @@
+#ifndef GPU_PRIORITY_FEATURES_LOWERING_H
+#define GPU_PRIORITY_FEATURES_LOWERING_H
+
+#include <graphit/midend/mir_context.h>
+#include <graphit/frontend/schedule.h>
+#include <graphit/midend/mir_rewriter.h>
+
+namespace graphit {
+class GPUPriorityFeaturesLowering {
+public:
+	MIRContext *mir_context_;
+	Schedule *schedule_;
+	GPUPriorityFeaturesLowering(MIRContext* mir_context, Schedule* schedule): mir_context_(mir_context), schedule_(schedule) {
+	}
+	void lower(void);		
+
+
+	struct EdgeSetApplyPriorityRewriter: public mir::MIRRewriter {
+		MIRContext *mir_context_;
+		Schedule *schedule_;
+		EdgeSetApplyPriorityRewriter(MIRContext* mir_context, Schedule* schedule): mir_context_(mir_context), schedule_(schedule) {
+		}
+		
+		using mir::MIRRewriter::visit;
+		virtual void visit(mir::ExprStmt::Ptr) override;
+		
+	};
+	struct PriorityUpdateOperatorRewriter: public mir::MIRRewriter {
+		MIRContext *mir_context_;
+		mir::UpdatePriorityEdgeSetApplyExpr::Ptr puesae_;
+		PriorityUpdateOperatorRewriter(MIRContext* mir_context, mir::UpdatePriorityEdgeSetApplyExpr::Ptr puesae): mir_context_(mir_context), puesae_(puesae) {
+		}
+		using mir::MIRRewriter::visit;
+		virtual void visit(mir::Call::Ptr) override;
+		
+	};
+	struct UDFPriorityQueueFinder: public mir::MIRVisitor {
+		using mir::MIRVisitor::visit;
+		
+		MIRContext *mir_context_;
+		UDFPriorityQueueFinder(MIRContext* mir_context): mir_context_(mir_context) {
+		}
+		std::vector<mir::Var> priority_queues_used;
+		mir::Var getPriorityQueue(void);
+		void insertVar(mir::Var);
+		virtual void visit(mir::PriorityUpdateOperator::Ptr) override;
+		virtual void visit(mir::PriorityUpdateOperatorMin::Ptr) override;
+		virtual void visit(mir::PriorityUpdateOperatorSum::Ptr) override;
+		virtual void visit(mir::Call::Ptr) override;
+	};
+};
+}
+
+#endif
+
+
+
diff --git a/include/graphit/midend/gpu_vector_field_properties_analyzer.h b/include/graphit/midend/gpu_vector_field_properties_analyzer.h
index 5555b37c..91d2fbfe 100644
--- a/include/graphit/midend/gpu_vector_field_properties_analyzer.h
+++ b/include/graphit/midend/gpu_vector_field_properties_analyzer.h
@@ -25,6 +25,8 @@ class GPUVectorFieldPropertiesAnalyzer {
 		virtual void visit(mir::AssignStmt::Ptr) override;
 		virtual void visit(mir::ReduceStmt::Ptr) override;
 		
+		virtual void visit(mir::PriorityUpdateOperatorMin::Ptr) override;
+		
 	};
 	struct ApplyExprVisitor: public mir::MIRVisitor {
 		MIRContext* mir_context_;
@@ -33,6 +35,7 @@ class GPUVectorFieldPropertiesAnalyzer {
 		using mir::MIRVisitor::visit;
 		virtual void visit(mir::PushEdgeSetApplyExpr::Ptr) override;
 		virtual void visit(mir::PullEdgeSetApplyExpr::Ptr) override;
+		virtual void visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr) override;
 	};
 
 	MIRContext* mir_context_;
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index bd77dcff..131a8d3c 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -753,6 +753,8 @@ namespace graphit {
             std::string tracking_var;
             bool is_atomic = false;
 
+	    std::shared_ptr<UpdatePriorityEdgeSetApplyExpr> edgeset_apply_expr;
+
             typedef std::shared_ptr<PriorityUpdateOperator> Ptr;
 
             virtual void accept(MIRVisitor *visitor) {
@@ -1401,6 +1403,7 @@ namespace graphit {
             typedef std::shared_ptr<UpdatePriorityEdgeSetApplyExpr> Ptr;
 
             UpdatePriorityEdgeSetApplyExpr() {}
+	    mir::Var priority_queue_used;
 
             UpdatePriorityEdgeSetApplyExpr(EdgeSetApplyExpr::Ptr edgeset_apply) {
                 target = edgeset_apply->target;
diff --git a/include/graphit/midend/mir_context.h b/include/graphit/midend/mir_context.h
index 398efb62..02384b4c 100644
--- a/include/graphit/midend/mir_context.h
+++ b/include/graphit/midend/mir_context.h
@@ -177,6 +177,11 @@ namespace graphit {
 	    return false;
 	}
 	bool isLoweredConst(std::string var_name) {
+		size_t dot_pos = var_name.find(".");
+		if (dot_pos != std::string::npos) {
+			var_name.resize(dot_pos);
+		}
+		
 		for (auto var: lowered_constants_) {
 			if (var->name == var_name)
 				return true;	
@@ -185,6 +190,11 @@ namespace graphit {
 			if (var->name == var_name)
 				return true;
 		}
+		for (auto var: const_priority_queues_) {
+			if (var->name == var_name) 
+				return true;
+		}
+		
 		return false;
 	}
 
@@ -381,6 +391,7 @@ namespace graphit {
         // These are global sets that are loaded from outside sources and cannot be modified
         std::vector<mir::VarDecl::Ptr> const_vertex_sets_;
         std::vector<mir::VarDecl::Ptr> const_edge_sets_;
+	std::vector<mir::VarDecl::Ptr> const_priority_queues_;
 
         //maps a vector to the Element it is associated with;
         std::map<std::string, mir::ElementType::Ptr> vector_set_element_type_map_;
diff --git a/include/graphit/midend/priority_queue_frontier_reuse.h b/include/graphit/midend/priority_queue_frontier_reuse.h
new file mode 100644
index 00000000..eed4771f
--- /dev/null
+++ b/include/graphit/midend/priority_queue_frontier_reuse.h
@@ -0,0 +1,10 @@
+#ifndef PRIORITY_QUEUE_FRONTIER_REUSE_H
+#define PRIORITY_QUEUE_FRONTIER_REUSE_H
+#include <graphit/midend/mir_context.h>
+#include <graphit/
+namespace graphit {
+class PriorityQueueFrontierReuse {
+	
+};
+}
+#endif
diff --git a/src/backend/codegen_gpu/assign_function_context.cpp b/src/backend/codegen_gpu/assign_function_context.cpp
index 5dc8e83e..f1fc05ce 100644
--- a/src/backend/codegen_gpu/assign_function_context.cpp
+++ b/src/backend/codegen_gpu/assign_function_context.cpp
@@ -18,6 +18,14 @@ void AssignFunctionContext::visit(mir::PushEdgeSetApplyExpr::Ptr pesae) {
 	if (mir_context_->isFunction(pesae->to_func))
 		mir_context_->getFunction(pesae->to_func)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
 }
+void AssignFunctionContext::visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr pesae) {
+	if (mir_context_->isFunction(pesae->input_function_name))
+		mir_context_->getFunction(pesae->input_function_name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+	if (mir_context_->isFunction(pesae->from_func))
+		mir_context_->getFunction(pesae->from_func)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+	if (mir_context_->isFunction(pesae->to_func))
+		mir_context_->getFunction(pesae->to_func)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+}
 void AssignFunctionContext::visit(mir::PullEdgeSetApplyExpr::Ptr pesae) {
 	if (mir_context_->isFunction(pesae->input_function_name))
 		mir_context_->getFunction(pesae->input_function_name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 86ada518..ad71f2b1 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -69,6 +69,11 @@ void CodeGenGPU::genScalarDecl(mir::VarDecl::Ptr var_decl) {
 	
 	var_decl->type->accept(this);
 	oss << " __host_" << var_decl->name << ";" << std::endl;
+
+	if (mir::isa<mir::PriorityQueueType>(var_decl->type)) {
+		var_decl->type->accept(this);
+		oss << " *__device_" << var_decl->name << ";" << std::endl;
+	}
 }
 void CodeGenGPU::genPropertyArrayDecl(mir::VarDecl::Ptr constant) {
 	mir::VectorType::Ptr vector_type = mir::to<mir::VectorType>(constant->type);
@@ -233,15 +238,6 @@ void CodeGenGPU::genFuncDecl(mir::FuncDecl::Ptr func_decl) {
 }
 void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 
-	/*
-	// Before we generate the payload for the load balancing function, we need to generate a declaration for the UDF
-	mir::FuncDecl::Ptr input_function_decl = mir_context_->getFunction(apply_expr->input_function_name);
-	genFuncDecl(input_function_decl);
-	if (apply_expr->to_func != "") {
-		mir::FuncDecl::Ptr to_function_decl = mir_context_->getFunction(apply_expr->to_func);
-		genFuncDecl(to_function_decl);
-	}
-	*/
 	// First we generate the function that is passed to the load balancing function
 
 	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
@@ -280,12 +276,46 @@ void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 	
 }
 
+void CodeGenGPUKernelEmitter::visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr apply_expr) {
+
+	// First we generate the function that is passed to the load balancing function
+
+	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
+
+	oss << "template <typename EdgeWeightType>" << std::endl;
+	oss << "void __device__ " << load_balancing_arg << "(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
+	indent();
+	printIndent();
+	oss << "// Body of the actual operator code" << std::endl;
+	if (apply_expr->to_func != "") {
+		printIndent();
+		oss << "if (!" << apply_expr->to_func << "(dst))" << std::endl;
+		indent();
+		printIndent();
+		oss << "return;" << std::endl;
+		dedent();
+	}
+	mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function_name);
+	// Enqueueing is disabled from here. We are now enqueing from the UDF 
+	if (apply_expr->is_weighted) {	
+		printIndent();
+		oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+		printIndent();
+		oss << apply_expr->input_function_name << "(src, dst, weight";
+	} else {
+		printIndent();
+		oss << apply_expr->input_function_name << "(src, dst";
+	}
+	if (apply_expr->requires_output)
+		oss << ", output_frontier";
+	oss << ");" << std::endl;
+	dedent();
+	printIndent();
+	oss << "}" << std::endl;	
+	apply_expr->device_function = load_balancing_arg;	
+}
+
 void CodeGenGPUKernelEmitter::visit(mir::PullEdgeSetApplyExpr::Ptr apply_expr) {
-	/*
-	// Before we generate the payload for the load balancing function, we need to generate a declaration for the UDF
-	mir::FuncDecl::Ptr input_function_decl = mir_context_->getFunction(apply_expr->input_function_name);
-	genFuncDecl(input_function_decl);
-	*/
 
 	// First we generate the function that is passed to the load balancing function
 	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
@@ -347,6 +377,7 @@ void CodeGenGPU::genGlobalDeclarations(void) {
 		oss << "float __device__ __device_" << threshold_var_name << ";" << std::endl;
 		stmt->threshold_var_name = threshold_var_name;
 	}
+	oss << "int32_t __delta_param;" << std::endl;	
 }
 
 void CodeGenGPU::genEdgeSets(void) {
@@ -369,6 +400,12 @@ void CodeGenGPU::visit(mir::EdgeSetType::Ptr edgeset_type) {
 	}
 }
 
+void CodeGenGPU::visit(mir::PriorityQueueType::Ptr pqt) {
+	oss << "gpu_runtime::GPUPriorityQueue<";
+	pqt->priority_type->accept(this);
+	oss << ">";
+}
+
 void CodeGenGPU::visit(mir::VertexSetType::Ptr vertexset_type) {
 	oss << "gpu_runtime::VertexFrontier";
 }
@@ -409,7 +446,7 @@ void CodeGenGPU::genHybridThresholds(void) {
 			oss << stmt->threshold_var_name << " = gpu_runtime::str_to_float(argv[" << stmt->argv_index << "]);" << std::endl;
 		} else {
 			printIndent();
-			oss << stmt->threshold_var_name << " = " << stmt->threshold << std::endl;
+			oss << stmt->threshold_var_name << " = " << stmt->threshold << ";" << std::endl;
 		}
 		printIndent();
 		oss << "cudaMemcpyToSymbol(__device_" << stmt->threshold_var_name << ", &" << stmt->threshold_var_name << ", sizeof(float), 0);" << std::endl;
@@ -444,6 +481,13 @@ void CodeGenGPU::visit(mir::FuncDecl::Ptr func_decl) {
 
 		if (func_decl->name == "main") {
 			genHybridThresholds();
+			if (mir_context_->delta_ <= 0) {
+				printIndent();
+				oss << "__delta_param = gpu_runtime::str_to_int(argv[" << - mir_context_->delta_ << "]);" << std::endl;
+			} else {
+				printIndent();
+				oss << "__delta_param = " << mir_context_->delta_ << ";" << std::endl;
+			}
 			for (auto stmt: mir_context_->edgeset_alloc_stmts) {
 				mir::AssignStmt::Ptr assign_stmt = mir::to<mir::AssignStmt>(stmt);
 				mir::EdgeSetLoadExpr::Ptr edge_set_load_expr = mir::to<mir::EdgeSetLoadExpr>(assign_stmt->expr);
@@ -477,6 +521,10 @@ void CodeGenGPU::visit(mir::FuncDecl::Ptr func_decl) {
 						oss << "), 0, cudaMemcpyHostToDevice);" << std::endl;
 					}
 				}
+				if (mir::isa<mir::PriorityQueueType>(constant->type)) {
+					printIndent();
+					oss << "cudaGetSymbolAddress(((void**)&__device_" << constant->name << "), " << constant->name << ");" << std::endl;
+				}
 			}
 			for (auto stmt: mir_context_->field_vector_init_stmts) {
 				stmt->accept(this);
@@ -503,9 +551,66 @@ void CodeGenGPU::visit(mir::FuncDecl::Ptr func_decl) {
 void CodeGenGPU::visit(mir::ElementType::Ptr element_type) {
 	oss << "int32_t";
 }
+void CodeGenGPU::genPriorityUpdateOperator(mir::PriorityUpdateOperator::Ptr puo) {
+	printIndent();
+	oss << "if (";
+	if (mir::isa<mir::PriorityUpdateOperatorMin>(puo)) {
+		mir::PriorityUpdateOperatorMin::Ptr puom = mir::to<mir::PriorityUpdateOperatorMin>(puo);
+		if (puom->is_atomic) {
+			oss << "gpu_runtime::writeMin";
+		} else {
+			assert(false && "Currently only atomic priority update is supported");
+		}
+		oss << "(";
+		oss << "&(";
+		puom->priority_queue->accept(this);
+		oss << ".device_priorities_[";
+		puom->destination_node_id->accept(this);
+		oss << "]), ";
+		puom->new_val->accept(this);
+		oss << ")";
+	}
+	oss << " && ";
+	puo->priority_queue->accept(this);
+	oss << ".device_priorities_[";
+	puo->destination_node_id->accept(this);
+	oss << "] < (";
+	puo->priority_queue->accept(this);
+	oss << ".current_priority_ + ";
+	puo->priority_queue->accept(this);
+	oss << ".delta_)";
+	oss << ") {" << std::endl;
+	indent();
+
+	mir::UpdatePriorityEdgeSetApplyExpr::Ptr upesae = puo->edgeset_apply_expr;	
+	mir::EnqueueVertex::Ptr evp = std::make_shared<mir::EnqueueVertex>();
+	evp->vertex_id = puo->destination_node_id;
+	mir::VarExpr::Ptr var_expr = mir::to<mir::VarExpr>(puo->priority_queue);
+	// Since this variable is created temporarily, we don;t need type
+	mir::Var var(var_expr->var.getName() + ".frontier_", nullptr);
+	mir::VarExpr::Ptr frontier_expr = std::make_shared<mir::VarExpr>();
+	frontier_expr->var = var;	
+	
+	evp->vertex_frontier = frontier_expr;
+	if (upesae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+		evp->type = mir::EnqueueVertex::Type::SPARSE;
+	} else if (upesae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
+		evp->type = mir::EnqueueVertex::Type::BOOLMAP;
+	} else if (upesae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BITMAP) {
+		evp->type = mir::EnqueueVertex::Type::BITMAP;
+	} 
+	
+	evp->accept(this);
+	dedent();
+	printIndent();	
+	oss << "}" << std::endl;
+
+}
 void CodeGenGPU::visit(mir::ExprStmt::Ptr expr_stmt) {
 	if (mir::isa<mir::EdgeSetApplyExpr>(expr_stmt->expr)) {
 		genEdgeSetApplyExpr(mir::to<mir::EdgeSetApplyExpr>(expr_stmt->expr), nullptr);
+	} else if (mir::isa<mir::PriorityUpdateOperatorMin>(expr_stmt->expr)) {
+		genPriorityUpdateOperator(mir::to<mir::PriorityUpdateOperatorMin>(expr_stmt->expr));
 	} else {
 		printIndent();
 		expr_stmt->expr->accept(this);
@@ -556,7 +661,7 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 		load_balance_function = "gpu_runtime::strict_load_balance";
 	}
 
-	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae)) {
+	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae) || mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae)) {
 		if (esae->from_func != "") {
 			printIndent();
 			oss << "gpu_runtime::vertex_set_prepare_sparse(";
@@ -591,6 +696,11 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 		target->accept(this);
 		oss << " = " << esae->from_func << ";" << std::endl;
 	}
+	if (mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae)) {
+		mir::UpdatePriorityEdgeSetApplyExpr::Ptr upesae = mir::to<mir::UpdatePriorityEdgeSetApplyExpr>(esae);
+		printIndent();
+		oss << "cudaMemcpyToSymbol(" << upesae->priority_queue_used.getName() << ", &__host_" << upesae->priority_queue_used.getName() << ", sizeof(" << upesae->priority_queue_used.getName() << "), 0);" << std::endl;
+	}
 
 	printIndent();
 	oss << load_balance_function << "_host<";
@@ -774,6 +884,28 @@ void CodeGenGPU::visit(mir::AssignStmt::Ptr assign_stmt) {
 	if (mir::isa<mir::EdgeSetApplyExpr>(assign_stmt->expr)) {
 		mir::EdgeSetApplyExpr::Ptr esae = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);	
 		genEdgeSetApplyExpr(esae, assign_stmt->lhs);
+	} else if (mir::isa<mir::PriorityQueueAllocExpr>(assign_stmt->expr)) {
+		mir::PriorityQueueAllocExpr::Ptr pqae = mir::to<mir::PriorityQueueAllocExpr>(assign_stmt->expr);	
+		printIndent();
+		assign_stmt->lhs->accept(this);
+		oss << ".init(";
+		std::string graph_name = mir_context_->getEdgeSets()[0]->name;	
+		oss << "__host_" << graph_name << ", ";
+		std::string vector_name = pqae->vector_function;
+		if (mir_context_->isLoweredConst(vector_name))
+			oss << "__host_" << vector_name;
+		else
+			oss << vector_name;
+		oss << ", ";
+		if (mir_context_->isLoweredConst(vector_name))
+			oss << "__device_" << vector_name;
+		else
+			oss << vector_name;
+		oss << ", 0, ";
+		oss << "__delta_param";
+		oss << ", ";
+		pqae->starting_node->accept(this);
+		oss << ");" << std::endl;	
 	} else {
 		printIndent();
 		assign_stmt->lhs->accept(this);
@@ -1125,6 +1257,15 @@ void CodeGenGPUFusedKernel::visit(mir::PrintStmt::Ptr print_stmt) {
 	oss << "_grid.sync();" << std::endl;
 }
 void CodeGenGPUHost::visit(mir::Call::Ptr call_expr) {
+	if (call_expr->name == "dequeue_ready_set" || call_expr->name == "finished") {
+		if (call_expr->name == "dequeue_ready_set")
+			call_expr->name = "dequeueReadySet";
+		mir::VarExpr::Ptr pq_expr = mir::to<mir::VarExpr>(call_expr->args[0]);
+		std::string pq_name = pq_expr->var.getName();
+		
+		oss << "__host_" << pq_name << "." << call_expr->name << "(__device_" << pq_name << ")";
+		return;
+	}
 	if (call_expr->name == "deleteObject" || call_expr->name.substr(0, strlen("builtin_")) == "builtin_")	
 		oss << "gpu_runtime::" << call_expr->name << "(";
 	else
diff --git a/src/midend/apply_expr_lower.cpp b/src/midend/apply_expr_lower.cpp
index 308e0a8e..24ebecf4 100644
--- a/src/midend/apply_expr_lower.cpp
+++ b/src/midend/apply_expr_lower.cpp
@@ -160,6 +160,7 @@ namespace graphit {
 	}
 	node = assign_stmt;
     }
+
     void ApplyExprLower::LowerApplyExpr::visit(mir::EdgeSetApplyExpr::Ptr edgeset_apply) {
 
         // use the target var expressionto figure out the edgeset type
diff --git a/src/midend/gpu_change_tracking_lower.cpp b/src/midend/gpu_change_tracking_lower.cpp
index d4b8bc3c..04eeed07 100644
--- a/src/midend/gpu_change_tracking_lower.cpp
+++ b/src/midend/gpu_change_tracking_lower.cpp
@@ -28,6 +28,10 @@ void GPUChangeTrackingLower::UdfArgChangeVisitor::visit(mir::PushEdgeSetApplyExp
 	mir::FuncDecl::Ptr func_decl = mir_context_->getFunction(pesae->input_function_name);	
 	updateUdf(func_decl, pesae);
 }
+void GPUChangeTrackingLower::UdfArgChangeVisitor::visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr pesae) {
+	mir::FuncDecl::Ptr func_decl = mir_context_->getFunction(pesae->input_function_name);	
+	updateUdf(func_decl, pesae);
+}
 void GPUChangeTrackingLower::UdfArgChangeVisitor::visit(mir::PullEdgeSetApplyExpr::Ptr pesae) {
 	mir::FuncDecl::Ptr func_decl = mir_context_->getFunction(pesae->input_function_name);	
 	updateUdf(func_decl, pesae);
diff --git a/src/midend/gpu_priority_features_lowering.cpp b/src/midend/gpu_priority_features_lowering.cpp
new file mode 100644
index 00000000..fc182e86
--- /dev/null
+++ b/src/midend/gpu_priority_features_lowering.cpp
@@ -0,0 +1,117 @@
+#include <graphit/midend/gpu_priority_features_lowering.h>
+
+namespace graphit {
+void GPUPriorityFeaturesLowering::lower(void) {
+	EdgeSetApplyPriorityRewriter rewriter(mir_context_, schedule_);
+	for (auto func: mir_context_->getFunctionList()) {
+		rewriter.rewrite(func);
+	}
+}
+void GPUPriorityFeaturesLowering::EdgeSetApplyPriorityRewriter::visit(mir::ExprStmt::Ptr expr_stmt) {
+	if (expr_stmt->stmt_label != "") {
+		label_scope_.scope(expr_stmt->stmt_label);
+	}
+	if (mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(expr_stmt->expr)) {
+		mir::UpdatePriorityEdgeSetApplyExpr::Ptr upesae = mir::to<mir::UpdatePriorityEdgeSetApplyExpr>(expr_stmt->expr);
+		mir::FuncDecl::Ptr udf = mir_context_->getFunction(upesae->input_function_name);
+		UDFPriorityQueueFinder finder(mir_context_);
+		udf->accept(&finder);
+		mir::Var pq = finder.getPriorityQueue();
+		
+		mir::Var frontier(pq.getName() + ".frontier_", nullptr);
+		
+		mir::VarExpr::Ptr lhs = std::make_shared<mir::VarExpr>();
+		lhs->var = frontier;
+		
+		mir::AssignStmt::Ptr assign = std::make_shared<mir::AssignStmt>();
+		assign->lhs = lhs;
+		assign->expr = expr_stmt->expr;
+		node = assign;
+	
+		upesae->is_parallel = true;
+		upesae->requires_output = true;
+		upesae->priority_queue_used = pq;
+		mir::VarExpr::Ptr edgeset_expr = mir::to<mir::VarExpr>(upesae->target);
+		mir::EdgeSetType::Ptr edgeset_type = mir::to<mir::EdgeSetType>(edgeset_expr->var.getType());
+		assert(edgeset_type->vertex_element_type_list->size() == 2);
+		if (edgeset_type->weight_type != nullptr) {
+		    upesae->is_weighted = true;
+		}
+		// Now apply the schedule to the operator
+		if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
+			auto current_scope_name = label_scope_.getCurrentScope();	
+			auto apply_schedule_iter = schedule_->apply_gpu_schedules.find(current_scope_name);
+			if (apply_schedule_iter != schedule_->apply_gpu_schedules.end()) {
+				auto apply_schedule = apply_schedule_iter->second;
+				if (dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule) != nullptr) {
+					upesae->applied_schedule = *dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule);
+					mir_context_->delta_ = upesae->applied_schedule.delta;
+				} else {
+					assert(false && "Scedule applied to edgesetapply must be a Simple Schedule");
+				}
+			}				
+		}
+		PriorityUpdateOperatorRewriter rewriter(mir_context_, upesae);
+		rewriter.rewrite(udf);	
+		if (expr_stmt->stmt_label != "") {
+			label_scope_.unscope();
+		}
+		return;	
+	}
+	if (expr_stmt->stmt_label != "") {
+		label_scope_.unscope();
+	}
+	mir::MIRRewriter::visit(expr_stmt);
+	return;
+}
+void GPUPriorityFeaturesLowering::UDFPriorityQueueFinder::visit(mir::PriorityUpdateOperator::Ptr call) {
+	if (mir::isa<mir::VarExpr>(call->args[0])) {
+		insertVar(mir::to<mir::VarExpr>(call->args[0])->var);	
+	}
+}
+void GPUPriorityFeaturesLowering::UDFPriorityQueueFinder::visit(mir::PriorityUpdateOperatorMin::Ptr call) {
+	mir::PriorityUpdateOperator::Ptr puo = call;
+	visit(puo);
+}
+void GPUPriorityFeaturesLowering::UDFPriorityQueueFinder::visit(mir::PriorityUpdateOperatorSum::Ptr call) {
+	mir::PriorityUpdateOperator::Ptr puo = call;
+	visit(puo);
+}
+void GPUPriorityFeaturesLowering::UDFPriorityQueueFinder::visit(mir::Call::Ptr call) {
+	if (call->name == "updatePriorityMin" || call->name == "UpdatePrioritySum") {
+		if (mir::isa<mir::VarExpr>(call->args[0])) {
+			insertVar(mir::to<mir::VarExpr>(call->args[0])->var);	
+		}
+	}
+}
+void GPUPriorityFeaturesLowering::UDFPriorityQueueFinder::insertVar(mir::Var to_insert) {
+	for (auto var: priority_queues_used) {
+		if (var.getName() == to_insert.getName())
+			return;
+	}
+	priority_queues_used.push_back(to_insert);
+}
+mir::Var GPUPriorityFeaturesLowering::UDFPriorityQueueFinder::getPriorityQueue(void) {
+	assert(priority_queues_used.size() == 1 && "Exactly one priority queue must be used in the UDF supplied to UpdatePriorityEdgeSetApplyExpr");
+	return priority_queues_used[0];
+}
+void GPUPriorityFeaturesLowering::PriorityUpdateOperatorRewriter::visit(mir::Call::Ptr call) {
+	if (call->name == "updatePriorityMin") {
+		mir::PriorityUpdateOperatorMin::Ptr update_op = std::make_shared<mir::PriorityUpdateOperatorMin>();
+		update_op->priority_queue = call->args[0];
+		update_op->destination_node_id = call->args[1];
+		update_op->old_val = call->args[2];
+		update_op->new_val = call->args[3];
+		update_op->edgeset_apply_expr = puesae_;
+		node = update_op;
+	} else if (call->name == "updatePrioritySum") {
+		mir::PriorityUpdateOperatorSum::Ptr update_op = std::make_shared<mir::PriorityUpdateOperatorSum>();
+		update_op->priority_queue = call->args[0];
+		update_op->destination_node_id = call->args[1];
+		update_op->delta = call->args[2];
+		update_op->minimum_val = call->args[3];
+		update_op->edgeset_apply_expr = puesae_;
+		node = update_op;
+	}
+}
+}
diff --git a/src/midend/gpu_vector_field_properties_analyzer.cpp b/src/midend/gpu_vector_field_properties_analyzer.cpp
index 1e200b2f..f5a8c2aa 100644
--- a/src/midend/gpu_vector_field_properties_analyzer.cpp
+++ b/src/midend/gpu_vector_field_properties_analyzer.cpp
@@ -50,6 +50,30 @@ void GPUVectorFieldPropertiesAnalyzer::ApplyExprVisitor::visit(mir::PullEdgeSetA
 	func->accept(&visitor);
 }
 
+void GPUVectorFieldPropertiesAnalyzer::ApplyExprVisitor::visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr pesae) {
+	// UpdatePriority will function just like Push for now
+	std::unordered_set<std::string> idp_set;
+	mir::FuncDecl::Ptr func = mir_context_->getFunction(pesae->input_function_name);
+
+	std::string src_name = func->args[0].getName();
+	std::string dst_name = func->args[1].getName();
+
+	switch (pesae->applied_schedule.load_balancing) {
+		case fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::VERTEX_BASED:
+			idp_set.insert(src_name);
+			break;
+		default:
+			break;	
+	}	
+	
+	
+	PropertyAnalyzingVisitor visitor(mir_context_, idp_set, func);
+	func->accept(&visitor);
+	
+}
+
+
+
 bool GPUVectorFieldPropertiesAnalyzer::PropertyAnalyzingVisitor::is_independent_index(mir::Expr::Ptr expr) {
 	if (mir::isa<mir::VarExpr>(expr)) {
 		mir::VarExpr::Ptr var_expr = mir::to<mir::VarExpr>(expr);
@@ -130,4 +154,11 @@ void GPUVectorFieldPropertiesAnalyzer::PropertyAnalyzingVisitor::visit(mir::Redu
 	enclosing_function->field_vector_properties_map_[target] = property;	
 	
 }
+void GPUVectorFieldPropertiesAnalyzer::PropertyAnalyzingVisitor::visit(mir::PriorityUpdateOperatorMin::Ptr puo) {
+	mir::MIRVisitor::visit(puo);
+	mir::Expr::Ptr index_expr = puo->destination_node_id;
+	if (!is_independent_index(index_expr)) {
+		puo->is_atomic = true;	
+	}
+}
 }
diff --git a/src/midend/mir.cpp b/src/midend/mir.cpp
index 22dd4174..a36f8fd5 100644
--- a/src/midend/mir.cpp
+++ b/src/midend/mir.cpp
@@ -90,6 +90,7 @@ namespace graphit {
             Call::copy(node);
             destination_node_id = expr->destination_node_id;
             priority_queue = expr->priority_queue;
+            edgeset_apply_expr = expr->edgeset_apply_expr;
         }
 
 
diff --git a/src/midend/mir_emitter.cpp b/src/midend/mir_emitter.cpp
index e9e5661b..c792e418 100644
--- a/src/midend/mir_emitter.cpp
+++ b/src/midend/mir_emitter.cpp
@@ -843,7 +843,10 @@ namespace graphit {
                 }
                 ctx->addEdgeSet(mir_var_decl);
                 ctx->addEdgesetType(mir_var_decl->name, type);
-
+	    } else if (std::dynamic_pointer_cast<mir::PriorityQueueType>(mir_var_decl->type) != nullptr) {
+		ctx->const_priority_queues_.push_back(mir_var_decl);
+                mir_var_decl->modifier = "const";
+                ctx->addConstant(mir_var_decl);
             } else {
                 mir_var_decl->modifier = "const";
                 ctx->addConstant(mir_var_decl);
diff --git a/src/midend/mir_lower.cpp b/src/midend/mir_lower.cpp
index 34f9b94f..8762b1c9 100644
--- a/src/midend/mir_lower.cpp
+++ b/src/midend/mir_lower.cpp
@@ -14,6 +14,7 @@
 #include <graphit/midend/vertex_edge_set_lower.h>
 #include <graphit/midend/merge_reduce_lower.h>
 #include <graphit/midend/priority_features_lowering.h>
+#include <graphit/midend/gpu_priority_features_lowering.h>
 #include <graphit/midend/while_loop_fusion.h>
 #include <graphit/midend/frontier_reuse_analysis.h>
 
@@ -33,8 +34,13 @@ namespace graphit {
         VertexEdgeSetLower(mir_context).lower();
 
 
-        //This pass needs to happen before ApplyExprLower pass because the default ReduceBeforeUpdate uses ApplyExprLower
-        PriorityFeaturesLower(mir_context, schedule).lower();
+	// We use the GPU version when the GPU Scheules are set
+	if (schedule != nullptr && !schedule->apply_gpu_schedules.empty()) {
+		GPUPriorityFeaturesLowering(mir_context, schedule).lower();
+	} else  {
+		//This pass needs to happen before ApplyExprLower pass because the default ReduceBeforeUpdate uses ApplyExprLower
+		PriorityFeaturesLower(mir_context, schedule).lower();
+	}
 
 	// This pass finds EdgeSetApplyExpressions that allow frontiers to be reused and removes the corresponding deletes
 	FrontierReuseAnalysis(mir_context).analyze();
diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index 1157af77..091b1837 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -6,173 +6,171 @@
 #include "vertex_frontier.h" 
 
 #ifndef NUM_BLOCKS
-  #define NUM_BLOCKS 80
+#define NUM_BLOCKS 80
 #endif
 
 #ifndef CTA_SIZE
-  #define CTA_SIZE 1024
+#define CTA_SIZE 1024
 #endif
 
 
 namespace gpu_runtime {
 
-    template<typename PriorityT_>
-    class GPUPriorityQueue;
-
-    static void __global__ update_nodes_identify_min(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices);
-
-
-    static void __global__ update_nodes_special(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices, gpu_runtime::VertexFrontier output_frontier);
-  
-  template<typename PriorityT_>
-    class GPUPriorityQueue {
-    
-  public:
-
-    size_t getCurrentPriority(){
-      return current_priority_;
-    }
-
-    void init(GraphT<int32_t> graph, PriorityT_ * host_priorities, PriorityT_* device_priorities, PriorityT_ initial_priority, PriorityT_ delta, NodeID initial_node = -1){
-      host_priorities_ = host_priorities;
-      device_priorities_ = device_priorities;
-      current_priority_ = initial_priority;
-      delta_ = delta;
-      ready_set_dequeued = false;
-      frontier_ = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
-      if (initial_node != -1){
-	  gpu_runtime::builtin_addVertex(frontier_, initial_node);
-      }
-    }
-    
-    void __device__ updatePriorityMin(GPUPriorityQueue<PriorityT_> * device_gpq,  PriorityT_ new_priority, VertexFrontier output_frontier, int32_t node){
-      bool output = gpu_runtime::writeMin(&(device_gpq->device_priorities_[node]), new_priority);
-     if (device_gpq->device_priorities_[node] >= (device_gpq->current_priority_ + device_gpq->delta_)) return;
-     if (output){
-       enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, node);
-     }
-
-    }
-    
-    bool finished(GPUPriorityQueue<PriorityT_> * device_gpq) {
-      if (current_priority_ == INT_MAX){
-	return true;
-      }
-
-      if (!ready_set_dequeued && gpu_runtime::builtin_getVertexSetSize(frontier_) == 0){
-	dequeueReadySet(device_gpq);
-	ready_set_dequeued = true;
-	return current_priority_ == INT_MAX;
-      } 
-
-      return false;
-    }
-    
-    bool host_finishedNode(NodeID v){
-      return host_priorities_[v]/delta_ < current_priority_;
-    }
-
-    bool __device__ device_finishedNode(NodeID v){
-
-    }
-
-    
-    VertexFrontier& dequeueReadySet(GPUPriorityQueue<PriorityT_> * device_gpq){
-      // if this is already dequeued in the previous finish() operator
-      // then don't do the dequeu operation again
-      if (ready_set_dequeued){
-	//Now that we dequeued it, the next ready set is no longer dequeued
-	ready_set_dequeued = false;
-	return frontier_;
-      }
-
-      //perform the dequeue operation only if the current frontier is empty
-      if (gpu_runtime::builtin_getVertexSetSize(frontier_) == 0) {
-	window_upper_ = current_priority_ + delta_;
-	current_priority_ = INT_MAX;
-
-	cudaMemcpy(device_gpq, this, sizeof(*device_gpq), cudaMemcpyHostToDevice); 
-	gpu_runtime::cudaCheckLastError();
-	
-	update_nodes_identify_min<<<NUM_BLOCKS, CTA_SIZE>>>(device_gpq, frontier_.max_num_elems);
-	gpu_runtime::cudaCheckLastError();
-
-	cudaMemcpy(this, device_gpq, sizeof(*this), cudaMemcpyDeviceToHost);
-	gpu_runtime::cudaCheckLastError();
-
-	//this line needs to be fixed
-	update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>(device_gpq, frontier_.max_num_elems,  frontier_);
-	gpu_runtime::cudaCheckLastError();
-	gpu_runtime::swap_queues(frontier_);
-	frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE;
-
-	//Now that we dequeued it, the next ready set is no longer dequeued
-	ready_set_dequeued = false;
-	return frontier_;
-      }
-
-      //if it is empty, just return the empty frontier
-      return frontier_;
-    }
-    
-    PriorityT_* host_priorities_ = nullptr;
-    PriorityT_* device_priorities_ = nullptr;
-    
-    PriorityT_ delta_ = 1;
-    PriorityT_ current_priority_ = 0;
-    PriorityT_ window_upper_ = 0;
-
-    //Need to do = {0} to avoid dynamic initialization error
-    VertexFrontier frontier_ = {0};
-    bool ready_set_dequeued = false;
-  };
-
-
-  static void __global__ update_nodes_identify_min(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices)
-  {
-    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-    int num_threads = blockDim.x * gridDim.x;
-    int total_work = num_vertices;
-    int work_per_thread = (total_work + num_threads - 1)/num_threads;
-    int32_t my_minimum = INT_MAX;
-    for (int i = 0; i < work_per_thread; i++) {
-      int32_t node_id = thread_id + i * num_threads;
-	if (node_id < num_vertices) {
-	  if (gpq->device_priorities_[node_id] >= (gpq->window_upper_) && gpq->device_priorities_[node_id] != INT_MAX && gpq->device_priorities_[node_id] < my_minimum) {
-	    my_minimum = gpq->device_priorities_[node_id];
-	  }
+template<typename PriorityT_>
+	class GPUPriorityQueue;
+
+static void __global__ update_nodes_identify_min(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices);
+
+
+static void __global__ update_nodes_special(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices, gpu_runtime::VertexFrontier output_frontier);
+
+template<typename PriorityT_>
+	class GPUPriorityQueue {
+
+	public:
+
+		size_t getCurrentPriority(){
+			return current_priority_;
+		}
+
+		void init(GraphT<int32_t> graph, PriorityT_ * host_priorities, PriorityT_* device_priorities, PriorityT_ initial_priority, PriorityT_ delta, NodeID initial_node = -1){
+			host_priorities_ = host_priorities;
+			device_priorities_ = device_priorities;
+			current_priority_ = initial_priority;
+			delta_ = delta;
+			ready_set_dequeued = false;
+			frontier_ = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
+			if (initial_node != -1){
+				gpu_runtime::builtin_addVertex(frontier_, initial_node);
+			}
+		}
+
+		void __device__ updatePriorityMin(GPUPriorityQueue<PriorityT_> * device_gpq,  PriorityT_ new_priority, VertexFrontier output_frontier, int32_t node){
+			bool output = gpu_runtime::writeMin(&(device_gpq->device_priorities_[node]), new_priority);
+			if (device_gpq->device_priorities_[node] >= (device_gpq->current_priority_ + device_gpq->delta_)) return;
+			if (output){
+				enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, node);
+			}
+
+		}
+
+		bool finished(GPUPriorityQueue<PriorityT_> * device_gpq) {
+			if (current_priority_ == INT_MAX){
+				return true;
+			}
+
+			if (!ready_set_dequeued && gpu_runtime::builtin_getVertexSetSize(frontier_) == 0){
+				dequeueReadySet(device_gpq);
+				ready_set_dequeued = true;
+				return current_priority_ == INT_MAX;
+			} 
+
+			return false;
+		}
+
+		bool host_finishedNode(NodeID v){
+			return host_priorities_[v]/delta_ < current_priority_;
+		}
+
+		bool __device__ device_finishedNode(NodeID v){
+
+		}
+
+		VertexFrontier& dequeueReadySet(GPUPriorityQueue<PriorityT_> * device_gpq){
+			// if this is already dequeued in the previous finish() operator
+			// then don't do the dequeu operation again
+			if (ready_set_dequeued){
+				//Now that we dequeued it, the next ready set is no longer dequeued
+				ready_set_dequeued = false;
+				return frontier_;
+			}
+
+			//perform the dequeue operation only if the current frontier is empty
+			if (gpu_runtime::builtin_getVertexSetSize(frontier_) == 0) {
+				window_upper_ = current_priority_ + delta_;
+				current_priority_ = INT_MAX;
+
+				cudaMemcpy(device_gpq, this, sizeof(*device_gpq), cudaMemcpyHostToDevice); 
+				gpu_runtime::cudaCheckLastError();
+
+				update_nodes_identify_min<<<NUM_BLOCKS, CTA_SIZE>>>(device_gpq, frontier_.max_num_elems);
+				gpu_runtime::cudaCheckLastError();
+
+				cudaMemcpy(this, device_gpq, sizeof(*this), cudaMemcpyDeviceToHost);
+				gpu_runtime::cudaCheckLastError();
+
+				//this line needs to be fixed
+				update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>(device_gpq, frontier_.max_num_elems,  frontier_);
+				gpu_runtime::cudaCheckLastError();
+				gpu_runtime::swap_queues(frontier_);
+				frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE;
+
+				//Now that we dequeued it, the next ready set is no longer dequeued
+				ready_set_dequeued = false;
+				return frontier_;
+			}
+
+			//if it is empty, just return the empty frontier
+			return frontier_;
+		}
+
+		PriorityT_* host_priorities_ = nullptr;
+		PriorityT_* device_priorities_ = nullptr;
+
+		PriorityT_ delta_ = 1;
+		PriorityT_ current_priority_ = 0;
+		PriorityT_ window_upper_ = 0;
+
+		//Need to do = {0} to avoid dynamic initialization error
+		VertexFrontier frontier_ = {0};
+		bool ready_set_dequeued = false;
+	};
+
+
+static void __global__ update_nodes_identify_min(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices) {
+	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+	int num_threads = blockDim.x * gridDim.x;
+	int total_work = num_vertices;
+	int work_per_thread = (total_work + num_threads - 1)/num_threads;
+	int32_t my_minimum = INT_MAX;
+	for (int i = 0; i < work_per_thread; i++) {
+		int32_t node_id = thread_id + i * num_threads;
+		if (node_id < num_vertices) {
+			if (gpq->device_priorities_[node_id] >= (gpq->window_upper_) && gpq->device_priorities_[node_id] != INT_MAX && gpq->device_priorities_[node_id] < my_minimum) {
+				my_minimum = gpq->device_priorities_[node_id];
+			}
+		}
 	}
-    }
-    
-    if (my_minimum < gpq->current_priority_){
-          atomicMin(&(gpq->current_priority_), my_minimum);
-    }
-  }//end of update_nodes_identify_min
-
-
-
-  static void __global__ update_nodes_special(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices, gpu_runtime::VertexFrontier output_frontier){
-    
-    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-    int num_threads = blockDim.x * gridDim.x;
-    //int warp_id = thread_id / 32;
-    
-    int total_work = num_vertices;
-    int work_per_thread = (total_work + num_threads - 1)/num_threads;
-    for (int i = 0; i < work_per_thread; i++) {
-      int32_t node_id = thread_id + i * num_threads;
-      if (node_id < num_vertices) {
-	if(gpq->device_priorities_[node_id] >= gpq->current_priority_ && gpq->device_priorities_[node_id] < (gpq->current_priority_ + gpq->delta_)) {
-	  gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, node_id);
+
+	if (my_minimum < gpq->current_priority_){
+		atomicMin(&(gpq->current_priority_), my_minimum);
 	}
-      }
-    }
-  }
-  
+}//end of update_nodes_identify_min
+
+
+
+static void __global__ update_nodes_special(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices, gpu_runtime::VertexFrontier output_frontier){
+
+	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+	int num_threads = blockDim.x * gridDim.x;
+	//int warp_id = thread_id / 32;
+
+	int total_work = num_vertices;
+	int work_per_thread = (total_work + num_threads - 1)/num_threads;
+	for (int i = 0; i < work_per_thread; i++) {
+		int32_t node_id = thread_id + i * num_threads;
+		if (node_id < num_vertices) {
+			if(gpq->device_priorities_[node_id] >= gpq->current_priority_ && gpq->device_priorities_[node_id] < (gpq->current_priority_ + gpq->delta_)) {
+				gpu_runtime::enqueueVertexSparseQueue(output_frontier.d_sparse_queue_output, output_frontier.d_num_elems_output, node_id);
+			}
+		}
+	}
+}
+
+
 
 
 
-  
 }
 
 
diff --git a/test/gpu_tests/all_gpu_tests.py b/test/gpu_tests/all_gpu_tests.py
index 43dd5d0b..8266df53 100644
--- a/test/gpu_tests/all_gpu_tests.py
+++ b/test/gpu_tests/all_gpu_tests.py
@@ -136,6 +136,9 @@ def test_sssp_delta_stepping(self):
 	def test_sssp_delta_stepping_verified(self):
 		self.sssp_verified_test("sssp_delta_stepping.cu", True)
 
+	def test_sssp_delta_stepping_verified_frontier_byval(self):
+		self.sssp_verified_test("sssp_delta_stepping_frontier_byval.cu", True)
+
 	def test_simple_graphit_exec(self):
 		output = self.graphit_exec_test("simple_graph_load.gt", "default_gpu_schedule.gt", [], [self.graph_directory + "/simple_mtx.mtx"])
 		output = output.split("\n")
diff --git a/test/gpu_tests/test_input/sssp_delta_stepping_frontier_byval.cu b/test/gpu_tests/test_input/sssp_delta_stepping_frontier_byval.cu
new file mode 100644
index 00000000..35a10024
--- /dev/null
+++ b/test/gpu_tests/test_input/sssp_delta_stepping_frontier_byval.cu
@@ -0,0 +1,172 @@
+
+
+
+#define VIRTUAL_WARP_SIZE (32)
+#define NUM_THREADS (1024)
+#define NUM_BLOCKS (80)
+#define CTA_SIZE (1024)
+#define WARP_SIZE (32)
+#define STAGE_1_SIZE (8)
+
+
+#include "gpu_intrinsics.h"
+#include <algorithm>
+
+
+#define USE_DEDUP 0
+#define SORT_NODES 0
+#include <assert.h>
+
+//#define DEBUG
+
+#ifdef DEBUG
+  #define ITER_COUNT (5)
+#else
+  #define ITER_COUNT (1)
+#endif
+
+gpu_runtime::GPUPriorityQueue<int> host_gpq;
+gpu_runtime::GPUPriorityQueue<int> __device__  device_gpq; 
+
+
+int32_t __device__ *SP;
+int32_t *__host_SP;
+int32_t *__device_SP;
+
+
+void __global__ init_kernel(gpu_runtime::GraphT<int32_t> graph, int start_v) {
+        int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+        int num_threads = blockDim.x * gridDim.x;
+        int total_work = graph.num_vertices;
+        int work_per_thread = (total_work + num_threads - 1)/num_threads;
+	if (thread_id == 0) {
+		//reset with the new data structure
+		SP[start_v] = 0;
+	}
+}
+
+/*bool __device__ updateEdge(int32_t src, int32_t dst, int32_t weight) {
+        bool output2;
+        bool SP_trackving_var_1 = 0;
+	SP_trackving_var_1 = gpu_runtime::writeMin(&SP[dst], (SP[src] + weight));
+	output2 = SP_trackving_var_1;
+	if (SP[dst] >= (device_gpq.current_priority_ + device_gpq.delta_)) return false;
+	return output2;
+	}*/
+
+void __device__ deviceUpdateEdge(int32_t src, int32_t dst, int32_t weight, gpu_runtime::VertexFrontier output_frontier){
+  device_gpq.updatePriorityMin(&device_gpq, (SP[src] + weight), output_frontier, dst);
+}
+
+template <typename EdgeWeightType>
+void __device__ gpu_operator_body_3(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {
+	// Body of the actual operator code
+	EdgeWeightType weight = graph.d_edge_weight[edge_id];
+	deviceUpdateEdge(src, dst, weight, output_frontier);
+	/*if (updateEdge(src, dst, weight)){
+		gpu_runtime::enqueueVertexBytemap(output_frontier.d_byte_map_output, output_frontier.d_num_elems_output, dst);
+		}*/
+}
+
+void __device__ SP_generated_vector_op_apply_func_0(int32_t v) {
+	SP[v] = 2147483647;
+}
+
+int main(int argc, char *argv[]) {
+	cudaSetDevice(0);
+	cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
+	gpu_runtime::GraphT<int32_t> graph;
+	gpu_runtime::load_graph(graph, argv[1], false);
+	int32_t delta = atoi(argv[3]);
+	int32_t start_vertex = atoi(argv[2]);
+	
+	cudaMalloc(&__device_SP, gpu_runtime::builtin_getVertices(graph) * sizeof(int32_t));
+	cudaMemcpyToSymbol(SP, &__device_SP, sizeof(int32_t*), 0);
+	__host_SP = new int32_t[gpu_runtime::builtin_getVertices(graph)];
+	cudaDeviceSynchronize();
+	float total_time = 0;
+	for (int outer = 0; outer < ITER_COUNT; outer++) {
+		float iter_total = 0;
+		//this sets it to Sparse
+		//host_gpq.frontier_ = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
+		
+		gpu_runtime::vertex_set_apply_kernel<gpu_runtime::AccessorAll, SP_generated_vector_op_apply_func_0><<<NUM_CTA, CTA_SIZE>>>(graph.getFullFrontier());
+		startTimer();
+
+		host_gpq.init(graph, __host_SP, __device_SP, 0, delta, start_vertex);
+
+		cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
+		gpu_runtime::cudaCheckLastError();
+		
+		init_kernel<<<NUM_BLOCKS, CTA_SIZE>>>(graph, start_vertex);
+		gpu_runtime::cudaCheckLastError();
+		
+		int iters = 0;	
+		cudaDeviceSynchronize();
+		float t = stopTimer();
+		//printf("Init time = %f\n", t);
+		iter_total+=t;
+
+		gpu_runtime::GPUPriorityQueue<int> * tmp_gpq;
+		cudaGetSymbolAddress(((void **)&tmp_gpq), device_gpq);
+		
+		while(! host_gpq.finished(tmp_gpq)){
+			startTimer();
+			iters++;
+			
+			gpu_runtime::VertexFrontier frontier = host_gpq.dequeueReadySet(tmp_gpq);
+			
+			gpu_runtime::vertex_set_prepare_sparse(frontier);
+			host_gpq.frontier_ = frontier;
+			cudaMemcpyToSymbol(device_gpq, &host_gpq, sizeof(host_gpq), 0);
+			gpu_runtime::cudaCheckLastError();
+
+			gpu_runtime::TWCE_load_balance_host<int32_t, gpu_operator_body_3, gpu_runtime::AccessorSparse, gpu_runtime::true_function>(graph, frontier, host_gpq.frontier_);
+			gpu_runtime::cudaCheckLastError();
+
+			gpu_runtime::swap_bytemaps(host_gpq.frontier_);
+			// set the input to the prepare function
+			host_gpq.frontier_.format_ready = gpu_runtime::VertexFrontier::BYTEMAP;
+			
+			cudaDeviceSynchronize();
+			t = stopTimer();
+
+			#ifdef DEBUG
+			//printf("Iter %d output_size = %d \n", iters, gpu_runtime::builtin_getVertexSetSize(frontier));
+			#endif
+			
+			iter_total += t;
+		}
+
+
+		#ifdef DEBUG
+		printf("Num iters = %d\n", iters);
+		printf("Time elapsed = %f\n", iter_total);
+		#endif
+		
+		total_time += iter_total;
+
+	}
+
+	#ifdef DEBUG
+	printf("Total time = %f\n", total_time);
+	#endif
+	
+	if (argc > 3)
+		if (argv[4][0] == 'v'){ 
+			//FILE *output = fopen("output.txt", "w");
+			cudaMemcpy(__host_SP, __device_SP, sizeof(int32_t)*graph.num_vertices, cudaMemcpyDeviceToHost);
+			#ifdef DEBUG
+			FILE *output = fopen("output.txt", "w");
+			#endif
+			
+			for (int i = 0; i < graph.num_vertices; i++){
+				#ifdef DEBUG
+				fprintf(output, "%d, %d\n", i, __host_SP[i]);
+				#else
+				printf("%d\n", __host_SP[i]);
+                                #endif
+			}
+		}
+	return 0;
+}

From 1795db6bddde9087e477ea8789077831590676e8 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Fri, 8 Nov 2019 04:31:27 -0500
Subject: [PATCH 72/88] Working code for Priority Queue with kernel fusion

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h |  15 +++
 include/graphit/midend/mir.h                  |   1 +
 src/backend/codegen_cpp.cpp                   |   2 +-
 src/backend/codegen_gpu/codegen_gpu.cpp       |  71 +++++++++-
 src/runtime_lib/gpu_intrinsics.h              |  20 +--
 .../infra_gpu/gpu_priority_queue.h            | 122 +++++++++++++++++-
 src/runtime_lib/infra_gpu/load_balance.h      |   2 +-
 src/runtime_lib/infra_gpu/vertex_frontier.h   |  73 +++++++++--
 .../infra_gpu/vertex_representation.h         |   2 +
 9 files changed, 273 insertions(+), 35 deletions(-)

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index d9d150e9..9e7d54a1 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -163,10 +163,20 @@ class CodeGenGPUFusedKernel: public CodeGenGPU {
 public:
 	using CodeGenGPU::CodeGenGPU;
 	using CodeGenGPU::visit;
+
+	mir::WhileStmt::Ptr current_while_stmt;
+	void insertUsedPq(mir::Var var) {
+		for (auto v: current_while_stmt->used_priority_queues) {
+			if (v.getName() == var.getName())
+				return;
+		}
+		current_while_stmt->used_priority_queues.push_back(var);
+	}
 	void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr);
 	virtual void visit(mir::StmtBlock::Ptr) override;
 	virtual void visit(mir::AssignStmt::Ptr) override;
 	virtual void visit(mir::VarDecl::Ptr) override;
+	virtual void visit(mir::VarExpr::Ptr) override;
 	virtual void visit(mir::PrintStmt::Ptr) override;
 	virtual void visit(mir::HybridGPUStmt::Ptr) override;
 	virtual void visit(mir::VertexSetDedupExpr::Ptr) override;
@@ -182,6 +192,8 @@ class KernelVariableExtractor: public mir::MIRVisitor {
 	using mir::MIRVisitor::visit;
 	std::vector<mir::Var> hoisted_vars; 
 	std::vector<mir::VarDecl::Ptr> hoisted_decls;
+	std::vector<mir::Var> hoisted_pqs;
+
 	MIRContext *mir_context_;
 	KernelVariableExtractor(MIRContext* mir_context): mir_context_(mir_context) {
 	}
@@ -194,10 +206,13 @@ class KernelVariableExtractor: public mir::MIRVisitor {
 	}
 	void insertDecl(mir::VarDecl::Ptr decl_to_insert) {
 		hoisted_decls.push_back(decl_to_insert);
+		mir::Var var(decl_to_insert->name, decl_to_insert->type);
+		insertVar(var);
 	}
 
 	virtual void visit(mir::VarExpr::Ptr);
 	virtual void visit(mir::VarDecl::Ptr);
+	virtual void visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr);
 };
 
 }
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index 131a8d3c..86b99f0c 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -418,6 +418,7 @@ namespace graphit {
 	    std::string fused_kernel_name;
 	    std::vector<mir::Var> hoisted_vars;
 	    std::vector<std::shared_ptr<mir::VarDecl>> hoisted_decls;
+	    std::vector<mir::Var> used_priority_queues;
 
             virtual void accept(MIRVisitor *visitor) {
                 visitor->visit(self<WhileStmt>());
diff --git a/src/backend/codegen_cpp.cpp b/src/backend/codegen_cpp.cpp
index 3ed0f22d..7f96231f 100644
--- a/src/backend/codegen_cpp.cpp
+++ b/src/backend/codegen_cpp.cpp
@@ -1737,7 +1737,7 @@ namespace graphit {
             oss << "new julienne::PriorityQueue <";
             priority_queue_alloc_expr->priority_type->accept(this);
             oss << " > ( ";
-
+	
             oss << mir_context_->getEdgeSets()[0]->name;
 
             if (priority_queue_alloc_expr->priority_update_type == mir::PriorityUpdateType::ReduceBeforePriorityUpdate){
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index ad71f2b1..134e56ff 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -136,14 +136,21 @@ void CodeGenGPU::genPropertyArrayAlloca(mir::VarDecl::Ptr var_decl) {
 		
 }
 void KernelVariableExtractor::visit(mir::VarExpr::Ptr var_expr) {
-	if (mir_context_->isLoweredConst(var_expr->var.getName()))
+	if (mir_context_->isLoweredConst(var_expr->var.getName())) {
 		return;
+	}
+	
 	insertVar(var_expr->var);
 }
 void KernelVariableExtractor::visit(mir::VarDecl::Ptr var_decl) {
 	insertDecl(var_decl);
 }
+void KernelVariableExtractor::visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr esae) {
+	mir::MIRVisitor::visit(esae);
+	hoisted_pqs.push_back(esae->priority_queue_used);	
+}
 void CodeGenGPU::genFusedWhileLoop(mir::WhileStmt::Ptr while_stmt) {
+	
 	// First we generate a unique function name for this fused kernel
 	std::string fused_kernel_name = "fused_kernel_body_" + mir_context_->getUniqueNameCounterString();
 	while_stmt->fused_kernel_name = fused_kernel_name;
@@ -157,6 +164,7 @@ void CodeGenGPU::genFusedWhileLoop(mir::WhileStmt::Ptr while_stmt) {
 	while_stmt->hoisted_decls = extractor.hoisted_decls;
 	
 	CodeGenGPUFusedKernel codegen (oss, mir_context_, module_name, "");
+	codegen.current_while_stmt = while_stmt;
 	
 	oss << "// ";
 	for (auto var: extractor.hoisted_vars) 
@@ -181,6 +189,10 @@ void CodeGenGPU::genFusedWhileLoop(mir::WhileStmt::Ptr while_stmt) {
 		codegen.printIndent();
 		oss << "auto __local_" << var.getName() << " = " << fused_kernel_name << "_" << var.getName() << ";" << std::endl;
 	}
+	for (auto var: extractor.hoisted_pqs) {
+		codegen.printIndent();
+		oss << "auto __local_" << var.getName() << " = " << var.getName() << ";" << std::endl;	
+	}
 	
 	codegen.printIndent();
 	oss << "while (";
@@ -200,6 +212,10 @@ void CodeGenGPU::genFusedWhileLoop(mir::WhileStmt::Ptr while_stmt) {
 		codegen.printIndent();
 		oss << fused_kernel_name << "_" << var.getName() << " = " << "__local_" << var.getName() << ";" << std::endl;
 	}
+	for (auto var: extractor.hoisted_pqs) {
+		codegen.printIndent();
+		oss << var.getName() << " = __local_" << var.getName() << ";" << std::endl;
+	}
 	codegen.dedent();
 	codegen.printIndent();
 	oss << "}" << std::endl;	
@@ -633,6 +649,13 @@ void CodeGenGPUHost::visit(mir::VarExpr::Ptr var_expr) {
 		oss << var_expr->var.getName();
 
 }
+void CodeGenGPUFusedKernel::visit(mir::VarExpr::Ptr var_expr) {
+	if (mir::isa<mir::PriorityQueueType>(var_expr->var.getType())) {
+		oss << "__local_" << var_expr->var.getName();
+		return;
+	} else 
+		oss << var_expr->var.getName();
+}
 void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr::Ptr target) {
 	if (target != nullptr && esae->from_func == "") {
 		assert(false && "GPU backend doesn't currently support creating output frontier without input frontier\n");
@@ -793,7 +816,8 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 	} else if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::STRICT) {
 		load_balance_function = "gpu_runtime::strict_load_balance";
 	}
-	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae)) {
+	
+	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae) || mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae)) {
 		printIndent();
 		oss << "gpu_runtime::vertex_set_prepare_sparse_device(";
 		oss << var_name(esae->from_func);
@@ -822,6 +846,24 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 		target->accept(this);	
 		oss << " = " << var_name(esae->from_func) << ";" << std::endl;
 	}
+	if (mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae)) {
+		mir::UpdatePriorityEdgeSetApplyExpr::Ptr upesae = mir::to<mir::UpdatePriorityEdgeSetApplyExpr>(esae);
+		insertUsedPq(upesae->priority_queue_used);
+	}
+	if (mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae)) {
+		mir::UpdatePriorityEdgeSetApplyExpr::Ptr upesae = mir::to<mir::UpdatePriorityEdgeSetApplyExpr>(esae);
+		printIndent();
+		oss << "if (_thread_id == 0) {" << std::endl;
+		indent();
+		printIndent();
+		oss << upesae->priority_queue_used.getName() << " = __local_" << upesae->priority_queue_used.getName() << ";" << std::endl;
+		dedent();
+		printIndent();
+		oss << "}" << std::endl;
+		printIndent();
+		oss << "_grid.sync();" << std::endl;
+		//oss << "cudaMemcpyToSymbol(" << upesae->priority_queue_used.getName() << ", &__host_" << upesae->priority_queue_used.getName() << ", sizeof(" << upesae->priority_queue_used.getName() << "), 0);" << std::endl;
+	}
 	printIndent();
 	oss << load_balance_function << "_device<";
 	
@@ -849,11 +891,12 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 	oss << ");" << std::endl;
 	
 	if (target != nullptr) {
+		mir::VarExpr::Ptr target_expr = mir::to<mir::VarExpr>(target);
 		if (esae->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
 			printIndent();
 			oss << "gpu_runtime::swap_queues_device(";
 			target->accept(this);
-			oss << ");" << std::endl;
+			oss << ");" << std::endl;	
 			printIndent();
 			target->accept(this);
 			oss << ".format_ready = gpu_runtime::VertexFrontier::SPARSE;" << std::endl;
@@ -1140,11 +1183,9 @@ void CodeGenGPUFusedKernel::visit(mir::VarDecl::Ptr var_decl) {
 	// Do nothing for variable declarations on kernel only lower the initialization as assignment
 	if (var_decl->initVal != nullptr) {
 		printIndent();
-		oss << "if (_thread_id == 0)" << std::endl;
-		indent();
-		printIndent();
-		oss << var_decl->name << " = ";
+		oss << "__local_" << var_decl->name << " = ";
 		var_decl->initVal->accept(this);
+		oss << ";" << std::endl;
 	}
 }
 void CodeGenGPU::visit(mir::VertexSetDedupExpr::Ptr vsde) {
@@ -1188,8 +1229,16 @@ void CodeGenGPU::visit(mir::WhileStmt::Ptr while_stmt) {
 			printIndent();
 			oss << "cudaMemcpyToSymbol(" << while_stmt->fused_kernel_name << "_" << var.getName() << ", &" << var.getName() << ", sizeof(" << var.getName() << "), 0, cudaMemcpyHostToDevice);" << std::endl;
 		}
+		for (auto var: while_stmt->used_priority_queues) {
+			printIndent();
+			oss << "cudaMemcpyToSymbol(" << var.getName() << ", &__host_" << var.getName() << ", sizeof(__host_" << var.getName() << "), 0);" << std::endl;
+		}
 		printIndent();
 		oss << "cudaLaunchCooperativeKernel((void*)" << while_stmt->fused_kernel_name << ", NUM_CTA, CTA_SIZE, gpu_runtime::no_args);" << std::endl;
+		for (auto var: while_stmt->used_priority_queues) {
+			printIndent();
+			oss << "cudaMemcpyFromSymbol(&__host_" << var.getName() << ", " << var.getName() << ", sizeof(__host_" << var.getName() << "), 0);" << std::endl;
+		}
 		for (auto var: while_stmt->hoisted_vars) {
 			bool to_copy = true;
 			for (auto decl: while_stmt->hoisted_decls) {
@@ -1282,6 +1331,14 @@ void CodeGenGPUHost::visit(mir::Call::Ptr call_expr) {
 }
 
 void CodeGenGPU::visit(mir::Call::Ptr call_expr) {
+	if (call_expr->name == "dequeue_ready_set" || call_expr->name == "finished") {
+		if (call_expr->name == "dequeue_ready_set")
+			call_expr->name = "dequeueReadySet";
+		mir::VarExpr::Ptr pq_expr = mir::to<mir::VarExpr>(call_expr->args[0]);
+		pq_expr->accept(this);
+		oss << ".device_" << call_expr->name << "()";
+		return;
+	}
 	if (call_expr->name == "deleteObject" || call_expr->name.substr(0, strlen("builtin_")) == "builtin_")	
 		oss << "gpu_runtime::device_" << call_expr->name << "(";
 	else
diff --git a/src/runtime_lib/gpu_intrinsics.h b/src/runtime_lib/gpu_intrinsics.h
index 0d3d8c17..f77d6779 100644
--- a/src/runtime_lib/gpu_intrinsics.h
+++ b/src/runtime_lib/gpu_intrinsics.h
@@ -15,22 +15,16 @@
 
 namespace gpu_runtime {
 
-template <typename T>
-void deleteObject(T &t) {
-	// Currently deleteObject is empty
-
-}
-template <>
-void deleteObject<VertexFrontier>(VertexFrontier &t) {
+void deleteObject(VertexFrontier &t) {
 	delete_vertex_frontier(t);
 }
 
 template <typename T>
-__device__ void device_deleteObject(T &t) {
-	// Currently deleteObject is empty
+void deleteObject(GPUPriorityQueue<T> &pq) {
+	pq.release();
 }
 
-static void * no_args[1];
+ void * no_args[1];
 
 float str_to_float(const char* str) {
 	float val;
@@ -38,5 +32,11 @@ float str_to_float(const char* str) {
 		return 0.0;
 	return val;
 }
+int32_t str_to_int(const char* str) {
+	int32_t val;
+	if (sscanf(str, "%i", &val) != 1)
+		return 0;
+	return val;
+}
 }
 #endif
diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index 091b1837..a6d1a09a 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -20,9 +20,9 @@ template<typename PriorityT_>
 	class GPUPriorityQueue;
 
 static void __global__ update_nodes_identify_min(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices);
-
-
 static void __global__ update_nodes_special(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices, gpu_runtime::VertexFrontier output_frontier);
+static void __device__ update_nodes_identify_min_device(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices);
+static void __device__ update_nodes_special_device(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices, gpu_runtime::VertexFrontier output_frontier);
 
 template<typename PriorityT_>
 	class GPUPriorityQueue {
@@ -40,10 +40,14 @@ template<typename PriorityT_>
 			delta_ = delta;
 			ready_set_dequeued = false;
 			frontier_ = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
+			cudaMalloc(&current_priority_shared, sizeof(PriorityT_));
 			if (initial_node != -1){
 				gpu_runtime::builtin_addVertex(frontier_, initial_node);
 			}
 		}
+		void release(void) {
+			delete_vertex_frontier(frontier_);
+		}
 
 		void __device__ updatePriorityMin(GPUPriorityQueue<PriorityT_> * device_gpq,  PriorityT_ new_priority, VertexFrontier output_frontier, int32_t node){
 			bool output = gpu_runtime::writeMin(&(device_gpq->device_priorities_[node]), new_priority);
@@ -67,6 +71,30 @@ template<typename PriorityT_>
 
 			return false;
 		}
+#ifdef GLOBAL
+		bool __device__ device_finished(void) {
+			if (current_priority_ == INT_MAX)
+				return true;
+			if (!ready_set_dequeued && gpu_runtime::device_builtin_getVertexSetSize(frontier_) == 0) {
+				device_dequeueReadySet();
+				if (threadIdx.x + blockIdx.x * blockDim.x == 0)
+					ready_set_dequeued = true;
+				this_grid().sync();
+				return current_priority_ == INT_MAX;
+			}
+			return false;
+		}
+#endif
+		bool __device__ device_finished(void) {
+			if(current_priority_ == INT_MAX)
+				return true;
+			if (!ready_set_dequeued && gpu_runtime::device_builtin_getVertexSetSize(frontier_) == 0) {
+				device_dequeueReadySet();
+				ready_set_dequeued = true;
+				return current_priority_ == INT_MAX;
+			}
+			return false;
+		}
 
 		bool host_finishedNode(NodeID v){
 			return host_priorities_[v]/delta_ < current_priority_;
@@ -99,7 +127,6 @@ template<typename PriorityT_>
 				cudaMemcpy(this, device_gpq, sizeof(*this), cudaMemcpyDeviceToHost);
 				gpu_runtime::cudaCheckLastError();
 
-				//this line needs to be fixed
 				update_nodes_special<<<NUM_BLOCKS, CTA_SIZE>>>(device_gpq, frontier_.max_num_elems,  frontier_);
 				gpu_runtime::cudaCheckLastError();
 				gpu_runtime::swap_queues(frontier_);
@@ -113,6 +140,79 @@ template<typename PriorityT_>
 			//if it is empty, just return the empty frontier
 			return frontier_;
 		}
+		
+		VertexFrontier __device__ device_dequeueReadySet(void) {
+			if (ready_set_dequeued) {
+				ready_set_dequeued = false;
+				return frontier_;
+			}
+			if (gpu_runtime::device_builtin_getVertexSetSize(frontier_) == 0) {
+				window_upper_ = current_priority_ + delta_;
+				current_priority_ = INT_MAX;
+				this_grid().sync();
+				if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+					current_priority_shared[0] = INT_MAX;
+				}
+				this_grid().sync();
+				
+				update_nodes_identify_min_device(this, frontier_.max_num_elems);
+				this_grid().sync();
+				
+				current_priority_ = current_priority_shared[0];
+				this_grid().sync();
+				update_nodes_special_device(this, frontier_.max_num_elems, frontier_);
+				gpu_runtime::swap_queues_device(frontier_);
+				frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE;
+				ready_set_dequeued = false;
+				return frontier_;
+			}
+			return frontier_;
+		}	
+
+
+#ifdef GLOBAL
+		VertexFrontier __device__ device_dequeueReadySet(void) {
+/*
+			if (threadIdx.x + blockDim.x * blockIdx.x == 0)
+				printf("Entering dequeue ready set\n");
+*/
+			if (ready_set_dequeued) {
+				this_grid().sync();
+				if (threadIdx.x + blockIdx.x * blockDim.x == 0)
+					ready_set_dequeued = false;
+				this_grid().sync();
+				return frontier_;
+			}
+			if (gpu_runtime::device_builtin_getVertexSetSize(frontier_) == 0) {
+/*				
+				if (threadIdx.x + blockDim.x * blockIdx.x == 0)
+					printf("Entering special case\n");
+*/
+				this_grid().sync();
+				if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+					window_upper_ = current_priority_ + delta_;
+					current_priority_ = INT_MAX;
+				}
+				this_grid().sync();
+				// No need for copy
+				update_nodes_identify_min_device(this, frontier_.max_num_elems);	
+				this_grid().sync();
+				update_nodes_special_device(this, frontier_.max_num_elems, frontier_);
+				this_grid().sync();
+				gpu_runtime::swap_queues_device_global(frontier_);
+				this_grid().sync();	
+				if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+					frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE;
+					ready_set_dequeued = false;
+				}
+				this_grid().sync();
+				return frontier_;
+					
+			}
+			this_grid().sync();
+			return frontier_;
+		}
+#endif
 
 		PriorityT_* host_priorities_ = nullptr;
 		PriorityT_* device_priorities_ = nullptr;
@@ -124,10 +224,12 @@ template<typename PriorityT_>
 		//Need to do = {0} to avoid dynamic initialization error
 		VertexFrontier frontier_ = {0};
 		bool ready_set_dequeued = false;
+		
+		PriorityT_ *current_priority_shared = nullptr;
 	};
 
 
-static void __global__ update_nodes_identify_min(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices) {
+static void __device__ update_nodes_identify_min_device(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices) {
 	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
 	int num_threads = blockDim.x * gridDim.x;
 	int total_work = num_vertices;
@@ -142,14 +244,14 @@ static void __global__ update_nodes_identify_min(GPUPriorityQueue<int32_t>* gpq,
 		}
 	}
 
-	if (my_minimum < gpq->current_priority_){
-		atomicMin(&(gpq->current_priority_), my_minimum);
+	if (my_minimum < gpq->current_priority_shared[0]){
+		atomicMin(&(gpq->current_priority_shared[0]), my_minimum);
 	}
 }//end of update_nodes_identify_min
 
 
 
-static void __global__ update_nodes_special(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices, gpu_runtime::VertexFrontier output_frontier){
+static void __device__ update_nodes_special_device(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices, gpu_runtime::VertexFrontier output_frontier){
 
 	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
 	int num_threads = blockDim.x * gridDim.x;
@@ -168,7 +270,13 @@ static void __global__ update_nodes_special(GPUPriorityQueue<int32_t>* gpq,  int
 }
 
 
+static void __global__ update_nodes_identify_min(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices) {
+	update_nodes_identify_min_device(gpq, num_vertices);	
+}
 
+static void __global__ update_nodes_special(GPUPriorityQueue<int32_t>* gpq,  int32_t num_vertices, gpu_runtime::VertexFrontier output_frontier){
+	update_nodes_special_device(gpq, num_vertices, output_frontier);
+}
 
 
 }
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index ee699371..e39998f0 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -122,7 +122,7 @@ void __device__ edge_only_load_balance_device(GraphT<EdgeWeightType> &graph, Ver
 #define STAGE_1_SIZE (8)
 #define WARP_SIZE (32)
 template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier), typename AccessorType, bool src_filter(int32_t)>
-static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier input_frontier, VertexFrontier output_frontier, unsigned int cta_id, unsigned int total_cta) {
+static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier, unsigned int cta_id, unsigned int total_cta) {
 	int32_t thread_id = blockDim.x * cta_id + threadIdx.x;
 	
 	int32_t lane_id = thread_id % 32;
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index ffa7ea4f..c53fb8f7 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -38,16 +38,19 @@ class VertexFrontier {
 };
 
 
+static void cudaFreeSafe(void* ptr) {
+	cudaFree(ptr);
+}
 void delete_vertex_frontier(VertexFrontier &frontier) {
-	cudaFree(frontier.d_sparse_queue_input);	
-	cudaFree(frontier.d_sparse_queue_output);
-	cudaFree(frontier.d_num_elems_input);
-	cudaFree(frontier.d_num_elems_output);
-	cudaFree(frontier.d_byte_map_input);
-	cudaFree(frontier.d_byte_map_output);
-	cudaFree(frontier.d_bit_map_input);
-	cudaFree(frontier.d_bit_map_output);
-	cudaFree(frontier.d_dedup_counters);
+	cudaFreeSafe(frontier.d_sparse_queue_input);	
+	cudaFreeSafe(frontier.d_sparse_queue_output);
+	cudaFreeSafe(frontier.d_num_elems_input);
+	cudaFreeSafe(frontier.d_num_elems_output);
+	cudaFreeSafe(frontier.d_byte_map_input);
+	cudaFreeSafe(frontier.d_byte_map_output);
+	cudaFreeSafe(frontier.d_bit_map_input);
+	cudaFreeSafe(frontier.d_bit_map_output);
+	cudaFreeSafe(frontier.d_dedup_counters);
 	return;
 }
 static VertexFrontier sentinel_frontier;
@@ -180,6 +183,7 @@ static void swap_queues(VertexFrontier &frontier) {
 
 	cudaMemset(frontier.d_num_elems_output, 0, sizeof(int32_t));	
 }
+
 static void __device__ swap_queues_device(VertexFrontier &frontier) {	
 	int32_t *temp = frontier.d_num_elems_input;
 	frontier.d_num_elems_input = frontier.d_num_elems_output;
@@ -192,6 +196,20 @@ static void __device__ swap_queues_device(VertexFrontier &frontier) {
 		frontier.d_num_elems_output[0] = 0;
 	this_grid().sync();
 }
+static void __device__ swap_queues_device_global(VertexFrontier &frontier) {	
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+		int32_t *temp = frontier.d_num_elems_input;
+		frontier.d_num_elems_input = frontier.d_num_elems_output;
+		frontier.d_num_elems_output = temp;
+		
+		temp = frontier.d_sparse_queue_input;
+		frontier.d_sparse_queue_input = frontier.d_sparse_queue_output;
+		frontier.d_sparse_queue_output = temp;
+	}
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) 
+		frontier.d_num_elems_output[0] = 0;
+	this_grid().sync();
+}
 
 static void swap_bytemaps(VertexFrontier &frontier) {
 	int32_t *temp = frontier.d_num_elems_input;
@@ -221,6 +239,24 @@ static void __device__ swap_bytemaps_device(VertexFrontier &frontier) {
 	parallel_memset(frontier.d_byte_map_output, 0, sizeof(unsigned char) * frontier.max_num_elems);		
 	this_grid().sync();
 }
+static void __device__ swap_bytemaps_device_global(VertexFrontier &frontier) {
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+		int32_t *temp = frontier.d_num_elems_input;
+		frontier.d_num_elems_input = frontier.d_num_elems_output;
+		frontier.d_num_elems_output = temp;
+		
+		unsigned char* temp2;
+		temp2 = frontier.d_byte_map_input;
+		frontier.d_byte_map_input = frontier.d_byte_map_output;
+		frontier.d_byte_map_output = temp2;
+	}
+	this_grid().sync();
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) 
+		frontier.d_num_elems_output[0] = 0;
+	this_grid().sync();
+	parallel_memset(frontier.d_byte_map_output, 0, sizeof(unsigned char) * frontier.max_num_elems);		
+	this_grid().sync();
+}
 static void swap_bitmaps(VertexFrontier &frontier) {
 	int32_t *temp = frontier.d_num_elems_input;
 	frontier.d_num_elems_input = frontier.d_num_elems_output;
@@ -253,6 +289,25 @@ static void __device__ swap_bitmaps_device(VertexFrontier &frontier) {
 	parallel_memset((unsigned char*)frontier.d_bit_map_output, 0, sizeof(uint32_t) * num_byte_for_bitmap);		
 	this_grid().sync();
 }
+static void __device__ swap_bitmaps_device_global(VertexFrontier &frontier) {
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
+		int32_t *temp = frontier.d_num_elems_input;
+		frontier.d_num_elems_input = frontier.d_num_elems_output;
+		frontier.d_num_elems_output = temp;
+		
+		uint32_t* temp2;
+		temp2 = frontier.d_bit_map_input;
+		frontier.d_bit_map_input = frontier.d_bit_map_output;
+		frontier.d_bit_map_output = temp2;
+	}
+
+	int32_t num_byte_for_bitmap = (frontier.max_num_elems + 8 * sizeof(uint32_t) - 1)/(sizeof(uint32_t) * 8);
+
+	if (threadIdx.x + blockIdx.x * blockDim.x == 0) 
+		frontier.d_num_elems_output[0] = 0;
+	parallel_memset((unsigned char*)frontier.d_bit_map_output, 0, sizeof(uint32_t) * num_byte_for_bitmap);		
+	this_grid().sync();
+}
 static void __device__ dedup_frontier_device(VertexFrontier &frontier) {
 	for(int32_t vidx = threadIdx.x + blockDim.x * blockIdx.x; vidx < frontier.d_num_elems_input[0]; vidx += blockDim.x * gridDim.x) {
 		int32_t vid = frontier.d_sparse_queue_input[vidx];
diff --git a/src/runtime_lib/infra_gpu/vertex_representation.h b/src/runtime_lib/infra_gpu/vertex_representation.h
index 576182b5..8b4db24c 100644
--- a/src/runtime_lib/infra_gpu/vertex_representation.h
+++ b/src/runtime_lib/infra_gpu/vertex_representation.h
@@ -63,11 +63,13 @@ static void __device__ vertex_set_prepare_sparse_device(VertexFrontier &frontier
 		generalized_prepare_from_to<AccessorAll, condition_bytemap, update_sparse>(frontier);
 		this_grid().sync();
 		swap_queues_device(frontier);
+		this_grid().sync();
 		return;
 	} else if (frontier.format_ready == VertexFrontier::BITMAP) {
 		generalized_prepare_from_to<AccessorAll, condition_bitmap, update_sparse>(frontier);
 		this_grid().sync();
 		swap_queues_device(frontier);
+		this_grid().sync();
 		return;
 	}
 }

From 489735b0d7275816594146d8523f671083e4da30 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Fri, 8 Nov 2019 14:19:37 -0500
Subject: [PATCH 73/88] Typo in codegen for fused kernel

---
 src/backend/codegen_gpu/codegen_gpu.cpp        | 3 +++
 src/runtime_lib/infra_gpu/gpu_priority_queue.h | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index 134e56ff..ace36435 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -653,6 +653,9 @@ void CodeGenGPUFusedKernel::visit(mir::VarExpr::Ptr var_expr) {
 	if (mir::isa<mir::PriorityQueueType>(var_expr->var.getType())) {
 		oss << "__local_" << var_expr->var.getName();
 		return;
+	} else if (is_hoisted_var(var_expr->var)) {
+		oss << "__local_" << var_expr->var.getName();
+		return;
 	} else 
 		oss << var_expr->var.getName();
 }
diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index a6d1a09a..51233268 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -118,12 +118,15 @@ template<typename PriorityT_>
 				window_upper_ = current_priority_ + delta_;
 				current_priority_ = INT_MAX;
 
+				cudaMemcpy(current_priority_shared, &current_priority_, sizeof(int32_t), cudaMemcpyHostToDevice);
 				cudaMemcpy(device_gpq, this, sizeof(*device_gpq), cudaMemcpyHostToDevice); 
 				gpu_runtime::cudaCheckLastError();
-
+				
 				update_nodes_identify_min<<<NUM_BLOCKS, CTA_SIZE>>>(device_gpq, frontier_.max_num_elems);
 				gpu_runtime::cudaCheckLastError();
 
+				cudaMemcpy(&(device_gpq->current_priority_), current_priority_shared, sizeof(int32_t), cudaMemcpyDeviceToHost);
+
 				cudaMemcpy(this, device_gpq, sizeof(*this), cudaMemcpyDeviceToHost);
 				gpu_runtime::cudaCheckLastError();
 

From 842f1d842c0a3769ff47d03538b4a566a6a898a2 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Fri, 22 Nov 2019 13:57:05 -0500
Subject: [PATCH 74/88] PageRank blocking issue fixed

---
 .../graphit/backend/codegen_gpu/codegen_gpu.h | 11 ++-
 include/graphit/midend/mir_context.h          |  4 +-
 src/backend/codegen_gpu/codegen_gpu.cpp       | 83 ++++++++++++-------
 src/midend/apply_expr_lower.cpp               |  4 +
 .../infra_gpu/gpu_priority_queue.h            |  4 +
 src/runtime_lib/infra_gpu/graph.h             | 46 +++++++---
 src/runtime_lib/infra_gpu/load_balance.h      | 63 ++++++++++++--
 src/runtime_lib/infra_gpu/vertex_frontier.h   | 12 ++-
 8 files changed, 174 insertions(+), 53 deletions(-)

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index 9e7d54a1..b0729849 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -80,9 +80,16 @@ class CodeGenGPU: public mir::MIRVisitor{
 	std::vector<mir::Var> kernel_hoisted_vars;
 	std::string current_kernel_name;
 	bool is_hoisted_var (mir::Var var) {
-		for (auto h_var: kernel_hoisted_vars)
-			if (h_var.getName() == var.getName())
+		std::string var_name = var.getName();
+			
+		size_t dot_pos = var_name.find(".");
+		if (dot_pos != std::string::npos) {
+			var_name.resize(dot_pos);
+		}
+		for (auto h_var: kernel_hoisted_vars) {
+			if (h_var.getName() == var_name)
 				return true;
+		}
 		return false;
 	}
 	
diff --git a/include/graphit/midend/mir_context.h b/include/graphit/midend/mir_context.h
index 02384b4c..dffa38d0 100644
--- a/include/graphit/midend/mir_context.h
+++ b/include/graphit/midend/mir_context.h
@@ -464,7 +464,9 @@ namespace graphit {
 	// Used by kernel fusion optimization
 	std::vector<mir::WhileStmt::Ptr> fused_while_loops;
 	std::vector<mir::HybridGPUStmt::Ptr> hybrid_gpu_stmts;
-
+	
+	// Used by blocking optimization
+	std::unordered_map<std::string, uint32_t> graphs_with_blocking;
     };
 
 }
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index ace36435..f2f2672b 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -177,6 +177,8 @@ void CodeGenGPU::genFusedWhileLoop(mir::WhileStmt::Ptr while_stmt) {
 	}
 	codegen.kernel_hoisted_vars = extractor.hoisted_vars;
 	codegen.current_kernel_name = fused_kernel_name;
+	for (auto var: extractor.hoisted_pqs)
+		codegen.kernel_hoisted_vars.push_back(var);
 
 	oss << "void __global__ " << fused_kernel_name << "(void) {" << std::endl;	
 	codegen.indent();
@@ -342,21 +344,22 @@ void CodeGenGPUKernelEmitter::visit(mir::PullEdgeSetApplyExpr::Ptr apply_expr) {
 	printIndent();
 	oss << "// Body of the actual operator" << std::endl;
 	// Before we generate the call to the UDF, we have to check if the dst is on the input frontier
-	
-	if (apply_expr->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
-		printIndent();
-		oss << "if (!input_frontier.d_byte_map_input[dst])" << std::endl;
-		indent();
-		printIndent();
-		oss << "return;" << std::endl;
-		dedent();
-	} else if (apply_expr->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BITMAP) {
-		printIndent();
-		oss << "if (!gpu_runtime::checkBit(input_frontier.d_bit_map_input, dst))" << std::endl;
-		indent();
-		printIndent();
-		oss << "return;" << std::endl;
-		dedent();
+	if (apply_expr->from_func != "") {	
+		if (apply_expr->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
+			printIndent();
+			oss << "if (!input_frontier.d_byte_map_input[dst])" << std::endl;
+			indent();
+			printIndent();
+			oss << "return;" << std::endl;
+			dedent();
+		} else if (apply_expr->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BITMAP) {
+			printIndent();
+			oss << "if (!gpu_runtime::checkBit(input_frontier.d_bit_map_input, dst))" << std::endl;
+			indent();
+			printIndent();
+			oss << "return;" << std::endl;
+			dedent();
+		}
 	}
 
 	mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function_name);
@@ -403,6 +406,16 @@ void CodeGenGPU::genEdgeSets(void) {
 		oss << " __device__ " << edgeset->name << ";" << std::endl;
 		edge_set_type->accept(this);
 		oss << " " << "__host_" << edgeset->name << ";" << std::endl;
+
+
+		if (mir_context_->graphs_with_blocking.find(edgeset->name) != mir_context_->graphs_with_blocking.end()) {
+			uint32_t blocking_size = mir_context_->graphs_with_blocking[edgeset->name];
+			auto edge_set_type = mir::to<mir::EdgeSetType>(edgeset->type);
+			edge_set_type->accept(this);
+			oss << " __device__ " << edgeset->name << "__blocked_" << blocking_size << ";" << std::endl;
+			edge_set_type->accept(this);
+			oss << " " << "__host_" << edgeset->name << "__blocked_" << blocking_size << ";" << std::endl;
+		}
 	}
 }
 
@@ -412,7 +425,7 @@ void CodeGenGPU::visit(mir::EdgeSetType::Ptr edgeset_type) {
 		edgeset_type->weight_type->accept(this);
 		oss << ">";	
 	} else {
-		oss << "gpu_runtime::GraphT<int32_t>";
+		oss << "gpu_runtime::GraphT<char>";
 	}
 }
 
@@ -519,6 +532,15 @@ void CodeGenGPU::visit(mir::FuncDecl::Ptr func_decl) {
 				printIndent();
 				oss << "cudaMemcpyToSymbol(";
 				oss << var_name << ", &__host_" << var_name << ", sizeof(__host_" << var_name << "), 0, cudaMemcpyHostToDevice);" << std::endl;
+				if (mir_context_->graphs_with_blocking.find(var_name) != mir_context_->graphs_with_blocking.end()) {
+					uint32_t blocking_size = mir_context_->graphs_with_blocking[var_name];		
+					printIndent();
+					oss << "gpu_runtime::block_graph_edges(__host_" << var_name << ", __host_" << var_name << "__blocked_" << blocking_size << ", " << blocking_size << ");" << std::endl;
+					printIndent();
+					oss << "cudaMemcpyToSymbol(";
+					oss << var_name << "__blocked_" << blocking_size << ", &__host_" << var_name << "__blocked_" << blocking_size << ", sizeof(__host_" << var_name << "__blocked_" << blocking_size << "), 0, cudaMemcpyHostToDevice);" << std::endl;
+				}
+				
 
 			}
 			for (auto constant: mir_context_->getLoweredConstants()) {
@@ -579,22 +601,19 @@ void CodeGenGPU::genPriorityUpdateOperator(mir::PriorityUpdateOperator::Ptr puo)
 		}
 		oss << "(";
 		oss << "&(";
-		puom->priority_queue->accept(this);
-		oss << ".device_priorities_[";
+		//puom->priority_queue->accept(this);
+		oss << "__output_frontier.d_priority_array[";
 		puom->destination_node_id->accept(this);
 		oss << "]), ";
 		puom->new_val->accept(this);
 		oss << ")";
 	}
 	oss << " && ";
-	puo->priority_queue->accept(this);
-	oss << ".device_priorities_[";
+	oss << "__output_frontier.d_priority_array[";
 	puo->destination_node_id->accept(this);
 	oss << "] < (";
-	puo->priority_queue->accept(this);
-	oss << ".current_priority_ + ";
-	puo->priority_queue->accept(this);
-	oss << ".delta_)";
+	//puo->priority_queue->accept(this);
+	oss << "__output_frontier.priority_cutoff)";
 	oss << ") {" << std::endl;
 	indent();
 
@@ -603,7 +622,7 @@ void CodeGenGPU::genPriorityUpdateOperator(mir::PriorityUpdateOperator::Ptr puo)
 	evp->vertex_id = puo->destination_node_id;
 	mir::VarExpr::Ptr var_expr = mir::to<mir::VarExpr>(puo->priority_queue);
 	// Since this variable is created temporarily, we don;t need type
-	mir::Var var(var_expr->var.getName() + ".frontier_", nullptr);
+	mir::Var var("__output_frontier", nullptr);
 	mir::VarExpr::Ptr frontier_expr = std::make_shared<mir::VarExpr>();
 	frontier_expr->var = var;	
 	
@@ -650,10 +669,7 @@ void CodeGenGPUHost::visit(mir::VarExpr::Ptr var_expr) {
 
 }
 void CodeGenGPUFusedKernel::visit(mir::VarExpr::Ptr var_expr) {
-	if (mir::isa<mir::PriorityQueueType>(var_expr->var.getType())) {
-		oss << "__local_" << var_expr->var.getName();
-		return;
-	} else if (is_hoisted_var(var_expr->var)) {
+	if (is_hoisted_var(var_expr->var)) {
 		oss << "__local_" << var_expr->var.getName();
 		return;
 	} else 
@@ -734,7 +750,7 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 	mir::Var target_var = mir::to<mir::VarExpr>(esae->target)->var;
 	mir::EdgeSetType::Ptr target_type = mir::to<mir::EdgeSetType>(target_var.getType());
 	if (target_type->weight_type == nullptr)
-		oss << "int32_t";
+		oss << "char";
 	else
 		target_type->weight_type->accept(this);
 
@@ -749,6 +765,9 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 
 	oss << ", " << esae->device_function << ", " << accessor_type << ", " << src_filter << ">(";
 	esae->target->accept(this);
+	if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::EDGE_ONLY && esae->applied_schedule.edge_blocking == fir::gpu_schedule::SimpleGPUSchedule::edge_blocking_type::BLOCKED) {
+		oss << "__blocked_" << esae->applied_schedule.edge_blocking_size;
+	}
 	oss << ", ";
 	if (esae->from_func != "")
 		oss << esae->from_func;
@@ -854,6 +873,7 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 		insertUsedPq(upesae->priority_queue_used);
 	}
 	if (mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae)) {
+/*
 		mir::UpdatePriorityEdgeSetApplyExpr::Ptr upesae = mir::to<mir::UpdatePriorityEdgeSetApplyExpr>(esae);
 		printIndent();
 		oss << "if (_thread_id == 0) {" << std::endl;
@@ -866,6 +886,7 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 		printIndent();
 		oss << "_grid.sync();" << std::endl;
 		//oss << "cudaMemcpyToSymbol(" << upesae->priority_queue_used.getName() << ", &__host_" << upesae->priority_queue_used.getName() << ", sizeof(" << upesae->priority_queue_used.getName() << "), 0);" << std::endl;
+*/
 	}
 	printIndent();
 	oss << load_balance_function << "_device<";
@@ -873,7 +894,7 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 	mir::Var target_var = mir::to<mir::VarExpr>(esae->target)->var;
 	mir::EdgeSetType::Ptr target_type = mir::to<mir::EdgeSetType>(target_var.getType());
 	if (target_type->weight_type == nullptr)
-		oss << "int32_t";
+		oss << "char";
 	else
 		target_type->weight_type->accept(this);
 	
diff --git a/src/midend/apply_expr_lower.cpp b/src/midend/apply_expr_lower.cpp
index 24ebecf4..0e6de014 100644
--- a/src/midend/apply_expr_lower.cpp
+++ b/src/midend/apply_expr_lower.cpp
@@ -199,6 +199,10 @@ namespace graphit {
 				node = std::make_shared<mir::PullEdgeSetApplyExpr>(edgeset_apply);
 			else 
 				assert(false && "Invalid option for direction\n");
+			
+			if (edgeset_apply->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::EDGE_ONLY && edgeset_apply->applied_schedule.edge_blocking == fir::gpu_schedule::SimpleGPUSchedule::edge_blocking_type::BLOCKED) {
+				mir_context_->graphs_with_blocking[mir::to<mir::VarExpr>(edgeset_apply->target)->var.getName()] = edgeset_apply->applied_schedule.edge_blocking_size;
+			}
 						
 		} else {
 			// No schedule is attached, lower using default schedule	
diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index 51233268..9bc59b8a 100644
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -40,6 +40,8 @@ template<typename PriorityT_>
 			delta_ = delta;
 			ready_set_dequeued = false;
 			frontier_ = gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(graph));
+			frontier_.d_priority_array = device_priorities;
+			frontier_.priority_cutoff = current_priority_ + delta_;
 			cudaMalloc(&current_priority_shared, sizeof(PriorityT_));
 			if (initial_node != -1){
 				gpu_runtime::builtin_addVertex(frontier_, initial_node);
@@ -136,6 +138,7 @@ template<typename PriorityT_>
 				frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE;
 
 				//Now that we dequeued it, the next ready set is no longer dequeued
+				frontier_.priority_cutoff = current_priority_ + delta_;
 				ready_set_dequeued = false;
 				return frontier_;
 			}
@@ -167,6 +170,7 @@ template<typename PriorityT_>
 				gpu_runtime::swap_queues_device(frontier_);
 				frontier_.format_ready = gpu_runtime::VertexFrontier::SPARSE;
 				ready_set_dequeued = false;
+				frontier_.priority_cutoff = current_priority_ + delta_;
 				return frontier_;
 			}
 			return frontier_;
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
index 3cb461bf..a410009d 100644
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -53,6 +53,13 @@ struct GraphT { // Field names are according to CSR, reuse for CSC
 	int32_t *strict_sum;
 	int32_t *strict_cta_sum;
 	int32_t *strict_grid_sum;
+
+
+	// blocking related parameters
+	int32_t num_buckets;
+	int32_t *h_bucket_sizes;
+	int32_t *d_bucket_sizes;
+
 		
 };
 void consume(int32_t _) {
@@ -84,37 +91,54 @@ static void block_graph_edges(GraphT<EdgeWeightType> &input_graph, GraphT<EdgeWe
 
 	output_graph.h_edge_src = new int32_t[input_graph.num_edges];
 	output_graph.h_edge_dst = new int32_t[input_graph.num_edges];
-	output_graph.h_edge_weights = new EdgeWeightType[input_graph.num_edges];
+	output_graph.h_edge_weight = new EdgeWeightType[input_graph.num_edges];
 
 	int32_t num_blocks = (input_graph.num_vertices + blocking_size - 1)/blocking_size;
-	
+	std::cout << "num blocks " << num_blocks << std::endl;	
 	int32_t *block_sizes = new int32_t[num_blocks+1];		
+	for (int32_t id = 0; id < num_blocks+1; id++)
+		block_sizes[id] = 0;
 	
 	for (int32_t eid = 0; eid < input_graph.num_edges; eid++) {
-		int32_t dst = input_graph.d_edge_dst[eid];
+		int32_t dst = input_graph.h_edge_dst[eid];
 		int32_t block_id = identify_block_id(dst, blocking_size);
-		block_sizes[block_id] += 1;
+		block_sizes[block_id+1] += 1;
 	}	
+	int32_t running_sum = 0;
+	for (int32_t bid = 0; bid < num_blocks; bid++) {
+		running_sum += block_sizes[bid];
+		block_sizes[bid] = running_sum;
+	}
 	block_sizes[0] = 0;
 	for (int32_t eid = 0; eid < input_graph.num_edges; eid++) {
-		int32_t dst = input_graph.d_edge_dst[eid];
+		int32_t dst = input_graph.h_edge_dst[eid];
 		int32_t block_id = identify_block_id(dst, blocking_size);
 		int32_t new_eid = block_sizes[block_id];
 		block_sizes[block_id]++;
-		output_graph.h_edge_src[new_eid] = input_graph.d_edge_src[eid];	
-		output_graph.h_edge_dst[new_eid] = input_graph.d_edge_dst[eid];	
-		output_graph.h_edge_weights[new_eid] = input_graph.d_edge_weights[eid];	
+		output_graph.h_edge_src[new_eid] = input_graph.h_edge_src[eid];	
+		output_graph.h_edge_dst[new_eid] = input_graph.h_edge_dst[eid];	
+		output_graph.h_edge_weight[new_eid] = input_graph.h_edge_weight[eid];	
 	}
 	
-	delete[] block_sizes;
+	//delete[] block_sizes;
+	output_graph.num_buckets = num_blocks;
+	output_graph.h_bucket_sizes = block_sizes;
+
+
+	cudaFree(input_graph.d_edge_src);
+	cudaFree(input_graph.d_edge_dst);
+	cudaFree(input_graph.d_edge_weight);
+
 	cudaMalloc(&output_graph.d_edge_src, sizeof(int32_t) * output_graph.num_edges);
 	cudaMalloc(&output_graph.d_edge_dst, sizeof(int32_t) * output_graph.num_edges);
 	cudaMalloc(&output_graph.d_edge_weight, sizeof(EdgeWeightType) * output_graph.num_edges);
+	cudaMalloc(&output_graph.d_bucket_sizes, sizeof(int32_t) * num_blocks);
 	
 	
 	cudaMemcpy(output_graph.d_edge_src, output_graph.h_edge_src, sizeof(int32_t) * output_graph.num_edges, cudaMemcpyHostToDevice);
 	cudaMemcpy(output_graph.d_edge_dst, output_graph.h_edge_dst, sizeof(int32_t) * output_graph.num_edges, cudaMemcpyHostToDevice);
 	cudaMemcpy(output_graph.d_edge_weight, output_graph.h_edge_weight, sizeof(EdgeWeightType) * output_graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(output_graph.d_bucket_sizes, output_graph.h_bucket_sizes, sizeof(int32_t) * num_blocks, cudaMemcpyHostToDevice);
 		
 }
 
@@ -145,7 +169,7 @@ static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool
 		
 		CONSUME(fread(graph.h_edge_src, sizeof(int32_t), graph.num_edges, bin_file));
 		CONSUME(fread(graph.h_edge_dst, sizeof(int32_t), graph.num_edges, bin_file));
-		CONSUME(fread(graph.h_edge_weight, sizeof(int32_t), graph.num_edges, bin_file));
+		CONSUME(fread(graph.h_edge_weight, sizeof(EdgeWeightType), graph.num_edges, bin_file));
 
 		CONSUME(fread(graph.h_src_offsets, sizeof(int32_t), graph.num_vertices + 1, bin_file));
 		fclose(bin_file);	
@@ -180,7 +204,7 @@ static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool
 		CONSUME(fwrite(&graph.num_edges, sizeof(int32_t), 1, bin_file));
 		CONSUME(fwrite(graph.h_edge_src, sizeof(int32_t), graph.num_edges, bin_file));
 		CONSUME(fwrite(graph.h_edge_dst, sizeof(int32_t), graph.num_edges, bin_file));
-		CONSUME(fwrite(graph.h_edge_weight, sizeof(int32_t), graph.num_edges, bin_file));
+		CONSUME(fwrite(graph.h_edge_weight, sizeof(EdgeWeightType), graph.num_edges, bin_file));
 		CONSUME(fwrite(graph.h_src_offsets, sizeof(int32_t), graph.num_vertices + 1, bin_file));
 		fclose(bin_file);	
 	}
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index e39998f0..c8abe104 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -3,6 +3,7 @@
 
 #include "infra_gpu/graph.h"
 #include "infra_gpu/vertex_frontier.h"
+#include "infra_gpu/gpu_priority_queue.h"
 #include <cooperative_groups.h>
 using namespace cooperative_groups;
 
@@ -29,6 +30,7 @@ static void __global__ vertex_set_apply_kernel(VertexFrontier frontier) {
 // VERTEX BASED LOAD BALANCE FUNCTIONS
 template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
 void __device__ vertex_based_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier, unsigned int cta_id, unsigned int num_cta) {
+
 	int32_t vid = threadIdx.x + blockDim.x * cta_id;
 	if (vid >= AccessorType::getSize(input_frontier))
 		return;
@@ -39,11 +41,27 @@ void __device__ vertex_based_load_balance(GraphT<EdgeWeightType> &graph, VertexF
 		int32_t dst = graph.d_edge_dst[eid];
 		load_balance_payload(graph, src, dst, eid, input_frontier, output_frontier);
 	}
+
+/*
+	int32_t total_vertices = AccessorType::getSize(input_frontier);
+	for (int32_t vidx = threadIdx.x + blockDim.x * cta_id; vidx < total_vertices; vidx += num_cta * blockDim.x) {
+		int32_t src = AccessorType::getElement(input_frontier, vidx);
+		for (int32_t eid = graph.d_src_offsets[src]; eid < graph.d_src_offsets[src+1]; eid++) {
+			if (src_filter(src) == false)
+				break;
+			int32_t dst = graph.d_edge_dst[eid];
+			load_balance_payload(graph, src, dst, eid, input_frontier, output_frontier);
+		}	
+	}
+*/
 }
 template <typename AccessorType>
 void __host__ vertex_based_load_balance_info(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
+
 	int32_t num_threads = AccessorType::getSizeHost(frontier);
 	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
+
+	//num_cta = NUM_CTA;
 	cta_size = CTA_SIZE;
 }
 template <typename AccessorType>
@@ -90,6 +108,21 @@ static void __device__ edge_only_load_balance(GraphT<EdgeWeightType> &graph, Ver
 		}
 	}		
 }
+
+template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier), typename AccessorType, bool src_filter(int32_t)>
+static void __device__ edge_only_load_balance_blocked(GraphT<EdgeWeightType> &graph, VertexFrontier input_frontier, VertexFrontier output_frontier, unsigned int cta_id, unsigned int total_cta, int32_t index) {
+	int32_t thread_id = blockDim.x * cta_id + threadIdx.x;
+	int32_t total_threads = blockDim.x * total_cta;
+	int32_t starting_edge = index == 0?0:graph.d_bucket_sizes[index-1];
+	int32_t ending_edge = graph.d_bucket_sizes[index];
+	for (int32_t eid = thread_id + starting_edge; eid < ending_edge; eid += total_threads) {
+		int32_t src = graph.d_edge_src[eid];
+		if (src_filter(src) == true) {
+			int32_t dst = graph.d_edge_dst[eid];
+			load_balance_payload(graph, src, dst, eid, input_frontier, output_frontier);	
+		}
+	}		
+}
 template <typename AccessorType>
 void __host__ edge_only_load_balance_info(VertexFrontier &frontier, int32_t &num_cta, int32_t &cta_size) {
 	num_cta = NUM_CTA;
@@ -105,6 +138,14 @@ void __global__ edge_only_load_balance_kernel(GraphT<EdgeWeightType> graph, Vert
 	edge_only_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);
 }
 
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __global__ edge_only_load_balance_blocked_kernel(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier) {
+	for (int32_t index = 0; index < graph.num_buckets; index++) {
+		edge_only_load_balance_blocked<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x, index);
+		__syncthreads();
+	}
+}
+
 template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
 void __host__ edge_only_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
 	int32_t num_cta, cta_size;
@@ -112,6 +153,13 @@ void __host__ edge_only_load_balance_host(GraphT<EdgeWeightType> &graph, VertexF
 	edge_only_load_balance_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
 }
 
+template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)>
+void __host__ edge_only_load_balance_blocked_host(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
+	int32_t num_cta = NUM_CTA;
+	int32_t cta_size = CTA_SIZE;
+	edge_only_load_balance_blocked_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier);
+}
+
 template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
 void __device__ edge_only_load_balance_device(GraphT<EdgeWeightType> &graph, VertexFrontier &input_frontier, VertexFrontier &output_frontier) {
 	vertex_based_load_balance<EdgeWeightType, load_balance_payload, AccessorType, src_filter>(graph, input_frontier, output_frontier, blockIdx.x, gridDim.x);	
@@ -122,6 +170,7 @@ void __device__ edge_only_load_balance_device(GraphT<EdgeWeightType> &graph, Ver
 #define STAGE_1_SIZE (8)
 #define WARP_SIZE (32)
 template <typename EdgeWeightType, void load_balance_payload (GraphT<EdgeWeightType>, int32_t, int32_t, int32_t, VertexFrontier, VertexFrontier), typename AccessorType, bool src_filter(int32_t)>
+
 static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFrontier input_frontier, VertexFrontier output_frontier, unsigned int cta_id, unsigned int total_cta) {
 	int32_t thread_id = blockDim.x * cta_id + threadIdx.x;
 	
@@ -182,7 +231,7 @@ static void __device__ TWCE_load_balance(GraphT<EdgeWeightType> graph, VertexFro
 			if (src_filter(local_vertex) == false)
 				break;
 			int32_t dst = graph.d_edge_dst[neigh_id];
-			load_balance_payload(graph, local_vertex, dst, neigh_id, input_frontier, output_frontier);	
+			load_balance_payload(graph, local_vertex, dst, neigh_id, input_frontier, output_frontier);
 		}
 
 	}
@@ -624,13 +673,16 @@ void __host__ TWC_load_balance_host(GraphT<EdgeWeightType> &graph, VertexFrontie
 	cudaMemcpy(twc_bin_sizes, graph.twc_bin_sizes, 3 * sizeof(int32_t), cudaMemcpyDeviceToHost);
 	num_threads = twc_bin_sizes[0];	
 	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
-	TWC_small_bin_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier); 
+	if (num_cta)
+		TWC_small_bin_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier); 
 	num_threads = twc_bin_sizes[1] * MID_BIN;	
 	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
-	TWC_mid_bin_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier); 
+	if (num_cta)
+		TWC_mid_bin_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier); 
 	num_threads = twc_bin_sizes[2] * LARGE_BIN;	
 	num_cta = (num_threads + CTA_SIZE-1)/CTA_SIZE;
-	TWC_large_bin_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier); 	
+	if (num_cta)
+		TWC_large_bin_kernel<EdgeWeightType, load_balance_payload, AccessorType, src_filter><<<num_cta, cta_size>>>(graph, input_frontier, output_frontier); 	
 }
 
 template <typename EdgeWeightType, load_balance_payload_type<EdgeWeightType> load_balance_payload, typename AccessorType, bool src_filter(int32_t)> 
@@ -840,7 +892,8 @@ void __device__ strict_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontie
 	int32_t row_size = end_row - start_row + 1;
 	//int32_t start_idx;
 
-	if(row_size <= STRICT_SM_SIZE) {
+	//if(row_size <= STRICT_SM_SIZE) {
+	if(row_size <= -1 ) {
 		if(threadIdx.x < row_size) {
 			index = AccessorType::getElement(input_frontier, start_row+threadIdx.x);
 			//deg = graph.d_get_degree(index);
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index c53fb8f7..d8be84ec 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -3,6 +3,9 @@
 
 #include "infra_gpu/support.h"
 #include <cooperative_groups.h>
+#ifndef FRONTIER_MULTIPLIER
+#define FRONTIER_MULTIPLIER (6)
+#endif
 using namespace cooperative_groups;
 namespace gpu_runtime {
 class VertexFrontier {
@@ -34,7 +37,10 @@ class VertexFrontier {
 	};
 
 	format_ready_type format_ready;
-
+	
+	// PriorityQueue related trackers
+	int32_t* d_priority_array;
+	int32_t priority_cutoff;
 };
 
 
@@ -101,8 +107,8 @@ static VertexFrontier create_new_vertex_set(int32_t num_vertices, int32_t init_e
 	frontier.max_num_elems = num_vertices;
 	cudaMalloc(&frontier.d_num_elems_input, sizeof(int32_t));
 	cudaMalloc(&frontier.d_num_elems_output, sizeof(int32_t));
-	cudaMalloc(&frontier.d_sparse_queue_input, sizeof(int32_t) * num_vertices * 6);
-	cudaMalloc(&frontier.d_sparse_queue_output, sizeof(int32_t) * num_vertices * 6);
+	cudaMalloc(&frontier.d_sparse_queue_input, sizeof(int32_t) * num_vertices * FRONTIER_MULTIPLIER);
+	cudaMalloc(&frontier.d_sparse_queue_output, sizeof(int32_t) * num_vertices * FRONTIER_MULTIPLIER);
 	
 	if (num_vertices == init_elems) {
 		initialize_frontier_all<<<NUM_CTA, CTA_SIZE>>>(frontier);		

From 98c468c34a4d67d6bf3979bba8b381a599e22dc0 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Thu, 2 Jan 2020 16:33:10 -0500
Subject: [PATCH 75/88] Fixed soem issues for BFS

---
 src/runtime_lib/infra_gpu/gpu_priority_queue.h  |  0
 src/runtime_lib/infra_gpu/graph.h               | 17 +++++++++++++----
 src/runtime_lib/infra_gpu/load_balance.h        |  1 +
 src/runtime_lib/infra_gpu/printer.h             |  0
 src/runtime_lib/infra_gpu/support.h             |  0
 .../infra_gpu/vertex_representation.h           |  0
 6 files changed, 14 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 src/runtime_lib/infra_gpu/gpu_priority_queue.h
 mode change 100644 => 100755 src/runtime_lib/infra_gpu/graph.h
 mode change 100644 => 100755 src/runtime_lib/infra_gpu/printer.h
 mode change 100644 => 100755 src/runtime_lib/infra_gpu/support.h
 mode change 100644 => 100755 src/runtime_lib/infra_gpu/vertex_representation.h

diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
old mode 100644
new mode 100755
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
old mode 100644
new mode 100755
index a410009d..4c89ea8d
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -8,6 +8,10 @@
 #define IGNORE_JULIENNE_TYPES
 #include "infra_gapbs/benchmark.h"
 #include "infra_gpu/vertex_frontier.h"
+#include "graphit_timer.h"
+#ifndef FRONTIER_MULTIPLIER
+	#define FRONTIER_MULTIPLIER (6)
+#endif
 namespace gpu_runtime {
 
 template <typename EdgeWeightType>
@@ -39,6 +43,10 @@ struct GraphT { // Field names are according to CSR, reuse for CSC
 		full_frontier.max_num_elems = num_vertices;
 		return full_frontier;
 	}
+	VertexFrontier& __device__ getFullFrontierDevice(void) {
+		full_frontier.max_num_elems = num_vertices;
+		return full_frontier;	
+	}
 
 
 	// Load balance scratch pads
@@ -123,6 +131,7 @@ static void block_graph_edges(GraphT<EdgeWeightType> &input_graph, GraphT<EdgeWe
 	//delete[] block_sizes;
 	output_graph.num_buckets = num_blocks;
 	output_graph.h_bucket_sizes = block_sizes;
+	
 
 
 	cudaFree(input_graph.d_edge_src);
@@ -220,12 +229,12 @@ static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool
 	cudaMemcpy(graph.d_src_offsets, graph.h_src_offsets, sizeof(int32_t) * (graph.num_vertices + 1), cudaMemcpyHostToDevice);
 	//std::cout << filename << " (" << graph.num_vertices << ", " << graph.num_edges << ")" << std::endl;
 
-	cudaMalloc(&graph.twc_small_bin, graph.num_vertices * 6 * sizeof(int32_t));
-	cudaMalloc(&graph.twc_mid_bin, graph.num_vertices * 6 * sizeof(int32_t));
-	cudaMalloc(&graph.twc_large_bin, graph.num_vertices * 6 * sizeof(int32_t));
+	cudaMalloc(&graph.twc_small_bin, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
+	cudaMalloc(&graph.twc_mid_bin, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
+	cudaMalloc(&graph.twc_large_bin, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
 	cudaMalloc(&graph.twc_bin_sizes, 3 * sizeof(int32_t));
 
-	cudaMalloc(&graph.strict_sum, graph.num_vertices * 6 * sizeof(int32_t));
+	cudaMalloc(&graph.strict_sum, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
 	cudaMalloc(&graph.strict_cta_sum, NUM_CTA * 2 * sizeof(int32_t));
 	cudaMalloc(&graph.strict_grid_sum, sizeof(int32_t));
 
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index c8abe104..28040ddb 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -40,6 +40,7 @@ void __device__ vertex_based_load_balance(GraphT<EdgeWeightType> &graph, VertexF
 			break;
 		int32_t dst = graph.d_edge_dst[eid];
 		load_balance_payload(graph, src, dst, eid, input_frontier, output_frontier);
+		
 	}
 
 /*
diff --git a/src/runtime_lib/infra_gpu/printer.h b/src/runtime_lib/infra_gpu/printer.h
old mode 100644
new mode 100755
diff --git a/src/runtime_lib/infra_gpu/support.h b/src/runtime_lib/infra_gpu/support.h
old mode 100644
new mode 100755
diff --git a/src/runtime_lib/infra_gpu/vertex_representation.h b/src/runtime_lib/infra_gpu/vertex_representation.h
old mode 100644
new mode 100755

From 3b854e617c81490389df8bda88b662f075eddd8f Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Fri, 28 Feb 2020 16:47:42 -0500
Subject: [PATCH 76/88] Added GPU autotuner

---
 autotune/compile_gpu.sh                  |   2 +
 autotune/gpu_apps/sssp_delta_stepping.gt |  38 ++++
 autotune/graphit_gpu_autotuner.py        | 211 +++++++++++++++++++++++
 3 files changed, 251 insertions(+)
 create mode 100644 autotune/compile_gpu.sh
 create mode 100644 autotune/gpu_apps/sssp_delta_stepping.gt
 create mode 100644 autotune/graphit_gpu_autotuner.py

diff --git a/autotune/compile_gpu.sh b/autotune/compile_gpu.sh
new file mode 100644
index 00000000..85939921
--- /dev/null
+++ b/autotune/compile_gpu.sh
@@ -0,0 +1,2 @@
+python ../build/bin/graphitc.py -a algotorun.gt -f schedule_0 -o test.cu
+/usr/local/cuda/bin/nvcc  -ccbin /usr/bin/c++ -std=c++11 -I ../src/runtime_lib/ -o test -Xcompiler "-w" -O3 test.cu -DNUM_CTA=80 -DCTA_SIZE=512 -Wno-deprecated-gpu-targets -gencode arch=compute_70,code=sm_70 --use_fast_math -Xptxas "-v -dlcm=ca --maxrregcount=64" -rdc=true -DFRONTIER_MULTIPLIER=3
diff --git a/autotune/gpu_apps/sssp_delta_stepping.gt b/autotune/gpu_apps/sssp_delta_stepping.gt
new file mode 100644
index 00000000..0cb31c7f
--- /dev/null
+++ b/autotune/gpu_apps/sssp_delta_stepping.gt
@@ -0,0 +1,38 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex, int) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const dist : vector{Vertex}(int) = 2147483647; %should be INT_MAX
+const pq: priority_queue{Vertex}(int);
+
+func updateEdge(src : Vertex, dst : Vertex, weight : int)
+    var new_dist : int = dist[src] + weight;
+    pq.updatePriorityMin(dst, dist[dst], new_dist);
+end
+
+func printDist(v : Vertex)
+    print dist[v];
+end
+
+func reset(v: Vertex)
+    dist[v] = 2147483647;
+end
+
+func main()
+    for trail in 0:10
+        var start_vertex : int = atoi(argv[2]);
+        pq = new priority_queue{Vertex}(int)(false, false, dist, 1, 2, false, start_vertex);
+        startTimer();
+        vertices.apply(reset);
+        dist[start_vertex] = 0;
+        #s0# while (pq.finished() == false)
+            var frontier : vertexset{Vertex} = pq.dequeue_ready_set(); % dequeue lowest priority nodes
+            #s1# edges.from(frontier).applyUpdatePriority(updateEdge);
+            delete frontier;
+        end
+        var elapsed_time : float = stopTimer();
+        print "elapsed time: ";
+        print elapsed_time;
+	delete pq;
+    end
+end
diff --git a/autotune/graphit_gpu_autotuner.py b/autotune/graphit_gpu_autotuner.py
new file mode 100644
index 00000000..c2e3870e
--- /dev/null
+++ b/autotune/graphit_gpu_autotuner.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python                                                           
+#
+# Autotune schedules for DeltaStepping in the GraphIt language
+#                                                                               
+
+# import adddeps  # fix sys.path
+import opentuner
+from opentuner import ConfigurationManipulator
+from opentuner import EnumParameter
+from opentuner import IntegerParameter
+from opentuner import MeasurementInterface
+from opentuner import Result
+from sys import exit
+import argparse
+
+py_graphitc_file = "../build/bin/graphitc.py"
+serial_compiler = "g++"
+
+#if using icpc for par_compiler, the compilation flags for CILK and OpenMP needs to be changed
+par_compiler = "g++"
+
+class GraphItTuner(MeasurementInterface):
+    new_schedule_file_name = ''
+    # a flag for testing if NUMA-aware schedule is specified
+
+
+    def manipulator(self):
+        """                                                                          
+        Define the search space by creating a                                        
+        ConfigurationManipulator                                                     
+        """
+
+
+
+        manipulator = ConfigurationManipulator()
+        manipulator.add_parameter(
+            EnumParameter('LB', 
+                          ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT']))
+
+        #'edge-aware-dynamic-vertex-parallel' not supported with the latest g++ cilk implementation
+        manipulator.add_parameter(EnumParameter('direction', ['PUSH', 'PULL']))
+        manipulator.add_parameter(EnumParameter('dedup', ['ENABLED', 'DISABLED']))
+        manipulator.add_parameter(EnumParameter('frontier_output', ['FUSED', 'UNFUSED_BITMAP', 'UNFUSED_BOOLMAP']))
+        # adding new parameters for PriorityGraph (Ordered GraphIt) 
+        manipulator.add_parameter(IntegerParameter('delta', 1, self.args.max_delta))
+
+        manipulator.add_parameter(EnumParameter('kernel_fusion', ['DISABLED', 'ENABLED']))
+        manipulator.add_parameter(EnumParameter('pull_rep', ['BITMAP', 'BOOLMAP']))
+        return manipulator
+
+
+    def write_cfg_to_schedule(self, cfg):
+        #write into a schedule file the configuration
+        direction = cfg['direction']
+        delta = cfg['delta']
+        dedup = cfg['dedup']
+        frontier_output = cfg['frontier_output']
+        kernel_fusion = cfg['kernel_fusion']
+        pull_rep = cfg['pull_rep']
+        LB = cfg['LB']
+
+        new_schedule = "schedule:\n"
+        new_schedule += "SimpleGPUSchedule s1;\n";
+        new_schedule += "s1.configLoadBalance(" + LB + ");\n"
+        new_schedule += "s1.configFrontierCreation(" + frontier_output + ");\n"
+        if direction == "PULL":
+            new_schedule += "s1.configDirection(PULL, " + pull_rep + ");\n"
+        else:
+            new_schedule += "s1.configDirection(PUSH);\n"
+        new_schedule += "s1.configDelta(" + str(delta) + ");\n"
+        new_schedule += "s1.configDeduplication(" + dedup + ");\n"
+        new_schedule += "program->applyGPUSchedule(\"s0:s1\", s1);\n"
+        new_schedule += "SimpleGPUSchedule s0;\n"
+        new_schedule += "s0.configKernelFusion(" + kernel_fusion + ");\n"
+	# We will currently not apply this. Use this after kernel fusion is fixed
+        #new_schedule += "program->applyGPUSchedule(\"s0\", s0);\n"
+
+        print (cfg)
+        print (new_schedule)
+
+        self.new_schedule_file_name = 'schedule_0' 
+        print (self.new_schedule_file_name)
+        f1 = open (self.new_schedule_file_name, 'w')
+        f1.write(new_schedule)
+        f1.close()
+
+    def compile(self, cfg,  id):
+        """                                                                          
+        Compile a given configuration in parallel                                    
+        """
+        try:
+            self.call_program("cp " + self.args.algo_file + " algotorun.gt")
+            return self.call_program("bash compile_gpu.sh")
+        except:
+            print ("fail to compiler .gt file")
+            self.call_program("false")
+
+
+    def parse_running_time(self, log_file_name='test.out'):
+        """Returns the elapsed time only, from the HPL output file"""
+
+        min_time = 10000
+
+        with open(log_file_name) as f:
+            content = f.readlines()
+        content = [x.strip() for x in content]
+        i = 0;
+        for line in content:
+            if line.find("elapsed time") != -1:
+                next_line = content[i+1]
+                time_str = next_line.strip()
+                time = float(time_str)
+                if time < min_time:
+                    min_time = time
+            i = i+1;
+
+        return min_time
+
+    def run_precompiled(self, desired_result, input, limit, compile_result, id):
+        """                                                                          
+        Run a compile_result from compile() sequentially and return performance      
+        """
+
+        cfg = desired_result.configuration.data
+        
+        if compile_result['returncode'] != 0:
+            print (str(compile_result))
+
+        assert compile_result['returncode'] == 0
+        try:    
+            run_cmd = "./test " + self.args.graph + " " + self.args.start_vertex + " > test.out"
+            print ("run_cmd: " + run_cmd)
+
+            # default value -1 for memory_limit translates into None (no memory upper limit)
+            # setting memory limit does not quite work yet
+            process_memory_limit = None
+            if self.args.memory_limit != -1:
+                process_memory_limit = self.args.memory_limit
+            # print ("memory limit: " + str(process_memory_limit))
+            run_result = self.call_program(run_cmd, limit=self.args.runtime_limit, memory_limit=process_memory_limit)  
+        finally:
+            pass
+	
+            #self.call_program('rm test')
+            #self.call_program('rm test.cpp')
+
+        if run_result['timeout'] == True:
+            val = self.args.runtime_limit
+        else:
+            val = self.parse_running_time();
+        
+        self.call_program('rm test.out')
+        print ("run result: " + str(run_result))
+        print ("running time: " + str(val))
+
+        if run_result['timeout'] == True:
+            print ("Timed out after " + str(self.args.runtime_limit) + " seconds")
+            return opentuner.resultsdb.models.Result(time=val)
+        elif run_result['returncode'] != 0:
+            if self.args.killed_process_report_runtime_limit == 1 and run_result['stderr'] == 'Killed\n' or True:
+                print ("process killed " + str(run_result))
+                return opentuner.resultsdb.models.Result(time=self.args.runtime_limit)
+            else:
+                print (str(run_result))
+                exit()
+        else:
+            return opentuner.resultsdb.models.Result(time=val)
+            
+        
+
+
+    def compile_and_run(self, desired_result, input, limit):
+        """                                                                          
+        Compile and run a given configuration then                                   
+        return performance                                                           
+        """
+        print ("input graph: " + self.args.graph)
+
+        cfg = desired_result.configuration.data
+
+
+        self.write_cfg_to_schedule(cfg)
+        
+        # this pases in the id 0 for the configuration
+        compile_result = self.compile(cfg, 0)
+        # print "compile_result: " + str(compile_result)
+        return self.run_precompiled(desired_result, input, limit, compile_result, 0)
+
+
+    def save_final_config(self, configuration):
+        """called at the end of tuning"""
+        print ('Final Configuration:', configuration.data)
+        self.manipulator().save_to_file(configuration.data,'final_config.json')
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(parents=opentuner.argparsers())
+    parser.add_argument('--graph', type=str, default="", help='the graph to tune on')
+    parser.add_argument('--start_vertex', type=str, default="0", help="Start vertex if applicable")
+
+    parser.add_argument('--algo_file', type=str, required=True, help='input algorithm file')
+    parser.add_argument('--default_schedule_file', type=str, required=False, default="", help='default schedule file')
+    parser.add_argument('--runtime_limit', type=float, default=300, help='a limit on the running time of each program')
+    parser.add_argument('--max_delta', type=int, default=800000, help='maximum delta used for priority coarsening')
+    parser.add_argument('--memory_limit', type=int, default=-1,help='set memory limit on unix based systems [does not quite work yet]')    
+    parser.add_argument('--killed_process_report_runtime_limit', type=int, default=0, help='reports runtime_limit when a process is killed by the shell. 0 for disable (default), 1 for enable')
+    args = parser.parse_args()
+    # pass the argumetns into the tuner
+    GraphItTuner.main(args)
+    

From 2dc18274e922211bc2501204d244b7a99b7312de Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Thu, 5 Mar 2020 10:31:17 -0500
Subject: [PATCH 77/88] Added Pointer jumping version of cc

---
 apps/cc_pjump.gt | 62 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 apps/cc_pjump.gt

diff --git a/apps/cc_pjump.gt b/apps/cc_pjump.gt
new file mode 100644
index 00000000..04c5b18b
--- /dev/null
+++ b/apps/cc_pjump.gt
@@ -0,0 +1,62 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+
+const vertices : vertexset{Vertex} = edges.getVertices();
+const IDs : vector{Vertex}(int) = 1;
+
+const update: vector[1](int);
+
+func updateEdge(src : Vertex, dst : Vertex)
+    var src_id: Vertex = IDs[src];
+    var dst_id: Vertex = IDs[dst];
+
+    IDs[dst_id] min= IDs[src_id];
+    IDs[src_id] min= IDs[dst_id];
+end
+
+func init(v : Vertex)
+     IDs[v] = v;
+end
+
+func pjump(v: Vertex) 
+    var y: Vertex = IDs[v];
+    var x: Vertex = IDs[y];
+    if x != y
+        IDs[v] = x;
+        update[0] = 1;
+    end
+end
+
+func main()
+    var n : int = edges.getVertices();
+    for trail in 0:10
+        var frontier : vertexset{Vertex} = new vertexset{Vertex}(n);
+        startTimer();
+        vertices.apply(init);
+        while (frontier.getVertexSetSize() != 0)
+            #s1# var output: vertexset{Vertex} = edges.from(frontier).applyModified(updateEdge,IDs);
+	    delete frontier;
+	    frontier = output;
+            update[0] = 1;
+            while update[0] != 0
+		update[0] = 0;
+		vertices.apply(pjump);
+            end
+        end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+        print "elapsed time: ";
+        print elapsed_time;
+    end
+end
+
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configDeduplication(ENABLED);
+	s1.configFrontierCreation(UNFUSED_BOOLMAP);
+	program->applyGPUSchedule("s1", s1);

From 6754d6894fe146101820ccc7a0df2e3ee1e7e322 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Thu, 5 Mar 2020 10:32:25 -0500
Subject: [PATCH 78/88] Removed schedule from pjump

---
 apps/cc_pjump.gt | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/apps/cc_pjump.gt b/apps/cc_pjump.gt
index 04c5b18b..152dd81b 100644
--- a/apps/cc_pjump.gt
+++ b/apps/cc_pjump.gt
@@ -53,10 +53,3 @@ func main()
 end
 
 
-% specify schedules here or use a separate schedule file
-schedule:
-	SimpleGPUSchedule s1;
-	s1.configLoadBalance(TWCE);
-	s1.configDeduplication(ENABLED);
-	s1.configFrontierCreation(UNFUSED_BOOLMAP);
-	program->applyGPUSchedule("s1", s1);

From 08c3ae88405ede5d27eef12da97c810431b94735 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Tue, 21 Apr 2020 19:00:16 -0400
Subject: [PATCH 79/88] BC changes and autotuner support for GPU backend

---
 .gitignore                                    |   3 +
 autotune/compile_gpu.sh                       |   1 +
 autotune/gpu_apps/bfs.gt                      |  41 +++
 autotune/gpu_apps/cc.gt                       |  55 ++++
 autotune/gpu_apps/pagerank.gt                 |  53 ++++
 autotune/graphit_gpu_autotuner.py             | 138 ++++++--
 .../codegen_gpu/assign_function_context.h     |   1 +
 .../graphit/backend/codegen_gpu/codegen_gpu.h |  11 +-
 include/graphit/frontend/gpu_schedule.h       |  41 ++-
 include/graphit/midend/apply_expr_lower.h     |   1 +
 include/graphit/midend/mir.h                  |  14 +
 include/graphit/midend/mir_context.h          |   1 +
 .../codegen_gpu/assign_function_context.cpp   |   4 +
 src/backend/codegen_gpu/codegen_gpu.cpp       | 294 +++++++++++++++---
 src/frontend/parser.cpp                       |   3 +
 src/midend/apply_expr_lower.cpp               | 113 ++++++-
 src/midend/gpu_change_tracking_lower.cpp      |   6 +
 src/midend/gpu_priority_features_lowering.cpp |   3 +
 src/midend/while_loop_fusion.cpp              |   5 +-
 src/runtime_lib/gpu_intrinsics.h              |   1 +
 .../infra_gpu/gpu_priority_queue.h            |   1 -
 src/runtime_lib/infra_gpu/graph.h             |  85 ++++-
 src/runtime_lib/infra_gpu/list.h              | 108 +++++++
 src/runtime_lib/infra_gpu/load_balance.h      |  21 +-
 src/runtime_lib/infra_gpu/support.h           |  13 +
 src/runtime_lib/infra_gpu/vertex_frontier.h   |  50 +++
 test/verifiers/bc_verifier.cpp                |   2 +-
 27 files changed, 968 insertions(+), 101 deletions(-)
 create mode 100644 autotune/gpu_apps/bfs.gt
 create mode 100644 autotune/gpu_apps/cc.gt
 create mode 100644 autotune/gpu_apps/pagerank.gt
 create mode 100644 src/runtime_lib/infra_gpu/list.h

diff --git a/.gitignore b/.gitignore
index d7c708a1..8692d055 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,6 @@
 .settings/
 .idea/
 build/
+autotune/*.json
+*.graphit_bin
+*.graphit_sbin
diff --git a/autotune/compile_gpu.sh b/autotune/compile_gpu.sh
index 85939921..09475db8 100644
--- a/autotune/compile_gpu.sh
+++ b/autotune/compile_gpu.sh
@@ -1,2 +1,3 @@
 python ../build/bin/graphitc.py -a algotorun.gt -f schedule_0 -o test.cu
 /usr/local/cuda/bin/nvcc  -ccbin /usr/bin/c++ -std=c++11 -I ../src/runtime_lib/ -o test -Xcompiler "-w" -O3 test.cu -DNUM_CTA=80 -DCTA_SIZE=512 -Wno-deprecated-gpu-targets -gencode arch=compute_70,code=sm_70 --use_fast_math -Xptxas "-v -dlcm=ca --maxrregcount=64" -rdc=true -DFRONTIER_MULTIPLIER=3
+#/usr/local/cuda/bin/nvcc  -ccbin /usr/bin/c++ -std=c++11 -I ../src/runtime_lib/ -o test -Xcompiler "-w" -O3 test.cu -DNUM_CTA=60 -DCTA_SIZE=512 -Wno-deprecated-gpu-targets -gencode arch=compute_61,code=sm_61 --use_fast_math -Xptxas "-v -dlcm=ca --maxrregcount=64" -rdc=true -DFRONTIER_MULTIPLIER=2
diff --git a/autotune/gpu_apps/bfs.gt b/autotune/gpu_apps/bfs.gt
new file mode 100644
index 00000000..5025d779
--- /dev/null
+++ b/autotune/gpu_apps/bfs.gt
@@ -0,0 +1,41 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print "elapsed time: ";
+    	print elapsed_time;
+    end
+end
+
diff --git a/autotune/gpu_apps/cc.gt b/autotune/gpu_apps/cc.gt
new file mode 100644
index 00000000..05422e0d
--- /dev/null
+++ b/autotune/gpu_apps/cc.gt
@@ -0,0 +1,55 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+
+const vertices : vertexset{Vertex} = edges.getVertices();
+const IDs : vector{Vertex}(int) = 1;
+
+const update: vector[1](int);
+
+func updateEdge(src : Vertex, dst : Vertex)
+    var src_id: Vertex = IDs[src];
+    var dst_id: Vertex = IDs[dst];
+
+    IDs[dst_id] min= IDs[src_id];
+    IDs[src_id] min= IDs[dst_id];
+end
+
+func init(v : Vertex)
+     IDs[v] = v;
+end
+
+func pjump(v: Vertex) 
+    var y: Vertex = IDs[v];
+    var x: Vertex = IDs[y];
+    if x != y
+        IDs[v] = x;
+        update[0] = 1;
+    end
+end
+
+func main()
+    var n : int = edges.getVertices();
+    for trail in 0:10
+        var frontier : vertexset{Vertex} = new vertexset{Vertex}(n);
+        startTimer();
+        vertices.apply(init);
+        #s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output: vertexset{Vertex} = edges.from(frontier).applyModified(updateEdge,IDs);
+	    delete frontier;
+	    frontier = output;
+            update[0] = 1;
+            while update[0] != 0
+		update[0] = 0;
+		vertices.apply(pjump);
+            end
+        end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+        print "elapsed time: ";
+        print elapsed_time;
+    end
+end
+
+
diff --git a/autotune/gpu_apps/pagerank.gt b/autotune/gpu_apps/pagerank.gt
new file mode 100644
index 00000000..c171e078
--- /dev/null
+++ b/autotune/gpu_apps/pagerank.gt
@@ -0,0 +1,53 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const old_rank : vector{Vertex}(float) = 1.0/vertices.size();
+const new_rank : vector{Vertex}(float) = 0.0;
+const out_degree : vector {Vertex}(int) = edges.getOutDegrees();
+const contrib : vector{Vertex}(float) = 0.0;
+const error : vector{Vertex}(float) = 0.0;
+const damp : float = 0.85;
+const beta_score : float = (1.0 - damp) / vertices.size();
+
+func computeContrib(v : Vertex)
+    contrib[v] = old_rank[v] / out_degree[v];
+end
+
+func updateEdge(src : Vertex, dst : Vertex)
+    new_rank[dst] += contrib[src];
+end
+
+func updateVertex(v : Vertex)
+    var old_score : float = old_rank[v];
+    new_rank[v] = beta_score + damp*(new_rank[v]);
+    error[v] = fabs(new_rank[v] - old_rank[v]);
+    old_rank[v] = new_rank[v];
+    new_rank[v] = 0.0;
+end
+
+func printRank(v : Vertex)
+    print old_rank[v];
+end
+
+func reset(v: Vertex)
+    old_rank[v] = 1.0/vertices.size();
+    new_rank[v] = 0.0;
+end
+
+func main()
+    for trail in 0:10
+    	startTimer();
+        vertices.apply(reset);
+    	#s0# for i in 0:20
+    	    vertices.apply(computeContrib);
+            #s1# edges.apply(updateEdge);
+            vertices.apply(updateVertex);
+    	end
+
+    	var elapsed_time : float = stopTimer();
+    	print "elapsed time: ";
+    	print elapsed_time;
+    end
+end
+
diff --git a/autotune/graphit_gpu_autotuner.py b/autotune/graphit_gpu_autotuner.py
index c2e3870e..ccaa070b 100644
--- a/autotune/graphit_gpu_autotuner.py
+++ b/autotune/graphit_gpu_autotuner.py
@@ -29,57 +29,117 @@ def manipulator(self):
         Define the search space by creating a                                        
         ConfigurationManipulator                                                     
         """
+        manipulator = ConfigurationManipulator()
+        if self.args.edge_only:
+            #manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT', 'EDGE_ONLY']))
+            manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'EDGE_ONLY']))
+            manipulator.add_parameter(EnumParameter('EB_0', ['ENABLED', 'DISABLED']))
+            manipulator.add_parameter(IntegerParameter('BS_0', 1, 20))
+        else:
+            #manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT']))
+            manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM']))
 
+        manipulator.add_parameter(EnumParameter('direction_0', ['PUSH', 'PULL']))
+        manipulator.add_parameter(EnumParameter('dedup_0', ['ENABLED', 'DISABLED']))
+        manipulator.add_parameter(EnumParameter('frontier_output_0', ['FUSED', 'UNFUSED_BITMAP', 'UNFUSED_BOOLMAP']))
+        manipulator.add_parameter(EnumParameter('pull_rep_0', ['BITMAP', 'BOOLMAP']))
 
+        if self.args.hybrid_schedule:
+            #manipulator.add_parameter(EnumParameter('LB_1', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT']))
+            manipulator.add_parameter(EnumParameter('LB_1', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM']))
+            
+            manipulator.add_parameter(EnumParameter('direction_1', ['PUSH', 'PULL']))
+            manipulator.add_parameter(EnumParameter('dedup_1', ['ENABLED', 'DISABLED']))
+            manipulator.add_parameter(EnumParameter('frontier_output_1', ['FUSED', 'UNFUSED_BITMAP', 'UNFUSED_BOOLMAP']))
+            manipulator.add_parameter(EnumParameter('pull_rep_1', ['BITMAP', 'BOOLMAP']))
+            
+            # We also choose the hybrid schedule threshold here
+            manipulator.add_parameter(IntegerParameter('threshold', 0, 1000))
+
+	
 
-        manipulator = ConfigurationManipulator()
-        manipulator.add_parameter(
-            EnumParameter('LB', 
-                          ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT']))
-
-        #'edge-aware-dynamic-vertex-parallel' not supported with the latest g++ cilk implementation
-        manipulator.add_parameter(EnumParameter('direction', ['PUSH', 'PULL']))
-        manipulator.add_parameter(EnumParameter('dedup', ['ENABLED', 'DISABLED']))
-        manipulator.add_parameter(EnumParameter('frontier_output', ['FUSED', 'UNFUSED_BITMAP', 'UNFUSED_BOOLMAP']))
         # adding new parameters for PriorityGraph (Ordered GraphIt) 
-        manipulator.add_parameter(IntegerParameter('delta', 1, self.args.max_delta))
+	# Currently since delta is allowed to be configured only once for the entire program, we will make a single decision even if the schedule is hybrid
+        if self.args.tune_delta:
+            manipulator.add_parameter(IntegerParameter('delta', 1, self.args.max_delta))
+
+
+        if self.args.kernel_fusion:
+            manipulator.add_parameter(EnumParameter('kernel_fusion', ['DISABLED', 'ENABLED']))
 
-        manipulator.add_parameter(EnumParameter('kernel_fusion', ['DISABLED', 'ENABLED']))
-        manipulator.add_parameter(EnumParameter('pull_rep', ['BITMAP', 'BOOLMAP']))
         return manipulator
 
 
     def write_cfg_to_schedule(self, cfg):
         #write into a schedule file the configuration
-        direction = cfg['direction']
-        delta = cfg['delta']
-        dedup = cfg['dedup']
-        frontier_output = cfg['frontier_output']
-        kernel_fusion = cfg['kernel_fusion']
-        pull_rep = cfg['pull_rep']
-        LB = cfg['LB']
+
+        direction_0 = cfg['direction_0']
+        if self.args.tune_delta:
+            delta_0 = cfg['delta']
+        dedup_0 = cfg['dedup_0']
+        frontier_output_0 = cfg['frontier_output_0']
+        pull_rep_0 = cfg['pull_rep_0']
+        LB_0 = cfg['LB_0']
 
         new_schedule = "schedule:\n"
+
         new_schedule += "SimpleGPUSchedule s1;\n";
-        new_schedule += "s1.configLoadBalance(" + LB + ");\n"
-        new_schedule += "s1.configFrontierCreation(" + frontier_output + ");\n"
-        if direction == "PULL":
-            new_schedule += "s1.configDirection(PULL, " + pull_rep + ");\n"
+        if LB_0 == "EDGE_ONLY" and cfg['EB_0'] == "ENABLED":
+            new_schedule += "s1.configLoadBalance(EDGE_ONLY, BLOCKED, " + str(int(int(self.args.num_vertices)/cfg['BS_0'])) + ");\n"
+            direction_0 = "PUSH"
+        else:
+            new_schedule += "s1.configLoadBalance(" + LB_0 + ");\n"
+        new_schedule += "s1.configFrontierCreation(" + frontier_output_0 + ");\n"
+        if direction_0 == "PULL":
+            new_schedule += "s1.configDirection(PULL, " + pull_rep_0 + ");\n"
         else:
             new_schedule += "s1.configDirection(PUSH);\n"
-        new_schedule += "s1.configDelta(" + str(delta) + ");\n"
-        new_schedule += "s1.configDeduplication(" + dedup + ");\n"
-        new_schedule += "program->applyGPUSchedule(\"s0:s1\", s1);\n"
-        new_schedule += "SimpleGPUSchedule s0;\n"
-        new_schedule += "s0.configKernelFusion(" + kernel_fusion + ");\n"
-	# We will currently not apply this. Use this after kernel fusion is fixed
-        #new_schedule += "program->applyGPUSchedule(\"s0\", s0);\n"
+        if self.args.tune_delta:
+            new_schedule += "s1.configDelta(" + str(delta_0) + ");\n"
+        new_schedule += "s1.configDeduplication(" + dedup_0 + ");\n"
+
+        if self.args.hybrid_schedule:
+            direction_1 = cfg['direction_1']
+            if self.args.tune_delta:
+                delta_1 = cfg['delta']
+            dedup_1 = cfg['dedup_1']
+            frontier_output_1 = cfg['frontier_output_1']
+            pull_rep_1 = cfg['pull_rep_1']
+            LB_1 = cfg['LB_1']
+
+            #threshold = self.args.hybrid_threshold
+            threshold = cfg['threshold']
+            
+            new_schedule += "SimpleGPUSchedule s2;\n";
+            new_schedule += "s2.configLoadBalance(" + LB_1 + ");\n"
+            new_schedule += "s2.configFrontierCreation(" + frontier_output_1 + ");\n"
+            if direction_1 == "PULL":
+                new_schedule += "s2.configDirection(PULL, " + pull_rep_1 + ");\n"
+            else:
+                new_schedule += "s2.configDirection(PUSH);\n"
+            if self.args.tune_delta:
+                new_schedule += "s2.configDelta(" + str(delta_1) + ");\n"
+            new_schedule += "s2.configDeduplication(" + dedup_1 + ");\n"
+            
+            new_schedule += "HybridGPUSchedule h1(INPUT_VERTEXSET_SIZE, " + str(threshold/1000) + ", s1, s2);\n"
+            new_schedule += "program->applyGPUSchedule(\"s0:s1\", h1);\n"
+
+        else:
+            new_schedule += "program->applyGPUSchedule(\"s0:s1\", s1);\n"
+
+
+
+        if self.args.kernel_fusion:
+            kernel_fusion = cfg['kernel_fusion']
+            new_schedule += "SimpleGPUSchedule s0;\n"
+            new_schedule += "s0.configKernelFusion(" + kernel_fusion + ");\n"
+            new_schedule += "program->applyGPUSchedule(\"s0\", s0);\n"
 
         print (cfg)
-        print (new_schedule)
+        #print (new_schedule)
 
         self.new_schedule_file_name = 'schedule_0' 
-        print (self.new_schedule_file_name)
+        #print (self.new_schedule_file_name)
         f1 = open (self.new_schedule_file_name, 'w')
         f1.write(new_schedule)
         f1.close()
@@ -174,7 +234,7 @@ def compile_and_run(self, desired_result, input, limit):
         Compile and run a given configuration then                                   
         return performance                                                           
         """
-        print ("input graph: " + self.args.graph)
+        # print ("input graph: " + self.args.graph)
 
         cfg = desired_result.configuration.data
 
@@ -190,7 +250,7 @@ def compile_and_run(self, desired_result, input, limit):
     def save_final_config(self, configuration):
         """called at the end of tuning"""
         print ('Final Configuration:', configuration.data)
-        self.manipulator().save_to_file(configuration.data,'final_config.json')
+        self.manipulator().save_to_file(configuration.data, self.args.final_config)
 
 
 
@@ -200,11 +260,21 @@ def save_final_config(self, configuration):
     parser.add_argument('--start_vertex', type=str, default="0", help="Start vertex if applicable")
 
     parser.add_argument('--algo_file', type=str, required=True, help='input algorithm file')
+    parser.add_argument('--final_config', type=str, help='Final config file', default="final_config.json")
     parser.add_argument('--default_schedule_file', type=str, required=False, default="", help='default schedule file')
     parser.add_argument('--runtime_limit', type=float, default=300, help='a limit on the running time of each program')
     parser.add_argument('--max_delta', type=int, default=800000, help='maximum delta used for priority coarsening')
     parser.add_argument('--memory_limit', type=int, default=-1,help='set memory limit on unix based systems [does not quite work yet]')    
     parser.add_argument('--killed_process_report_runtime_limit', type=int, default=0, help='reports runtime_limit when a process is killed by the shell. 0 for disable (default), 1 for enable')
+
+    parser.add_argument('--kernel_fusion', type=bool, default=False, help='Choose if you want to also tune kernel fusion')
+    parser.add_argument('--hybrid_schedule', type=bool, default=False, help='Choose if you want to also explore hybrid schedules')
+    parser.add_argument('--edge_only', type=bool, default=False, help='Choose if you want to also enable EDGE_ONLY schedules')
+    parser.add_argument('--num_vertices', type=int, required=True, help='Supply number of vertices in the graph')
+    parser.add_argument('--tune_delta', type=bool, default=False, help='Also tune the delta parameter')
+    parser.add_argument('--hybrid_threshold', type=int, default=1000, help='Threshold value on 1000')
+
+
     args = parser.parse_args()
     # pass the argumetns into the tuner
     GraphItTuner.main(args)
diff --git a/include/graphit/backend/codegen_gpu/assign_function_context.h b/include/graphit/backend/codegen_gpu/assign_function_context.h
index ff1264e7..0b6bb309 100644
--- a/include/graphit/backend/codegen_gpu/assign_function_context.h
+++ b/include/graphit/backend/codegen_gpu/assign_function_context.h
@@ -20,6 +20,7 @@ class AssignFunctionContext : mir::MIRVisitor {
 		void visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr);
 		void visit(mir::PullEdgeSetApplyExpr::Ptr);
 		void visit(mir::VertexSetApplyExpr::Ptr);
+		void visit(mir::VertexSetWhereExpr::Ptr);
 	private:
 		MIRContext *mir_context_;
 };
diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index b0729849..cc30bca3 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -69,7 +69,7 @@ class CodeGenGPU: public mir::MIRVisitor{
 	void genPropertyArrayAlloca(mir::VarDecl::Ptr);
 	
 	void genFusedWhileLoop(mir::WhileStmt::Ptr);
-	void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr);
+	virtual void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr);
 
 	EdgesetApplyFunctionDeclGenerator* edgeset_apply_func_gen_;
 
@@ -142,6 +142,12 @@ class CodeGenGPU: public mir::MIRVisitor{
 
 	virtual void visit(mir::EnqueueVertex::Ptr) override;
 
+        virtual void visit(mir::VertexSetWhereExpr::Ptr) override;
+
+
+	virtual void visit(mir::ListType::Ptr) override;
+	virtual void visit(mir::ListAllocExpr::Ptr) override;
+
 	void genPriorityUpdateOperator(mir::PriorityUpdateOperator::Ptr); 
 
 };
@@ -179,7 +185,7 @@ class CodeGenGPUFusedKernel: public CodeGenGPU {
 		}
 		current_while_stmt->used_priority_queues.push_back(var);
 	}
-	void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr);
+	virtual void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr) override;
 	virtual void visit(mir::StmtBlock::Ptr) override;
 	virtual void visit(mir::AssignStmt::Ptr) override;
 	virtual void visit(mir::VarDecl::Ptr) override;
@@ -187,6 +193,7 @@ class CodeGenGPUFusedKernel: public CodeGenGPU {
 	virtual void visit(mir::PrintStmt::Ptr) override;
 	virtual void visit(mir::HybridGPUStmt::Ptr) override;
 	virtual void visit(mir::VertexSetDedupExpr::Ptr) override;
+	virtual void visit(mir::VertexSetApplyExpr::Ptr) override;
 	
 	std::string var_name (std::string var) {
 		//return current_kernel_name + "_" + var;
diff --git a/include/graphit/frontend/gpu_schedule.h b/include/graphit/frontend/gpu_schedule.h
index ab255844..9ed0e6a1 100644
--- a/include/graphit/frontend/gpu_schedule.h
+++ b/include/graphit/frontend/gpu_schedule.h
@@ -17,6 +17,7 @@ enum gpu_schedule_options {
 	PUSH, 
 	PULL, 
 	FUSED, 
+	UNFUSED,
 	UNFUSED_BITMAP,
 	UNFUSED_BOOLMAP,
 	ENABLED,
@@ -32,7 +33,7 @@ enum gpu_schedule_options {
 	BITMAP,
 	BOOLMAP,
 	BLOCKED,
-	UNBLOCKED
+	UNBLOCKED,
 };
 
 class GPUSchedule {
@@ -64,6 +65,10 @@ class SimpleGPUSchedule: public GPUSchedule {
 		DEDUP_DISABLED,
 		DEDUP_ENABLED
 	};
+	enum class deduplication_strategy_type {
+		DEDUP_FUSED,
+		DEDUP_UNFUSED
+	};
 
 	enum class load_balancing_type {
 		VERTEX_BASED,	
@@ -85,16 +90,23 @@ class SimpleGPUSchedule: public GPUSchedule {
 		FUSION_ENABLED
 	};
 
+	enum class boolean_type_type {
+		BOOLMAP,
+		BITMAP
+	};
+
 private:
 public:
 	direction_type direction;
 	pull_frontier_rep_type pull_frontier_rep;
 	frontier_creation_type frontier_creation;
 	deduplication_type deduplication;
+	deduplication_strategy_type deduplication_strategy;
 	load_balancing_type load_balancing;
 	edge_blocking_type edge_blocking;
 	uint32_t edge_blocking_size;
 	kernel_fusion_type kernel_fusion;
+	boolean_type_type boolean_type;
 
 	int32_t delta;
 	
@@ -108,6 +120,7 @@ class SimpleGPUSchedule: public GPUSchedule {
 		edge_blocking_size = 0;
 		kernel_fusion = kernel_fusion_type::FUSION_DISABLED;
 		delta = 1;
+		boolean_type = boolean_type_type::BOOLMAP;
 	}	
 
 public:	
@@ -153,10 +166,21 @@ class SimpleGPUSchedule: public GPUSchedule {
 		}
 	}
 
-	void configDeduplication(enum gpu_schedule_options o) {
+	void configDeduplication(enum gpu_schedule_options o, enum gpu_schedule_options l = UNFUSED) {
 		switch(o) {
 			case ENABLED:
 				deduplication = deduplication_type::DEDUP_ENABLED;
+				switch (l) {
+					case FUSED:
+						deduplication_strategy = deduplication_strategy_type::DEDUP_FUSED;
+						break;
+					case UNFUSED:
+						deduplication_strategy = deduplication_strategy_type::DEDUP_UNFUSED;
+						break;
+					default:
+						assert(false && "Invalid deduplication strategy\n");
+						break;
+				}
 				break;
 			case DISABLED:
 				deduplication = deduplication_type::DEDUP_DISABLED;
@@ -233,6 +257,19 @@ class SimpleGPUSchedule: public GPUSchedule {
 		}	
 		delta *= -1;
 	}
+	void configBooleanType(enum gpu_schedule_options o) {
+		switch(o) {
+			case BOOLMAP:
+				boolean_type = boolean_type_type::BOOLMAP;
+				break;
+			case BITMAP:
+				boolean_type = boolean_type_type::BITMAP;
+				break;
+			default:
+				assert(false && "Invalid option for configBooleanType");
+				break;
+		}
+	}
 	
 };
 
diff --git a/include/graphit/midend/apply_expr_lower.h b/include/graphit/midend/apply_expr_lower.h
index 56a77dc7..eb04fd11 100644
--- a/include/graphit/midend/apply_expr_lower.h
+++ b/include/graphit/midend/apply_expr_lower.h
@@ -40,6 +40,7 @@ namespace graphit {
 	    virtual void visit(mir::StmtBlock::Ptr stmt_block);
 	    virtual void visit(mir::VarDecl::Ptr var_decl);
 	    virtual void visit(mir::AssignStmt::Ptr assign_stmt); 
+	    virtual void visit(mir::ExprStmt::Ptr assign_stmt); 
 
             Schedule * schedule_;
             MIRContext* mir_context_;
diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index 86b99f0c..13b886bf 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -225,6 +225,11 @@ namespace graphit {
                 visitor->visit(self<ScalarType>());
             }
 
+	    enum class BoolType {
+		BYTE, BIT
+	    };
+	    BoolType bool_type;
+
             std::string toString(){
                 std::string output_str = "";
                 if (type == mir::ScalarType::Type::FLOAT){
@@ -919,6 +924,8 @@ namespace graphit {
             MergeReduceField::Ptr merge_reduce;
 
 	    bool frontier_reusable = false;
+	    bool fused_dedup = false;
+	    bool fused_dedup_perfect = false;
 
             typedef std::shared_ptr<EdgeSetApplyExpr> Ptr;
 
@@ -954,6 +961,8 @@ namespace graphit {
 		applied_schedule = edgeset_apply->applied_schedule;
 		frontier_reusable = edgeset_apply->frontier_reusable;
 		requires_output = edgeset_apply->requires_output;
+		fused_dedup = edgeset_apply->fused_dedup;
+		fused_dedup_perfect = edgeset_apply->fused_dedup_perfect;
             }
 
             virtual void accept(MIRVisitor *visitor) {
@@ -983,6 +992,8 @@ namespace graphit {
 		applied_schedule = edgeset_apply->applied_schedule;
 		frontier_reusable = edgeset_apply->frontier_reusable;
 		requires_output = edgeset_apply->requires_output;
+		fused_dedup = edgeset_apply->fused_dedup;
+		fused_dedup_perfect = edgeset_apply->fused_dedup_perfect;
             }
 
             virtual void accept(MIRVisitor *visitor) {
@@ -1543,6 +1554,7 @@ namespace graphit {
 	// GPU Specific operators
 	struct VertexSetDedupExpr: Expr {
 		Expr::Ptr target;
+		bool perfect_dedup;
 		typedef std::shared_ptr<VertexSetDedupExpr> Ptr;
 		virtual void accept(MIRVisitor *visitor) {
 			visitor->visit(self<VertexSetDedupExpr>());
@@ -1572,6 +1584,8 @@ namespace graphit {
 	struct EnqueueVertex: Stmt {
 		Expr::Ptr vertex_id;
 		Expr::Ptr vertex_frontier;
+		bool fused_dedup;
+		bool fused_dedup_perfect;
 		enum class Type {SPARSE, BOOLMAP, BITMAP};
 		Type type;
 		typedef std::shared_ptr<EnqueueVertex> Ptr;
diff --git a/include/graphit/midend/mir_context.h b/include/graphit/midend/mir_context.h
index dffa38d0..acde1bcb 100644
--- a/include/graphit/midend/mir_context.h
+++ b/include/graphit/midend/mir_context.h
@@ -467,6 +467,7 @@ namespace graphit {
 	
 	// Used by blocking optimization
 	std::unordered_map<std::string, uint32_t> graphs_with_blocking;
+	std::unordered_map<std::string, bool> graphs_with_transpose;
     };
 
 }
diff --git a/src/backend/codegen_gpu/assign_function_context.cpp b/src/backend/codegen_gpu/assign_function_context.cpp
index f1fc05ce..404a677f 100644
--- a/src/backend/codegen_gpu/assign_function_context.cpp
+++ b/src/backend/codegen_gpu/assign_function_context.cpp
@@ -38,4 +38,8 @@ void AssignFunctionContext::visit(mir::VertexSetApplyExpr::Ptr vsae) {
 	if (mir_context_->isFunction(vsae->input_function_name))
 		mir_context_->getFunction(vsae->input_function_name)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
 }
+void AssignFunctionContext::visit(mir::VertexSetWhereExpr::Ptr vswe) {
+	if (mir_context_->isFunction(vswe->input_func))
+		mir_context_->getFunction(vswe->input_func)->function_context = mir::FuncDecl::function_context_type::CONTEXT_DEVICE;
+}
 }
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index f2f2672b..d9cf8bc5 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -7,6 +7,7 @@
 #include "graphit/backend/codegen_gpu/extract_read_write_set.h"
 #include <graphit/midend/mir.h>
 #include <cstring>
+#include <iostream>
 
 namespace graphit {
 int CodeGenGPU::genGPU() {
@@ -296,41 +297,90 @@ void CodeGenGPUKernelEmitter::visit(mir::PushEdgeSetApplyExpr::Ptr apply_expr) {
 
 void CodeGenGPUKernelEmitter::visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr apply_expr) {
 
-	// First we generate the function that is passed to the load balancing function
 
-	std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
 
-	oss << "template <typename EdgeWeightType>" << std::endl;
-	oss << "void __device__ " << load_balancing_arg << "(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
-	indent();
-	printIndent();
-	oss << "// Body of the actual operator code" << std::endl;
-	if (apply_expr->to_func != "") {
-		printIndent();
-		oss << "if (!" << apply_expr->to_func << "(dst))" << std::endl;
+	if (apply_expr->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PUSH) {
+		// First we generate the function that is passed to the load balancing function
+		std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
+
+		oss << "template <typename EdgeWeightType>" << std::endl;
+		oss << "void __device__ " << load_balancing_arg << "(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
 		indent();
 		printIndent();
-		oss << "return;" << std::endl;
+		oss << "// Body of the actual operator code" << std::endl;
+		if (apply_expr->to_func != "") {
+			printIndent();
+			oss << "if (!" << apply_expr->to_func << "(dst))" << std::endl;
+			indent();
+			printIndent();
+			oss << "return;" << std::endl;
+			dedent();
+		}
+		mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function_name);
+		// Enqueueing is disabled from here. We are now enqueing from the UDF 
+		if (apply_expr->is_weighted) {	
+			printIndent();
+			oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+			printIndent();
+			oss << apply_expr->input_function_name << "(src, dst, weight";
+		} else {
+			printIndent();
+			oss << apply_expr->input_function_name << "(src, dst";
+		}
+		if (apply_expr->requires_output)
+			oss << ", output_frontier";
+		oss << ");" << std::endl;
 		dedent();
-	}
-	mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function_name);
-	// Enqueueing is disabled from here. We are now enqueing from the UDF 
-	if (apply_expr->is_weighted) {	
 		printIndent();
-		oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+		oss << "}" << std::endl;	
+		apply_expr->device_function = load_balancing_arg;	
+	} else if (apply_expr->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL) {
+		// First we generate the function that is passed to the load balancing function
+		std::string load_balancing_arg = "gpu_operator_body_" + mir_context_->getUniqueNameCounterString();
+		
+		oss << "template <typename EdgeWeightType>" << std::endl;
+		oss << "void __device__ " << load_balancing_arg << "(gpu_runtime::GraphT<EdgeWeightType> graph, int32_t src, int32_t dst, int32_t edge_id, gpu_runtime::VertexFrontier input_frontier, gpu_runtime::VertexFrontier output_frontier) {" << std::endl;
+		indent();
 		printIndent();
-		oss << apply_expr->input_function_name << "(src, dst, weight";
-	} else {
+		oss << "// Body of the actual operator" << std::endl;
+		// Before we generate the call to the UDF, we have to check if the dst is on the input frontier
+		if (apply_expr->from_func != "") {	
+			if (apply_expr->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
+				printIndent();
+				oss << "if (!input_frontier.d_byte_map_input[dst])" << std::endl;
+				indent();
+				printIndent();
+				oss << "return;" << std::endl;
+				dedent();
+			} else if (apply_expr->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BITMAP) {
+				printIndent();
+				oss << "if (!gpu_runtime::checkBit(input_frontier.d_bit_map_input, dst))" << std::endl;
+				indent();
+				printIndent();
+				oss << "return;" << std::endl;
+				dedent();
+			}
+		}
+
+		mir::FuncDecl::Ptr input_function = mir_context_->getFunction(apply_expr->input_function_name);
+		// Enqueueing is disabled from here. We are now enqueing from the UDF 
+		if (apply_expr->is_weighted) {	
+			printIndent();
+			oss << "EdgeWeightType weight = graph.d_edge_weight[edge_id];" << std::endl;
+			printIndent();
+			oss << apply_expr->input_function_name << "(dst, src, weight";
+		} else {
+			printIndent();
+			oss << apply_expr->input_function_name << "(dst, src";
+		}
+		if (apply_expr->requires_output)
+			oss << ", output_frontier";
+		oss << ");" << std::endl;
+		dedent();
 		printIndent();
-		oss << apply_expr->input_function_name << "(src, dst";
+		oss << "}" << std::endl;	
+		apply_expr->device_function = load_balancing_arg;
 	}
-	if (apply_expr->requires_output)
-		oss << ", output_frontier";
-	oss << ");" << std::endl;
-	dedent();
-	printIndent();
-	oss << "}" << std::endl;	
-	apply_expr->device_function = load_balancing_arg;	
 }
 
 void CodeGenGPUKernelEmitter::visit(mir::PullEdgeSetApplyExpr::Ptr apply_expr) {
@@ -416,6 +466,15 @@ void CodeGenGPU::genEdgeSets(void) {
 			edge_set_type->accept(this);
 			oss << " " << "__host_" << edgeset->name << "__blocked_" << blocking_size << ";" << std::endl;
 		}
+
+		if (mir_context_->graphs_with_transpose.find(edgeset->name) != mir_context_->graphs_with_transpose.end() && mir_context_->graphs_with_transpose[edgeset->name]) {
+			auto edge_set_type = mir::to<mir::EdgeSetType>(edgeset->type);
+			edge_set_type->accept(this);
+			oss << " __device__ " << edgeset->name << "__transposed" << ";" << std::endl;
+			edge_set_type->accept(this);
+			oss << " __host_" << edgeset->name << "__transposed" << ";" << std::endl;
+			
+		}
 	}
 }
 
@@ -438,6 +497,28 @@ void CodeGenGPU::visit(mir::PriorityQueueType::Ptr pqt) {
 void CodeGenGPU::visit(mir::VertexSetType::Ptr vertexset_type) {
 	oss << "gpu_runtime::VertexFrontier";
 }
+void CodeGenGPU::visit(mir::ListType::Ptr list_type) {
+	if (mir::isa<mir::VertexSetType>(list_type->element_type)) {
+		oss << "gpu_runtime::VertexFrontierList";
+		return;
+	}
+	oss << "std::vector<";
+	list_type->element_type->accept(this);
+	oss << ">";
+}
+void CodeGenGPU::visit(mir::ListAllocExpr::Ptr alloc_expr) {
+	if (mir::isa<mir::VertexSetType>(alloc_expr->element_type)) {
+		oss << "gpu_runtime::create_new_vertex_frontier_list(";		
+		mir::VertexSetType::Ptr vst = mir::to<mir::VertexSetType>(alloc_expr->element_type);
+		mir::Expr::Ptr size_expr = mir_context_->getElementCount(vst->element);
+		size_expr->accept(this);
+		oss << ")";
+		return;
+	}
+	oss << "std::vector<";
+	alloc_expr->element_type->accept(this);
+	oss << ">()";
+}
 void CodeGenGPU::visit(mir::ScalarType::Ptr scalar_type) {
 	switch(scalar_type->type) {
 		case mir::ScalarType::Type::INT:
@@ -540,6 +621,14 @@ void CodeGenGPU::visit(mir::FuncDecl::Ptr func_decl) {
 					oss << "cudaMemcpyToSymbol(";
 					oss << var_name << "__blocked_" << blocking_size << ", &__host_" << var_name << "__blocked_" << blocking_size << ", sizeof(__host_" << var_name << "__blocked_" << blocking_size << "), 0, cudaMemcpyHostToDevice);" << std::endl;
 				}
+
+				if (mir_context_->graphs_with_transpose.find(var_name) != mir_context_->graphs_with_transpose.end() && mir_context_->graphs_with_transpose[var_name]) {
+					printIndent();
+					oss << "__host_" << var_name << "__transposed = gpu_runtime::builtin_transpose(__host_" << var_name << ");" << std::endl;
+					printIndent();
+					oss << "cudaMemcpyToSymbol(";
+					oss << var_name << "__transposed" << ", &__host_" << var_name << "__transposed, sizeof(__host_" << var_name << "__transposed), 0, cudaMemcpyHostToDevice);" << std::endl;
+				}
 				
 
 			}
@@ -681,13 +770,16 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 	}		
 	// We will assume that the output frontier can reuse the input frontier. 
 	// Assert that the frontier can be reused
+	/*
 	if (target != nullptr && esae->frontier_reusable != true) {
 		assert(false && "GPU backend currently doesn't support creating frontiers from the apply expressions. Could not find opportunity for reuse\n");
 	}
+	*/
 
 	printIndent();
 	oss << "{" << std::endl;
 	indent();
+	
 	std::string load_balance_function = "gpu_runtime::vertex_based_load_balance";
 	if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::TWCE) {
 		load_balance_function = "gpu_runtime::TWCE_load_balance";
@@ -703,14 +795,14 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 		load_balance_function = "gpu_runtime::strict_load_balance";
 	}
 
-	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae) || mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae)) {
+	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae) || mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae) && esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PUSH) {
 		if (esae->from_func != "") {
 			printIndent();
 			oss << "gpu_runtime::vertex_set_prepare_sparse(";
 			oss << esae->from_func;
 			oss << ");" << std::endl;
 		}
-	} else if (mir::isa<mir::PullEdgeSetApplyExpr>(esae)) {
+	} else if (mir::isa<mir::PullEdgeSetApplyExpr>(esae) || mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae) && esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL) {
 		if (esae->from_func != "") {
 			if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
 				printIndent();
@@ -733,10 +825,22 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 		}
 
 	}
+
+	// We will have to create a new frontier in case the frontier cannot be reused
+	// If the frontier is reusable, we simply assign the old to the new
 	if (target != nullptr) {
-		printIndent();
-		target->accept(this);
-		oss << " = " << esae->from_func << ";" << std::endl;
+		if (esae->frontier_reusable) {
+			printIndent();
+			target->accept(this);
+			oss << " = " << esae->from_func << ";" << std::endl;
+		} else {
+			printIndent();
+			target->accept(this);
+			oss << " = ";
+			oss << "gpu_runtime::create_new_vertex_set(gpu_runtime::builtin_getVertices(";
+			esae->target->accept(this);
+			oss << "), 0);" << std::endl;
+		}
 	}
 	if (mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae)) {
 		mir::UpdatePriorityEdgeSetApplyExpr::Ptr upesae = mir::to<mir::UpdatePriorityEdgeSetApplyExpr>(esae);
@@ -744,6 +848,12 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 		oss << "cudaMemcpyToSymbol(" << upesae->priority_queue_used.getName() << ", &__host_" << upesae->priority_queue_used.getName() << ", sizeof(" << upesae->priority_queue_used.getName() << "), 0);" << std::endl;
 	}
 
+	// Before the load balance if the update requires dedup, then update the counters
+	if (esae->fused_dedup && target != nullptr) {
+		printIndent();
+		target->accept(this);
+		oss << ".curr_dedup_counter++;" << std::endl;
+	}	
 	printIndent();
 	oss << load_balance_function << "_host<";
 
@@ -768,6 +878,9 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 	if (esae->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::EDGE_ONLY && esae->applied_schedule.edge_blocking == fir::gpu_schedule::SimpleGPUSchedule::edge_blocking_type::BLOCKED) {
 		oss << "__blocked_" << esae->applied_schedule.edge_blocking_size;
 	}
+	if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL) {
+		oss << "__transposed";
+	}
 	oss << ", ";
 	if (esae->from_func != "")
 		oss << esae->from_func;
@@ -839,12 +952,12 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 		load_balance_function = "gpu_runtime::strict_load_balance";
 	}
 	
-	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae) || mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae)) {
+	if (mir::isa<mir::PushEdgeSetApplyExpr>(esae) || mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae) && esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PUSH) {
 		printIndent();
 		oss << "gpu_runtime::vertex_set_prepare_sparse_device(";
 		oss << var_name(esae->from_func);
 		oss << ");" << std::endl;
-	} else if (mir::isa<mir::PullEdgeSetApplyExpr>(esae)) {
+	} else if (mir::isa<mir::PullEdgeSetApplyExpr>(esae) || mir::isa<mir::UpdatePriorityEdgeSetApplyExpr>(esae) && esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL) {
 		if (esae->applied_schedule.pull_frontier_rep == fir::gpu_schedule::SimpleGPUSchedule::pull_frontier_rep_type::BOOLMAP) {
 			printIndent();
 			oss << "gpu_runtime::vertex_set_prepare_boolmap_device(";
@@ -888,6 +1001,12 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 		//oss << "cudaMemcpyToSymbol(" << upesae->priority_queue_used.getName() << ", &__host_" << upesae->priority_queue_used.getName() << ", sizeof(" << upesae->priority_queue_used.getName() << "), 0);" << std::endl;
 */
 	}
+	// Before the load balance if the update requires dedup, then update the counters
+	if (esae->fused_dedup && target != nullptr) {
+		printIndent();
+		target->accept(this);
+		oss << ".curr_dedup_counter++;" << std::endl;
+	}	
 	printIndent();
 	oss << load_balance_function << "_device<";
 	
@@ -900,7 +1019,7 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 	
 	std::string accessor_type = "gpu_runtime::AccessorSparse";
 	if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->to_func == "")
-		accessor_type = "gpu_runtime::AcessorAll";
+		accessor_type = "gpu_runtime::AccessorAll";
 	std::string src_filter = "gpu_runtime::true_function";
 	if (esae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL && esae->to_func != "")
 		src_filter = esae->to_func;
@@ -911,7 +1030,7 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 	if (target != nullptr) 
 		target->accept(this);
 	else 
-		oss << "gpu_runtime::sentinel_frontier";
+		oss << "gpu_runtime::device_sentinel_frontier";
 	oss << ");" << std::endl;
 	
 	if (target != nullptr) {
@@ -973,6 +1092,28 @@ void CodeGenGPU::visit(mir::AssignStmt::Ptr assign_stmt) {
 		oss << ", ";
 		pqae->starting_node->accept(this);
 		oss << ");" << std::endl;	
+	} else if(mir::isa<mir::VertexSetWhereExpr>(assign_stmt->expr)) {
+                mir::VertexSetWhereExpr::Ptr vswe = mir::to<mir::VertexSetWhereExpr>(assign_stmt->expr);
+		if(!mir_context_->isConstVertexSet(vswe->target)) {
+			assert(false && "GPU backend currently doesn't support vertex where on non-const sets");
+		}
+		auto associated_element_type = mir_context_->getElementTypeFromVectorOrSetName(vswe->target);
+		assert(associated_element_type != nullptr);
+		auto associated_edge_set = mir_context_->getEdgeSetFromElementType(associated_element_type);
+		assert(associated_edge_set != nullptr);
+		
+		printIndent();
+		assign_stmt->lhs->accept(this);
+		oss << " = ";
+		oss << "gpu_runtime::create_new_vertex_set(";
+		oss << "__host_" << associated_edge_set->name << ".num_vertices, 0);" << std::endl;
+		printIndent();
+		oss << "gpu_runtime::vertex_set_where<";
+		oss << vswe->input_func << ">";
+		oss << "(__host_" << associated_edge_set->name << ".num_vertices, ";
+		assign_stmt->lhs->accept(this);
+		oss << ");" << std::endl;
+				
 	} else {
 		printIndent();
 		assign_stmt->lhs->accept(this);
@@ -1056,7 +1197,7 @@ void CodeGenGPU::visit(mir::IntLiteral::Ptr expr) {
 	oss << expr->val;
 }
 void CodeGenGPU::visit(mir::FloatLiteral::Ptr expr) {
-	oss << expr->val;
+	oss << "((float)" << expr->val << ")";
 }
 void CodeGenGPU::visit(mir::StringLiteral::Ptr expr) {
 	oss << "\"";
@@ -1153,7 +1294,14 @@ void CodeGenGPU::visit(mir::ReduceStmt::Ptr reduce_stmt) {
 void CodeGenGPU::visit(mir::EnqueueVertex::Ptr enqueue_vertex) {
 	printIndent();
 	if (enqueue_vertex->type == mir::EnqueueVertex::Type::SPARSE) {
-		oss << "gpu_runtime::enqueueVertexSparseQueue(";
+		oss << "gpu_runtime::enqueueVertexSparseQueue";
+		if (enqueue_vertex->fused_dedup) {
+			oss << "Dedup";
+			if (enqueue_vertex->fused_dedup_perfect) {
+				oss <<"Perfect";
+			}
+		}
+		oss << "(";
 		enqueue_vertex->vertex_frontier->accept(this);
 		oss << ".d_sparse_queue_output";
 	} else if (enqueue_vertex->type == mir::EnqueueVertex::Type::BOOLMAP) {
@@ -1169,6 +1317,10 @@ void CodeGenGPU::visit(mir::EnqueueVertex::Ptr enqueue_vertex) {
 	enqueue_vertex->vertex_frontier->accept(this);
 	oss << ".d_num_elems_output, ";
 	enqueue_vertex->vertex_id->accept(this);
+	if (enqueue_vertex->type == mir::EnqueueVertex::Type::SPARSE && enqueue_vertex->fused_dedup == true) {
+		oss << ", ";
+		enqueue_vertex->vertex_frontier->accept(this);	
+	}
 	oss << ");" << std::endl;	
 	
 }
@@ -1200,7 +1352,15 @@ void CodeGenGPU::visit(mir::VarDecl::Ptr var_decl) {
 		
 	} else 
 		oss << ";" << std::endl;
-		
+
+	if (mir::isa<mir::EdgeSetType>(var_decl->type)) {
+		if (mir_context_->graphs_with_transpose.find(var_decl->name) != mir_context_->graphs_with_transpose.end() && mir_context_->graphs_with_transpose[var_decl->name]) {
+			printIndent();
+			var_decl->type->accept(this);
+			oss << " " << var_decl->name << "__transposed = ";
+			oss << "gpu_runtime::builtin_transpose(" << var_decl->name << ");" << std::endl;
+		}
+	}		
 	
 }
 void CodeGenGPUFusedKernel::visit(mir::VarDecl::Ptr var_decl) {
@@ -1213,7 +1373,10 @@ void CodeGenGPUFusedKernel::visit(mir::VarDecl::Ptr var_decl) {
 	}
 }
 void CodeGenGPU::visit(mir::VertexSetDedupExpr::Ptr vsde) {
-	oss << "gpu_runtime::dedup_frontier(";
+	if (vsde->perfect_dedup)
+		oss << "gpu_runtime::dedup_frontier_perfect(";
+	else
+		oss << "gpu_runtime::dedup_frontier(";
 	vsde->target->accept(this);
 	oss << ")";
 }
@@ -1417,8 +1580,17 @@ void CodeGenGPU::visit(mir::BreakStmt::Ptr break_stmt) {
 	oss << "break;" << std::endl;
 }
 void CodeGenGPU::visit(mir::VertexSetApplyExpr::Ptr vsae) {
-	oss << "gpu_runtime::vertex_set_apply_kernel<"; 
 	auto mir_var = mir::to<mir::VarExpr> (vsae->target);
+	if (!mir_context_->isConstVertexSet(mir_var->var.getName())) {
+		// This assumes that the parent of the expression is a ExprStmt
+		oss << "gpu_runtime::vertex_set_prepare_sparse(";
+		oss << mir_var->var.getName(); 
+		oss << ");" << std::endl;
+		printIndent();
+		oss << mir_var->var.getName() << ".format_ready = gpu_runtime::VertexFrontier::SPARSE;" << std::endl;
+		printIndent();
+	}
+	oss << "gpu_runtime::vertex_set_apply_kernel<"; 
 	if (mir_context_->isConstVertexSet(mir_var->var.getName())) {
 		oss << "gpu_runtime::AccessorAll";
 	} else {
@@ -1444,6 +1616,43 @@ void CodeGenGPU::visit(mir::VertexSetApplyExpr::Ptr vsae) {
 		oss << ")";
 	}		
 }
+void CodeGenGPUFusedKernel::visit(mir::VertexSetApplyExpr::Ptr vsae) {
+	auto mir_var = mir::to<mir::VarExpr> (vsae->target);
+	if (!mir_context_->isConstVertexSet(mir_var->var.getName())) {
+		// This assumes that the parent of the expression is a ExprStmt
+		oss << "gpu_runtime::vertex_set_prepare_sparse_device(";
+		oss << var_name(mir_var->var.getName());
+		oss << ");" << std::endl;
+		printIndent();
+		oss << var_name(mir_var->var.getName()) << ".format_ready = gpu_runtime::VertexFrontier::SPARSE;" << std::endl;
+		printIndent();
+	}
+	oss << "gpu_runtime::vertex_set_apply<"; 
+	if (mir_context_->isConstVertexSet(mir_var->var.getName())) {
+		oss << "gpu_runtime::AccessorAll";
+	} else {
+		oss << "gpu_runtime::AccessorSparse";
+	}
+	oss << ", ";
+	oss << vsae->input_function_name << ">";
+	if (mir_context_->isConstVertexSet(mir_var->var.getName())) {
+		auto associated_element_type = mir_context_->getElementTypeFromVectorOrSetName(mir_var->var.getName());
+		assert(associated_element_type != nullptr);
+		auto associated_edge_set = mir_context_->getEdgeSetFromElementType(associated_element_type);
+		assert(associated_edge_set != nullptr);
+		oss << "(";
+		oss << var_name(associated_edge_set->name) << ".getFullFrontier()";
+		oss << ")";	
+	} else {
+		oss << "(";
+		oss << var_name(mir_var->var.getName());
+		oss << ")";
+	}		
+	oss << ";" << std::endl;
+	printIndent();
+	oss << "_grid.sync()";
+	
+}
 void CodeGenGPU::visit(mir::VertexSetAllocExpr::Ptr vsae) {
 	mir::Expr::Ptr size_expr = mir_context_->getElementCount(vsae->element_type);
 	oss << "gpu_runtime::create_new_vertex_set(";
@@ -1533,4 +1742,9 @@ void CodeGenGPUFusedKernel::visit(mir::HybridGPUStmt::Ptr stmt) {
 		assert(false && "Invalid criteria for Hybrid Statement\n");
 	}
 }
+
+void CodeGenGPU::visit(mir::VertexSetWhereExpr::Ptr expr) {
+	assert(false && "VertexSetWhereExpr should be handled in AssignStmt");
+}
+
 }
diff --git a/src/frontend/parser.cpp b/src/frontend/parser.cpp
index b95f41d4..f34828e8 100644
--- a/src/frontend/parser.cpp
+++ b/src/frontend/parser.cpp
@@ -2603,6 +2603,9 @@ namespace graphit {
         intrinsics_.push_back("append");
         intrinsics_.push_back("pop");
         intrinsics_.push_back("transpose");
+        intrinsics_.push_back("insert");
+        intrinsics_.push_back("retrieve");
+
 
         // set up function call intrinsics
         decls.insert("fabs", IdentType::FUNCTION);
diff --git a/src/midend/apply_expr_lower.cpp b/src/midend/apply_expr_lower.cpp
index 0e6de014..e8ac6afa 100644
--- a/src/midend/apply_expr_lower.cpp
+++ b/src/midend/apply_expr_lower.cpp
@@ -51,7 +51,7 @@ namespace graphit {
     }
     void ApplyExprLower::LowerApplyExpr::visit(mir::VarDecl::Ptr var_decl) {
 	if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
-		if (mir::isa<mir::EdgeSetApplyExpr> (var_decl->initVal)) {
+		if (mir::isa<mir::EdgeSetApplyExpr> (var_decl->initVal) || mir::isa<mir::VertexSetWhereExpr>(var_decl->initVal)) {
 			auto init_val = var_decl->initVal;
 			var_decl->initVal = nullptr;
 			mir::AssignStmt::Ptr assign_stmt = std::make_shared<mir::AssignStmt>();
@@ -64,7 +64,7 @@ namespace graphit {
 			insert_after_stmt = assign_stmt;
 			node = var_decl;
 			return;	
-		}
+		} 
 	}
 	MIRRewriter::visit(var_decl);
 	var_decl = mir::to<mir::VarDecl>(node);
@@ -149,17 +149,107 @@ namespace graphit {
 	assign_stmt = mir::to<mir::AssignStmt>(node);
 	if (mir::isa<mir::EdgeSetApplyExpr> (assign_stmt->expr)) {
 		mir::EdgeSetApplyExpr::Ptr edgeset_apply = mir::to<mir::EdgeSetApplyExpr>(assign_stmt->expr);
-		
-		if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty() && edgeset_apply->applied_schedule.deduplication == fir::gpu_schedule::SimpleGPUSchedule::deduplication_type::DEDUP_ENABLED && edgeset_apply->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
-			mir::VertexSetDedupExpr::Ptr dedup_expr = std::make_shared<mir::VertexSetDedupExpr>();
-			mir::ExprStmt::Ptr expr_stmt = std::make_shared<mir::ExprStmt>();
-			dedup_expr->target = assign_stmt->lhs;	
-			expr_stmt->expr = dedup_expr;
-			insert_after_stmt = expr_stmt;
+		if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty() && edgeset_apply->enable_deduplication == true && edgeset_apply->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+			if (edgeset_apply->applied_schedule.deduplication_strategy == fir::gpu_schedule::SimpleGPUSchedule::deduplication_strategy_type::DEDUP_FUSED) {
+				edgeset_apply->fused_dedup = true;
+				edgeset_apply->fused_dedup_perfect = true;
+			} else {
+				mir::VertexSetDedupExpr::Ptr dedup_expr = std::make_shared<mir::VertexSetDedupExpr>();
+				mir::ExprStmt::Ptr expr_stmt = std::make_shared<mir::ExprStmt>();
+				dedup_expr->target = assign_stmt->lhs;	
+				expr_stmt->expr = dedup_expr;
+				insert_after_stmt = expr_stmt;
+				dedup_expr->perfect_dedup = true;
+				edgeset_apply->fused_dedup = false;
+			}
+		} else if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty() && edgeset_apply->applied_schedule.deduplication == fir::gpu_schedule::SimpleGPUSchedule::deduplication_type::DEDUP_ENABLED && edgeset_apply->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
+			if (edgeset_apply->applied_schedule.deduplication_strategy == fir::gpu_schedule::SimpleGPUSchedule::deduplication_strategy_type::DEDUP_FUSED) {
+				edgeset_apply->fused_dedup = true;
+				edgeset_apply->fused_dedup_perfect = false;
+			} else {
+				mir::VertexSetDedupExpr::Ptr dedup_expr = std::make_shared<mir::VertexSetDedupExpr>();
+				mir::ExprStmt::Ptr expr_stmt = std::make_shared<mir::ExprStmt>();
+				dedup_expr->target = assign_stmt->lhs;	
+				expr_stmt->expr = dedup_expr;
+				insert_after_stmt = expr_stmt;
+				dedup_expr->perfect_dedup = false;
+				edgeset_apply->fused_dedup = false;
+			}
 		}
 	}
 	node = assign_stmt;
     }
+    void ApplyExprLower::LowerApplyExpr::visit(mir::ExprStmt::Ptr expr_stmt) {
+        if (expr_stmt->stmt_label != "") {
+                label_scope_.scope(expr_stmt->stmt_label);
+        }
+	if (mir::isa<mir::EdgeSetApplyExpr> (expr_stmt->expr)) {
+		mir::EdgeSetApplyExpr::Ptr edgeset_apply = mir::to<mir::EdgeSetApplyExpr>(expr_stmt->expr);
+		if (schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) {
+			auto current_scope_name = label_scope_.getCurrentScope();
+			auto apply_schedule_iter = schedule_->apply_gpu_schedules.find(current_scope_name);
+			if (apply_schedule_iter != schedule_->apply_gpu_schedules.end()) {
+				auto apply_schedule = apply_schedule_iter->second;
+				if (dynamic_cast<fir::gpu_schedule::HybridGPUSchedule*>(apply_schedule) != nullptr) {	
+					fir::gpu_schedule::HybridGPUSchedule *hybrid_schedule = dynamic_cast<fir::gpu_schedule::HybridGPUSchedule*>(apply_schedule);	
+					// This EdgeSetApply has a Hybrid Schedule attached to it
+					// Create the first Stmt block
+					mir::StmtBlock::Ptr stmt_block_1 = std::make_shared<mir::StmtBlock>();	
+					mir::ExprStmt::Ptr stmt1 = std::make_shared<mir::ExprStmt>();
+					stmt1->expr = expr_stmt->expr;
+					stmt1->stmt_label = "hybrid1";	
+					stmt_block_1->insertStmtEnd(stmt1);
+					fir::gpu_schedule::SimpleGPUSchedule * schedule1 = new fir::gpu_schedule::SimpleGPUSchedule();
+					*schedule1 = hybrid_schedule->s1;
+					schedule_->apply_gpu_schedules[current_scope_name + ":hybrid1"] = schedule1;
+					stmt_block_1 = rewrite<mir::StmtBlock>(stmt_block_1);
+					
+					// Now create the second Stmt block
+				        auto func_decl = mir_context_->getFunction(edgeset_apply->input_function_name);
+				        mir::FuncDecl::Ptr func_decl_v2 = func_decl->clone<mir::FuncDecl>();
+				        func_decl_v2->name = func_decl->name + "_v2"; 
+				        mir_context_->addFunctionFront(func_decl_v2);
+					mir::StmtBlock::Ptr stmt_block_2 = std::make_shared<mir::StmtBlock>();
+					mir::ExprStmt::Ptr stmt2 = std::make_shared<mir::ExprStmt>();
+					stmt2->expr = expr_stmt->expr;
+					mir::to<mir::EdgeSetApplyExpr>(stmt2->expr)->input_function_name = func_decl_v2->name;
+					stmt2->stmt_label = "hybrid2";
+					stmt_block_2->insertStmtEnd(stmt2);
+					fir::gpu_schedule::SimpleGPUSchedule * schedule2 = new fir::gpu_schedule::SimpleGPUSchedule();
+					*schedule2 = hybrid_schedule->s2;
+					schedule_->apply_gpu_schedules[current_scope_name + ":hybrid2"] = schedule2;
+					stmt_block_2 = rewrite<mir::StmtBlock>(stmt_block_2);
+					
+					// Finally create a hybrid statement and replace - 
+					mir::HybridGPUStmt::Ptr hybrid_node = std::make_shared<mir::HybridGPUStmt>();
+					hybrid_node->stmt1 = stmt_block_1;
+					hybrid_node->stmt2 = stmt_block_2;
+					hybrid_node->threshold = hybrid_schedule->threshold;
+					hybrid_node->argv_index = hybrid_schedule->argv_index;
+					hybrid_node->criteria = hybrid_schedule->_hybrid_criteria;
+					if (hybrid_node->criteria == fir::gpu_schedule::HybridGPUSchedule::hybrid_criteria::INPUT_VERTEXSET_SIZE && edgeset_apply->from_func != "") {
+						hybrid_node->input_frontier_name = edgeset_apply->from_func;	
+					} else {
+						assert(false && "Invalid criteria for Hybrid Node\n");
+					}
+					
+					node = hybrid_node;
+					mir_context_->hybrid_gpu_stmts.push_back(hybrid_node);
+					if (expr_stmt->stmt_label != "") {
+						label_scope_.unscope();
+					}
+					return;
+					
+				}
+			}
+		}
+	}
+        if (expr_stmt->stmt_label != "") {
+                label_scope_.unscope();
+        }
+        MIRRewriter::visit(expr_stmt);
+	node = expr_stmt;
+    }
 
     void ApplyExprLower::LowerApplyExpr::visit(mir::EdgeSetApplyExpr::Ptr edgeset_apply) {
 
@@ -195,9 +285,10 @@ namespace graphit {
 			}
 			if (edgeset_apply->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PUSH)
 				node = std::make_shared<mir::PushEdgeSetApplyExpr>(edgeset_apply);
-			else if (edgeset_apply->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL)
+			else if (edgeset_apply->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL) {
 				node = std::make_shared<mir::PullEdgeSetApplyExpr>(edgeset_apply);
-			else 
+				mir_context_->graphs_with_transpose[mir::to<mir::VarExpr>(edgeset_apply->target)->var.getName()] = true;
+			} else 
 				assert(false && "Invalid option for direction\n");
 			
 			if (edgeset_apply->applied_schedule.load_balancing == fir::gpu_schedule::SimpleGPUSchedule::load_balancing_type::EDGE_ONLY && edgeset_apply->applied_schedule.edge_blocking == fir::gpu_schedule::SimpleGPUSchedule::edge_blocking_type::BLOCKED) {
diff --git a/src/midend/gpu_change_tracking_lower.cpp b/src/midend/gpu_change_tracking_lower.cpp
index 04eeed07..b361939f 100644
--- a/src/midend/gpu_change_tracking_lower.cpp
+++ b/src/midend/gpu_change_tracking_lower.cpp
@@ -77,6 +77,8 @@ void GPUChangeTrackingLower::ReductionOpChangeVisitor::visit(mir::StmtBlock::Ptr
 					frontier_expr->var = frontier_var;
 					enqueue_vertex->vertex_id = tre->index;
 					enqueue_vertex->vertex_frontier = frontier_expr;	
+					enqueue_vertex->fused_dedup = current_edge_set_apply_expr->fused_dedup;
+					enqueue_vertex->fused_dedup_perfect = current_edge_set_apply_expr->fused_dedup_perfect;
 					if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
 						enqueue_vertex->type = mir::EnqueueVertex::Type::SPARSE;
 					} else if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
@@ -125,6 +127,8 @@ void GPUChangeTrackingLower::ReductionOpChangeVisitor::visit(mir::StmtBlock::Ptr
 					frontier_expr->var = frontier_var;
 					enqueue_vertex->vertex_id = tre->index;
 					enqueue_vertex->vertex_frontier = frontier_expr;	
+					enqueue_vertex->fused_dedup = current_edge_set_apply_expr->fused_dedup;
+					enqueue_vertex->fused_dedup_perfect = current_edge_set_apply_expr->fused_dedup_perfect;
 					if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
 						enqueue_vertex->type = mir::EnqueueVertex::Type::SPARSE;
 					} else if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
@@ -151,6 +155,8 @@ void GPUChangeTrackingLower::ReductionOpChangeVisitor::visit(mir::StmtBlock::Ptr
 					frontier_expr->var = frontier_var;
 					enqueue_vertex->vertex_id = tre->index;
 					enqueue_vertex->vertex_frontier = frontier_expr;	
+					enqueue_vertex->fused_dedup = current_edge_set_apply_expr->fused_dedup;
+					enqueue_vertex->fused_dedup_perfect = current_edge_set_apply_expr->fused_dedup_perfect;
 					if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::FRONTIER_FUSED) {
 						enqueue_vertex->type = mir::EnqueueVertex::Type::SPARSE;
 					} else if (current_edge_set_apply_expr->applied_schedule.frontier_creation == fir::gpu_schedule::SimpleGPUSchedule::frontier_creation_type::UNFUSED_BOOLMAP) {
diff --git a/src/midend/gpu_priority_features_lowering.cpp b/src/midend/gpu_priority_features_lowering.cpp
index fc182e86..3f5aae90 100644
--- a/src/midend/gpu_priority_features_lowering.cpp
+++ b/src/midend/gpu_priority_features_lowering.cpp
@@ -46,6 +46,9 @@ void GPUPriorityFeaturesLowering::EdgeSetApplyPriorityRewriter::visit(mir::ExprS
 				if (dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule) != nullptr) {
 					upesae->applied_schedule = *dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule);
 					mir_context_->delta_ = upesae->applied_schedule.delta;
+					if (upesae->applied_schedule.direction == fir::gpu_schedule::SimpleGPUSchedule::direction_type::DIR_PULL) {	
+						mir_context_->graphs_with_transpose[mir::to<mir::VarExpr>(upesae->target)->var.getName()] = true;
+					}
 				} else {
 					assert(false && "Scedule applied to edgesetapply must be a Simple Schedule");
 				}
diff --git a/src/midend/while_loop_fusion.cpp b/src/midend/while_loop_fusion.cpp
index 1231358c..6304f170 100644
--- a/src/midend/while_loop_fusion.cpp
+++ b/src/midend/while_loop_fusion.cpp
@@ -19,9 +19,10 @@ void graphit::WhileLoopFusion::visit(mir::WhileStmt::Ptr while_stmt) {
 			auto apply_schedule = apply_schedule_iter->second;
 			if (dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule)) {
 				auto applied_simple_schedule = dynamic_cast<fir::gpu_schedule::SimpleGPUSchedule*>(apply_schedule);
-				if (applied_simple_schedule->kernel_fusion == fir::gpu_schedule::SimpleGPUSchedule::kernel_fusion_type::FUSION_ENABLED)
-					while_stmt->is_fused = true;
+				if (applied_simple_schedule->kernel_fusion == fir::gpu_schedule::SimpleGPUSchedule::kernel_fusion_type::FUSION_ENABLED) {
+					while_stmt->is_fused = true; 
 					mir_context_->fused_while_loops.push_back(while_stmt);
+				}
 			}
 		}
 	}
diff --git a/src/runtime_lib/gpu_intrinsics.h b/src/runtime_lib/gpu_intrinsics.h
index f77d6779..a1a54b24 100644
--- a/src/runtime_lib/gpu_intrinsics.h
+++ b/src/runtime_lib/gpu_intrinsics.h
@@ -12,6 +12,7 @@
 #include "infra_gpu/support.h"
 #include "infra_gpu/printer.h"
 #include "infra_gpu/gpu_priority_queue.h"
+#include "infra_gpu/list.h"
 
 namespace gpu_runtime {
 
diff --git a/src/runtime_lib/infra_gpu/gpu_priority_queue.h b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
index 9bc59b8a..e3f251de 100755
--- a/src/runtime_lib/infra_gpu/gpu_priority_queue.h
+++ b/src/runtime_lib/infra_gpu/gpu_priority_queue.h
@@ -123,7 +123,6 @@ template<typename PriorityT_>
 				cudaMemcpy(current_priority_shared, &current_priority_, sizeof(int32_t), cudaMemcpyHostToDevice);
 				cudaMemcpy(device_gpq, this, sizeof(*device_gpq), cudaMemcpyHostToDevice); 
 				gpu_runtime::cudaCheckLastError();
-				
 				update_nodes_identify_min<<<NUM_BLOCKS, CTA_SIZE>>>(device_gpq, frontier_.max_num_elems);
 				gpu_runtime::cudaCheckLastError();
 
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
index 4c89ea8d..e7582659 100755
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -32,6 +32,9 @@ struct GraphT { // Field names are according to CSR, reuse for CSC
 	int32_t *d_edge_dst; // num_edges;
 	EdgeWeightType *d_edge_weight; // num_edges;
 
+	GraphT<EdgeWeightType> *transposed_graph;
+	
+
 	int32_t h_get_degree(int32_t vertex_id) {
 		return h_src_offsets[vertex_id + 1] - h_src_offsets[vertex_id];
 	}
@@ -151,6 +154,80 @@ static void block_graph_edges(GraphT<EdgeWeightType> &input_graph, GraphT<EdgeWe
 		
 }
 
+template <typename EdgeWeightType>
+static GraphT<EdgeWeightType> builtin_transpose(GraphT<EdgeWeightType> &graph) {
+	if (graph.transposed_graph != nullptr)
+		return *(graph.transposed_graph);
+	// For now we will return the same graph
+	// TODO: copy transpose implementation from infra_ CPU
+	GraphT<EdgeWeightType> output_graph;
+	output_graph.num_vertices = graph.num_vertices;
+	output_graph.num_edges = graph.num_edges;
+	
+	output_graph.h_src_offsets = new int32_t[graph.num_vertices+2];
+	output_graph.h_edge_src = new int32_t[graph.num_edges];
+	output_graph.h_edge_dst = new int32_t[graph.num_edges];
+	output_graph.h_edge_weight = new EdgeWeightType[graph.num_edges];
+	
+	for (int32_t i = 0; i < graph.num_vertices + 2; i++)
+		output_graph.h_src_offsets[i] = 0;
+	
+	// This will count the degree for each vertex in the transposed graph
+	for (int32_t i = 0; i < graph.num_edges; i++) {
+		int32_t dst = graph.h_edge_dst[i];
+		output_graph.h_src_offsets[dst+2]++;
+	}
+
+	// We will now create cummulative sums
+	for (int32_t i = 0; i < graph.num_vertices; i++) {
+		output_graph.h_src_offsets[i+2] += output_graph.h_src_offsets[i+1];	
+	}
+	
+	// Finally fill in the edges and the weights for the new graph		
+	for (int32_t i = 0; i < graph.num_edges; i++) {
+		int32_t dst = graph.h_edge_dst[i];
+		int32_t pos = output_graph.h_src_offsets[dst+1];
+		output_graph.h_src_offsets[dst+1]++;
+		output_graph.h_edge_src[pos] = dst;
+		output_graph.h_edge_dst[pos] = graph.h_edge_src[i];
+		output_graph.h_edge_weight[pos] = graph.h_edge_weight[i];
+	}
+
+	cudaMalloc(&output_graph.d_edge_src, sizeof(int32_t) * graph.num_edges);
+	cudaMalloc(&output_graph.d_edge_dst, sizeof(int32_t) * graph.num_edges);
+	cudaMalloc(&output_graph.d_edge_weight, sizeof(EdgeWeightType) * graph.num_edges);
+	cudaMalloc(&output_graph.d_src_offsets, sizeof(int32_t) * (graph.num_vertices + 1));
+	
+	
+	cudaMemcpy(output_graph.d_edge_src, output_graph.h_edge_src, sizeof(int32_t) * graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(output_graph.d_edge_dst, output_graph.h_edge_dst, sizeof(int32_t) * graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(output_graph.d_edge_weight, output_graph.h_edge_weight, sizeof(EdgeWeightType) * graph.num_edges, cudaMemcpyHostToDevice);
+	cudaMemcpy(output_graph.d_src_offsets, output_graph.h_src_offsets, sizeof(int32_t) * (graph.num_vertices + 1), cudaMemcpyHostToDevice);
+	
+/*	
+	cudaMalloc(&output_graph.twc_small_bin, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
+	cudaMalloc(&output_graph.twc_mid_bin, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
+	cudaMalloc(&output_graph.twc_large_bin, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
+	cudaMalloc(&output_graph.twc_bin_sizes, 3 * sizeof(int32_t));
+
+	cudaMalloc(&output_graph.strict_sum, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
+	cudaMalloc(&output_graph.strict_cta_sum, NUM_CTA * 2 * sizeof(int32_t));
+	cudaMalloc(&output_graph.strict_grid_sum, sizeof(int32_t));
+*/
+	output_graph.twc_small_bin = graph.twc_small_bin;
+	output_graph.twc_mid_bin = graph.twc_mid_bin;
+	output_graph.twc_large_bin = graph.twc_large_bin;
+	output_graph.strict_sum = graph.strict_sum;
+	output_graph.strict_cta_sum = graph.strict_cta_sum;
+	output_graph.strict_grid_sum = output_graph.strict_grid_sum;
+
+	output_graph.transposed_graph = &graph;
+	graph.transposed_graph = new GraphT<EdgeWeightType>(output_graph);
+
+	
+	return output_graph;
+}
+
 template <typename EdgeWeightType>
 static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool to_sort = false) {
 	int flen = strlen(filename.c_str());
@@ -158,8 +235,11 @@ static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool
 	char bin_filename[1024];
 	strcpy(bin_filename, filename.c_str());
 
-	if (string_ends_with(filename.c_str(), bin_extension) == false)	
+	if (string_ends_with(filename.c_str(), bin_extension) == false)	 {
+		strcat(bin_filename, ".");
+		strcat(bin_filename, typeid(EdgeWeightType).name());
 		strcat(bin_filename, bin_extension);
+	}
 	
 	FILE *bin_file = fopen(bin_filename, "rb");
 	if (!bin_file && string_ends_with(filename.c_str(), bin_extension)) {
@@ -217,6 +297,7 @@ static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool
 		CONSUME(fwrite(graph.h_src_offsets, sizeof(int32_t), graph.num_vertices + 1, bin_file));
 		fclose(bin_file);	
 	}
+
 	cudaMalloc(&graph.d_edge_src, sizeof(int32_t) * graph.num_edges);
 	cudaMalloc(&graph.d_edge_dst, sizeof(int32_t) * graph.num_edges);
 	cudaMalloc(&graph.d_edge_weight, sizeof(EdgeWeightType) * graph.num_edges);
@@ -237,6 +318,8 @@ static void load_graph(GraphT<EdgeWeightType> &graph, std::string filename, bool
 	cudaMalloc(&graph.strict_sum, graph.num_vertices * FRONTIER_MULTIPLIER * sizeof(int32_t));
 	cudaMalloc(&graph.strict_cta_sum, NUM_CTA * 2 * sizeof(int32_t));
 	cudaMalloc(&graph.strict_grid_sum, sizeof(int32_t));
+	
+	graph.transposed_graph = nullptr;
 
 }
 
diff --git a/src/runtime_lib/infra_gpu/list.h b/src/runtime_lib/infra_gpu/list.h
new file mode 100644
index 00000000..43e76441
--- /dev/null
+++ b/src/runtime_lib/infra_gpu/list.h
@@ -0,0 +1,108 @@
+#ifndef GRAPHIT_GPU_LIST_H
+#define GRAPHIT_GPU_LIST_H
+
+#include <vector>
+#include <cooperative_groups.h>
+using namespace cooperative_groups;
+
+namespace gpu_runtime {
+/*
+template <typename T>
+static void builtin_append(std::vector<T> &vec, T elem) {
+	vec.push_back(elem);	
+}
+
+template <typename T>
+static T builtin_pop(std::vector<T> &vec) {
+	T ret = vec.back();
+	vec.pop_back();
+	return ret;
+}
+*/
+
+class VertexFrontierList {
+public:
+	int32_t max_num_elems; 
+	int32_t current_levels;
+	
+	int32_t * d_level_indices;
+	int32_t * d_vertices;	
+};
+
+VertexFrontierList create_new_vertex_frontier_list(int32_t max_elems) {
+	VertexFrontierList vl;
+	vl.max_num_elems = max_elems;
+	vl.current_levels = 0;
+	
+	cudaMalloc(&(vl.d_level_indices), sizeof(int32_t) * (max_elems + 1));	
+	//vl.h_level_indices = new int32_t [max_elems + 1];	
+	//vl.h_level_indices[0] = 0;
+	cudaMemset(vl.d_level_indices, 0, sizeof(int32_t));
+	cudaMalloc(&(vl.d_vertices), sizeof(int32_t) * max_elems);
+	return vl;
+}
+
+
+void builtin_insert(VertexFrontierList &vl, VertexFrontier &frontier) {
+	int32_t array[2];
+
+	cudaMemcpy(array, vl.d_level_indices + vl.current_levels, sizeof(int32_t), cudaMemcpyDeviceToHost);
+	vertex_set_prepare_sparse(frontier);	
+	frontier.format_ready = VertexFrontier::SPARSE;
+	//int32_t at = vl.h_level_indices[vl.current_levels];
+	int32_t at = array[0];
+	int32_t num_elems = builtin_getVertexSetSize(frontier);
+	cudaMemcpy(vl.d_vertices + at, frontier.d_sparse_queue_input, num_elems * sizeof(int32_t), cudaMemcpyDeviceToDevice);
+	//vl.h_level_indices[vl.current_levels + 1] = at + num_elems;	
+	array[1] = at + num_elems;
+
+	cudaMemcpy(vl.d_level_indices + vl.current_levels + 1, array + 1, sizeof(int32_t), cudaMemcpyHostToDevice);
+	vl.current_levels++;
+}
+
+void __device__ device_builtin_insert(VertexFrontierList &vl, VertexFrontier &frontier) {
+	vertex_set_prepare_sparse_device(frontier);
+	frontier.format_ready = VertexFrontier::SPARSE;
+
+	int32_t at = vl.d_level_indices[vl.current_levels];
+	int32_t num_elems = device_builtin_getVertexSetSize(frontier);
+	parallel_memcpy((unsigned char*)(vl.d_vertices + at), (unsigned char*)(frontier.d_sparse_queue_input), num_elems * sizeof(int32_t));
+	if (threadIdx.x == 0 && blockIdx.x == 0)
+		vl.d_level_indices[vl.current_levels + 1] = at + num_elems;
+	vl.current_levels++;
+	this_grid().sync();
+}
+
+
+void builtin_retrieve(VertexFrontierList &vl, VertexFrontier &frontier) {
+	if (vl.current_levels == 0) {
+		assert(false && "Too deep into vertex frontier list");
+	}	
+	int32_t array[2];
+
+	cudaMemcpy(array, vl.d_level_indices + vl.current_levels - 1, sizeof(int32_t)*2, cudaMemcpyDeviceToHost);
+	//int32_t at = vl.h_level_indices[vl.current_levels - 1];
+	//int32_t num_elems = vl.h_level_indices[vl.current_levels] - at;
+	int32_t at = array[0];
+	int32_t num_elems = array[1] - at;
+	cudaMemcpy(frontier.d_sparse_queue_input, vl.d_vertices + at, num_elems * sizeof(int32_t), cudaMemcpyDeviceToDevice);
+	cudaMemcpy(frontier.d_num_elems_input, &num_elems, sizeof(int32_t), cudaMemcpyHostToDevice);
+	frontier.format_ready = gpu_runtime::VertexFrontier::SPARSE;
+	vl.current_levels--;
+}
+void __device__ device_builtin_retrieve(VertexFrontierList &vl, VertexFrontier &frontier) {
+	if (vl.current_levels == 0)
+		assert(false && "Too deep into vertex frontier list");
+	int32_t at = vl.d_level_indices[vl.current_levels -1];		
+	int32_t num_elems = vl.d_level_indices[vl.current_levels] - at;
+	parallel_memcpy((unsigned char*)frontier.d_sparse_queue_input, (unsigned char*) (vl.d_vertices + at), num_elems * sizeof(int32_t));
+	if (threadIdx.x == 0 && blockIdx.x == 0)
+		frontier.d_num_elems_input[0] = num_elems;
+	frontier.format_ready = gpu_runtime::VertexFrontier::SPARSE;
+	vl.current_levels--;
+	this_grid().sync();
+}
+}
+
+
+#endif
diff --git a/src/runtime_lib/infra_gpu/load_balance.h b/src/runtime_lib/infra_gpu/load_balance.h
index 28040ddb..91d3220d 100644
--- a/src/runtime_lib/infra_gpu/load_balance.h
+++ b/src/runtime_lib/infra_gpu/load_balance.h
@@ -885,8 +885,9 @@ void __device__ strict_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontie
         int32_t index, src_idx;
 	//int32_t deg;
 
+	// if(cta_id == num_cta - 1) return;
 	// can be fused
-	bool last_tb = (cta_id == (graph.strict_grid_sum[0] + NNZ_PER_BLOCK-1)/NNZ_PER_BLOCK-1);
+	//bool last_tb = (cta_id == (graph.strict_grid_sum[0] + NNZ_PER_BLOCK-1)/NNZ_PER_BLOCK-1);
 	int32_t start_row = binary_search_upperbound(&graph.strict_sum[0], tot_size, NNZ_PER_BLOCK*cta_id)-1;
 	int32_t end_row = binary_search_upperbound(&graph.strict_sum[0], tot_size, NNZ_PER_BLOCK*(cta_id+1))-1;
 
@@ -917,9 +918,12 @@ void __device__ strict_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontie
 		//int32_t lane = (threadIdx.x&31);
 		int32_t offset = 0;
 
-		int32_t tot_deg;
-		if(!last_tb) tot_deg = NNZ_PER_BLOCK;
-		else tot_deg = (graph.strict_grid_sum[0] - 1) % NNZ_PER_BLOCK + 1;
+
+		int32_t tot_deg = graph.strict_grid_sum[0] - cta_id * NNZ_PER_BLOCK;
+		if(tot_deg > NNZ_PER_BLOCK) tot_deg = NNZ_PER_BLOCK;
+		//int32_t tot_deg;
+		//if(!last_tb) tot_deg = NNZ_PER_BLOCK;
+		//else tot_deg = (graph.strict_grid_sum[0] - 1) % NNZ_PER_BLOCK + 1;
 
 		//int32_t phase = threadIdx.x;
 		//int32_t off=32;
@@ -932,13 +936,15 @@ void __device__ strict_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontie
 			if (src_filter(src_idx) == false)
 				continue;
 			int32_t ei = sm_loc[offset + id] + i - sm_deg[offset + id];
+			if(ei >= graph.num_edges) break;
 			int32_t dst_idx = graph.d_edge_dst[ei];
 			load_balance_payload(graph, src_idx, dst_idx, ei, input_frontier, output_frontier);
 		}
 	} else {
-		int32_t tot_deg;
-		if(!last_tb) tot_deg = NNZ_PER_BLOCK;
-		else tot_deg = (graph.strict_grid_sum[0] - 1) % NNZ_PER_BLOCK + 1;
+		int32_t tot_deg = graph.strict_grid_sum[0] - cta_id * NNZ_PER_BLOCK;
+		if(tot_deg > NNZ_PER_BLOCK) tot_deg = NNZ_PER_BLOCK;
+		//if(!last_tb) tot_deg = NNZ_PER_BLOCK;
+		//else tot_deg = (graph.strict_grid_sum[0] - 1) % NNZ_PER_BLOCK + 1;
 
 		int32_t width = row_size;
 		//int32_t offset = 0;
@@ -950,6 +956,7 @@ void __device__ strict_load_balance(GraphT<EdgeWeightType> &graph, VertexFrontie
 			if (src_filter(src_idx) == false)
 				continue;
 			int32_t ei = graph.d_src_offsets[src_idx] + i - graph.strict_sum[start_row + id];
+			if(ei >= graph.num_edges) break;
 			int32_t dst_idx = graph.d_edge_dst[ei];
 			load_balance_payload(graph, src_idx, dst_idx, ei, input_frontier, output_frontier);
 		}
diff --git a/src/runtime_lib/infra_gpu/support.h b/src/runtime_lib/infra_gpu/support.h
index 4358a9eb..bc6fe394 100755
--- a/src/runtime_lib/infra_gpu/support.h
+++ b/src/runtime_lib/infra_gpu/support.h
@@ -31,6 +31,15 @@ static bool __device__ writeMin(T *dst, T src) {
 	bool ret = (old_value > src);
 	return ret;
 }
+template <typename T>
+static bool __device__ writeMax(T *dst, T src) {
+	if (*dst >= src)
+		return false;
+	T old_value = atomicMax(dst, src);
+	bool ret = (old_value < src);
+	return ret;
+}
+
 
 template <typename T>
 static bool __device__ writeAdd(T *dst, T src) {
@@ -47,6 +56,10 @@ static void __device__ parallel_memset(unsigned char* dst, unsigned char val, si
 	for (size_t index = threadIdx.x + blockDim.x * blockIdx.x; index < total_bytes; index += blockDim.x * gridDim.x)
 		dst[index] = val;
 }
+static void __device__ parallel_memcpy(unsigned char* dst, unsigned char* src, size_t total_bytes) {
+	for (size_t index = threadIdx.x + blockDim.x * blockIdx.x; index < total_bytes; index += blockDim.x * gridDim.x)
+		dst[index] = src[index];
+}
 }
 
 #endif
diff --git a/src/runtime_lib/infra_gpu/vertex_frontier.h b/src/runtime_lib/infra_gpu/vertex_frontier.h
index d8be84ec..c5f2d53d 100644
--- a/src/runtime_lib/infra_gpu/vertex_frontier.h
+++ b/src/runtime_lib/infra_gpu/vertex_frontier.h
@@ -60,6 +60,7 @@ void delete_vertex_frontier(VertexFrontier &frontier) {
 	return;
 }
 static VertexFrontier sentinel_frontier;
+static __device__ VertexFrontier device_sentinel_frontier;
 
 static int32_t builtin_getVertexSetSize(VertexFrontier &frontier) {
 	int32_t curr_size = 0;
@@ -156,6 +157,21 @@ static void __device__ enqueueVertexSparseQueue(int32_t *sparse_queue, int32_t *
 	int32_t pos = atomicAggInc(sparse_queue_size);
 	sparse_queue[pos] = vertex_id;
 }
+static void __device__ enqueueVertexSparseQueueDedup(int32_t *sparse_queue, int32_t *sparse_queue_size, int32_t vertex_id, VertexFrontier &frontier) {
+	int32_t vid = vertex_id;
+	if (frontier.d_dedup_counters[vid] < frontier.curr_dedup_counter) {
+		int32_t pos = atomicAggInc(sparse_queue_size);
+		sparse_queue[pos] = vertex_id;
+		frontier.d_dedup_counters[vid] = frontier.curr_dedup_counter;
+	}
+}
+static void __device__ enqueueVertexSparseQueueDedupPerfect(int32_t *sparse_queue, int32_t *sparse_queue_size, int32_t vertex_id, VertexFrontier &frontier) {
+	int32_t vid = vertex_id;
+	if (writeMax(&frontier.d_dedup_counters[vid], frontier.curr_dedup_counter)) {
+		int32_t pos = atomicAggInc(sparse_queue_size);
+		sparse_queue[pos] = vertex_id;
+	}
+}
 static void __device__ enqueueVertexBytemap(unsigned char* byte_map, int32_t *byte_map_size, int32_t vertex_id) {
 	// We are not using atomic operation here because races are benign here
 	if (byte_map[vertex_id] == 1)
@@ -331,6 +347,23 @@ static void dedup_frontier(VertexFrontier &frontier) {
 	dedup_frontier_kernel<<<NUM_CTA, CTA_SIZE>>>(frontier);
 	swap_queues(frontier);
 }
+
+static void __device__ dedup_frontier_device_perfect(VertexFrontier &frontier) {
+	for(int32_t vidx = threadIdx.x + blockDim.x * blockIdx.x; vidx < frontier.d_num_elems_input[0]; vidx += blockDim.x * gridDim.x) {
+		int32_t vid = frontier.d_sparse_queue_input[vidx];
+		if (writeMax(&frontier.d_dedup_counters[vid], frontier.curr_dedup_counter)) {
+			enqueueVertexSparseQueue(frontier.d_sparse_queue_output, frontier.d_num_elems_output, vid);
+		}
+	}
+}
+static void __global__ dedup_frontier_kernel_perfect(VertexFrontier frontier) {
+	dedup_frontier_device_perfect(frontier);	
+}
+static void dedup_frontier_perfect(VertexFrontier &frontier) {
+	frontier.curr_dedup_counter++;
+	dedup_frontier_kernel_perfect<<<NUM_CTA, CTA_SIZE>>>(frontier);
+	swap_queues(frontier);
+}
 bool __device__ true_function(int32_t _) {
 	return true;
 }
@@ -361,6 +394,23 @@ static void __device__ vertex_set_create_reverse_sparse_queue_device(VertexFront
 static void foo_bar(void) {
 }
 
+template <bool where_func(int32_t)>
+static void __global__ vertex_set_where_kernel(int32_t num_vertices, VertexFrontier frontier) {
+
+	for (int32_t node_id = blockDim.x * blockIdx.x + threadIdx.x; node_id < num_vertices; node_id += blockDim.x * gridDim.x) {
+		if (where_func(node_id)) {
+			enqueueVertexSparseQueue(frontier.d_sparse_queue_output, frontier.d_num_elems_output, node_id);
+		}
+	}
+
+}
+
+template <bool where_func(int32_t)>
+static void __host__ vertex_set_where(int32_t num_vertices, VertexFrontier &frontier) {
+	vertex_set_where_kernel<where_func><<<NUM_CTA, CTA_SIZE>>>(num_vertices, frontier);
+	swap_queues(frontier);
+}
+
 }
 
 #endif
diff --git a/test/verifiers/bc_verifier.cpp b/test/verifiers/bc_verifier.cpp
index eb69d11f..292440b4 100644
--- a/test/verifiers/bc_verifier.cpp
+++ b/test/verifiers/bc_verifier.cpp
@@ -68,7 +68,7 @@ bool BCVerifier(const Graph &g, NodeID source, NodeID num_iters,
     // Compare scores
     bool all_ok = true;
     for (NodeID n : g.vertices()) {
-        if (abs(scores[n] - scores_to_test[n]) > 0.000001) {
+        if (abs(scores[n] - scores_to_test[n]) > 0.001) {
             cout << n << ": " << scores[n] << " != " << scores_to_test[n] << endl;
             all_ok = false;
         }

From 8a2e1e096cc35be5ec896284a5ed9fb18f987ee1 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Tue, 19 May 2020 03:57:56 -0400
Subject: [PATCH 80/88] Added implementation and test cases for MIRMetadata

---
 include/graphit/midend/mir.h          | 29 +++++++++++
 include/graphit/midend/mir_metadata.h | 46 +++++++++++++++++
 test/c++/midend_test.cpp              | 74 ++++++++++++++++++++++++++-
 3 files changed, 148 insertions(+), 1 deletion(-)
 create mode 100644 include/graphit/midend/mir_metadata.h

diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index 13b886bf..b8533091 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -11,6 +11,7 @@
 #include <iostream>
 #include <unordered_set>
 #include <graphit/midend/mir_visitor.h>
+#include <graphit/midend/mir_metadata.h>
 #include <graphit/midend/var.h>
 #include <assert.h>
 #include <graphit/midend/field_vector_property.h>
@@ -54,6 +55,8 @@ namespace graphit {
                 return to<T>(cloneNode());
             }
 
+            // We use a single map to hold all metadata on the MIR Node
+            std::unordered_map<std::string, std::shared_ptr<MIRMetadata>> metadata_map;
         protected:
             template<typename T = MIRNode>
             std::shared_ptr<T> self() {
@@ -68,6 +71,32 @@ namespace graphit {
                 // as I slowly add in support for copy functionalities
                 return nullptr;
             };
+        public:
+            // Functions to set and retrieve metadata of different types
+            template<typename T>
+            void setMetadata(std::string mdname, T val) {
+                typename MIRMetadataImpl<T>::Ptr mdnode = std::make_shared<MIRMetadataImpl<T>>(val);
+                metadata_map[mdname] = mdnode;
+            }
+            // This function is safe to be called even if the metadata with
+            // the specified name doesn't exist
+            template<typename T>
+            bool hasMetadata(std::string mdname) {
+                if (metadata_map.find(mdname) == metadata_map.end())
+		    return false;
+                typename MIRMetadata::Ptr mdnode = metadata_map[mdname];
+                if (!mdnode->isa<T>())
+                    return false;
+                return true;
+            }
+            // This function should be called only after confirming that the 
+            // metadata with the given name exists
+            template <typename T>
+            T getMetadata(std::string mdname) {
+                assert(hasMetadata<T>(mdname));
+                typename MIRMetadata::Ptr mdnode = metadata_map[mdname];
+                return mdnode->to<T>()->val;
+            } 
         };
 
         struct Expr : public MIRNode {
diff --git a/include/graphit/midend/mir_metadata.h b/include/graphit/midend/mir_metadata.h
new file mode 100644
index 00000000..77f51ae2
--- /dev/null
+++ b/include/graphit/midend/mir_metadata.h
@@ -0,0 +1,46 @@
+#ifndef MIR_METADATA_H
+#define MIR_METADATA_H
+
+#include <memory>
+#include <cassert>
+namespace graphit {
+namespace mir {
+
+template<typename T>
+class MIRMetadataImpl;
+
+// The abstract class for the mir metadata
+// Different templated metadata types inherit from this type
+class MIRMetadata: public std::enable_shared_from_this<MIRMetadata> {
+public:
+	typedef std::shared_ptr<MIRMetadata> Ptr;
+	virtual ~MIRMetadata() = default;
+
+
+	template <typename T>
+	bool isa (void) {
+		if(std::dynamic_pointer_cast<MIRMetadataImpl<T>>(shared_from_this()))
+			return true;
+		return false;
+	}
+	template <typename T>
+	std::shared_ptr<MIRMetadataImpl<T>> to(void) {
+		std::shared_ptr<MIRMetadataImpl<T>> ret = std::dynamic_pointer_cast<MIRMetadataImpl<T>>(shared_from_this());
+		assert(ret != nullptr);
+		return ret;
+	}
+};
+
+// Templated metadata class for each type
+template<typename T>
+class MIRMetadataImpl: public MIRMetadata {
+public:
+	typedef std::shared_ptr<MIRMetadataImpl<T>> Ptr;
+	T val;	
+	MIRMetadataImpl(T _val): val(_val) {
+	}
+};
+
+}
+}
+#endif
diff --git a/test/c++/midend_test.cpp b/test/c++/midend_test.cpp
index 5f1fdb99..89e4105b 100644
--- a/test/c++/midend_test.cpp
+++ b/test/c++/midend_test.cpp
@@ -110,4 +110,76 @@ TEST_F(MidendTest, SimpleVertexSetDeclAllocWithMain) {
                              "const vertices : vertexset{Vertex} = new vertexset{Vertex}(5);\n"
                              "func main() print 4; end");
     EXPECT_EQ (0, basicTest(is));
-}
\ No newline at end of file
+}
+
+// Test cases for the MIRMetadata API
+TEST_F(MidendTest, SimpleMetadataTest) {
+    istringstream is("func main() print 4; end");
+    EXPECT_EQ(0, basicTest(is));
+    EXPECT_EQ(true, mir_context_->isFunction("main"));
+
+    mir::FuncDecl::Ptr main_func = mir_context_->getFunction("main");
+
+    main_func->setMetadata<bool>("basic_boolean_md", true);
+    main_func->setMetadata<int>("basic_int_md", 42);
+    EXPECT_EQ(true, main_func->hasMetadata<bool>("basic_boolean_md"));
+    EXPECT_EQ(true, main_func->getMetadata<bool>("basic_boolean_md"));
+    
+    EXPECT_EQ(true, main_func->hasMetadata<int>("basic_int_md"));
+    EXPECT_EQ(42, main_func->getMetadata<int>("basic_int_md"));
+
+}
+TEST_F(MidendTest, SimpleMetadataTestNoExist) {
+    istringstream is("func main() print 4; end");
+    EXPECT_EQ(0, basicTest(is));
+    EXPECT_EQ(true, mir_context_->isFunction("main"));
+
+    mir::FuncDecl::Ptr main_func = mir_context_->getFunction("main");
+
+    main_func->setMetadata<int>("basic_int_md", 42);
+    EXPECT_EQ(false, main_func->hasMetadata<int>("other_int_md"));
+    EXPECT_EQ(false, main_func->hasMetadata<bool>("basic_int_md")); 
+}
+
+TEST_F(MidendTest, SimpleMetadataTestString) {
+    istringstream is("func main() print 4; end");
+    EXPECT_EQ(0, basicTest(is));
+    EXPECT_EQ(true, mir_context_->isFunction("main"));
+
+    mir::FuncDecl::Ptr main_func = mir_context_->getFunction("main");
+
+    main_func->setMetadata<std::string>("basic_str_md", "md value");
+    EXPECT_EQ(true, main_func->hasMetadata<std::string>("basic_str_md"));
+    EXPECT_EQ("md value", main_func->getMetadata<std::string>("basic_str_md"));
+}
+
+TEST_F(MidendTest, SimpleMetadataTestMIRNodeAsMD) {
+    istringstream is("const val:int = 42;\nfunc main() print val; end");
+    EXPECT_EQ(0, basicTest(is));
+    EXPECT_EQ(true, mir_context_->isFunction("main"));
+    EXPECT_EQ(1, mir_context_->getConstants().size());
+     
+    mir::FuncDecl::Ptr main_func = mir_context_->getFunction("main");
+    mir::VarDecl::Ptr decl = mir_context_->getConstants()[0];
+
+    main_func->setMetadata<mir::MIRNode::Ptr>("used_var_md", decl);
+    
+    EXPECT_EQ(true, main_func->hasMetadata<mir::MIRNode::Ptr>("used_var_md"));
+    mir::MIRNode::Ptr mdnode = main_func->getMetadata<mir::MIRNode::Ptr>("used_var_md");
+    EXPECT_EQ(true, mir::isa<mir::VarDecl>(mdnode)); 
+}
+
+TEST_F(MidendTest, SimpleMetadataTestMIRNodeVectorAsMD) {
+    istringstream is("const val:int = 42;\nconst val2: int = 55;\nfunc main() print val + val2; end");
+    EXPECT_EQ(0, basicTest(is));
+    EXPECT_EQ(true, mir_context_->isFunction("main"));
+    EXPECT_EQ(2, mir_context_->getConstants().size());
+     
+    mir::FuncDecl::Ptr main_func = mir_context_->getFunction("main");
+    std::vector<mir::VarDecl::Ptr> decls = mir_context_->getConstants();
+
+    main_func->setMetadata<std::vector<mir::VarDecl::Ptr>>("used_vars_md", decls);
+    
+    EXPECT_EQ(true, main_func->hasMetadata<std::vector<mir::VarDecl::Ptr>>("used_vars_md"));
+    EXPECT_EQ(2, main_func->getMetadata<std::vector<mir::VarDecl::Ptr>>("used_vars_md").size());
+}

From 63f31f357eeb5db09d7f86f56ab0e8c6a53e90b0 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Thu, 28 May 2020 15:52:29 -0400
Subject: [PATCH 81/88] cloning function for mir metadata

---
 include/graphit/midend/mir.h          |  9 +++++++++
 include/graphit/midend/mir_metadata.h | 11 +++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/graphit/midend/mir.h b/include/graphit/midend/mir.h
index b8533091..38aff603 100644
--- a/include/graphit/midend/mir.h
+++ b/include/graphit/midend/mir.h
@@ -97,6 +97,15 @@ namespace graphit {
                 typename MIRMetadata::Ptr mdnode = metadata_map[mdname];
                 return mdnode->to<T>()->val;
             } 
+            std::unordered_map<std::string, std::shared_ptr<MIRMetadata>> cloneMetadata(void) {
+                std::unordered_map<std::string, std::shared_ptr<MIRMetadata>> new_map;
+                for (auto iter = metadata_map.begin(); iter != metadata_map.end(); iter++) {
+                    auto key = iter->first;
+                    new_map[key] = metadata_map[key]->clone();
+                }
+               	return new_map;
+            }
+            
         };
 
         struct Expr : public MIRNode {
diff --git a/include/graphit/midend/mir_metadata.h b/include/graphit/midend/mir_metadata.h
index 77f51ae2..fed5eed7 100644
--- a/include/graphit/midend/mir_metadata.h
+++ b/include/graphit/midend/mir_metadata.h
@@ -24,11 +24,14 @@ class MIRMetadata: public std::enable_shared_from_this<MIRMetadata> {
 		return false;
 	}
 	template <typename T>
-	std::shared_ptr<MIRMetadataImpl<T>> to(void) {
-		std::shared_ptr<MIRMetadataImpl<T>> ret = std::dynamic_pointer_cast<MIRMetadataImpl<T>>(shared_from_this());
+	typename MIRMetadataImpl<T>::Ptr to(void) {
+		typename MIRMetadataImpl<T>::Ptr ret = std::dynamic_pointer_cast<MIRMetadataImpl<T>>(shared_from_this());
 		assert(ret != nullptr);
 		return ret;
 	}
+	virtual MIRMetadata::Ptr clone(void) {
+		return shared_from_this();
+	}
 };
 
 // Templated metadata class for each type
@@ -39,6 +42,10 @@ class MIRMetadataImpl: public MIRMetadata {
 	T val;	
 	MIRMetadataImpl(T _val): val(_val) {
 	}
+	MIRMetadata::Ptr clone(void) {
+		Ptr new_md = std::make_shared<MIRMetadataImpl<T>>(*this);
+		return 	new_md;
+	}
 };
 
 }

From c3be3071e57ac75c38e07df0c32fae002d878f3d Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Wed, 24 Jun 2020 15:48:08 -0400
Subject: [PATCH 82/88] Added perf tests for GPU

---
 graphit_eval/eval/gpu_perf/inputs/bc_power.gt | 126 ++++++++
 graphit_eval/eval/gpu_perf/inputs/bc_road.gt  | 126 ++++++++
 .../eval/gpu_perf/inputs/bfs_power.gt         |  63 ++++
 graphit_eval/eval/gpu_perf/inputs/bfs_road.gt |  56 ++++
 graphit_eval/eval/gpu_perf/inputs/cc_power.gt |  64 ++++
 .../eval/gpu_perf/inputs/pr_social.gt         |  57 ++++
 .../eval/gpu_perf/inputs/sssp_power.gt        |  46 +++
 .../eval/gpu_perf/inputs/sssp_road.gt         |  50 ++++
 graphit_eval/eval/gpu_perf/run_tests.py       | 273 ++++++++++++++++++
 9 files changed, 861 insertions(+)
 create mode 100644 graphit_eval/eval/gpu_perf/inputs/bc_power.gt
 create mode 100644 graphit_eval/eval/gpu_perf/inputs/bc_road.gt
 create mode 100644 graphit_eval/eval/gpu_perf/inputs/bfs_power.gt
 create mode 100644 graphit_eval/eval/gpu_perf/inputs/bfs_road.gt
 create mode 100644 graphit_eval/eval/gpu_perf/inputs/cc_power.gt
 create mode 100644 graphit_eval/eval/gpu_perf/inputs/pr_social.gt
 create mode 100644 graphit_eval/eval/gpu_perf/inputs/sssp_power.gt
 create mode 100644 graphit_eval/eval/gpu_perf/inputs/sssp_road.gt
 create mode 100644 graphit_eval/eval/gpu_perf/run_tests.py

diff --git a/graphit_eval/eval/gpu_perf/inputs/bc_power.gt b/graphit_eval/eval/gpu_perf/inputs/bc_power.gt
new file mode 100644
index 00000000..0209d8cb
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/bc_power.gt
@@ -0,0 +1,126 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+
+const num_paths : vector{Vertex}(int) = 0;
+const dependences : vector{Vertex}(float) = 0;
+const visited : vector{Vertex}(bool) = false;
+
+func forward_update(src : Vertex, dst : Vertex)
+    num_paths[dst] +=  num_paths[src];
+end
+
+func visited_vertex_filter(v : Vertex) -> output : bool
+    output = (visited[v] == false);
+end
+
+func mark_visited(v : Vertex)
+    visited[v] = true;
+end
+
+func mark_unvisited(v : Vertex)
+    visited[v] = false;
+end
+
+func backward_vertex_f(v : Vertex)
+    visited[v] = true;
+    dependences[v] += 1.0 / num_paths[v];
+end
+
+func backward_update(src : Vertex, dst : Vertex)
+    dependences[dst] += dependences[src];
+end
+
+func final_vertex_f(v : Vertex)
+    if num_paths[v] != 0
+        dependences[v] = (dependences[v] - 1.0 / num_paths[v]) * num_paths[v];
+    else
+        dependences[v] = 0;
+    end
+end
+
+func reset(v : Vertex)
+    dependences[v] = 0;
+    num_paths[v] = 0;
+end
+
+
+
+
+func main()
+
+    % transposing the edges
+    var transposed_edges : edgeset{Edge}(Vertex, Vertex) = edges.transpose();
+    for trail in 0:1
+	 startTimer();
+    	 var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+    	 var start_vertex : int = atoi(argv[2]);
+
+    	 frontier.addVertex(start_vertex);
+    	 num_paths[start_vertex] = 1;
+    	 visited[start_vertex] = true;
+    	 var round : int = 0;
+    	 var frontier_list : list{vertexset{Vertex}} = new list{vertexset{Vertex}}();
+
+    	 frontier_list.insert(frontier);
+
+    	 % foward pass to propagate num_paths
+    	 while (frontier.getVertexSetSize() != 0)
+               round = round + 1;
+               #s1# var output : vertexset{Vertex} = edges.from(frontier).to(visited_vertex_filter).applyModified(forward_update, num_paths);
+	       delete frontier;
+               output.apply(mark_visited);
+               frontier_list.insert(output);
+               frontier = output;
+    	 end
+
+
+   	  % resetting the visited information for the backward pass
+    	  vertices.apply(mark_unvisited);
+
+    	  % pop off the empty frontier
+    	  frontier_list.retrieve(frontier);
+
+    	  frontier_list.retrieve(frontier);
+    	  frontier.apply(backward_vertex_f);
+    	  round = round - 1;
+
+    	  % backward pass to accumulate the dependencies
+    	  while (round > 0)
+          	#s2# transposed_edges.from(frontier).to(visited_vertex_filter).apply(backward_update);
+        	frontier_list.retrieve(frontier);
+        	frontier.apply(backward_vertex_f);
+        	round = round - 1;
+    	  end
+    	  delete frontier;
+
+    	  vertices.apply(final_vertex_f);
+	  var elapsed_time : float = stopTimer();
+          print "elapsed time: ";
+          print elapsed_time;
+	  vertices.apply(reset);
+    end
+    
+
+
+end
+
+
+schedule:
+
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(FUSED);
+	
+	SimpleGPUSchedule s2;
+	s2.configLoadBalance(TWCE);
+	s2.configDirection(PULL, BITMAP);
+	s2.configFrontierCreation(UNFUSED_BITMAP);
+
+	HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, "argv[3]", s1, s2);	
+
+	program->applyGPUSchedule("s1", h1);
+	program->applyGPUSchedule("s2", h1);
+
diff --git a/graphit_eval/eval/gpu_perf/inputs/bc_road.gt b/graphit_eval/eval/gpu_perf/inputs/bc_road.gt
new file mode 100644
index 00000000..b591e9d4
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/bc_road.gt
@@ -0,0 +1,126 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+
+const num_paths : vector{Vertex}(int) = 0;
+const dependences : vector{Vertex}(float) = 0;
+const visited : vector{Vertex}(bool) = false;
+
+func forward_update(src : Vertex, dst : Vertex)
+    num_paths[dst] +=  num_paths[src];
+end
+
+func visited_vertex_filter(v : Vertex) -> output : bool
+    output = (visited[v] == false);
+end
+
+func mark_visited(v : Vertex)
+    visited[v] = true;
+end
+
+func mark_unvisited(v : Vertex)
+    visited[v] = false;
+end
+
+func backward_vertex_f(v : Vertex)
+    visited[v] = true;
+    dependences[v] += 1.0 / num_paths[v];
+end
+
+func backward_update(src : Vertex, dst : Vertex)
+    dependences[dst] += dependences[src];
+end
+
+func final_vertex_f(v : Vertex)
+    if num_paths[v] != 0
+        dependences[v] = (dependences[v] - 1.0 / num_paths[v]) * num_paths[v];
+    else
+        dependences[v] = 0;
+    end
+end
+
+func reset(v : Vertex)
+    dependences[v] = 0;
+    num_paths[v] = 0;
+end
+
+
+
+
+func main()
+
+    % transposing the edges
+    var transposed_edges : edgeset{Edge}(Vertex, Vertex) = edges.transpose();
+    for trail in 0:1
+	 startTimer();
+    	 var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+    	 var start_vertex : int = atoi(argv[2]);
+
+    	 frontier.addVertex(start_vertex);
+    	 num_paths[start_vertex] = 1;
+    	 visited[start_vertex] = true;
+    	 var round : int = 0;
+    	 var frontier_list : list{vertexset{Vertex}} = new list{vertexset{Vertex}}();
+
+    	 frontier_list.insert(frontier);
+
+    	 % foward pass to propagate num_paths
+    	 #s0# while (frontier.getVertexSetSize() != 0)
+               round = round + 1;
+               #s1# var output : vertexset{Vertex} = edges.from(frontier).to(visited_vertex_filter).applyModified(forward_update, num_paths);
+	       delete frontier;
+               output.apply(mark_visited);
+               frontier_list.insert(output);
+               frontier = output;
+    	 end
+
+
+   	  % resetting the visited information for the backward pass
+    	  vertices.apply(mark_unvisited);
+
+    	  % pop off the empty frontier
+    	  frontier_list.retrieve(frontier);
+
+    	  frontier_list.retrieve(frontier);
+    	  frontier.apply(backward_vertex_f);
+    	  round = round - 1;
+
+    	  % backward pass to accumulate the dependencies
+    	  #s2# while (round > 0)
+          	#s3# transposed_edges.from(frontier).to(visited_vertex_filter).apply(backward_update);
+        	frontier_list.retrieve(frontier);
+        	frontier.apply(backward_vertex_f);
+        	round = round - 1;
+    	  end
+    	  delete frontier;
+
+    	  vertices.apply(final_vertex_f);
+	  var elapsed_time : float = stopTimer();
+          print "elapsed time: ";
+          print elapsed_time;
+	  vertices.apply(reset);
+    end
+    
+
+
+end
+
+
+schedule:
+
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(FUSED);
+	s1.configDeduplication(ENABLED, FUSED);
+	
+
+	program->applyGPUSchedule("s0:s1", s1);
+	program->applyGPUSchedule("s2:s3", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	
+	program->applyGPUSchedule("s0", s0);
+	program->applyGPUSchedule("s2", s0);
diff --git a/graphit_eval/eval/gpu_perf/inputs/bfs_power.gt b/graphit_eval/eval/gpu_perf/inputs/bfs_power.gt
new file mode 100644
index 00000000..ee63e3a1
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/bfs_power.gt
@@ -0,0 +1,63 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print "elapsed time: ";
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+
+	s1.configDeduplication(DISABLED);
+	s1.configLoadBalance(TWCE);
+	s1.configDirection(PUSH);
+	s1.configFrontierCreation(FUSED);
+	//s1.configDeduplication(ENABLED);
+	//s1.configFrontierCreation(UNFUSED_BITMAP);
+
+
+	SimpleGPUSchedule s2 = s1;
+	s2.configLoadBalance(VERTEX_BASED);
+	s2.configDirection(PULL, BITMAP);
+	s2.configDeduplication(DISABLED);
+	s2.configFrontierCreation(UNFUSED_BITMAP);
+	
+	HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, "argv[3]", s1, s2);
+	program->applyGPUSchedule("s0:s1", h1);
+
+
diff --git a/graphit_eval/eval/gpu_perf/inputs/bfs_road.gt b/graphit_eval/eval/gpu_perf/inputs/bfs_road.gt
new file mode 100644
index 00000000..b44597e2
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/bfs_road.gt
@@ -0,0 +1,56 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print "elapsed time: ";
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configDeduplication(DISABLED);
+	s1.configLoadBalance(TWCE);
+	s1.configDirection(PUSH);
+	s1.configFrontierCreation(FUSED);
+
+
+	program->applyGPUSchedule("s0:s1", s1);
+
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/graphit_eval/eval/gpu_perf/inputs/cc_power.gt b/graphit_eval/eval/gpu_perf/inputs/cc_power.gt
new file mode 100644
index 00000000..3c7cf885
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/cc_power.gt
@@ -0,0 +1,64 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+
+const vertices : vertexset{Vertex} = edges.getVertices();
+const IDs : vector{Vertex}(int) = 1;
+
+const update: vector[1](int);
+
+func updateEdge(src : Vertex, dst : Vertex)
+    var src_id: Vertex = IDs[src];
+    var dst_id: Vertex = IDs[dst];
+
+    IDs[dst_id] min= IDs[src_id];
+    IDs[src_id] min= IDs[dst_id];
+end
+
+func init(v : Vertex)
+     IDs[v] = v;
+end
+
+func pjump(v: Vertex) 
+    var y: Vertex = IDs[v];
+    var x: Vertex = IDs[y];
+    if x != y
+        IDs[v] = x;
+        update[0] = 1;
+    end
+end
+
+func main()
+    var n : int = edges.getVertices();
+    for trail in 0:10
+        var frontier : vertexset{Vertex} = new vertexset{Vertex}(n);
+        startTimer();
+        vertices.apply(init);
+        while (frontier.getVertexSetSize() != 0)
+            #s1# var output: vertexset{Vertex} = edges.from(frontier).applyModified(updateEdge,IDs);
+	    delete frontier;
+	    frontier = output;
+            update[0] = 1;
+            #s0# while update[0] != 0
+		update[0] = 0;
+		vertices.apply(pjump);
+            end
+        end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+        print "elapsed time: ";
+        print elapsed_time;
+    end
+end
+
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWC);
+	s1.configDeduplication(ENABLED);
+	s1.configFrontierCreation(UNFUSED_BITMAP);
+	program->applyGPUSchedule("s1", s1);
+
+
diff --git a/graphit_eval/eval/gpu_perf/inputs/pr_social.gt b/graphit_eval/eval/gpu_perf/inputs/pr_social.gt
new file mode 100644
index 00000000..18aea9cc
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/pr_social.gt
@@ -0,0 +1,57 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const old_rank : vector{Vertex}(double) = 1.0/vertices.size();
+const new_rank : vector{Vertex}(double) = 0.0;
+const out_degree : vector {Vertex}(int) = edges.getOutDegrees();
+const contrib : vector{Vertex}(double) = 0.0;
+const error : vector{Vertex}(double) = 0.0;
+const damp : double = 0.85;
+const beta_score : double = (1.0 - damp) / vertices.size();
+
+func computeContrib(v : Vertex)
+    contrib[v] = old_rank[v] / out_degree[v];
+end
+
+func updateEdge(src : Vertex, dst : Vertex)
+    new_rank[dst] += contrib[src];
+end
+
+func updateVertex(v : Vertex)
+    var old_score : double = old_rank[v];
+    new_rank[v] = beta_score + damp*(new_rank[v]);
+    error[v] = fabs(new_rank[v] - old_rank[v]);
+    old_rank[v] = new_rank[v];
+    new_rank[v] = 0.0;
+end
+
+func printRank(v : Vertex)
+    print old_rank[v];
+end
+
+func reset(v: Vertex)
+    old_rank[v] = 1.0/vertices.size();
+    new_rank[v] = 0.0;
+end
+
+func main()
+    for trail in 0:10
+    	startTimer();
+        vertices.apply(reset);
+    	for i in 0:20
+    	    vertices.apply(computeContrib);
+            #s1# edges.apply(updateEdge);
+            vertices.apply(updateVertex);
+    	end
+
+    	var elapsed_time : double = stopTimer();
+    	print "elapsed time: ";
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	program->applyGPUSchedule("s1", s1);
diff --git a/graphit_eval/eval/gpu_perf/inputs/sssp_power.gt b/graphit_eval/eval/gpu_perf/inputs/sssp_power.gt
new file mode 100644
index 00000000..de01ddca
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/sssp_power.gt
@@ -0,0 +1,46 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex, int) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const dist : vector{Vertex}(int) = 2147483647; %should be INT_MAX
+const pq: priority_queue{Vertex}(int);
+
+func updateEdge(src : Vertex, dst : Vertex, weight : int)
+    var new_dist : int = dist[src] + weight;
+    pq.updatePriorityMin(dst, dist[dst], new_dist);
+end
+
+func printDist(v : Vertex)
+    print dist[v];
+end
+
+func reset(v: Vertex)
+    dist[v] = 2147483647;
+end
+
+func main()
+    for trail in 0:10
+        var start_vertex : int = atoi(argv[2]);
+        pq = new priority_queue{Vertex}(int)(false, false, dist, 1, 2, false, start_vertex);
+        startTimer();
+        vertices.apply(reset);
+        dist[start_vertex] = 0;
+        #s0# while (pq.finished() == false)
+            var frontier : vertexset{Vertex} = pq.dequeue_ready_set(); % dequeue lowest priority nodes
+            #s1# edges.from(frontier).applyUpdatePriority(updateEdge);
+            delete frontier;
+        end
+        var elapsed_time : float = stopTimer();
+        print "elapsed time: ";
+        print elapsed_time;
+	delete pq;
+    end
+end
+
+
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(UNFUSED_BOOLMAP);
+	s1.configDelta("argv[3]");
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/graphit_eval/eval/gpu_perf/inputs/sssp_road.gt b/graphit_eval/eval/gpu_perf/inputs/sssp_road.gt
new file mode 100644
index 00000000..ce00052d
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/inputs/sssp_road.gt
@@ -0,0 +1,50 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex, int) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const dist : vector{Vertex}(int) = 2147483647; %should be INT_MAX
+const pq: priority_queue{Vertex}(int);
+
+func updateEdge(src : Vertex, dst : Vertex, weight : int)
+    var new_dist : int = dist[src] + weight;
+    pq.updatePriorityMin(dst, dist[dst], new_dist);
+end
+
+func printDist(v : Vertex)
+    print dist[v];
+end
+
+func reset(v: Vertex)
+    dist[v] = 2147483647;
+end
+
+func main()
+    for trail in 0:10
+        var start_vertex : int = atoi(argv[2]);
+        pq = new priority_queue{Vertex}(int)(false, false, dist, 1, 2, false, start_vertex);
+        startTimer();
+        vertices.apply(reset);
+        dist[start_vertex] = 0;
+        #s0# while (pq.finished() == false)
+            var frontier : vertexset{Vertex} = pq.dequeue_ready_set(); % dequeue lowest priority nodes
+            #s1# edges.from(frontier).applyUpdatePriority(updateEdge);
+            delete frontier;
+        end
+        var elapsed_time : float = stopTimer();
+        print "elapsed time: ";
+        print elapsed_time;
+	delete pq;
+    end
+end
+
+
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(FUSED);
+	s1.configDelta("argv[3]");
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/graphit_eval/eval/gpu_perf/run_tests.py b/graphit_eval/eval/gpu_perf/run_tests.py
new file mode 100644
index 00000000..42f644e8
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/run_tests.py
@@ -0,0 +1,273 @@
+import os
+import sys
+import subprocess
+import shutil
+
+NVCC_PATH="/usr/local/cuda/bin/nvcc"
+
+GRAPHIT_SRC_DIR=""
+GRAPHIT_BUILD_DIR=""
+GRAPH_DIR=""
+
+WORKING_DIR=os.path.abspath("./scratch").rstrip("/")
+
+OUTPUT_DIR=os.path.abspath("./output").rstrip("/")
+INPUTS_DIR=os.path.abspath("./inputs").rstrip("/")
+
+
+GPU_CC=""
+NUM_SM=""
+
+def get_command_output_class(command):
+	output = ""
+	if isinstance(command, list):
+		proc = subprocess.Popen(command, stdout=subprocess.PIPE)
+	else:
+		print(command)
+		proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
+	exitcode = proc.wait()
+	for line in proc.stdout.readlines():
+		if isinstance(line, bytes):
+			line = line.decode()
+		output += line.rstrip() + "\n"
+
+	proc.stdout.close()
+	return exitcode, output
+
+def get_command_output(command):
+	(exitcode, output) = get_command_output_class(command)
+	if exitcode != 0:
+		print("Error executing command:", command)
+		exit(1)
+	return output
+
+def get_gpu_prop():
+	global GPU_CC
+	global NUM_SM
+	global NVCC_PATH
+	global GRAPHIT_SRC_DIR
+	global WORKING_DIR
+
+	get_command_output(NVCC_PATH + " " + GRAPHIT_SRC_DIR + "/test/gpu_tests/test_input/obtain_gpu_cc.cu -o " + WORKING_DIR + "/obtain_gpu_cc")
+	output = get_command_output(WORKING_DIR+"/obtain_gpu_cc").strip().split("\n")
+	if len(output) != 2:
+		print("Cannot obtain GPU information")
+		exit(1)
+	GPU_CC=output[0]
+	NUM_SM=output[1]	
+
+
+def compile_and_execute(input_file, graph_name, args, output_name):
+	global GRAPHIT_SRC_DIR
+	global GRAPHIT_BUILD_DIR
+	global GRAPH_DIR
+	global WORKING_DIR
+	global OUTPUT_DIR
+	global GPU_CC
+	global NUM_SM
+	global NVCC_PATH
+	
+	nvcc_command = NVCC_PATH + " -rdc=true --use_fast_math -Xptxas \"-dlcm=ca --maxrregcount=64\" -std=c++11 -DNUM_CTA=" + str(int(NUM_SM)*2)+ " -DCTA_SIZE=512 -gencode arch=compute_" + GPU_CC + ",code=sm_"+GPU_CC	
+
+	graphit_compiler_command = "python " + GRAPHIT_BUILD_DIR + "/bin/graphitc.py -o " + WORKING_DIR+"/test_cpp.cu -f"
+
+
+	cwd = os.getcwd()
+	os.chdir(WORKING_DIR)
+	get_command_output(graphit_compiler_command + " " + input_file)
+	get_command_output(nvcc_command + " " + WORKING_DIR+"/test_cpp.cu -o " + WORKING_DIR+"/test_executable -I " + GRAPHIT_SRC_DIR+"/src/runtime_lib")
+	output = get_command_output(WORKING_DIR+"/test_executable " + graph_name + " " + args)
+	os.chdir(cwd)
+	
+	f = open(OUTPUT_DIR+"/"+output_name, "w")
+	f.write(output)
+	f.close()
+	
+
+def parse_output_file(output_name):
+	global OUTPUT_DIR
+	f = open(OUTPUT_DIR+"/"+output_name)
+	content = f.read().strip().split("\n")
+	f.close()
+	min_time = 1000000
+	for line in content:
+		try:
+			time = float(line)
+		except ValueError as verr:
+			time = -1
+		if time == -1:
+			continue
+		if time < min_time:
+			min_time = time
+	return time
+
+def create_csv(time_values, output_name):
+	global OUTPUT_DIR
+	f = open(OUTPUT_DIR+"/"+output_name, "w")
+	
+	for graph in time_values.keys():
+		f.write (graph+", " + str(time_values[graph]) + "\n")
+
+	f.close()
+
+def test_pr():
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/soc-orkut.mtx", "", "pr_OK")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/soc-twitter-2010.mtx", "", "pr_TW")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/soc-LiveJournal1.mtx", "", "pr_LJ")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/soc-sinaweibo.mtx", "", "pr_SW")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/indochina-2004.weighted.mtx", "", "pr_IC")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/hollywood-2009.weighted.mtx", "", "pr_HW")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/road_central.weighted.mtx", "", "pr_RC")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/road_usa.weighted.mtx", "", "pr_RU")
+	compile_and_execute(INPUTS_DIR+"/pr_social.gt", GRAPH_DIR+"/roadNet-CA.weighted.mtx", "", "pr_RN")
+
+	time_values={}
+	time_values['OK'] = parse_output_file("pr_OK")
+	time_values['TW'] = parse_output_file("pr_TW")
+	time_values['LJ'] = parse_output_file("pr_LJ")
+	time_values['SW'] = parse_output_file("pr_SW")
+	time_values['IC'] = parse_output_file("pr_IC")
+	time_values['HW'] = parse_output_file("pr_HW")
+	time_values['RC'] = parse_output_file("pr_RC")
+	time_values['RU'] = parse_output_file("pr_RU")
+	time_values['RN'] = parse_output_file("pr_RN")
+
+	create_csv(time_values, "pr.csv")
+
+def test_sssp():
+	compile_and_execute(INPUTS_DIR+"/sssp_power.gt", GRAPH_DIR+"/soc-orkut.mtx", "0 22", "sssp_OK")
+	compile_and_execute(INPUTS_DIR+"/sssp_power.gt", GRAPH_DIR+"/soc-twitter-2010.mtx", "0 15", "sssp_TW")
+	compile_and_execute(INPUTS_DIR+"/sssp_power.gt", GRAPH_DIR+"/soc-LiveJournal1.mtx", "0 120", "sssp_LJ")
+	compile_and_execute(INPUTS_DIR+"/sssp_power.gt", GRAPH_DIR+"/soc-sinaweibo.mtx", "0 15", "sssp_SW")
+	compile_and_execute(INPUTS_DIR+"/sssp_power.gt", GRAPH_DIR+"/indochina-2004.weighted.mtx", "0 15", "sssp_IC")
+	compile_and_execute(INPUTS_DIR+"/sssp_power.gt", GRAPH_DIR+"/hollywood-2009.weighted.mtx", "0 15", "sssp_HW")
+	compile_and_execute(INPUTS_DIR+"/sssp_road.gt", GRAPH_DIR+"/road_central.weighted.mtx", "0 80000", "sssp_RC")
+	compile_and_execute(INPUTS_DIR+"/sssp_road.gt", GRAPH_DIR+"/road_usa.weighted.mtx", "0 80000", "sssp_RU")
+	compile_and_execute(INPUTS_DIR+"/sssp_road.gt", GRAPH_DIR+"/roadNet-CA.weighted.mtx", "0 80000", "sssp_RN")
+
+	time_values={}
+	time_values['OK'] = parse_output_file("sssp_OK")
+	time_values['TW'] = parse_output_file("sssp_TW")
+	time_values['LJ'] = parse_output_file("sssp_LJ")
+	time_values['SW'] = parse_output_file("sssp_SW")
+	time_values['IC'] = parse_output_file("sssp_IC")
+	time_values['HW'] = parse_output_file("sssp_HW")
+	time_values['RC'] = parse_output_file("sssp_RC")
+	time_values['RU'] = parse_output_file("sssp_RU")
+	time_values['RN'] = parse_output_file("sssp_RN")
+
+	create_csv(time_values, "sssp.csv")
+
+def test_cc():
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/soc-orkut.mtx", "", "cc_OK")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/soc-twitter-2010.mtx", "", "cc_TW")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/soc-LiveJournal1.mtx", "", "cc_LJ")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/soc-sinaweibo.mtx", "", "cc_SW")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/indochina-2004.weighted.mtx", "", "cc_IC")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/hollywood-2009.weighted.mtx", "", "cc_HW")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/road_central.weighted.mtx", "", "cc_RC")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/road_usa.weighted.mtx", "", "cc_RU")
+	compile_and_execute(INPUTS_DIR+"/cc_power.gt", GRAPH_DIR+"/roadNet-CA.weighted.mtx", "", "cc_RN")
+
+	time_values={}
+	time_values['OK'] = parse_output_file("cc_OK")
+	time_values['TW'] = parse_output_file("cc_TW")
+	time_values['LJ'] = parse_output_file("cc_LJ")
+	time_values['SW'] = parse_output_file("cc_SW")
+	time_values['IC'] = parse_output_file("cc_IC")
+	time_values['HW'] = parse_output_file("cc_HW")
+	time_values['RC'] = parse_output_file("cc_RC")
+	time_values['RU'] = parse_output_file("cc_RU")
+	time_values['RN'] = parse_output_file("cc_RN")
+
+	create_csv(time_values, "cc.csv")
+
+def test_bfs():
+	compile_and_execute(INPUTS_DIR+"/bfs_power.gt", GRAPH_DIR+"/soc-orkut.mtx", "0 0.12", "bfs_OK")
+	compile_and_execute(INPUTS_DIR+"/bfs_power.gt", GRAPH_DIR+"/soc-twitter-2010.mtx", "0 0.03", "bfs_TW")
+	compile_and_execute(INPUTS_DIR+"/bfs_power.gt", GRAPH_DIR+"/soc-LiveJournal1.mtx", "0 0.015", "bfs_LJ")
+	compile_and_execute(INPUTS_DIR+"/bfs_power.gt", GRAPH_DIR+"/soc-sinaweibo.mtx", "0 0.012", "bfs_SW")
+	compile_and_execute(INPUTS_DIR+"/bfs_power.gt", GRAPH_DIR+"/indochina-2004.weighted.mtx", "0 0.03", "bfs_IC")
+	compile_and_execute(INPUTS_DIR+"/bfs_power.gt", GRAPH_DIR+"/hollywood-2009.weighted.mtx", "0 0.03", "bfs_HW")
+	compile_and_execute(INPUTS_DIR+"/bfs_road.gt", GRAPH_DIR+"/road_central.weighted.mtx", "0", "bfs_RC")
+	compile_and_execute(INPUTS_DIR+"/bfs_road.gt", GRAPH_DIR+"/road_usa.weighted.mtx", "0", "bfs_RU")
+	compile_and_execute(INPUTS_DIR+"/bfs_road.gt", GRAPH_DIR+"/roadNet-CA.weighted.mtx", "0", "bfs_RN")
+
+	time_values={}
+	time_values['OK'] = parse_output_file("bfs_OK")
+	time_values['TW'] = parse_output_file("bfs_TW")
+	time_values['LJ'] = parse_output_file("bfs_LJ")
+	time_values['SW'] = parse_output_file("bfs_SW")
+	time_values['IC'] = parse_output_file("bfs_IC")
+	time_values['HW'] = parse_output_file("bfs_HW")
+	time_values['RC'] = parse_output_file("bfs_RC")
+	time_values['RU'] = parse_output_file("bfs_RU")
+	time_values['RN'] = parse_output_file("bfs_RN")
+
+	create_csv(time_values, "bfs.csv")
+
+def test_bc():
+	compile_and_execute(INPUTS_DIR+"/bc_power.gt", GRAPH_DIR+"/soc-orkut.mtx", "0 0.12", "bc_OK")
+	compile_and_execute(INPUTS_DIR+"/bc_power.gt", GRAPH_DIR+"/soc-twitter-2010.mtx", "0 0.03", "bc_TW")
+	compile_and_execute(INPUTS_DIR+"/bc_power.gt", GRAPH_DIR+"/soc-LiveJournal1.mtx", "0 0.015", "bc_LJ")
+	compile_and_execute(INPUTS_DIR+"/bc_power.gt", GRAPH_DIR+"/soc-sinaweibo.mtx", "0 0.012", "bc_SW")
+	compile_and_execute(INPUTS_DIR+"/bc_power.gt", GRAPH_DIR+"/indochina-2004.weighted.mtx", "0 0.03", "bc_IC")
+	compile_and_execute(INPUTS_DIR+"/bc_power.gt", GRAPH_DIR+"/hollywood-2009.weighted.mtx", "0 0.03", "bc_HW")
+	compile_and_execute(INPUTS_DIR+"/bc_road.gt", GRAPH_DIR+"/road_central.weighted.mtx", "0", "bc_RC")
+	compile_and_execute(INPUTS_DIR+"/bc_road.gt", GRAPH_DIR+"/road_usa.weighted.mtx", "0", "bc_RU")
+	compile_and_execute(INPUTS_DIR+"/bc_road.gt", GRAPH_DIR+"/roadNet-CA.weighted.mtx", "0", "bc_RN")
+
+	time_values={}
+	time_values['OK'] = parse_output_file("bc_OK")
+	time_values['TW'] = parse_output_file("bc_TW")
+	time_values['LJ'] = parse_output_file("bc_LJ")
+	time_values['SW'] = parse_output_file("bc_SW")
+	time_values['IC'] = parse_output_file("bc_IC")
+	time_values['HW'] = parse_output_file("bc_HW")
+	time_values['RC'] = parse_output_file("bc_RC")
+	time_values['RU'] = parse_output_file("bc_RU")
+	time_values['RN'] = parse_output_file("bc_RN")
+
+	create_csv(time_values, "bc.csv")
+
+def run_all_tests():
+	test_pr()	
+	test_sssp()	
+	test_cc()	
+	test_bfs()	
+	test_bc()	
+
+def usage(pname):
+	print("Usage:")
+	print(pname + " <graphit_src_dir> <graphit_build_dir> <graph_directory_path>")
+
+def main():
+	global GRAPHIT_SRC_DIR
+	global GRAPHIT_BUILD_DIR
+	global GRAPH_DIR
+	global WORKING_DIR
+	global OUTPUT_DIR
+
+	if len(sys.argv) < 4:
+		usage(sys.argv[0])
+		exit(1)
+	GRAPHIT_SRC_DIR = os.path.abspath(sys.argv[1].strip()).rstrip("/")
+	GRAPHIT_BUILD_DIR = os.path.abspath(sys.argv[2].strip()).rstrip("/")
+	GRAPH_DIR = os.path.abspath(sys.argv[3].strip()).rstrip("/")
+
+
+	if os.path.isdir(WORKING_DIR):
+		shutil.rmtree(WORKING_DIR)
+	os.mkdir(WORKING_DIR)
+
+
+	if not os.path.isdir(OUTPUT_DIR):
+		os.mkdir(OUTPUT_DIR)
+
+	get_gpu_prop()	
+
+	run_all_tests()
+
+if __name__ == '__main__':
+	main()
+	

From 4c8aac8d2c8c517b21dd5144cc02001c2005493a Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Wed, 1 Jul 2020 13:50:22 -0400
Subject: [PATCH 83/88] Update .travis.yml

---
 .travis.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 145b39fd..36191b62 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -19,6 +19,8 @@ python: "3.7"
 before_install:
         - sudo apt-get update
         - sudo apt-get install python3-pip
+        - sudo pip3 install --upgrade pip
+        - sudo pip3 install setuptools
         - sudo pip3 install pybind11
         - sudo pip3 install scipy
 

From 729421d217784ff7e6fb2d73107e5b82284c6501 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Thu, 3 Dec 2020 12:16:00 -0500
Subject: [PATCH 84/88] Small changes

---
 autotune/cmd.sh                       |   1 +
 autotune/run.sh                       |  28 +++
 graphit_eval/eval/gpu_perf/.gitignore |   2 +
 test/c++/high_level_schedule_test.cpp | 269 +++++++++++++++++++++++++-
 4 files changed, 299 insertions(+), 1 deletion(-)
 create mode 100644 autotune/cmd.sh
 create mode 100644 autotune/run.sh
 create mode 100644 graphit_eval/eval/gpu_perf/.gitignore

diff --git a/autotune/cmd.sh b/autotune/cmd.sh
new file mode 100644
index 00000000..02c515e2
--- /dev/null
+++ b/autotune/cmd.sh
@@ -0,0 +1 @@
+python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-LiveJournal1.mtx --algo_file gpu_apps/sssp_delta_stepping.gt --killed_process_report_runtime_limit 1 --max_delta 100 --runtime_limit 10 --stop-after 900
diff --git a/autotune/run.sh b/autotune/run.sh
new file mode 100644
index 00000000..f78fb321
--- /dev/null
+++ b/autotune/run.sh
@@ -0,0 +1,28 @@
+
+export CUDA_VISIBLE_DEVICES=6
+
+
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-LiveJournal1.mtx --algo_file gpu_apps/sssp_delta_stepping.gt --killed_process_report_runtime_limit 1 --max_delta 100 --runtime_limit 20 --stop-after 600 --final_config=final_config_ds_livejournal.json --kernel_fusion=True --num_vertices=0
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-twitter-2010.mtx --algo_file gpu_apps/sssp_delta_stepping.gt --killed_process_report_runtime_limit 1 --max_delta 100 --runtime_limit 20 --stop-after 600 --final_config=final_config_ds_twitter.json --kernel_fusion=True --num_vertices=0
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/road_usa.weighted.mtx --algo_file gpu_apps/sssp_delta_stepping.gt --killed_process_report_runtime_limit 1 --max_delta 100000 --runtime_limit 20 --stop-after 1500 --final_config=final_config_ds_road_usa.json --kernel_fusion=True --num_vertices=0
+
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-LiveJournal1.mtx --algo_file gpu_apps/cc.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 10 --stop-after 600 --final_config=final_config_cc_livejournal.json --num_vertices=0
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-twitter-2010.mtx --algo_file gpu_apps/cc.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 30 --stop-after 600 --final_config=final_config_cc_twitter.json --num_vertices=0
+
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/road_usa.weighted.mtx --algo_file gpu_apps/cc.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 20 --stop-after 600 --final_config=final_config_cc_road_usa.json --num_vertices=0
+
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-LiveJournal1.mtx --algo_file gpu_apps/pagerank.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 20 --stop-after 600 --final_config=final_config_pr_livejournal.json --kernel_fusion=True --edge_only=True --num_vertices=4847571
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-twitter-2010.mtx --algo_file gpu_apps/pagerank.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 30 --stop-after 600 --final_config=final_config_pr_twitter.json --kernel_fusion=True --edge_only=True --num_vertices=21297772
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/road_usa.weighted.mtx --algo_file gpu_apps/pagerank.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 20 --stop-after 600 --final_config=final_config_pr_road_usa.json --kernel_fusion=True --edge_only=True --num_vertices=23947347
+
+
+
+python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-orkut.mtx --algo_file gpu_apps/bfs.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 20 --stop-after=36000  --final_config=final_config_bfs_orkut.json --kernel_fusion=True --num_vertices=0 --hybrid_schedule=True
+
+python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-LiveJournal1.mtx --algo_file gpu_apps/bfs.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 20 --stop-after=3600  --final_config=final_config_bfs_livejournal.json --kernel_fusion=True --num_vertices=0 --hybrid_schedule=True --hybrid_threshold=8
+
+
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/soc-twitter-2010.mtx --algo_file gpu_apps/bfs.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 20 --stop-after 600 --final_config=final_config_bfs_twitter.json --kernel_fusion=True --num_vertices=0 --hybrid_schedule=True
+
+#python3 graphit_gpu_autotuner.py --graph /local/ajaybr/graph-dataset/clean_general/road_usa.weighted.mtx --algo_file gpu_apps/bfs.gt --killed_process_report_runtime_limit 1 --max_delta 1 --runtime_limit 20 --stop-after 600 --final_config=final_config_bfs_road_usa.json --kernel_fusion=True --num_vertices=0 --hybrid_schedule=1
+
diff --git a/graphit_eval/eval/gpu_perf/.gitignore b/graphit_eval/eval/gpu_perf/.gitignore
new file mode 100644
index 00000000..e3eb845b
--- /dev/null
+++ b/graphit_eval/eval/gpu_perf/.gitignore
@@ -0,0 +1,2 @@
+output/*
+scratch/*
diff --git a/test/c++/high_level_schedule_test.cpp b/test/c++/high_level_schedule_test.cpp
index 0edc6421..ead1fa92 100644
--- a/test/c++/high_level_schedule_test.cpp
+++ b/test/c++/high_level_schedule_test.cpp
@@ -226,6 +226,31 @@ class HighLevelScheduleTest : public ::testing::Test {
                                                      "    vertices.apply(printID);\n"
                                                      "end");
 
+        const char* gpu_cc_char = ("element Vertex end\n"
+                                                     "element Edge end\n"
+                                                     "const edges : edgeset{Edge}(Vertex,Vertex) = load (\"../test/graphs/4.el\");\n"
+                                                     "const vertices : vertexset{Vertex} = edges.getVertices();\n"
+                                                     "const IDs : vector{Vertex}(int) = 1;\n"
+                                                     "func updateEdge(src : Vertex, dst : Vertex)\n"
+                                                     "    IDs[dst] min= IDs[src];\n"
+                                                     "end\n"
+                                                     "func init(v : Vertex)\n"
+                                                     "     IDs[v] = v;\n"
+                                                     "end\n"
+                                                     "func printID(v : Vertex)\n"
+                                                     "    print IDs[v];\n"
+                                                     "end\n"
+                                                     "func main()\n"
+                                                     "    var n : int = edges.getVertices();\n"
+                                                     "    var frontier : vertexset{Vertex} = new vertexset{Vertex}(n);\n"
+                                                     "    vertices.apply(init);\n"
+                                                     "    #s0# while (frontier.getVertexSetSize() != 0)\n"
+                                                     "        #s1# frontier = edges.from(frontier).applyModified(updateEdge, IDs);\n"
+                                                     "    end\n"
+                                                     "    vertices.apply(printID);\n"
+                                                     "end");
+
+
 
 
         const char* cf_char = ( "element Vertex end\n"
@@ -471,7 +496,89 @@ class HighLevelScheduleTest : public ::testing::Test {
                 "    end\n"
                 "    vertices.apply(final_vertex_f);\n"
                 "end");
-
+        const char* gpu_bc_char = (
+		"element Vertex end"
+		"element Edge end"
+		"const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);"
+		"const vertices : vertexset{Vertex} = edges.getVertices();"
+		"const num_paths : vector{Vertex}(int) = 0;"
+		"const dependences : vector{Vertex}(float) = 0;"
+		"const visited : vector{Vertex}(bool) = false;"
+		"func forward_update(src : Vertex, dst : Vertex)"
+		"    num_paths[dst] +=  num_paths[src];"
+		"end"
+		"func visited_vertex_filter(v : Vertex) -> output : bool"
+		"    output = (visited[v] == false);"
+		"end"
+		"func mark_visited(v : Vertex)"
+		"    visited[v] = true;"
+		"end"
+		"func mark_unvisited(v : Vertex)"
+		"    visited[v] = false;"
+		"end"
+		"func backward_vertex_f(v : Vertex)"
+		"    visited[v] = true;"
+		"    dependences[v] += 1.0 / num_paths[v];"
+		"end"
+		"func backward_update(src : Vertex, dst : Vertex)"
+		"    dependences[dst] += dependences[src];"
+		"end"
+		"func final_vertex_f(v : Vertex)"
+		"    if num_paths[v] != 0"
+		"        dependences[v] = (dependences[v] - 1.0 / num_paths[v]) * num_paths[v];"
+		"    else"
+		"        dependences[v] = 0;"
+		"    end"
+		"end"
+		"func reset(v : Vertex)"
+		"    dependences[v] = 0;"
+		"    num_paths[v] = 0;"
+		"end"
+		"func main()"
+		""
+		"    % transposing the edges"
+		"    var transposed_edges : edgeset{Edge}(Vertex, Vertex) = edges.transpose();"
+		"    for trail in 0:1"
+		"	 startTimer();"
+		"    	 var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);"
+		"    	 var start_vertex : int = atoi(argv[2]);"
+		"    	 frontier.addVertex(start_vertex);"
+		"    	 num_paths[start_vertex] = 1;"
+		"    	 visited[start_vertex] = true;"
+		"    	 var round : int = 0;"
+		"    	 var frontier_list : list{vertexset{Vertex}} = new list{vertexset{Vertex}}();"
+		"    	 frontier_list.insert(frontier);"
+		"    	 % foward pass to propagate num_paths"
+		"    	 while (frontier.getVertexSetSize() != 0)"
+		"               round = round + 1;"
+		"               #s1# var output : vertexset{Vertex} = edges.from(frontier).to(visited_vertex_filter).applyModified(forward_update, num_paths);"
+		"	       delete frontier;"
+		"               output.apply(mark_visited);"
+		"               frontier_list.insert(output);"
+		"               frontier = output;"
+		"    	 end"
+		"   	  % resetting the visited information for the backward pass"
+		"    	  vertices.apply(mark_unvisited);"
+		"    	  % pop off the empty frontier"
+		"    	  frontier_list.retrieve(frontier);"
+		"    	  frontier_list.retrieve(frontier);"
+		"    	  frontier.apply(backward_vertex_f);"
+		"    	  round = round - 1;"
+		"    	  % backward pass to accumulate the dependencies"
+		"    	  while (round > 0)"
+		"          	#s2# transposed_edges.from(frontier).to(visited_vertex_filter).apply(backward_update);"
+		"        	frontier_list.retrieve(frontier);"
+		"        	frontier.apply(backward_vertex_f);"
+		"        	round = round - 1;"
+		"    	  end"
+		"    	  delete frontier;"
+		"    	  vertices.apply(final_vertex_f);"
+		"	  var elapsed_time : float = stopTimer();"
+		"          print "elapsed time: ";"
+		"          print elapsed_time;"
+		"	  vertices.apply(reset);"
+		"    end"
+		"end");
         const char* closeness_centrality_weighted_char = (
                 "element Vertex end\n"
                 "element Edge end\n"
@@ -532,6 +639,27 @@ class HighLevelScheduleTest : public ::testing::Test {
                              "    delete frontier; "
                              "  end\n"
                              "end");
+        const char* gpu_delta_stepping_char = ("element Vertex end\n"
+                             "element Edge end\n"
+                             "const edges : edgeset{Edge}(Vertex,Vertex, int) = load (\"argv[1]\");\n"
+                             "const vertices : vertexset{Vertex} = edges.getVertices();\n"
+                             "const dist : vector{Vertex}(int) = 2147483647; %should be INT_MAX\n"
+                             "const pq: priority_queue{Vertex}(int);"
+
+                             "func updateEdge(src : Vertex, dst : Vertex, weight : int) \n"
+                             "  var new_dist : int = dist[src] + weight; "
+                             "  pq.updatePriorityMin(dst, dist[dst], new_dist); "
+                             "end\n"
+                             "func main() "
+                             "  var start_vertex : int = atoi(argv[2]);"
+                             " dist[start_vertex] = 0;"
+                             "  pq = new priority_queue{Vertex}(int)(false, false, dist, 1, 2, false, start_vertex);"
+                             "  #s0# while (pq.finished() == false) "
+                             "    var frontier : vertexset{Vertex} = pq.dequeue_ready_set(); % dequeue_ready_set() \n"
+                             "    #s1# edges.from(frontier).applyUpdatePriority(updateEdge);  \n"
+                             "    delete frontier; "
+                             "  end\n"
+                             "end");
 
         const char* ppsp_char = ("element Vertex end\n"
                                            "element Edge end\n"
@@ -2417,3 +2545,142 @@ TEST_F(HighLevelScheduleTest, BFSHybridPushPullScheduleTest) {
     program->applyGPUSchedule("s1", h1);
     EXPECT_EQ(0, basicTestWithGPUSchedule(program));
 }
+
+
+// GPU tests with different scheduels for 5 applications
+TEST_F(HighLevelScheduleTest, GPUPageRankScheduleTest) {
+    using namespace fir::gpu_schedule;
+    istringstream is (std::string(pr_char));
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+    SimpleGPUSchedule s1;
+    s1.configDirection(PULL); 
+    s1.configLoadBalance(EDGE_ONLY, BLOCKED, 0x42000); 
+    program->applyGPUSchedule("s1", s1);	 
+    EXPECT_EQ(0, basicTestWithGPUSchedule(program));
+}
+
+TEST_F(HightLevelScheduleTest, GPUBFSPowerLawScheduleTest) {
+    using namespace fir::gpu_schedule;
+
+    istringstream is (bfs_str_gpu_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+    SimpleGPUSchedule s1;
+    s1.configDeduplication(DISABLED);
+    s1.configLoadBalance(ETWC);
+    s1.configDirection(PUSH);
+    s1.configFrontierCreation(FUSED);
+    SimpleGPUSchedule s2 = s1;
+    s2.configLoadBalance(VERTEX_BASED);
+    s2.configDirection(PULL, BITMAP);
+    s2.configDeduplication(DISABLED);
+    s2.configFrontierCreation(UNFUSED_BITMAP);
+    HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, "argv[3]", s1, s2);
+    program->applyGPUSchedule("s0:s1", h1);
+    EXPECT_EQ(0, basicTestWithGPUSchedule(program));
+    
+}
+
+TEST_F(HightLevelScheduleTest, GPUBFSRoadScheduleTest) {
+    using namespace fir::gpu_schedule;
+
+    istringstream is (bfs_str_gpu_);
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+   
+    SimpleGPUSchedule s1;
+    s1.configDeduplication(DISABLED);
+    s1.configLoadBalance(ETWC);
+    s1.configDirection(PUSH);
+    s1.configFrontierCreation(FUSED);
+    program->applyGPUSchedule("s0:s1", s1);
+    SimpleGPUSchedule s0;
+    s0.configKernelFusion(ENABLED);
+    program->applyGPUSchedule("s0", s0);
+    EXPECT_EQ(0, basicTestWithGPUSchedule(program)); 
+}
+
+TEST_F(HightLevelScheduleTest, GPUDSPowerLawScheduleTest) {
+    using namespace fir::gpu_schedule;
+
+    istringstream is (std::string(gpu_delta_stepping_char));
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+
+    SimpleGPUSchedule s1;
+    s1.configLoadBalance(ETWC);
+    s1.configFrontierCreation(UNFUSED_BOOLMAP);
+    s1.configDelta("argv[3]");
+    program->applyGPUSchedule("s0:s1", s1);
+   
+    EXPECT_EQ(0, basicTestWithGPUSchedule(program));
+    
+}
+
+TEST_F(HightLevelScheduleTest, GPUDSRoadScheduleTest) {
+    using namespace fir::gpu_schedule;
+
+    istringstream is (std::string(gpu_delta_stepping_char));
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+
+    SimpleGPUSchedule s1;
+    s1.configLoadBalance(ETWC);
+    s1.configFrontierCreation(FUSED);
+    s1.configDelta("argv[3]");
+    program->applyGPUSchedule("s0:s1", s1);
+    SimpleGPUSchedule s0;
+    s0.configKernelFusion(ENABLED);
+    program->applyGPUSchedule("s0", s0);
+    
+    EXPECT_EQ(0, basicTestWithGPUSchedule(program));
+    
+}
+
+TEST_F(HightLevelScheduleTest, GPUCCScheduleTest) {
+    using namespace fir::gpu_schedule;
+
+    istringstream is (std::string(gpu_cc_char));
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+
+    SimpleGPUSchedule s1;
+    s1.configLoadBalance(ETWC);
+    s1.configDeduplication(ENABLED);
+    s1.configFrontierCreation(UNFUSED_BITMAP);
+    program->applyGPUSchedule("s0:s1", s1);
+    
+    EXPECT_EQ(0, basicTestWithGPUSchedule(program));
+    
+}
+
+TEST_F(HightLevelScheduleTest, GPUBCPowerLawScheduleTest) {
+    using namespace fir::gpu_schedule;
+
+    istringstream is (std::string(gpu_bc_char));
+    fe_->parseStream(is, context_, errors_);
+    fir::high_level_schedule::ProgramScheduleNode::Ptr program
+            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
+    SimpleGPUSchedule s1;
+    s1.configLoadBalance(ETWC);
+    s1.configFrontierCreation(FUSED);
+    s1.configDeduplication(ENABLED, FUSED);
+    SimpleGPUSchedule s2;
+    s2.configLoadBalance(ETWC);
+    s2.configDirection(PULL, BITMAP);
+    s2.configFrontierCreation(UNFUSED_BITMAP);
+    HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, "argv[3]", s1, s2);
+    program->applyGPUSchedule("s1", s1);
+    program->applyGPUSchedule("s2", s1);
+   
+    
+    EXPECT_EQ(0, basicTestWithGPUSchedule(program));
+    
+}

From 59744184534883d42b6a253aca8fd18f1e525fb0 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Thu, 3 Dec 2020 12:45:35 -0500
Subject: [PATCH 85/88] Very minor code generation bug

---
 src/backend/codegen_gpu/codegen_gpu.cpp |  32 ++-
 src/runtime_lib/infra_gpu/graph.h       |   2 +-
 test/c++/high_level_schedule_test.cpp   | 269 +-----------------------
 3 files changed, 31 insertions(+), 272 deletions(-)

diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index d9cf8bc5..d7ea387b 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -457,14 +457,17 @@ void CodeGenGPU::genEdgeSets(void) {
 		edge_set_type->accept(this);
 		oss << " " << "__host_" << edgeset->name << ";" << std::endl;
 
-
+		bool requires_transpose = false;
+		bool requires_blocking = false;
+		uint32_t blocking_size = 0;
 		if (mir_context_->graphs_with_blocking.find(edgeset->name) != mir_context_->graphs_with_blocking.end()) {
-			uint32_t blocking_size = mir_context_->graphs_with_blocking[edgeset->name];
+			blocking_size = mir_context_->graphs_with_blocking[edgeset->name];
 			auto edge_set_type = mir::to<mir::EdgeSetType>(edgeset->type);
 			edge_set_type->accept(this);
 			oss << " __device__ " << edgeset->name << "__blocked_" << blocking_size << ";" << std::endl;
 			edge_set_type->accept(this);
 			oss << " " << "__host_" << edgeset->name << "__blocked_" << blocking_size << ";" << std::endl;
+			requires_blocking = true;
 		}
 
 		if (mir_context_->graphs_with_transpose.find(edgeset->name) != mir_context_->graphs_with_transpose.end() && mir_context_->graphs_with_transpose[edgeset->name]) {
@@ -473,8 +476,18 @@ void CodeGenGPU::genEdgeSets(void) {
 			oss << " __device__ " << edgeset->name << "__transposed" << ";" << std::endl;
 			edge_set_type->accept(this);
 			oss << " __host_" << edgeset->name << "__transposed" << ";" << std::endl;
+			requires_transpose = true;
 			
 		}
+		if (requires_transpose && requires_blocking) {
+			auto edge_set_type = mir::to<mir::EdgeSetType>(edgeset->type);
+			edge_set_type->accept(this);
+			oss << " __device__ " << edgeset->name << "__blocked_" << blocking_size << "__transposed" << ";" << std::endl;
+			edge_set_type->accept(this);
+			oss << " __host_" << edgeset->name << "__blocked_" << blocking_size << "__transposed" << ";" << std::endl;
+		}
+		
+		
 	}
 }
 
@@ -613,8 +626,12 @@ void CodeGenGPU::visit(mir::FuncDecl::Ptr func_decl) {
 				printIndent();
 				oss << "cudaMemcpyToSymbol(";
 				oss << var_name << ", &__host_" << var_name << ", sizeof(__host_" << var_name << "), 0, cudaMemcpyHostToDevice);" << std::endl;
+				bool requires_blocking = false;
+				bool requires_transpose = false;
+				uint32_t blocking_size = 0;
 				if (mir_context_->graphs_with_blocking.find(var_name) != mir_context_->graphs_with_blocking.end()) {
-					uint32_t blocking_size = mir_context_->graphs_with_blocking[var_name];		
+					blocking_size = mir_context_->graphs_with_blocking[var_name];		
+					requires_blocking = true;
 					printIndent();
 					oss << "gpu_runtime::block_graph_edges(__host_" << var_name << ", __host_" << var_name << "__blocked_" << blocking_size << ", " << blocking_size << ");" << std::endl;
 					printIndent();
@@ -623,12 +640,21 @@ void CodeGenGPU::visit(mir::FuncDecl::Ptr func_decl) {
 				}
 
 				if (mir_context_->graphs_with_transpose.find(var_name) != mir_context_->graphs_with_transpose.end() && mir_context_->graphs_with_transpose[var_name]) {
+					requires_transpose = true;
 					printIndent();
 					oss << "__host_" << var_name << "__transposed = gpu_runtime::builtin_transpose(__host_" << var_name << ");" << std::endl;
 					printIndent();
 					oss << "cudaMemcpyToSymbol(";
 					oss << var_name << "__transposed" << ", &__host_" << var_name << "__transposed, sizeof(__host_" << var_name << "__transposed), 0, cudaMemcpyHostToDevice);" << std::endl;
 				}
+				if (requires_transpose && requires_blocking) {
+					printIndent();
+					oss << "gpu_runtime::block_graph_edges(__host_" << var_name << "__transposed, __host_" << var_name << "__blocked_" << blocking_size << "__transposed, " << blocking_size << ");" << std::endl;
+					printIndent();
+					oss << "cudaMemcpyToSymbol(";
+					oss << var_name << "__blocked_" << blocking_size << "__transposed, &__host_" << var_name << "__blocked_" << blocking_size << "__transposed, sizeof(__host_" << var_name << "__blocked_" << blocking_size << "__transposed), 0, cudaMemcpyHostToDevice);" << std::endl;
+					
+				}
 				
 
 			}
diff --git a/src/runtime_lib/infra_gpu/graph.h b/src/runtime_lib/infra_gpu/graph.h
index e7582659..9fd91285 100755
--- a/src/runtime_lib/infra_gpu/graph.h
+++ b/src/runtime_lib/infra_gpu/graph.h
@@ -105,7 +105,7 @@ static void block_graph_edges(GraphT<EdgeWeightType> &input_graph, GraphT<EdgeWe
 	output_graph.h_edge_weight = new EdgeWeightType[input_graph.num_edges];
 
 	int32_t num_blocks = (input_graph.num_vertices + blocking_size - 1)/blocking_size;
-	std::cout << "num blocks " << num_blocks << std::endl;	
+	//std::cout << "num blocks " << num_blocks << std::endl;	
 	int32_t *block_sizes = new int32_t[num_blocks+1];		
 	for (int32_t id = 0; id < num_blocks+1; id++)
 		block_sizes[id] = 0;
diff --git a/test/c++/high_level_schedule_test.cpp b/test/c++/high_level_schedule_test.cpp
index ead1fa92..0edc6421 100644
--- a/test/c++/high_level_schedule_test.cpp
+++ b/test/c++/high_level_schedule_test.cpp
@@ -226,31 +226,6 @@ class HighLevelScheduleTest : public ::testing::Test {
                                                      "    vertices.apply(printID);\n"
                                                      "end");
 
-        const char* gpu_cc_char = ("element Vertex end\n"
-                                                     "element Edge end\n"
-                                                     "const edges : edgeset{Edge}(Vertex,Vertex) = load (\"../test/graphs/4.el\");\n"
-                                                     "const vertices : vertexset{Vertex} = edges.getVertices();\n"
-                                                     "const IDs : vector{Vertex}(int) = 1;\n"
-                                                     "func updateEdge(src : Vertex, dst : Vertex)\n"
-                                                     "    IDs[dst] min= IDs[src];\n"
-                                                     "end\n"
-                                                     "func init(v : Vertex)\n"
-                                                     "     IDs[v] = v;\n"
-                                                     "end\n"
-                                                     "func printID(v : Vertex)\n"
-                                                     "    print IDs[v];\n"
-                                                     "end\n"
-                                                     "func main()\n"
-                                                     "    var n : int = edges.getVertices();\n"
-                                                     "    var frontier : vertexset{Vertex} = new vertexset{Vertex}(n);\n"
-                                                     "    vertices.apply(init);\n"
-                                                     "    #s0# while (frontier.getVertexSetSize() != 0)\n"
-                                                     "        #s1# frontier = edges.from(frontier).applyModified(updateEdge, IDs);\n"
-                                                     "    end\n"
-                                                     "    vertices.apply(printID);\n"
-                                                     "end");
-
-
 
 
         const char* cf_char = ( "element Vertex end\n"
@@ -496,89 +471,7 @@ class HighLevelScheduleTest : public ::testing::Test {
                 "    end\n"
                 "    vertices.apply(final_vertex_f);\n"
                 "end");
-        const char* gpu_bc_char = (
-		"element Vertex end"
-		"element Edge end"
-		"const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);"
-		"const vertices : vertexset{Vertex} = edges.getVertices();"
-		"const num_paths : vector{Vertex}(int) = 0;"
-		"const dependences : vector{Vertex}(float) = 0;"
-		"const visited : vector{Vertex}(bool) = false;"
-		"func forward_update(src : Vertex, dst : Vertex)"
-		"    num_paths[dst] +=  num_paths[src];"
-		"end"
-		"func visited_vertex_filter(v : Vertex) -> output : bool"
-		"    output = (visited[v] == false);"
-		"end"
-		"func mark_visited(v : Vertex)"
-		"    visited[v] = true;"
-		"end"
-		"func mark_unvisited(v : Vertex)"
-		"    visited[v] = false;"
-		"end"
-		"func backward_vertex_f(v : Vertex)"
-		"    visited[v] = true;"
-		"    dependences[v] += 1.0 / num_paths[v];"
-		"end"
-		"func backward_update(src : Vertex, dst : Vertex)"
-		"    dependences[dst] += dependences[src];"
-		"end"
-		"func final_vertex_f(v : Vertex)"
-		"    if num_paths[v] != 0"
-		"        dependences[v] = (dependences[v] - 1.0 / num_paths[v]) * num_paths[v];"
-		"    else"
-		"        dependences[v] = 0;"
-		"    end"
-		"end"
-		"func reset(v : Vertex)"
-		"    dependences[v] = 0;"
-		"    num_paths[v] = 0;"
-		"end"
-		"func main()"
-		""
-		"    % transposing the edges"
-		"    var transposed_edges : edgeset{Edge}(Vertex, Vertex) = edges.transpose();"
-		"    for trail in 0:1"
-		"	 startTimer();"
-		"    	 var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);"
-		"    	 var start_vertex : int = atoi(argv[2]);"
-		"    	 frontier.addVertex(start_vertex);"
-		"    	 num_paths[start_vertex] = 1;"
-		"    	 visited[start_vertex] = true;"
-		"    	 var round : int = 0;"
-		"    	 var frontier_list : list{vertexset{Vertex}} = new list{vertexset{Vertex}}();"
-		"    	 frontier_list.insert(frontier);"
-		"    	 % foward pass to propagate num_paths"
-		"    	 while (frontier.getVertexSetSize() != 0)"
-		"               round = round + 1;"
-		"               #s1# var output : vertexset{Vertex} = edges.from(frontier).to(visited_vertex_filter).applyModified(forward_update, num_paths);"
-		"	       delete frontier;"
-		"               output.apply(mark_visited);"
-		"               frontier_list.insert(output);"
-		"               frontier = output;"
-		"    	 end"
-		"   	  % resetting the visited information for the backward pass"
-		"    	  vertices.apply(mark_unvisited);"
-		"    	  % pop off the empty frontier"
-		"    	  frontier_list.retrieve(frontier);"
-		"    	  frontier_list.retrieve(frontier);"
-		"    	  frontier.apply(backward_vertex_f);"
-		"    	  round = round - 1;"
-		"    	  % backward pass to accumulate the dependencies"
-		"    	  while (round > 0)"
-		"          	#s2# transposed_edges.from(frontier).to(visited_vertex_filter).apply(backward_update);"
-		"        	frontier_list.retrieve(frontier);"
-		"        	frontier.apply(backward_vertex_f);"
-		"        	round = round - 1;"
-		"    	  end"
-		"    	  delete frontier;"
-		"    	  vertices.apply(final_vertex_f);"
-		"	  var elapsed_time : float = stopTimer();"
-		"          print "elapsed time: ";"
-		"          print elapsed_time;"
-		"	  vertices.apply(reset);"
-		"    end"
-		"end");
+
         const char* closeness_centrality_weighted_char = (
                 "element Vertex end\n"
                 "element Edge end\n"
@@ -639,27 +532,6 @@ class HighLevelScheduleTest : public ::testing::Test {
                              "    delete frontier; "
                              "  end\n"
                              "end");
-        const char* gpu_delta_stepping_char = ("element Vertex end\n"
-                             "element Edge end\n"
-                             "const edges : edgeset{Edge}(Vertex,Vertex, int) = load (\"argv[1]\");\n"
-                             "const vertices : vertexset{Vertex} = edges.getVertices();\n"
-                             "const dist : vector{Vertex}(int) = 2147483647; %should be INT_MAX\n"
-                             "const pq: priority_queue{Vertex}(int);"
-
-                             "func updateEdge(src : Vertex, dst : Vertex, weight : int) \n"
-                             "  var new_dist : int = dist[src] + weight; "
-                             "  pq.updatePriorityMin(dst, dist[dst], new_dist); "
-                             "end\n"
-                             "func main() "
-                             "  var start_vertex : int = atoi(argv[2]);"
-                             " dist[start_vertex] = 0;"
-                             "  pq = new priority_queue{Vertex}(int)(false, false, dist, 1, 2, false, start_vertex);"
-                             "  #s0# while (pq.finished() == false) "
-                             "    var frontier : vertexset{Vertex} = pq.dequeue_ready_set(); % dequeue_ready_set() \n"
-                             "    #s1# edges.from(frontier).applyUpdatePriority(updateEdge);  \n"
-                             "    delete frontier; "
-                             "  end\n"
-                             "end");
 
         const char* ppsp_char = ("element Vertex end\n"
                                            "element Edge end\n"
@@ -2545,142 +2417,3 @@ TEST_F(HighLevelScheduleTest, BFSHybridPushPullScheduleTest) {
     program->applyGPUSchedule("s1", h1);
     EXPECT_EQ(0, basicTestWithGPUSchedule(program));
 }
-
-
-// GPU tests with different scheduels for 5 applications
-TEST_F(HighLevelScheduleTest, GPUPageRankScheduleTest) {
-    using namespace fir::gpu_schedule;
-    istringstream is (std::string(pr_char));
-    fe_->parseStream(is, context_, errors_);
-    fir::high_level_schedule::ProgramScheduleNode::Ptr program
-            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-    SimpleGPUSchedule s1;
-    s1.configDirection(PULL); 
-    s1.configLoadBalance(EDGE_ONLY, BLOCKED, 0x42000); 
-    program->applyGPUSchedule("s1", s1);	 
-    EXPECT_EQ(0, basicTestWithGPUSchedule(program));
-}
-
-TEST_F(HightLevelScheduleTest, GPUBFSPowerLawScheduleTest) {
-    using namespace fir::gpu_schedule;
-
-    istringstream is (bfs_str_gpu_);
-    fe_->parseStream(is, context_, errors_);
-    fir::high_level_schedule::ProgramScheduleNode::Ptr program
-            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-    SimpleGPUSchedule s1;
-    s1.configDeduplication(DISABLED);
-    s1.configLoadBalance(ETWC);
-    s1.configDirection(PUSH);
-    s1.configFrontierCreation(FUSED);
-    SimpleGPUSchedule s2 = s1;
-    s2.configLoadBalance(VERTEX_BASED);
-    s2.configDirection(PULL, BITMAP);
-    s2.configDeduplication(DISABLED);
-    s2.configFrontierCreation(UNFUSED_BITMAP);
-    HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, "argv[3]", s1, s2);
-    program->applyGPUSchedule("s0:s1", h1);
-    EXPECT_EQ(0, basicTestWithGPUSchedule(program));
-    
-}
-
-TEST_F(HightLevelScheduleTest, GPUBFSRoadScheduleTest) {
-    using namespace fir::gpu_schedule;
-
-    istringstream is (bfs_str_gpu_);
-    fe_->parseStream(is, context_, errors_);
-    fir::high_level_schedule::ProgramScheduleNode::Ptr program
-            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-   
-    SimpleGPUSchedule s1;
-    s1.configDeduplication(DISABLED);
-    s1.configLoadBalance(ETWC);
-    s1.configDirection(PUSH);
-    s1.configFrontierCreation(FUSED);
-    program->applyGPUSchedule("s0:s1", s1);
-    SimpleGPUSchedule s0;
-    s0.configKernelFusion(ENABLED);
-    program->applyGPUSchedule("s0", s0);
-    EXPECT_EQ(0, basicTestWithGPUSchedule(program)); 
-}
-
-TEST_F(HightLevelScheduleTest, GPUDSPowerLawScheduleTest) {
-    using namespace fir::gpu_schedule;
-
-    istringstream is (std::string(gpu_delta_stepping_char));
-    fe_->parseStream(is, context_, errors_);
-    fir::high_level_schedule::ProgramScheduleNode::Ptr program
-            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-
-    SimpleGPUSchedule s1;
-    s1.configLoadBalance(ETWC);
-    s1.configFrontierCreation(UNFUSED_BOOLMAP);
-    s1.configDelta("argv[3]");
-    program->applyGPUSchedule("s0:s1", s1);
-   
-    EXPECT_EQ(0, basicTestWithGPUSchedule(program));
-    
-}
-
-TEST_F(HightLevelScheduleTest, GPUDSRoadScheduleTest) {
-    using namespace fir::gpu_schedule;
-
-    istringstream is (std::string(gpu_delta_stepping_char));
-    fe_->parseStream(is, context_, errors_);
-    fir::high_level_schedule::ProgramScheduleNode::Ptr program
-            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-
-    SimpleGPUSchedule s1;
-    s1.configLoadBalance(ETWC);
-    s1.configFrontierCreation(FUSED);
-    s1.configDelta("argv[3]");
-    program->applyGPUSchedule("s0:s1", s1);
-    SimpleGPUSchedule s0;
-    s0.configKernelFusion(ENABLED);
-    program->applyGPUSchedule("s0", s0);
-    
-    EXPECT_EQ(0, basicTestWithGPUSchedule(program));
-    
-}
-
-TEST_F(HightLevelScheduleTest, GPUCCScheduleTest) {
-    using namespace fir::gpu_schedule;
-
-    istringstream is (std::string(gpu_cc_char));
-    fe_->parseStream(is, context_, errors_);
-    fir::high_level_schedule::ProgramScheduleNode::Ptr program
-            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-
-    SimpleGPUSchedule s1;
-    s1.configLoadBalance(ETWC);
-    s1.configDeduplication(ENABLED);
-    s1.configFrontierCreation(UNFUSED_BITMAP);
-    program->applyGPUSchedule("s0:s1", s1);
-    
-    EXPECT_EQ(0, basicTestWithGPUSchedule(program));
-    
-}
-
-TEST_F(HightLevelScheduleTest, GPUBCPowerLawScheduleTest) {
-    using namespace fir::gpu_schedule;
-
-    istringstream is (std::string(gpu_bc_char));
-    fe_->parseStream(is, context_, errors_);
-    fir::high_level_schedule::ProgramScheduleNode::Ptr program
-            = std::make_shared<fir::high_level_schedule::ProgramScheduleNode>(context_);
-    SimpleGPUSchedule s1;
-    s1.configLoadBalance(ETWC);
-    s1.configFrontierCreation(FUSED);
-    s1.configDeduplication(ENABLED, FUSED);
-    SimpleGPUSchedule s2;
-    s2.configLoadBalance(ETWC);
-    s2.configDirection(PULL, BITMAP);
-    s2.configFrontierCreation(UNFUSED_BITMAP);
-    HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, "argv[3]", s1, s2);
-    program->applyGPUSchedule("s1", s1);
-    program->applyGPUSchedule("s2", s1);
-   
-    
-    EXPECT_EQ(0, basicTestWithGPUSchedule(program));
-    
-}

From 6cc3eca58ca12ff57dcb7c516aaf58118f2d3cb5 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Wed, 10 Mar 2021 14:21:57 -0500
Subject: [PATCH 86/88] Update README.md

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a6f0d1fd..accd4540 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 GraphIt Domain Specific Langauge and Compiler [![Build Status](https://travis-ci.org/GraphIt-DSL/graphit.svg?branch=master)](https://travis-ci.org/GraphIt-DSL/graphit)
 ==========
-GraphIt is a high-performance Graph DSL. [Website](http://graphit-lang.org)
+GraphIt is a high-performance Graph DSL. 
+[Website](http://graphit-lang.org)
 
 Dependencies
 ===========

From f8646f1a76efaf1399f1f60928c9f83ddb1bea54 Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Fri, 19 Mar 2021 03:23:27 -0400
Subject: [PATCH 87/88] Added CGO21 AE to the repo, all tests passing

---
 graphit_eval/g2_cgo2021_eval/.gitignore       |   5 +
 graphit_eval/g2_cgo2021_eval/README.md        | 159 ++++++++
 graphit_eval/g2_cgo2021_eval/dataset/Makefile |  13 +
 graphit_eval/g2_cgo2021_eval/dataset/local.sh |   6 +
 .../g2_cgo2021_eval/fig3_inputs/fig3_a.gt     |  51 +++
 .../g2_cgo2021_eval/fig3_inputs/fig3_b.gt     |  51 +++
 .../g2_cgo2021_eval/fig3_inputs/fig3_c.gt     |  56 +++
 graphit_eval/g2_cgo2021_eval/gen_fig3.py      |  72 ++++
 graphit_eval/g2_cgo2021_eval/gen_table7.py    | 338 ++++++++++++++++++
 .../g2_cgo2021_eval/table7_inputs/bc_road.gt  | 125 +++++++
 .../table7_inputs/bc_social.gt                | 126 +++++++
 .../g2_cgo2021_eval/table7_inputs/bfs_road.gt |  55 +++
 .../table7_inputs/bfs_social.gt               |  59 +++
 .../g2_cgo2021_eval/table7_inputs/cc.gt       |  63 ++++
 .../g2_cgo2021_eval/table7_inputs/ds_road.gt  |  49 +++
 .../table7_inputs/ds_social.gt                |  45 +++
 .../table7_inputs/obtain_gpu_cc.cu            |  31 ++
 .../g2_cgo2021_eval/table7_inputs/pr.gt       |  60 ++++
 .../table7_inputs/simple_graph_load.gt        |  13 +
 include/graphit/midend/atomics_op_lower.h     |   6 +-
 src/backend/codegen_gpu/codegen_gpu.cpp       |  13 +-
 src/midend/apply_expr_lower.cpp               |  15 +-
 src/midend/atomics_op_lower.cpp               |  13 +-
 src/midend/mir_lower.cpp                      |   2 +-
 24 files changed, 1414 insertions(+), 12 deletions(-)
 create mode 100644 graphit_eval/g2_cgo2021_eval/.gitignore
 create mode 100644 graphit_eval/g2_cgo2021_eval/README.md
 create mode 100644 graphit_eval/g2_cgo2021_eval/dataset/Makefile
 create mode 100644 graphit_eval/g2_cgo2021_eval/dataset/local.sh
 create mode 100644 graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_a.gt
 create mode 100644 graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_b.gt
 create mode 100644 graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_c.gt
 create mode 100644 graphit_eval/g2_cgo2021_eval/gen_fig3.py
 create mode 100644 graphit_eval/g2_cgo2021_eval/gen_table7.py
 create mode 100644 graphit_eval/g2_cgo2021_eval/table7_inputs/bc_road.gt
 create mode 100644 graphit_eval/g2_cgo2021_eval/table7_inputs/bc_social.gt
 create mode 100644 graphit_eval/g2_cgo2021_eval/table7_inputs/bfs_road.gt
 create mode 100644 graphit_eval/g2_cgo2021_eval/table7_inputs/bfs_social.gt
 create mode 100644 graphit_eval/g2_cgo2021_eval/table7_inputs/cc.gt
 create mode 100644 graphit_eval/g2_cgo2021_eval/table7_inputs/ds_road.gt
 create mode 100644 graphit_eval/g2_cgo2021_eval/table7_inputs/ds_social.gt
 create mode 100644 graphit_eval/g2_cgo2021_eval/table7_inputs/obtain_gpu_cc.cu
 create mode 100644 graphit_eval/g2_cgo2021_eval/table7_inputs/pr.gt
 create mode 100644 graphit_eval/g2_cgo2021_eval/table7_inputs/simple_graph_load.gt

diff --git a/graphit_eval/g2_cgo2021_eval/.gitignore b/graphit_eval/g2_cgo2021_eval/.gitignore
new file mode 100644
index 00000000..1f593ec1
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/.gitignore
@@ -0,0 +1,5 @@
+fig3_outputs/*
+table7_outputs/*
+dataset/*
+!dataset/Makefile
+!dataset/local.sh
diff --git a/graphit_eval/g2_cgo2021_eval/README.md b/graphit_eval/g2_cgo2021_eval/README.md
new file mode 100644
index 00000000..87790a17
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/README.md
@@ -0,0 +1,159 @@
+# G2_artifact_eval
+
+## Introduction
+This repository is the guide for evaluating our CGO2021 paper, "Techniques for Compiling Graphs Algorithms for GPUs". This guide has steps for cloning, compiling and executing the implementation of the compiler framework G2 which is built on top of the [GraphIt DSL compiler](https://graphit-lang.org/). 
+This guide has two parts -
+  - Part 1: Reproducing Figure 3. in the paper to demonstrate that the compiler can generate very different optimized code for different schedules
+  - Part 2: Reproducing the G2 columns for all the applications and graphs from Table 7 to demonstrate the performance of the code generated from the G2 compiler
+
+Since Table 7 shows the performance numbers when run on the NVIDIA-Tesla V-100 GPU, the exact execution times you will get in Part 2 will depend on the actual GPU you use. If you do not have access to the same GPU, we have provided access to our system with this GPU in our artifact evaluation submission. 
+If you use any other GPU the schedules might have to be tuned to get the best performance for the GPU. 
+
+## Requirements
+We expect you to run the artifact evaluation on a Linux system with at least 40GBs of space. Following are the software requirements for each of the parts
+
+### Part 1
+Since part 1 only demonstrates the different code generated for different schedules, this part does *NOT* require an NVIDIA GPU or CUDA to be installed. The only software requirements are - 
+
+ - cmake (>= 3.5.1)
+ - CXX compiler (like g++ >= 5.4.0)
+ - python3 
+ - make
+ - bash
+ - git
+
+### Part 2
+Part 2 demonstrates the performance of these applications on the actual GPU. Ideally we require an NVIDIA Tesla V-100 for best results, but other NVIDIA GPUs would also work (the actual performance numbers would be different in that case). Following are the requirements besides all the requirements from Part 1 - 
+
+ - NVIDIA GPU (Pascal generation or better, preferred NVIDIA Tesla V-100 32 GB, access to our machine provided in the artifact evaluation submission). 
+ - CUDA SDK (>= 9.0)
+
+
+## How to run 
+
+### Cloning
+We will start by cloning this repository on the evaluation system using the following command - 
+
+    git clone --recursive https://github.com/AjayBrahmakshatriya/G2_artifact_eval.git
+
+If you have already cloned this repository without the `--recursive` command you can get the submodules by running the following commands. Otherwise you can directly proceed to Building G2.
+
+    git submodule init
+    git submodule update
+   
+### Building G2
+Start by navigating to the `G2_artifact_eval` directory. We will first build the G2 compiler by running the following commands from the repo's top level directory - 
+
+    cd graphit
+    mkdir build
+    cd build
+    cmake ..
+    make -j$(nproc)
+    
+If no errors are reported, the G2 compiler is built correctly and you can navigate back to the repository's top level directory and proceed to "Running Part 1"
+
+### Running Part 1
+With the G2 compiler built, you can run the Part 1 to generate the code as shown in Figure 3. You can start by returning to the top level directory of the repository with the command `cd ../../` and then running the command - 
+
+    python3 gen_fig3.py
+
+When running this command, the program will prompt for a few options like paths to where the G2 compiler is built and the output directory path. If you have followed the above steps, you can simply press enter and choose the default options shown in `[]`. 
+
+This command should take about 5 mins to run and if it doesn't report any errors, the appropriate files have been generated. Notice the above commands also prints all the commands that were executed to generate the output files. 
+
+The source files for the three schedules in Figure 3 are in the `fig3_inputs/` directory - `fig3_inputs/fig3_a.gt`, `fig3_inputs/fig3_b.gt` and `fig3_inputs/fig3_c.gt`. You can open and read them in your favorite text editor. All the three programs have the same algorithm input but different schedules a the bottom under the `schedule:` section. You can match this schedule with the one in the paper (barring some syntactic changes in the paper for brevity). 
+
+If you choose the default options while running the above programs, the outputs should be generated in the `fig3_outputs/` directory - `fig3_outputs/fig3_a.gt.cu`, `fig3_outputs/fig3_b.gt.cu` and `fig3_outputs/fig3_c.gt.cu`. Again, you can open and read them in your favorite text editor or simple `cat` them. 
+
+You can match the body of the `main` and the user defined function `updateEdges`. Again, we have changed the syntax a little in the paper for brevity. 
+
+### Obtaining the datasets
+The datasets are only required for Part 2. If you are not planning to run Part 2, you do not need to obtain the datasets.
+
+We have created two datasets for your convenience - *small* and *all*. The small dataset contains just two graphs (one with bounded degree distribution and one with power law degree distribution). Obtaining and running the small dataset should take less than 15 mins and quickly tests all the variants for all algorithms. The all dataset contains all the 9 graphs from the paper and would take much longer to run (upwards of 1.5 hours on our system). 
+
+There are two ways of obtaining the datasets. If you are running this artifact evaluation on the system we have provided access to, you can quickly fetch all the data set files by running the following commands in the top level directory - 
+
+    cd dataset
+    make local
+    
+If everything succeeds, the dataset should be soft-linked into this directory and you can verify that by running the `ls` command. You can now navigate to the top level directory using the command `cd ../` and proceed to the next step ("Running Part 2"). If the command reports the error 
+
+> You are not running this command on the right host. Please use `make dataset` instead
+
+it means that you are not running the artifact evaluation on our system and you should use the other method for downloading the datasets
+
+If you are running the artifact evaluation on your own system, the script will have to download a tar ball and extact the files. We have a separate command for *small* and *all* datasets. So if you are planning to run the evaluation only for the 2 graphs, please download only the small dataset to save time.
+
+For downloading the *all* dataset run the following commands from the top-level directory -  
+
+    cd datasets
+    make dataset
+
+To download just the *small* dataset run the following commands from the top-level directory - 
+
+    cd datasets
+    make small
+
+    
+This step will take some time, because it downloads and uncompresses all the datasets. After the command succeeds, you can verify that the files are downloaded by running the `ls` command. The small dataset is part of the all dataset and if you accidently downloaded the all dataset, you can still run the small part of the experiment. 
+
+Navigate to the top level directory in any case using the command - `cd ../`
+
+### Running Part 2
+This part evaluates the generated code for all the applications and inputs to reproduce Table 7 in the paper. A reminder that if you are running the experiments on a system with any other GPU than the NVIDIA Tesla V-100 (32 GB), the results might be different. The system we have provided with the artifact evaluation has the correct GPU. 
+
+Before we actually run the evaluation, we will list all the GPUs in the system and find one that is completely free. We need a free GPU because the performance might be hampered if other processes are running on the same GPU. 
+
+Start by running the command - 
+
+    nvidia-smi
+    
+This will list all the GPUs attached to the system numbered from 0. At the bottom of the table, there is a Processes section which shows what processes are running on which GPU. Find a GPU which doesn't have any processes running on it and note down its ID. Suppose for the purpose of this evaluation, the 4th GPU (ID: 3) is free and we want to use that. 
+
+We do not recommend running the evaluation on a GPU that is being used by other processes since it might affect the evaluation results (and correctness) a lot. 
+
+Before running the actual command for running all the experiments, make sure you have successfully built G2 and fetched the datasets. If you are planning to run the *all* dataset make sure you have downloaded the entire dataset. 
+
+To run only the small data set navigate to the top level directory of the repository and run the command - 
+
+    python3 gen_table7.py small
+
+To run the all data set navigate to the top level directory of the repository and run the command - 
+
+    python3 gen_table7.py
+    
+Again, like Part 1 the program will prompt for various options like path to the CUDA compiler, CXX compiler, path to G2 build directory and the GPU to use. Following is the description of each of the options - 
+
+- Output directory to use: This is the directory where the output of this section will be generated. Please select the default option by pressing enter (notice that the outputs from previous runs will be wiped. So if are planning to run multiple times and want to preserve old results, copy the results somewhere else). 
+- GraphIt build directory: This is the path to the `build/` directory where G2 is compiled. If you have followed the exact steps above, just choose the default by pressing enter. 
+- Dataset path: This is the directory where the datasets are fetched. If you have followed the exact steps mentioned above, just select the default by pressing enter. 
+- NVCC path: This is the path to the `nvcc` compiler from the CUDA SDK. Typically this binary is located at `/usr/local/cuda/bin/nvcc`. If you have installed it elsewhere, please provide the path here. If you have the binary in your `$PATH` variable (you can verify this by running `nvcc --version`), you can simply type `nvcc` and press enter. If you are using the system that we have provided, just press the enter key. 
+- CXX_COMPILER path: This is the path to the CXX compiler that you want to use. The default option is `/usr/bin/g++`. If you are using a different compiler, please provide the path here. If you are using the system that we have provided, just press the enter key. 
+- GPU ID to use: This is the GPU ID that you want to use to run the experiments on. We have obtained the ID of a GPU that is free in the above step. Enter that here. If the 4th GPU (ID: 3) is free, type `3` and press enter. The default option is `0`, but `0` might not be free. 
+
+Once you enter all the options, the experiments will run one after the other. The program will print which application it is currently running and how many graphs it is done evaluating on. Sit back because running all the applications can take a while (~20 mins for small dataset and >1.5 hrs for all dataset)
+
+If the program completes execution without any errors, all the experiments are done and you can view the final results in the output directory. If you chose the default option, the output file should be under `table7_outputs/table7.txt`. The program should also print the table on successful completion. 
+
+
+## Evaluating related works
+Table 7 in the paper also compares against other related works to compare the speedups we obtain with our compiler. Unfortunately the source code of some of the related works is not directly usable (we found some bugs in the systems and had to fix them ourselves). But the related work Gunrock has a system that is easy to build and evaluate - 
+
+The source code for Gunrock is available in the repository - 
+
+    https://github.com/gunrock/gunrock
+
+Building gunrock is very easy and you can simply follow the instructions under "Quick Start Guide". Gunrock also provides access to docker containers that should have all the dependences packaged. You can use those if it is more convenient. 
+
+If you successfully build gunrock, you can run the experiments with the same dataset that you obtained for G2 evaluation (mtx file format). 
+
+For example to run the pagerank application with Gunrock, the binary is located in the `build/bin/` directory. You can run it using the command - 
+
+    ./pr market /path/to/dataset/roadNet-CA.mtx --num-runs=10 --quick --device=3 --advance-mode=TWC
+
+Here `--device=3` is the GPU to run the experiments on. Change the ID to the proper GPU you want to run on. For other applications like BFS and BC some extra parameters like `do-bfs-a` and `do-bfs-b` are required. The details of all applications, their command line arguments and reference results can be found at - 
+
+    https://gunrock.github.io/docs/#/analysis/engines_topc/engines_topc_table 
+
+
diff --git a/graphit_eval/g2_cgo2021_eval/dataset/Makefile b/graphit_eval/g2_cgo2021_eval/dataset/Makefile
new file mode 100644
index 00000000..a800bf73
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/dataset/Makefile
@@ -0,0 +1,13 @@
+small: 
+	wget "https://www.dropbox.com/s/s92m1rqjvwvkf4m/graph-dataset-small.tar.gz?dl=0" -O graph-dataset-small.tar.gz
+	tar xvf graph-dataset-small.tar.gz
+	rm graph-dataset-small.tar.gz
+
+dataset:
+	wget "https://www.dropbox.com/s/ysk7sk8yor2o71g/graph-dataset.tar.gz?dl=0" -O graph-dataset.tar.gz
+	tar xvf graph-dataset.tar.gz
+	rm graph-dataset.tar.gz
+local:
+	bash local.sh
+
+
diff --git a/graphit_eval/g2_cgo2021_eval/dataset/local.sh b/graphit_eval/g2_cgo2021_eval/dataset/local.sh
new file mode 100644
index 00000000..c409a803
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/dataset/local.sh
@@ -0,0 +1,6 @@
+if [[ $(hostname) == "lanka-dgx0.csail.mit.edu" ]]; then
+	echo OK
+	ln -s /local/ajaybr/graph-dataset/clean_general/*.mtx .
+else
+	echo You are not running this command on the right host. Please use `make dataset` instead
+fi
diff --git a/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_a.gt b/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_a.gt
new file mode 100644
index 00000000..ef741291
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_a.gt
@@ -0,0 +1,51 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+
+	s1.configDeduplication(DISABLED);
+	s1.configDirection(PUSH);
+	s1.configLoadBalance(VERTEX_BASED);
+	s1.configFrontierCreation(FUSED);
+	program->applyGPUSchedule("s0:s1", s1);
+
+
diff --git a/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_b.gt b/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_b.gt
new file mode 100644
index 00000000..443ad1fd
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_b.gt
@@ -0,0 +1,51 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(FUSED);
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
+
diff --git a/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_c.gt b/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_c.gt
new file mode 100644
index 00000000..43f2a687
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/fig3_inputs/fig3_c.gt
@@ -0,0 +1,56 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configDirection(PUSH);
+	s1.configLoadBalance(TWCE);
+
+	SimpleGPUSchedule s2 = s1;
+	s2.configLoadBalance(VERTEX_BASED);
+	s2.configDirection(PULL, BITMAP);
+	s2.configDeduplication(DISABLED);
+	s2.configFrontierCreation(UNFUSED_BITMAP);
+	
+	HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, "argv[3]", s1, s2);
+	program->applyGPUSchedule("s0:s1", h1);
+
+
diff --git a/graphit_eval/g2_cgo2021_eval/gen_fig3.py b/graphit_eval/g2_cgo2021_eval/gen_fig3.py
new file mode 100644
index 00000000..0cf6952b
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/gen_fig3.py
@@ -0,0 +1,72 @@
+import os
+import subprocess
+DIR_PATH=os.path.dirname(os.path.realpath(__file__)).rstrip("/")
+
+SCRATCH_PATH=""
+GRAPHIT_BUILD_PATH=""
+APPS_DIRECTORY=""
+
+	
+
+def read_default_path(message, default):
+	print(message + " [" + default + "]: ", end="")
+	val = input().strip().rstrip("/")
+	if val == "":
+		val = default	
+	return val
+
+def get_command_output(command):
+	output = ""
+	if isinstance(command, list):
+		proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+	else:
+		proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+	exitcode = proc.wait()
+	if exitcode != 0:
+		print(command)
+	assert(exitcode == 0)
+	for line in proc.stdout.readlines():
+		if isinstance(line, bytes):
+			line = line.decode()
+		output += line.rstrip() + "\n"
+	proc.stdout.close()
+	return output
+
+def compile_application(gtfile):
+	get_command_output("python3 " + GRAPHIT_BUILD_PATH + "/bin/graphitc.py -f " + APPS_DIRECTORY + "/" + gtfile + " -o " + gtfile + ".cu")
+	
+
+def run_tests():
+	compile_application("fig3_a.gt")
+	compile_application("fig3_b.gt")
+	compile_application("fig3_c.gt")
+	
+	os.system("rm compile.cpp compile.o")
+	
+	
+def main():
+	global SCRATCH_PATH
+	global GRAPHIT_BUILD_PATH
+	global APPS_DIRECTORY
+
+	print("Starting artifact evaluation in directory: ", DIR_PATH)
+	SCRATCH_PATH = read_default_path("Please choose a output directory to use", DIR_PATH + "/fig3_outputs")
+	GRAPHIT_BUILD_PATH = read_default_path("Please choose GraphIt build directory", DIR_PATH + "/../../build")
+	APPS_DIRECTORY = DIR_PATH+"/fig3_inputs"
+	
+	if os.path.exists(SCRATCH_PATH):
+		os.system("rm -rf " + SCRATCH_PATH)
+	os.makedirs(SCRATCH_PATH)
+	
+	os.chdir(SCRATCH_PATH)
+	
+
+	run_tests()
+
+
+
+
+
+
+if __name__ == "__main__":
+	main()
diff --git a/graphit_eval/g2_cgo2021_eval/gen_table7.py b/graphit_eval/g2_cgo2021_eval/gen_table7.py
new file mode 100644
index 00000000..d1777001
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/gen_table7.py
@@ -0,0 +1,338 @@
+import os
+import subprocess
+import sys
+DIR_PATH=os.path.dirname(os.path.realpath(__file__)).rstrip("/")
+
+SCRATCH_PATH=""
+GRAPHIT_BUILD_PATH=""
+DATASET_PATH=""
+APPS_DIRECTORY=""
+GPU_ID=""
+NVCC_PATH=""
+CXX_COMPILER=""
+NVCC_COMMAND=""
+GPU_PREFIX=""
+
+
+ORKUT=""
+TWITTER=""
+LIVEJOURNAL=""
+SINAWEIBO=""
+HOLLYWOOD=""
+INDOCHINA=""
+RUSA=""
+RCA=""
+RCENTRAL=""
+GRAPH_ALL=[]
+GRAPH_SOCIAL=[]
+GRAPH_ROAD=[]
+
+def find_dataset_files():
+	global ORKUT
+	global TWITTER
+	global LIVEJOURNAL
+	global SINAWEIBO
+	global HOLLYWOOD
+	global INDOCHINA
+	global RUSA
+	global RCA
+	global RCENTRAL
+	global GRAPH_ALL
+	global GRAPH_ROAD
+	global GRAPH_SOCIAL
+
+	ORKUT=DATASET_PATH+"/soc-orkut.mtx"
+	TWITTER=DATASET_PATH+"/soc-twitter-2010.mtx"
+	LIVEJOURNAL=DATASET_PATH+"/soc-LiveJournal1.mtx"
+	SINAWEIBO=DATASET_PATH+"/soc-sinaweibo.mtx"
+	HOLLYWOOD=DATASET_PATH+"/hollywood-2009.weighted.mtx"
+	INDOCHINA=DATASET_PATH+"/indochina-2004.weighted.mtx"
+	RUSA=DATASET_PATH+"/road_usa.weighted.mtx"
+	RCA=DATASET_PATH+"/roadNet-CA.weighted.mtx"
+	RCENTRAL=DATASET_PATH+"/road_central.weighted.mtx"
+
+	if len(sys.argv) >= 2 and sys.argv[1] == "small":	
+		GRAPH_SOCIAL=[('livejournal', LIVEJOURNAL)]
+		GRAPH_ROAD=[('rca', RCA)]
+	else:
+		GRAPH_SOCIAL=[('orkut', ORKUT), ('twitter', TWITTER), ('livejournal', LIVEJOURNAL), ('sinaweibo', SINAWEIBO), ('indochina', INDOCHINA), ('hollywood', HOLLYWOOD)]
+		GRAPH_ROAD=[('rca', RCA), ('rusa', RUSA), ('rcentral', RCENTRAL)]
+
+	GRAPH_ALL = GRAPH_SOCIAL + GRAPH_ROAD
+
+	
+
+def read_default_path(message, default):
+	print(message + " [" + default + "]: ", end="")
+	val = input().strip().rstrip("/")
+	if val == "":
+		val = default	
+	return val
+
+def get_gpu_count():
+	gpus = os.popen("nvidia-smi -L").read().strip()
+	return len(gpus.split("\n"))
+
+def get_command_output(command):
+	output = ""
+	if isinstance(command, list):
+		proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+	else:
+		proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+	exitcode = proc.wait()
+	if exitcode != 0:
+		print(command)
+	assert(exitcode == 0)
+	for line in proc.stdout.readlines():
+		if isinstance(line, bytes):
+			line = line.decode()
+		output += line.rstrip() + "\n"
+	proc.stdout.close()
+	return output
+
+def set_NVCC_COMMAND(MAX_REG=64):
+	global NVCC_COMMAND
+	
+	NVCC_COMMAND = NVCC_PATH + " -ccbin " + CXX_COMPILER + " "
+	
+	get_command_output(NVCC_COMMAND + APPS_DIRECTORY + "/obtain_gpu_cc.cu -o obtain_gpu_cc")
+	output = get_command_output(GPU_PREFIX+"./obtain_gpu_cc").split()
+
+	if len(output) != 2:
+		print ("Cannot obtain GPU information")
+		exit(-1)
+	compute_capability = output[0]
+	num_of_sm = output[1]
+
+	if MAX_REG == 64:	
+		NVCC_COMMAND += " -rdc=true -DNUM_CTA=" + str(int(num_of_sm)*2) + " -DCTA_SIZE=512 -gencode arch=compute_" + compute_capability + ",code=sm_" + compute_capability
+	elif MAX_REG == 512:
+		CTA_STYLE = (int(int(num_of_sm)/2), int(512/2))
+		NVCC_COMMAND += " -rdc=true -DNUM_CTA=" + str(CTA_STYLE[0]) + " -DCTA_SIZE=" + str(CTA_STYLE[1]) + " -gencode arch=compute_" + compute_capability + ",code=sm_" + compute_capability
+	else:
+		print("Invalid MAX_REG configuration, not supported\n")
+		exit(-1)
+
+	NVCC_COMMAND += " -std=c++11 -O3 -I " + DIR_PATH+"/../.." + "/src/runtime_lib/ -Xcompiler \"-w\" -Wno-deprecated-gpu-targets --use_fast_math -Xptxas \" -dlcm=ca --maxrregcount=" + str(MAX_REG) + "\" "
+
+
+def compile_application(gtfile, binname):
+	if os.path.exists(binname):
+		return
+	get_command_output("python3 " + GRAPHIT_BUILD_PATH + "/bin/graphitc.py -f " + APPS_DIRECTORY + "/" + gtfile + " -o " + gtfile + ".cu")
+	get_command_output(NVCC_COMMAND + gtfile + ".cu -o " + binname)
+
+
+def run_sanity_check():
+	compile_application("simple_graph_load.gt", "load")
+	get_command_output(GPU_PREFIX+"./load " + RCA)
+
+
+def compile_and_run(gtfile, binname, run_args, outputf):
+	compile_application(gtfile, binname)
+	output = get_command_output(GPU_PREFIX+"./"+binname + " " + run_args)
+	f = open(outputf, "w")
+	f.write(output)
+	f.close()
+
+
+def run_pr():
+	set_NVCC_COMMAND()
+	print("Running eval for Pagerank")
+	PR = "pr.gt"	
+	for i, (name, graph) in enumerate(GRAPH_ALL):
+		compile_and_run(PR, "pr", graph, "pr_" + name + ".out")
+		print(str(i+1) + "/" + str(len(GRAPH_ALL)))
+
+
+def run_cc():
+	set_NVCC_COMMAND()
+	print("Running eval for Connected Components")
+	CC = "cc.gt"	
+	for i, (name, graph) in enumerate(GRAPH_ALL):
+		compile_and_run(CC, "cc", graph, "cc_" + name + ".out")
+		print(str(i+1) + "/" + str(len(GRAPH_ALL)))
+
+
+def run_ds():
+	delta = {}
+	delta["orkut"] = 22
+	delta["livejournal"] = 120
+	delta["twitter"] = 15
+	delta["sinaweibo"] = 15
+	delta["hollywood"] = 15
+	delta["indochina"] = 10000
+	delta["rusa"] = 80000
+	delta["rcentral"] = 30000
+	delta["rca"] = 20000
+	
+	print ("Running eval for Delta Stepping")
+	DS_SOCIAL = "ds_social.gt"
+	DS_ROAD = "ds_road.gt"
+	set_NVCC_COMMAND()
+	for i, (name, graph) in enumerate(GRAPH_SOCIAL):
+		compile_and_run(DS_SOCIAL, "ds_social", graph + " 0 " + str(delta[name]), "ds_" + name + ".out")
+		print(str(i+1) + "/" + str(len(GRAPH_ALL)))
+
+	set_NVCC_COMMAND(512)
+	for i, (name, graph) in enumerate(GRAPH_ROAD):
+		compile_and_run(DS_ROAD, "ds_road", graph + " 0 " + str(delta[name]), "ds_" + name + ".out")
+		print(str(i+1+len(GRAPH_SOCIAL)) + "/" + str(len(GRAPH_ALL)))
+		
+def run_bc():
+	threshold = {}
+	threshold["orkut"] = 0.010
+	threshold["livejournal"] = 0.006
+	threshold["twitter"] = 0.023
+	threshold["sinaweibo"] = 0.008
+	threshold["hollywood"] = 0.026
+	threshold["indochina"] = 0.99
+	
+	print ("Running eval for Betweenness Centrality")	
+	BC_SOCIAL = "bc_social.gt"
+	BC_ROAD = "bc_road.gt"
+	set_NVCC_COMMAND()
+	for i, (name, graph) in enumerate(GRAPH_SOCIAL):
+		compile_and_run(BC_SOCIAL, "bc_social", graph + " 0 " + str(threshold[name]), "bc_" + name + ".out")
+		print(str(i+1) + "/" + str(len(GRAPH_ALL)))
+	set_NVCC_COMMAND(512)
+	for i, (name, graph) in enumerate(GRAPH_ROAD):
+		compile_and_run(BC_ROAD, "bc_road", graph + " 0", "bc_" + name + ".out")
+		print(str(i+1+len(GRAPH_SOCIAL)) + "/" + str(len(GRAPH_ALL)))
+	
+
+def run_bfs():
+	threshold = {}
+	threshold["orkut"] = 0.010
+	threshold["livejournal"] = 0.006
+	threshold["twitter"] = 0.023
+	threshold["sinaweibo"] = 0.008
+	threshold["hollywood"] = 0.026
+	threshold["indochina"] = 0.99
+	
+	print ("Running eval for Breadth First Search")	
+	BFS_SOCIAL = "bfs_social.gt"
+	BFS_ROAD = "bfs_road.gt"
+	set_NVCC_COMMAND()
+	for i, (name, graph) in enumerate(GRAPH_SOCIAL):
+		compile_and_run(BFS_SOCIAL, "bfs_social", graph + " 0 " + str(threshold[name]), "bfs_" + name + ".out")
+		print(str(i+1) + "/" + str(len(GRAPH_ALL)))
+	set_NVCC_COMMAND(512)
+	for i, (name, graph) in enumerate(GRAPH_ROAD):
+		compile_and_run(BFS_ROAD, "bfs_road", graph + " 0", "bfs_" + name + ".out")
+		print(str(i+1+len(GRAPH_SOCIAL)) + "/" + str(len(GRAPH_ALL)))
+
+
+def read_execution_time(filename):
+	try:
+		f = open(SCRATCH_PATH + "/" + filename, "r")	
+		values = f.read().strip().split("\n")
+		values = [float(val) for val in values]
+		min_val = min(values)
+		min_val = int(min_val * 100000) / 100.0
+		return min_val
+	except:
+		return -1
+   
+
+def run_tests():
+	# get the GPU properties first
+	set_NVCC_COMMAND()
+	run_sanity_check()
+	run_pr()
+	run_cc()
+	run_ds()
+	run_bc()
+	run_bfs()
+
+
+def print_cell(f, val):
+	spaces = 9 - len(str(val))
+	f.write(" " * spaces + str(val) + " |")
+
+def gen_table7():
+	short_names = {}
+	short_names["orkut"] = "OK"
+	short_names["twitter"] = "TW"
+	short_names["livejournal"] = "LJ"
+	short_names["sinaweibo"] = "SW"
+	short_names["hollywood"] = "HW"
+	short_names["indochina"] = "IC"
+	short_names["rusa"] = "RU"
+	short_names["rca"] = "RN"
+	short_names["rcentral"] = "RC"
+
+	filepath = SCRATCH_PATH + "/table7.txt"
+	f = open(filepath, "w")
+	
+	f.write("-" * 67)
+	f.write("\n")
+	f.write("|")
+	print_cell(f, "Graph")
+	print_cell(f, "PR")
+	print_cell(f, "CC")
+	print_cell(f, "BFS")
+	print_cell(f, "BC")
+	print_cell(f, "SSSP")
+	f.write("\n")
+	f.write("-" * 67)
+	f.write("\n")
+	
+	for graph, _  in GRAPH_ALL:
+		f.write("|")
+		print_cell(f, short_names[graph])
+		for app in ["pr", "cc", "bfs", "bc", "ds"]:
+			fname = app + "_" + graph + ".out"
+			val = read_execution_time(fname)
+			print_cell(f, val)
+		f.write("\n")
+	
+	f.write("-" * 67)
+	f.write("\n")
+	
+	f.close()
+	print(open(filepath, "r").read())
+	print("# This table is generated at: ", filepath)
+
+	
+def main():
+	global SCRATCH_PATH
+	global GRAPHIT_BUILD_PATH
+	global DATASET_PATH
+	global APPS_DIRECTORY
+	global GPU_ID
+	global NVCC_PATH
+	global CXX_COMPILER
+	global GPU_PREFIX
+
+	print("Starting artifact evaluation in directory: ", DIR_PATH)
+	SCRATCH_PATH = read_default_path("Please choose a output directory to use", DIR_PATH + "/table7_outputs")
+	GRAPHIT_BUILD_PATH = read_default_path("Please choose GraphIt build directory", DIR_PATH + "/../../build")
+	DATASET_PATH = read_default_path("Please choose dataset path", DIR_PATH + "/dataset")
+	APPS_DIRECTORY = DIR_PATH+"/table7_inputs"
+	NVCC_PATH = read_default_path("Please choose NVCC path", "/usr/local/cuda/bin/nvcc")
+	CXX_COMPILER = read_default_path("Please choose CXX_COMPILER", "/usr/bin/g++")
+
+	if os.path.exists(SCRATCH_PATH):
+		os.system("rm -rf " + SCRATCH_PATH)
+	os.makedirs(SCRATCH_PATH)
+
+	os.chdir(SCRATCH_PATH)
+
+
+	total_devices = get_gpu_count()
+	GPU_ID = read_default_path("Choose GPU id to use (0-" + str(total_devices-1) + ")", str(0))
+	GPU_PREFIX="CUDA_VISIBLE_DEVICES="+GPU_ID+" "
+	
+
+	find_dataset_files()
+	run_tests()
+	gen_table7()
+
+
+
+
+
+
+if __name__ == "__main__":
+	main()
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/bc_road.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/bc_road.gt
new file mode 100644
index 00000000..fc84b683
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/bc_road.gt
@@ -0,0 +1,125 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+
+const num_paths : vector{Vertex}(double) = 0;
+const dependences : vector{Vertex}(float) = 0;
+const visited : vector{Vertex}(bool) = false;
+
+func forward_update(src : Vertex, dst : Vertex)
+    num_paths[dst] +=  num_paths[src];
+end
+
+func visited_vertex_filter(v : Vertex) -> output : bool
+    output = (visited[v] == false);
+end
+
+func mark_visited(v : Vertex)
+    visited[v] = true;
+end
+
+func mark_unvisited(v : Vertex)
+    visited[v] = false;
+end
+
+func backward_vertex_f(v : Vertex)
+    visited[v] = true;
+    dependences[v] += 1.0 / num_paths[v];
+end
+
+func backward_update(src : Vertex, dst : Vertex)
+    dependences[dst] += dependences[src];
+end
+
+func final_vertex_f(v : Vertex)
+    if num_paths[v] != 0
+        dependences[v] = (dependences[v] - 1.0 / num_paths[v]) * num_paths[v];
+    else
+        dependences[v] = 0;
+    end
+end
+
+func reset(v : Vertex)
+    dependences[v] = 0;
+    num_paths[v] = 0;
+    visited[v] = false;
+end
+
+
+
+
+func main()
+
+    % transposing the edges
+    var transposed_edges : edgeset{Edge}(Vertex, Vertex) = edges.transpose();
+    for trail in 0:10
+	 startTimer();
+    	 var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+    	 var start_vertex : int = atoi(argv[2]);
+
+    	 frontier.addVertex(start_vertex);
+    	 num_paths[start_vertex] = 1;
+    	 visited[start_vertex] = true;
+    	 var round : int = 0;
+    	 var frontier_list : list{vertexset{Vertex}} = new list{vertexset{Vertex}}();
+
+    	 frontier_list.insert(frontier);
+
+    	 % foward pass to propagate num_paths
+    	 #s0# while (frontier.getVertexSetSize() != 0)
+               round = round + 1;
+               #s1# var output : vertexset{Vertex} = edges.from(frontier).to(visited_vertex_filter).applyModified(forward_update, num_paths);
+	       delete frontier;
+               output.apply(mark_visited);
+               frontier_list.insert(output);
+               frontier = output;
+    	 end
+
+
+   	  % resetting the visited information for the backward pass
+    	  vertices.apply(mark_unvisited);
+
+    	  % pop off the empty frontier
+    	  frontier_list.retrieve(frontier);
+
+    	  frontier_list.retrieve(frontier);
+    	  frontier.apply(backward_vertex_f);
+    	  round = round - 1;
+
+    	  % backward pass to accumulate the dependencies
+    	  #s2# while (round > 0)
+          	#s3# transposed_edges.from(frontier).to(visited_vertex_filter).apply(backward_update);
+        	frontier_list.retrieve(frontier);
+        	frontier.apply(backward_vertex_f);
+        	round = round - 1;
+    	  end
+    	  delete frontier;
+
+    	  vertices.apply(final_vertex_f);
+	  var elapsed_time : float = stopTimer();
+          print elapsed_time;
+	  vertices.apply(reset);
+    end
+    
+
+
+end
+
+
+schedule:
+
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(FUSED);
+	s1.configDeduplication(ENABLED, FUSED);
+	
+	program->applyGPUSchedule("s0:s1", s1);
+	program->applyGPUSchedule("s2:s3", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	
+	program->applyGPUSchedule("s0", s0);
+	program->applyGPUSchedule("s2", s0);
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/bc_social.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/bc_social.gt
new file mode 100644
index 00000000..ae797d24
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/bc_social.gt
@@ -0,0 +1,126 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+
+const num_paths : vector{Vertex}(double) = 0;
+const dependences : vector{Vertex}(float) = 0;
+const visited : vector{Vertex}(bool) = false;
+
+func forward_update(src : Vertex, dst : Vertex)
+    num_paths[dst] +=  num_paths[src];
+end
+
+func visited_vertex_filter(v : Vertex) -> output : bool
+    output = (visited[v] == false);
+end
+
+func mark_visited(v : Vertex)
+    visited[v] = true;
+end
+
+func mark_unvisited(v : Vertex)
+    visited[v] = false;
+end
+
+func backward_vertex_f(v : Vertex)
+    visited[v] = true;
+    dependences[v] += 1.0 / num_paths[v];
+end
+
+func backward_update(src : Vertex, dst : Vertex)
+    dependences[dst] += dependences[src];
+end
+
+func final_vertex_f(v : Vertex)
+    if num_paths[v] != 0
+        dependences[v] = (dependences[v] - 1.0 / num_paths[v]) * num_paths[v];
+    else
+        dependences[v] = 0;
+    end
+end
+
+func reset(v : Vertex)
+    dependences[v] = 0;
+    num_paths[v] = 0;
+    visited[v] = false;
+end
+
+
+
+
+func main()
+
+    % transposing the edges
+    var transposed_edges : edgeset{Edge}(Vertex, Vertex) = edges.transpose();
+    for trail in 0:10
+	 startTimer();
+    	 var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+    	 var start_vertex : int = atoi(argv[2]);
+
+    	 frontier.addVertex(start_vertex);
+    	 num_paths[start_vertex] = 1;
+    	 visited[start_vertex] = true;
+    	 var round : int = 0;
+    	 var frontier_list : list{vertexset{Vertex}} = new list{vertexset{Vertex}}();
+
+    	 frontier_list.insert(frontier);
+
+    	 % foward pass to propagate num_paths
+    	 while (frontier.getVertexSetSize() != 0)
+               round = round + 1;
+               #s1# var output : vertexset{Vertex} = edges.from(frontier).to(visited_vertex_filter).applyModified(forward_update, num_paths);
+	       delete frontier;
+               output.apply(mark_visited);
+               frontier_list.insert(output);
+               frontier = output;
+    	 end
+
+
+   	  % resetting the visited information for the backward pass
+    	  vertices.apply(mark_unvisited);
+
+    	  % pop off the empty frontier
+    	  frontier_list.retrieve(frontier);
+
+    	  frontier_list.retrieve(frontier);
+    	  frontier.apply(backward_vertex_f);
+    	  round = round - 1;
+
+    	  % backward pass to accumulate the dependencies
+    	  while (round > 0)
+          	#s2# transposed_edges.from(frontier).to(visited_vertex_filter).apply(backward_update);
+        	frontier_list.retrieve(frontier);
+        	frontier.apply(backward_vertex_f);
+        	round = round - 1;
+    	  end
+    	  delete frontier;
+
+    	  vertices.apply(final_vertex_f);
+	  var elapsed_time : float = stopTimer();
+          print elapsed_time;
+	  vertices.apply(reset);
+    end
+    
+
+
+end
+
+
+schedule:
+
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(FUSED);
+	
+	SimpleGPUSchedule s2;
+	s2.configLoadBalance(TWCE);
+	s2.configDirection(PULL, BITMAP);
+	s2.configFrontierCreation(UNFUSED_BITMAP);
+
+	HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, "argv[3]", s1, s2);	
+
+	program->applyGPUSchedule("s1", h1);
+	program->applyGPUSchedule("s2", h1);
+
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/bfs_road.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/bfs_road.gt
new file mode 100644
index 00000000..af81d4d3
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/bfs_road.gt
@@ -0,0 +1,55 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configDeduplication(DISABLED);
+	s1.configLoadBalance(TWCE);
+	s1.configDirection(PUSH);
+	s1.configFrontierCreation(FUSED);
+
+
+	program->applyGPUSchedule("s0:s1", s1);
+
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/bfs_social.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/bfs_social.gt
new file mode 100644
index 00000000..7634b0ff
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/bfs_social.gt
@@ -0,0 +1,59 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const parent : vector{Vertex}(int) = -1;
+
+
+func updateEdge(src : Vertex, dst : Vertex)
+    parent[dst] = src;
+end
+
+func toFilter(v : Vertex) -> output : bool
+    output =  parent[v] == -1;
+end
+
+func reset(v: Vertex)
+    parent[v] = -1;
+end
+
+func main()
+    for trail in 0:10
+    	var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
+	startTimer();
+        vertices.apply(reset);
+	var start_vertex : int = atoi(argv[2]);
+    	frontier.addVertex(start_vertex);
+    	parent[start_vertex] = start_vertex;
+
+    	#s0# while (frontier.getVertexSetSize() != 0)
+            #s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
+	    delete frontier;
+	    frontier = output;
+    	end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+    	print elapsed_time;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+
+	s1.configDeduplication(DISABLED);
+	s1.configLoadBalance(TWCE);
+	s1.configDirection(PUSH);
+	s1.configFrontierCreation(FUSED);
+
+	SimpleGPUSchedule s2 = s1;
+	s2.configLoadBalance(VERTEX_BASED);
+	s2.configDirection(PULL, BITMAP);
+	s2.configDeduplication(DISABLED);
+	s2.configFrontierCreation(UNFUSED_BITMAP);
+	
+	HybridGPUSchedule h1 (INPUT_VERTEXSET_SIZE, "argv[3]", s1, s2);
+	program->applyGPUSchedule("s0:s1", h1);
+
+
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/cc.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/cc.gt
new file mode 100644
index 00000000..65af4db0
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/cc.gt
@@ -0,0 +1,63 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+
+const vertices : vertexset{Vertex} = edges.getVertices();
+const IDs : vector{Vertex}(int) = 1;
+
+const update: vector[1](int);
+
+func updateEdge(src : Vertex, dst : Vertex)
+    var src_id: Vertex = IDs[src];
+    var dst_id: Vertex = IDs[dst];
+
+    IDs[dst_id] min= IDs[src_id];
+    IDs[src_id] min= IDs[dst_id];
+end
+
+func init(v : Vertex)
+     IDs[v] = v;
+end
+
+func pjump(v: Vertex) 
+    var y: Vertex = IDs[v];
+    var x: Vertex = IDs[y];
+    if x != y
+        IDs[v] = x;
+        update[0] = 1;
+    end
+end
+
+func main()
+    var n : int = edges.getVertices();
+    for trail in 0:10
+        var frontier : vertexset{Vertex} = new vertexset{Vertex}(n);
+        startTimer();
+        vertices.apply(init);
+        while (frontier.getVertexSetSize() != 0)
+            #s1# var output: vertexset{Vertex} = edges.from(frontier).applyModified(updateEdge,IDs);
+	    delete frontier;
+	    frontier = output;
+            update[0] = 1;
+            #s0# while update[0] != 0
+		update[0] = 0;
+		vertices.apply(pjump);
+            end
+        end
+        var elapsed_time : float = stopTimer();
+	delete frontier;
+        print elapsed_time;
+    end
+end
+
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(CM);
+	s1.configDeduplication(ENABLED);
+	s1.configFrontierCreation(UNFUSED_BITMAP);
+	program->applyGPUSchedule("s1", s1);
+
+
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/ds_road.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/ds_road.gt
new file mode 100644
index 00000000..7e048331
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/ds_road.gt
@@ -0,0 +1,49 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex, int) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const dist : vector{Vertex}(int) = 2147483647; %should be INT_MAX
+const pq: priority_queue{Vertex}(int);
+
+func updateEdge(src : Vertex, dst : Vertex, weight : int)
+    var new_dist : int = dist[src] + weight;
+    pq.updatePriorityMin(dst, dist[dst], new_dist);
+end
+
+func printDist(v : Vertex)
+    print dist[v];
+end
+
+func reset(v: Vertex)
+    dist[v] = 2147483647;
+end
+
+func main()
+    for trail in 0:10
+        var start_vertex : int = atoi(argv[2]);
+        pq = new priority_queue{Vertex}(int)(false, false, dist, 1, 2, false, start_vertex);
+        startTimer();
+        vertices.apply(reset);
+        dist[start_vertex] = 0;
+        #s0# while (pq.finished() == false)
+            var frontier : vertexset{Vertex} = pq.dequeue_ready_set(); % dequeue lowest priority nodes
+            #s1# edges.from(frontier).applyUpdatePriority(updateEdge);
+            delete frontier;
+        end
+        var elapsed_time : float = stopTimer();
+        print elapsed_time;
+	delete pq;
+    end
+end
+
+
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(CM);
+	s1.configFrontierCreation(FUSED);
+	s1.configDelta("argv[3]");
+	program->applyGPUSchedule("s0:s1", s1);
+
+	SimpleGPUSchedule s0;
+	s0.configKernelFusion(ENABLED);
+	program->applyGPUSchedule("s0", s0);
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/ds_social.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/ds_social.gt
new file mode 100644
index 00000000..c9947024
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/ds_social.gt
@@ -0,0 +1,45 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex, int) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const dist : vector{Vertex}(int) = 2147483647; %should be INT_MAX
+const pq: priority_queue{Vertex}(int);
+
+func updateEdge(src : Vertex, dst : Vertex, weight : int)
+    var new_dist : int = dist[src] + weight;
+    pq.updatePriorityMin(dst, dist[dst], new_dist);
+end
+
+func printDist(v : Vertex)
+    print dist[v];
+end
+
+func reset(v: Vertex)
+    dist[v] = 2147483647;
+end
+
+func main()
+    for trail in 0:10
+        var start_vertex : int = atoi(argv[2]);
+        pq = new priority_queue{Vertex}(int)(false, false, dist, 1, 2, false, start_vertex);
+        startTimer();
+        vertices.apply(reset);
+        dist[start_vertex] = 0;
+        #s0# while (pq.finished() == false)
+            var frontier : vertexset{Vertex} = pq.dequeue_ready_set(); % dequeue lowest priority nodes
+            #s1# edges.from(frontier).applyUpdatePriority(updateEdge);
+            delete frontier;
+        end
+        var elapsed_time : float = stopTimer();
+        print elapsed_time;
+	delete pq;
+    end
+end
+
+
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configLoadBalance(TWCE);
+	s1.configFrontierCreation(UNFUSED_BOOLMAP);
+	s1.configDelta("argv[3]");
+	program->applyGPUSchedule("s0:s1", s1);
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/obtain_gpu_cc.cu b/graphit_eval/g2_cgo2021_eval/table7_inputs/obtain_gpu_cc.cu
new file mode 100644
index 00000000..bdec4266
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/obtain_gpu_cc.cu
@@ -0,0 +1,31 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_runtime_api.h>
+
+int main(int argc, char *argv[]) {
+    cudaDeviceProp prop;
+    cudaError_t status;
+    int device_count;
+    int device_index = 0;
+    if (argc > 1) {
+        device_index = atoi(argv[1]);
+    }
+
+    status = cudaGetDeviceCount(&device_count);
+    if (status != cudaSuccess) {
+        fprintf(stderr,"cudaGetDeviceCount() failed: %s\n", cudaGetErrorString(status));
+        return -1;
+    }
+    if (device_index >= device_count) {
+        fprintf(stderr, "Specified device index %d exceeds the maximum (the device count on this system is %d)\n", device_index, device_count);
+        return -1;
+    }
+    status = cudaGetDeviceProperties(&prop, device_index);
+    if (status != cudaSuccess) {
+        fprintf(stderr,"cudaGetDeviceProperties() for device device_index failed: %s\n", cudaGetErrorString(status));
+        return -1;
+    }
+    int v = prop.major * 10 + prop.minor;
+    printf("%d\n", v);
+    printf("%d\n", prop.multiProcessorCount);
+}
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/pr.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/pr.gt
new file mode 100644
index 00000000..0e16007f
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/pr.gt
@@ -0,0 +1,60 @@
+element Vertex end
+element Edge end
+const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
+const vertices : vertexset{Vertex} = edges.getVertices();
+const old_rank : vector{Vertex}(float) = 1.0/vertices.size();
+const new_rank : vector{Vertex}(float) = 0.0;
+const out_degree : vector {Vertex}(int) = edges.getOutDegrees();
+const contrib : vector{Vertex}(float) = 0.0;
+const error : vector{Vertex}(float) = 0.0;
+const damp : float = 0.85;
+const beta_score : float = (1.0 - damp) / vertices.size();
+
+func computeContrib(v : Vertex)
+    contrib[v] = old_rank[v] / out_degree[v];
+end
+
+func updateEdge(src : Vertex, dst : Vertex)
+    new_rank[dst] += contrib[src];
+end
+
+func updateVertex(v : Vertex)
+    var old_score : float = old_rank[v];
+    new_rank[v] = beta_score + damp*(new_rank[v]);
+    error[v] = fabs(new_rank[v] - old_rank[v]);
+    old_rank[v] = new_rank[v];
+    new_rank[v] = 0.0;
+    
+end
+
+func printRank(v : Vertex)
+    print old_rank[v];
+end
+
+func reset(v: Vertex)
+    old_rank[v] = 1.0/vertices.size();
+    new_rank[v] = 0.0;
+end
+
+func main()
+    for trail in 0:10
+    	startTimer();
+        vertices.apply(reset);
+	for round in 0:20
+    	    vertices.apply(computeContrib);
+            #s1# edges.apply(updateEdge);
+            vertices.apply(updateVertex);	     
+    	end
+    	var elapsed_time : float = stopTimer();
+    	print elapsed_time/20.0;
+    end
+end
+
+% specify schedules here or use a separate schedule file
+schedule:
+	SimpleGPUSchedule s1;
+	s1.configDirection(PULL);
+	s1.configLoadBalance(EDGE_ONLY, BLOCKED, 0x42000);
+	
+	program->applyGPUSchedule("s1", s1);
+
diff --git a/graphit_eval/g2_cgo2021_eval/table7_inputs/simple_graph_load.gt b/graphit_eval/g2_cgo2021_eval/table7_inputs/simple_graph_load.gt
new file mode 100644
index 00000000..bc50ce27
--- /dev/null
+++ b/graphit_eval/g2_cgo2021_eval/table7_inputs/simple_graph_load.gt
@@ -0,0 +1,13 @@
+element Vertex end
+element Edge end
+
+const edges : edgeset{Edge}(Vertex, Vertex, int) = load (argv[1]);
+
+func main()
+	#s1# print edges.getVertices();	
+end
+
+
+schedule:
+	SimpleGPUSchedule s1;
+	program->applyGPUSchedule("s1", s1);
diff --git a/include/graphit/midend/atomics_op_lower.h b/include/graphit/midend/atomics_op_lower.h
index 50f18d61..acc9e56f 100644
--- a/include/graphit/midend/atomics_op_lower.h
+++ b/include/graphit/midend/atomics_op_lower.h
@@ -17,7 +17,7 @@ namespace graphit {
     class AtomicsOpLower {
 
     public:
-        AtomicsOpLower(MIRContext *mir_context) : mir_context_(mir_context) {};
+        AtomicsOpLower(MIRContext *mir_context, Schedule * s) : mir_context_(mir_context), schedule_(s){};
 
         struct ApplyExprVisitor : public mir::MIRVisitor {
             ApplyExprVisitor(MIRContext *mir_context) :
@@ -47,7 +47,7 @@ namespace graphit {
         };
 
         struct ReduceStmtLower : public mir::MIRVisitor {
-            ReduceStmtLower(MIRContext* mir_context) : mir_context_(mir_context){
+            ReduceStmtLower(MIRContext* mir_context, Schedule* s) : mir_context_(mir_context), schedule_(s){
             }
 
 
@@ -55,6 +55,7 @@ namespace graphit {
 
         private:
             MIRContext *mir_context_ = nullptr;
+            Schedule *schedule_ = nullptr;
 
         };
 
@@ -63,6 +64,7 @@ namespace graphit {
 
     private:
         MIRContext *mir_context_ = nullptr;
+        Schedule *schedule_ = nullptr;
 
 
 
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index d23b4c19..dabf8dc7 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -843,7 +843,11 @@ void CodeGenGPU::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae, mir::Expr:
 			}
 		}
 
-		std::string to_func = esae->to_func->function_name->name;
+		std::string to_func ;
+		if (esae->to_func)
+			to_func = esae->to_func->function_name->name;
+		else 
+			to_func = "";
 		if (to_func != "") {
 			printIndent();
 			oss << "gpu_runtime::vertex_set_create_reverse_sparse_queue_host<" << to_func << ">(";
@@ -995,7 +999,12 @@ void CodeGenGPUFusedKernel::genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr esae,
 			oss << var_name(esae->from_func->function_name->name);
 			oss << ");" << std::endl;
 		}
-		std::string to_func = esae->to_func->function_name->name;
+		std::string to_func;
+		if (esae->to_func)
+			to_func = esae->to_func->function_name->name;
+		else
+			to_func = "";
+                
 		if (to_func != "") {
 			printIndent();
 			oss << "gpu_runtime::vertex_set_create_reverse_sparse_queue_device<" << to_func << ">(";
diff --git a/src/midend/apply_expr_lower.cpp b/src/midend/apply_expr_lower.cpp
index ccba6e25..a7e27e67 100644
--- a/src/midend/apply_expr_lower.cpp
+++ b/src/midend/apply_expr_lower.cpp
@@ -108,7 +108,13 @@ namespace graphit {
 					mir::AssignStmt::Ptr stmt2 = std::make_shared<mir::AssignStmt>();
 					stmt2->lhs = assign_stmt->lhs;
 					stmt2->expr = assign_stmt->expr;
-					mir::to<mir::EdgeSetApplyExpr>(stmt2->expr)->input_function->function_name->name = func_decl_v2->name;
+						
+					mir::FuncExpr::Ptr new_func_expr = std::make_shared<mir::FuncExpr>();
+					new_func_expr->function_name = std::make_shared<mir::IdentDecl>();
+					new_func_expr->function_name->name = func_decl_v2->name;
+
+
+					mir::to<mir::EdgeSetApplyExpr>(stmt2->expr)->input_function= new_func_expr;
 					stmt2->stmt_label = "hybrid2";
 					stmt_block_2->insertStmtEnd(stmt2);
 					fir::gpu_schedule::SimpleGPUSchedule * schedule2 = new fir::gpu_schedule::SimpleGPUSchedule();
@@ -212,7 +218,12 @@ namespace graphit {
 					mir::StmtBlock::Ptr stmt_block_2 = std::make_shared<mir::StmtBlock>();
 					mir::ExprStmt::Ptr stmt2 = std::make_shared<mir::ExprStmt>();
 					stmt2->expr = expr_stmt->expr;
-					mir::to<mir::EdgeSetApplyExpr>(stmt2->expr)->input_function->function_name->name = func_decl_v2->name;
+
+					mir::FuncExpr::Ptr new_func_expr = std::make_shared<mir::FuncExpr>();
+					new_func_expr->function_name = std::make_shared<mir::IdentDecl>();
+					new_func_expr->function_name->name = func_decl_v2->name;
+
+					mir::to<mir::EdgeSetApplyExpr>(stmt2->expr)->input_function = new_func_expr;
 					stmt2->stmt_label = "hybrid2";
 					stmt_block_2->insertStmtEnd(stmt2);
 					fir::gpu_schedule::SimpleGPUSchedule * schedule2 = new fir::gpu_schedule::SimpleGPUSchedule();
diff --git a/src/midend/atomics_op_lower.cpp b/src/midend/atomics_op_lower.cpp
index 01277db9..5924d23a 100644
--- a/src/midend/atomics_op_lower.cpp
+++ b/src/midend/atomics_op_lower.cpp
@@ -23,7 +23,7 @@ void graphit::AtomicsOpLower::ApplyExprVisitor::visit(graphit::mir::UpdatePriori
 
 void graphit::AtomicsOpLower::ApplyExprVisitor::visit(graphit::mir::HybridDenseEdgeSetApplyExpr::Ptr apply_expr) {
     if (apply_expr->is_parallel){
-        ReduceStmtLower reduce_stmt_lower = ReduceStmtLower(mir_context_);
+        ReduceStmtLower reduce_stmt_lower = ReduceStmtLower(mir_context_, schedule_);
         auto pull_func_name = apply_expr->input_function->function_name->name;
         mir::FuncDecl::Ptr pull_func_decl = mir_context_->getFunction(pull_func_name);
         auto push_func_name = apply_expr->push_function_->function_name->name;
@@ -40,7 +40,7 @@ void graphit::AtomicsOpLower::ApplyExprVisitor::visit(graphit::mir::HybridDenseE
 
 void graphit::AtomicsOpLower::ApplyExprVisitor::singleFunctionEdgeSetApplyExprAtomicsLower(graphit::mir::EdgeSetApplyExpr::Ptr apply_expr){
     if (apply_expr->is_parallel){
-        ReduceStmtLower reduce_stmt_lower = ReduceStmtLower(mir_context_);
+        ReduceStmtLower reduce_stmt_lower = ReduceStmtLower(mir_context_, schedule_);
         auto apply_func_decl_name = apply_expr->input_function->function_name->name;
         mir::FuncDecl::Ptr apply_func_decl = mir_context_->getFunction(apply_func_decl_name);
         apply_func_decl->accept(&reduce_stmt_lower);
@@ -289,13 +289,16 @@ void graphit::AtomicsOpLower::ReduceStmtLower::visit(graphit::mir::ReduceStmt::P
                         break;
                     default:
                         std::cout << "not supported for atomics" << std::endl;
-                        exit(0);
+			assert(false);
                 }
             }
         }
 
         //If it is local vector, we still need to add atomic
-        else if(mir::isa<mir::VectorType>(local_vector_field_type)) {
+	// This is definitely a bug. Not all local vectors require an atomic access
+	// It also seems that the mechanism to check if the acccess is atomic seems to be broken. 
+	// This will just add atomics everywhere
+        else if(!(schedule_ != nullptr && !schedule_->apply_gpu_schedules.empty()) && mir::isa<mir::VectorType>(local_vector_field_type)) {
             mir::VectorType::Ptr vector_type = mir::to<mir::VectorType>(local_vector_field_type);
             mir::Type::Ptr local_field_type = vector_type->vector_element_type;
 
@@ -316,7 +319,7 @@ void graphit::AtomicsOpLower::ReduceStmtLower::visit(graphit::mir::ReduceStmt::P
                                 break;
                             default:
                                 std::cout << "not supported for atomics" << std::endl;
-                                exit(0);
+				assert(false);
                         }
                     }
 
diff --git a/src/midend/mir_lower.cpp b/src/midend/mir_lower.cpp
index c3e9ad45..f7a85888 100644
--- a/src/midend/mir_lower.cpp
+++ b/src/midend/mir_lower.cpp
@@ -84,7 +84,7 @@ namespace graphit {
 
         // This pass inserts atomic operations, including CAS, writeMin, writeAdd
         // This pass does not need the schedule
-        AtomicsOpLower(mir_context).lower();
+        AtomicsOpLower(mir_context, schedule).lower();
 
         // This pass generates code for tracking if a field has been modified
         // during the execution of the edgeset apply functions.

From 7acdceff49752d1e4c28157519cfbb0ea80f1cec Mon Sep 17 00:00:00 2001
From: Ajay Brahmakshatriya <ajaybr@mit.edu>
Date: Thu, 18 Nov 2021 01:21:04 -0500
Subject: [PATCH 88/88] Fixed minor Codegen bug for AE

---
 include/graphit/backend/codegen_gpu/codegen_gpu.h |  4 ++--
 src/backend/codegen_gpu/codegen_gpu.cpp           | 14 ++++++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/include/graphit/backend/codegen_gpu/codegen_gpu.h b/include/graphit/backend/codegen_gpu/codegen_gpu.h
index cc30bca3..a5c97125 100644
--- a/include/graphit/backend/codegen_gpu/codegen_gpu.h
+++ b/include/graphit/backend/codegen_gpu/codegen_gpu.h
@@ -149,6 +149,8 @@ class CodeGenGPU: public mir::MIRVisitor{
 	virtual void visit(mir::ListAllocExpr::Ptr) override;
 
 	void genPriorityUpdateOperator(mir::PriorityUpdateOperator::Ptr); 
+	void generateDeviceToHostCopy(mir::TensorArrayReadExpr::Ptr tare);
+	void generateHostToDeviceCopy(mir::TensorArrayReadExpr::Ptr tare);
 
 };
 class CodeGenGPUHost: public CodeGenGPU {
@@ -167,8 +169,6 @@ class CodeGenGPUHost: public CodeGenGPU {
 	virtual void visit(mir::VarExpr::Ptr) override;
 
 
-	void generateDeviceToHostCopy(mir::TensorArrayReadExpr::Ptr tare);
-	void generateHostToDeviceCopy(mir::TensorArrayReadExpr::Ptr tare);
 };
 
 
diff --git a/src/backend/codegen_gpu/codegen_gpu.cpp b/src/backend/codegen_gpu/codegen_gpu.cpp
index d7ea387b..d591814d 100644
--- a/src/backend/codegen_gpu/codegen_gpu.cpp
+++ b/src/backend/codegen_gpu/codegen_gpu.cpp
@@ -1467,12 +1467,22 @@ void CodeGenGPU::visit(mir::WhileStmt::Ptr while_stmt) {
 		}
 		return;
 	}
+
+	ExtractReadWriteSet extractor(mir_context_);
+	while_stmt->cond->accept(&extractor);
+	
 	printIndent();
 	oss << "while (";
 	while_stmt->cond->accept(this);
 	oss << ") {" << std::endl;
 	indent();
+	for (auto tare: extractor.write_set) {
+		generateHostToDeviceCopy(tare);
+	}
 	while_stmt->body->accept(this);
+	for (auto tare: extractor.read_set) {
+		generateDeviceToHostCopy(tare);
+	}
 	dedent();
 	printIndent();
 	oss << "}" << std::endl;
@@ -1690,7 +1700,7 @@ void CodeGenGPU::visit(mir::VertexSetAllocExpr::Ptr vsae) {
 		vsae->size_expr->accept(this);
 	oss << ")";
 }
-void CodeGenGPUHost::generateDeviceToHostCopy(mir::TensorArrayReadExpr::Ptr tare) {
+void CodeGenGPU::generateDeviceToHostCopy(mir::TensorArrayReadExpr::Ptr tare) {
 	printIndent();
 	mir::Var target = mir::to<mir::VarExpr>(tare->target)->var;
 	std::string var_name = target.getName();
@@ -1703,7 +1713,7 @@ void CodeGenGPUHost::generateDeviceToHostCopy(mir::TensorArrayReadExpr::Ptr tare
 	oss << "), cudaMemcpyDeviceToHost);" << std::endl;	
 	
 }
-void CodeGenGPUHost::generateHostToDeviceCopy(mir::TensorArrayReadExpr::Ptr tare) {
+void CodeGenGPU::generateHostToDeviceCopy(mir::TensorArrayReadExpr::Ptr tare) {
 	printIndent();
 	mir::Var target = mir::to<mir::VarExpr>(tare->target)->var;
 	std::string var_name = target.getName();