diff --git a/gibbon-compiler/src/Gibbon/Compiler.hs b/gibbon-compiler/src/Gibbon/Compiler.hs index 3014d1d38..ee5f200f8 100644 --- a/gibbon-compiler/src/Gibbon/Compiler.hs +++ b/gibbon-compiler/src/Gibbon/Compiler.hs @@ -377,6 +377,7 @@ compileRTS Config{verbosity,optc,dynflags} = do ++ (if pointer then " POINTER=1 " else "") ++ (if parallel then " PARALLEL=1 " else "") ++ (if bumpAlloc then " BUMPALLOC=1 " else "") + ++ (if papi then " PAPI=1 " else "") ++ (" USER_CFLAGS=\"" ++ optc ++ "\"") ++ (" VERBOSITY=" ++ show verbosity) execCmd @@ -392,6 +393,7 @@ compileRTS Config{verbosity,optc,dynflags} = do rts_debug = gopt Opt_RtsDebug dynflags print_gc_stats = gopt Opt_PrintGcStats dynflags genGC = gopt Opt_GenGc dynflags + papi = gopt Opt_PapiInstrumentation dynflags -- | Compile and run the generated code if appropriate @@ -426,6 +428,10 @@ compileAndRunExe cfg@Config{backend,arrayInput,benchInput,mode,cfile,exefile} fp links = if pointer then " -lgc -lm " else " -lm " + papi = gopt Opt_PapiInstrumentation (dynflags cfg) + links' = if papi + then links ++ "-l:libpapi.a " + else links compile_program = do compileRTS cfg lib_dir <- getRTSBuildDir @@ -436,7 +442,7 @@ compileAndRunExe cfg@Config{backend,arrayInput,benchInput,mode,cfile,exefile} fp ++" -L" ++ lib_dir ++ " -Wl,-rpath=" ++ lib_dir ++ " " ++ outfile ++ " " ++ rts_o_path - ++ links ++ " -lgibbon_rts_ng" + ++ links' ++ " -lgibbon_rts_ng" execCmd Nothing @@ -524,6 +530,7 @@ compilationCmd C config = (cc config) ++" -std=gnu11 " ++ (if not genGC then " -D_GIBBON_GENGC=0 " else " -D_GIBBON_GENGC=1 ") ++ (if simpleWriteBarrier then " -D_GIBBON_SIMPLE_WRITE_BARRIER=1 " else " -D_GIBBON_SIMPLE_WRITE_BARRIER=0 ") ++ (if lazyPromote then " -D_GIBBON_EAGER_PROMOTION=0 " else " -D_GIBBON_EAGER_PROMOTION=1 ") + ++ (if papi then " -D_GIBBON_ENABLE_PAPI " else "") where dflags = dynflags config bumpAlloc = gopt Opt_BumpAlloc dflags pointer = gopt Opt_Pointer dflags @@ -534,6 +541,7 @@ compilationCmd C config = (cc config) ++" -std=gnu11 " genGC = gopt Opt_GenGc dflags simpleWriteBarrier = gopt Opt_SimpleWriteBarrier dflags lazyPromote = gopt Opt_NoEagerPromote dflags + papi = gopt Opt_PapiInstrumentation dflags -- | isBench :: Mode -> Bool diff --git a/gibbon-compiler/src/Gibbon/DynFlags.hs b/gibbon-compiler/src/Gibbon/DynFlags.hs index f887db342..5bfc2c5e8 100644 --- a/gibbon-compiler/src/Gibbon/DynFlags.hs +++ b/gibbon-compiler/src/Gibbon/DynFlags.hs @@ -14,34 +14,36 @@ import Data.Set as S import Options.Applicative data GeneralFlag - = Opt_Gibbon1 -- ^ Set Opt_No_RemoveCopies & Opt_BigInfiniteRegions - | Opt_Gibbon2 -- ^ Set Opt_RemoveCopies & Opt_InfiniteRegions - | Opt_RemoveCopies -- ^ Calls to copy functions are converted to indirections - | Opt_No_RemoveCopies -- ^ Unset Opt_RemoveCopies - | Opt_InfiniteRegions -- ^ Use infinite regions - | Opt_BigInfiniteRegions -- ^ Use big infinite regions - | Opt_BenchPrint -- ^ Should the benchamrked function have its output printed? - | Opt_Packed -- ^ Use packed representation - | Opt_Pointer -- ^ Use pointer representation - | Opt_BumpAlloc -- ^ Use bump-pointer allocation if using the non-packed backend - | Opt_Warnc -- ^ Show warnings from the C compiler - | Opt_DisableGC -- ^ Don't run the the garbage collector (used by Codegen). - | Opt_No_PureAnnot -- ^ Don't use 'pure' annotations (a GCC optimization) - | Opt_Fusion -- ^ Enable fusion. - | Opt_Parallel -- ^ Fork/join parallelism. - | Opt_RegionOnSpawn -- ^ Allocate into fresh regions for every spawn, not steal. - | Opt_GhcTc -- ^ Typecheck with GHC before compiling with Gibbon. - | Opt_RelativeOffsets -- ^ Enable relative offsets. - | Opt_CountParRegions -- ^ Count and print the number of regions allocated for parallelism. - | Opt_CountAllRegions -- ^ Count and print the number of all the regions allocated. - | Opt_RtsDebug -- ^ Compile the RTS in debugging mode. - | Opt_PrintGcStats -- ^ Record and print GC statistics. - | Opt_GenGc -- ^ Use the new non-generational GC. - | Opt_NoEagerPromote -- ^ Disable eager promotion. - | Opt_SimpleWriteBarrier -- ^ Disables eliminate-indirection-chains optimization. - | Opt_Layout_Local -- ^ Optimize the layout of Algebraic data types locally - | Opt_Layout_Global -- ^ Optimize the layout of Algebraic data types globally - | Opt_Layout_Use_Solver -- ^ Use the Solver to optimize the layout of the data types. + = Opt_Gibbon1 -- ^ Set Opt_No_RemoveCopies & Opt_BigInfiniteRegions + | Opt_Gibbon2 -- ^ Set Opt_RemoveCopies & Opt_InfiniteRegions + | Opt_RemoveCopies -- ^ Calls to copy functions are converted to indirections + | Opt_No_RemoveCopies -- ^ Unset Opt_RemoveCopies + | Opt_InfiniteRegions -- ^ Use infinite regions + | Opt_BigInfiniteRegions -- ^ Use big infinite regions + | Opt_BenchPrint -- ^ Should the benchamrked function have its output printed? + | Opt_Packed -- ^ Use packed representation + | Opt_Pointer -- ^ Use pointer representation + | Opt_BumpAlloc -- ^ Use bump-pointer allocation if using the non-packed backend + | Opt_Warnc -- ^ Show warnings from the C compiler + | Opt_DisableGC -- ^ Don't run the the garbage collector (used by Codegen). + | Opt_No_PureAnnot -- ^ Don't use 'pure' annotations (a GCC optimization) + | Opt_Fusion -- ^ Enable fusion. + | Opt_Parallel -- ^ Fork/join parallelism. + | Opt_RegionOnSpawn -- ^ Allocate into fresh regions for every spawn, not steal. + | Opt_GhcTc -- ^ Typecheck with GHC before compiling with Gibbon. + | Opt_RelativeOffsets -- ^ Enable relative offsets. + | Opt_CountParRegions -- ^ Count and print the number of regions allocated for parallelism. + | Opt_CountAllRegions -- ^ Count and print the number of all the regions allocated. + | Opt_RtsDebug -- ^ Compile the RTS in debugging mode. + | Opt_PrintGcStats -- ^ Record and print GC statistics. + | Opt_GenGc -- ^ Use the new non-generational GC. + | Opt_NoEagerPromote -- ^ Disable eager promotion. + | Opt_SimpleWriteBarrier -- ^ Disables eliminate-indirection-chains optimization. + | Opt_Layout_Local -- ^ Optimize the layout of Algebraic data types locally + | Opt_Layout_Global -- ^ Optimize the layout of Algebraic data types globally + | Opt_Layout_Use_Solver -- ^ Use the Solver to optimize the layout of the data types. + | Opt_PapiInstrumentation -- ^ Enable PAPI instrumentation while compiling the gibbon binary. + deriving (Show,Read,Eq,Ord) -- | Exactly like GHC's ddump flags. @@ -120,7 +122,8 @@ dynflagsParser = DynFlags <$> (S.fromList <$> many gflagsParser) <*> (S.fromList flag' Opt_SimpleWriteBarrier (long "simple-write-barrier" <> help "Disables eliminate-indirection-chains optimization.") <|> flag' Opt_Layout_Local (long "opt-layout-local" <> help "Optimizes the Layout of Algebraic data types locally") <|> flag' Opt_Layout_Global (long "opt-layout-global" <> help "Optimizes the Layout of Algebraic data types globally") <|> - flag' Opt_Layout_Use_Solver (long "opt-layout-use-solver" <> help "Use the solver instead of a Greedy Heuristic") + flag' Opt_Layout_Use_Solver (long "opt-layout-use-solver" <> help "Use the solver instead of a Greedy Heuristic") <|> + flag' Opt_PapiInstrumentation (long "enable-papi" <> help "Enable instrumentation using papi, extends the iterate timing function." ) dflagsParser :: Parser DebugFlag diff --git a/gibbon-compiler/src/Gibbon/Passes/Codegen.hs b/gibbon-compiler/src/Gibbon/Passes/Codegen.hs index 98df71dc0..6e42d4b5f 100644 --- a/gibbon-compiler/src/Gibbon/Passes/Codegen.hs +++ b/gibbon-compiler/src/Gibbon/Passes/Codegen.hs @@ -326,6 +326,9 @@ codegenProg cfg prg@(Prog info_tbl sym_tbl funs mtal) = \#include \n\ \#include \n\ \#endif\n\n\ + \#ifdef _GIBBON_ENABLE_PAPI\n\ + \#include \n\ + \#endif\n\n\ \/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\ \ * Program starts here\n\ \ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\ @@ -629,6 +632,8 @@ codegenTail venv fenv sort_fns (LetTimedT flg bnds rhs body) ty sync_deps = selftimed <- gensym "selftimed" times <- gensym "times" tmp <- gensym "tmp" + papi_retval <- gensym "papi_retval" + papi_region <- gensym "papi_region" let ident = case bnds of ((v,_):_) -> v _ -> (toVar "") @@ -659,13 +664,36 @@ codegenTail venv fenv sort_fns (LetTimedT flg bnds rhs body) ty sync_deps = , C.BlockStm [cstm| printf("itertime: %lf\n", $id:itertime); |] , C.BlockStm [cstm| gib_vector_inplace_update($id:times, $id:iters, &($id:itertime)); |] ] - in [ C.BlockStm [cstm| for (long long $id:iters = 0; $id:iters < gib_get_iters_param(); $id:iters ++) { $items:body } |] - , C.BlockStm [cstm| gib_vector_inplace_sort($id:times, gib_compare_doubles); |] - , C.BlockDecl [cdecl| double *$id:tmp = (double*) gib_vector_nth($id:times, (gib_get_iters_param() / 2)); |] - , C.BlockDecl [cdecl| double $id:selftimed = *($id:tmp); |] - , C.BlockDecl [cdecl| double $id:batchtime = gib_sum_timing_array($id:times); |] - , C.BlockStm [cstm| gib_print_timing_array($id:times); |] - , C.BlockStm [cstm| gib_vector_free($id:times); |] + -- TODO: Find a better way to get a name for the region id. + ifdef = "#ifdef _GIBBON_ENABLE_PAPI" + endif = "#endif" + body' = [ C.BlockStm [cstm| $escstm:ifdef |] + , C.BlockStm [cstm| sprintf($id:papi_region, "%d", get_papi_region_id());|] + , C.BlockDecl [cdecl| int $id:papi_retval = PAPI_hl_region_begin($id:papi_region);|] + , C.BlockStm [cstm| if ( $id:papi_retval != PAPI_OK ) { + exit(1); + } |] + , C.BlockStm [cstm| $escstm:endif |] + ] ++ + body ++ + [ C.BlockStm [cstm| $escstm:ifdef |] + , C.BlockStm [cstm| $id:papi_retval = PAPI_hl_region_end($id:papi_region);|] + , C.BlockStm [cstm| if ( $id:papi_retval != PAPI_OK ) { + exit(1); + } |] + , C.BlockStm [cstm| increment_papi_region_id(); |] + , C.BlockStm [cstm| $escstm:endif |] + ] + in [ C.BlockStm [cstm| $escstm:ifdef |] + , C.BlockDecl [cdecl| char $id:papi_region[128];|] + , C.BlockStm [cstm| $escstm:endif |] + , C.BlockStm [cstm| for (long long $id:iters = 0; $id:iters < gib_get_iters_param(); $id:iters ++) { $items:body' } |] + , C.BlockStm [cstm| gib_vector_inplace_sort($id:times, gib_compare_doubles); |] + , C.BlockDecl [cdecl| double *$id:tmp = (double*) gib_vector_nth($id:times, (gib_get_iters_param() / 2)); |] + , C.BlockDecl [cdecl| double $id:selftimed = *($id:tmp); |] + , C.BlockDecl [cdecl| double $id:batchtime = gib_sum_timing_array($id:times); |] + , C.BlockStm [cstm| gib_print_timing_array($id:times); |] + , C.BlockStm [cstm| gib_vector_free($id:times); |] ]) -- else diff --git a/gibbon-rts/Makefile b/gibbon-rts/Makefile index a26b07160..83b1a7cb4 100644 --- a/gibbon-rts/Makefile +++ b/gibbon-rts/Makefile @@ -13,6 +13,7 @@ # POINTER # PARALLEL # BUMPALLOC +# PAPI # # # GC toggles: @@ -69,6 +70,10 @@ ifeq ($(POINTER), 1) CFLAGS += -D_GIBBON_POINTER endif +ifeq ($(PAPI), 1) + CFLAGS += -D_GIBBON_ENABLE_PAPI +endif + ifeq ($(PARALLEL), 1) CFLAGS += -fcilkplus -D_GIBBON_PARALLEL endif @@ -111,7 +116,6 @@ RUST_RTS_SO := libgibbon_rts_ng.so RUST_RTS_PATH := $(RUST_RTS_DIR)/target/$(MODE)/$(RUST_RTS_SO) RUST_SOURCES := $(shell find $(RUST_RTS_DIR) -type f -name *.rs) - all: rts rts: c_rts rs_rts @@ -146,7 +150,7 @@ $(C_RTS_DIR)/%.o: $(C_RTS_DIR)/%.c $(BUILD_DIR)/%.h: $(C_RTS_DIR)/%.h mkdir -p $(BUILD_DIR) && \ - ln -s $^ $@ + ln -s -f $^ $@ $(BUILD_DIR): mkdir -p $(BUILD_DIR) diff --git a/gibbon-rts/rts-c/gibbon_rts.c b/gibbon-rts/rts-c/gibbon_rts.c index 4afed9190..01cc1abc7 100644 --- a/gibbon-rts/rts-c/gibbon_rts.c +++ b/gibbon-rts/rts-c/gibbon_rts.c @@ -35,8 +35,9 @@ #include #endif - - +#ifdef _GIBBON_ENABLE_PAPI +#include +#endif /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * Globals and their accessors @@ -61,6 +62,8 @@ static int64_t gib_global_region_count = 0; // Invariant: should always be equal to max(sym_table_keys). static GibSym gib_global_gensym_counter = 0; +//PAPI: specify the region to instrument +static uint64_t papi_region_id = 0; size_t gib_get_biginf_init_chunk_size(void) @@ -128,6 +131,16 @@ GibSym gib_read_gensym_counter(void) return gib_global_gensym_counter; } +uint64_t get_papi_region_id(void) +{ + return papi_region_id; +} + +void increment_papi_region_id(void) +{ + papi_region_id++; +} + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * Allocators diff --git a/gibbon-rts/rts-c/gibbon_rts.h b/gibbon-rts/rts-c/gibbon_rts.h index 4c9565c56..cd98b0fcb 100644 --- a/gibbon-rts/rts-c/gibbon_rts.h +++ b/gibbon-rts/rts-c/gibbon_rts.h @@ -14,18 +14,19 @@ * CPP macros used in the RTS: * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * - * _GIBBON_VERBOSITY=int verbosity level for debug output - * _GIBBON_DEBUG enables various assertions if present - * _GIBBON_GCSTATS collect GC statistics if present - * _GIBBON_PRINT_GCSTATS print GC statistics if present - * _GIBBON_GENGC only use old reference counted GC set to 0 - * _GIBBON_BOUNDSCHECK boundscheck vector accesses - * _GIBBON_BUMPALLOC_LISTS bump allocated linked lists - * _GIBBON_BUMPALLOC_HEAP bump allocated gib_alloc - * _GIBBON_POINTER pointer mode gib_alloc - * _GIBBON_PARALLEL parallel mode - * _GIBBON_EAGER_PROMOTION disable eager promotion if set to 0 - * _GIBBON_SIMPLE_WRITE_BARRIER disable eliminate-indirection-chains optimization + * _GIBBON_VERBOSITY=int verbosity level for debug output + * _GIBBON_DEBUG enables various assertions if present + * _GIBBON_GCSTATS collect GC statistics if present + * _GIBBON_PRINT_GCSTATS print GC statistics if present + * _GIBBON_GENGC only use old reference counted GC set to 0 + * _GIBBON_BOUNDSCHECK boundscheck vector accesses + * _GIBBON_BUMPALLOC_LISTS bump allocated linked lists + * _GIBBON_BUMPALLOC_HEAP bump allocated gib_alloc + * _GIBBON_POINTER pointer mode gib_alloc + * _GIBBON_PARALLEL parallel mode + * _GIBBON_EAGER_PROMOTION disable eager promotion if set to 0 + * _GIBBON_SIMPLE_WRITE_BARRIER disable eliminate-indirection-chains optimization + * _GIBBON_ENABLE_PAPI enable instrumentation via papi * */ @@ -116,6 +117,8 @@ char *gib_read_bench_prog_param(void); char *gib_read_benchfile_param(void); char *gib_read_arrayfile_param(void); uint64_t gib_read_arrayfile_length_param(void); +uint64_t get_papi_region_id(void); +void increment_papi_region_id(void); // Number of regions allocated. int64_t gib_read_region_count(void);