Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable Papi high level stats within the iterate construct #229

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion gibbon-compiler/src/Gibbon/Compiler.hs
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,7 @@ compileRTS Config{verbosity,optc,dynflags} = do
++ (if pointer then " POINTER=1 " else "")
++ (if parallel then " PARALLEL=1 " else "")
++ (if bumpAlloc then " BUMPALLOC=1 " else "")
++ (if papi then " PAPI=1 " else "")
++ (" USER_CFLAGS=\"" ++ optc ++ "\"")
++ (" VERBOSITY=" ++ show verbosity)
execCmd
Expand All @@ -392,6 +393,7 @@ compileRTS Config{verbosity,optc,dynflags} = do
rts_debug = gopt Opt_RtsDebug dynflags
print_gc_stats = gopt Opt_PrintGcStats dynflags
genGC = gopt Opt_GenGc dynflags
papi = gopt Opt_PapiInstrumentation dynflags


-- | Compile and run the generated code if appropriate
Expand Down Expand Up @@ -426,6 +428,10 @@ compileAndRunExe cfg@Config{backend,arrayInput,benchInput,mode,cfile,exefile} fp
links = if pointer
then " -lgc -lm "
else " -lm "
papi = gopt Opt_PapiInstrumentation (dynflags cfg)
links' = if papi
then links ++ "-l:libpapi.a "
else links
compile_program = do
compileRTS cfg
lib_dir <- getRTSBuildDir
Expand All @@ -436,7 +442,7 @@ compileAndRunExe cfg@Config{backend,arrayInput,benchInput,mode,cfile,exefile} fp
++" -L" ++ lib_dir
++ " -Wl,-rpath=" ++ lib_dir ++ " "
++ outfile ++ " " ++ rts_o_path
++ links ++ " -lgibbon_rts_ng"
++ links' ++ " -lgibbon_rts_ng"

execCmd
Nothing
Expand Down Expand Up @@ -524,6 +530,7 @@ compilationCmd C config = (cc config) ++" -std=gnu11 "
++ (if not genGC then " -D_GIBBON_GENGC=0 " else " -D_GIBBON_GENGC=1 ")
++ (if simpleWriteBarrier then " -D_GIBBON_SIMPLE_WRITE_BARRIER=1 " else " -D_GIBBON_SIMPLE_WRITE_BARRIER=0 ")
++ (if lazyPromote then " -D_GIBBON_EAGER_PROMOTION=0 " else " -D_GIBBON_EAGER_PROMOTION=1 ")
++ (if papi then " -D_GIBBON_ENABLE_PAPI " else "")
where dflags = dynflags config
bumpAlloc = gopt Opt_BumpAlloc dflags
pointer = gopt Opt_Pointer dflags
Expand All @@ -534,6 +541,7 @@ compilationCmd C config = (cc config) ++" -std=gnu11 "
genGC = gopt Opt_GenGc dflags
simpleWriteBarrier = gopt Opt_SimpleWriteBarrier dflags
lazyPromote = gopt Opt_NoEagerPromote dflags
papi = gopt Opt_PapiInstrumentation dflags

-- |
isBench :: Mode -> Bool
Expand Down
61 changes: 32 additions & 29 deletions gibbon-compiler/src/Gibbon/DynFlags.hs
Original file line number Diff line number Diff line change
Expand Up @@ -14,34 +14,36 @@ import Data.Set as S
import Options.Applicative

data GeneralFlag
= Opt_Gibbon1 -- ^ Set Opt_No_RemoveCopies & Opt_BigInfiniteRegions
| Opt_Gibbon2 -- ^ Set Opt_RemoveCopies & Opt_InfiniteRegions
| Opt_RemoveCopies -- ^ Calls to copy functions are converted to indirections
| Opt_No_RemoveCopies -- ^ Unset Opt_RemoveCopies
| Opt_InfiniteRegions -- ^ Use infinite regions
| Opt_BigInfiniteRegions -- ^ Use big infinite regions
| Opt_BenchPrint -- ^ Should the benchamrked function have its output printed?
| Opt_Packed -- ^ Use packed representation
| Opt_Pointer -- ^ Use pointer representation
| Opt_BumpAlloc -- ^ Use bump-pointer allocation if using the non-packed backend
| Opt_Warnc -- ^ Show warnings from the C compiler
| Opt_DisableGC -- ^ Don't run the the garbage collector (used by Codegen).
| Opt_No_PureAnnot -- ^ Don't use 'pure' annotations (a GCC optimization)
| Opt_Fusion -- ^ Enable fusion.
| Opt_Parallel -- ^ Fork/join parallelism.
| Opt_RegionOnSpawn -- ^ Allocate into fresh regions for every spawn, not steal.
| Opt_GhcTc -- ^ Typecheck with GHC before compiling with Gibbon.
| Opt_RelativeOffsets -- ^ Enable relative offsets.
| Opt_CountParRegions -- ^ Count and print the number of regions allocated for parallelism.
| Opt_CountAllRegions -- ^ Count and print the number of all the regions allocated.
| Opt_RtsDebug -- ^ Compile the RTS in debugging mode.
| Opt_PrintGcStats -- ^ Record and print GC statistics.
| Opt_GenGc -- ^ Use the new non-generational GC.
| Opt_NoEagerPromote -- ^ Disable eager promotion.
| Opt_SimpleWriteBarrier -- ^ Disables eliminate-indirection-chains optimization.
| Opt_Layout_Local -- ^ Optimize the layout of Algebraic data types locally
| Opt_Layout_Global -- ^ Optimize the layout of Algebraic data types globally
| Opt_Layout_Use_Solver -- ^ Use the Solver to optimize the layout of the data types.
= Opt_Gibbon1 -- ^ Set Opt_No_RemoveCopies & Opt_BigInfiniteRegions
| Opt_Gibbon2 -- ^ Set Opt_RemoveCopies & Opt_InfiniteRegions
| Opt_RemoveCopies -- ^ Calls to copy functions are converted to indirections
| Opt_No_RemoveCopies -- ^ Unset Opt_RemoveCopies
| Opt_InfiniteRegions -- ^ Use infinite regions
| Opt_BigInfiniteRegions -- ^ Use big infinite regions
| Opt_BenchPrint -- ^ Should the benchamrked function have its output printed?
| Opt_Packed -- ^ Use packed representation
| Opt_Pointer -- ^ Use pointer representation
| Opt_BumpAlloc -- ^ Use bump-pointer allocation if using the non-packed backend
| Opt_Warnc -- ^ Show warnings from the C compiler
| Opt_DisableGC -- ^ Don't run the the garbage collector (used by Codegen).
| Opt_No_PureAnnot -- ^ Don't use 'pure' annotations (a GCC optimization)
| Opt_Fusion -- ^ Enable fusion.
| Opt_Parallel -- ^ Fork/join parallelism.
| Opt_RegionOnSpawn -- ^ Allocate into fresh regions for every spawn, not steal.
| Opt_GhcTc -- ^ Typecheck with GHC before compiling with Gibbon.
| Opt_RelativeOffsets -- ^ Enable relative offsets.
| Opt_CountParRegions -- ^ Count and print the number of regions allocated for parallelism.
| Opt_CountAllRegions -- ^ Count and print the number of all the regions allocated.
| Opt_RtsDebug -- ^ Compile the RTS in debugging mode.
| Opt_PrintGcStats -- ^ Record and print GC statistics.
| Opt_GenGc -- ^ Use the new non-generational GC.
| Opt_NoEagerPromote -- ^ Disable eager promotion.
| Opt_SimpleWriteBarrier -- ^ Disables eliminate-indirection-chains optimization.
| Opt_Layout_Local -- ^ Optimize the layout of Algebraic data types locally
| Opt_Layout_Global -- ^ Optimize the layout of Algebraic data types globally
| Opt_Layout_Use_Solver -- ^ Use the Solver to optimize the layout of the data types.
| Opt_PapiInstrumentation -- ^ Enable PAPI instrumentation while compiling the gibbon binary.

deriving (Show,Read,Eq,Ord)

-- | Exactly like GHC's ddump flags.
Expand Down Expand Up @@ -120,7 +122,8 @@ dynflagsParser = DynFlags <$> (S.fromList <$> many gflagsParser) <*> (S.fromList
flag' Opt_SimpleWriteBarrier (long "simple-write-barrier" <> help "Disables eliminate-indirection-chains optimization.") <|>
flag' Opt_Layout_Local (long "opt-layout-local" <> help "Optimizes the Layout of Algebraic data types locally") <|>
flag' Opt_Layout_Global (long "opt-layout-global" <> help "Optimizes the Layout of Algebraic data types globally") <|>
flag' Opt_Layout_Use_Solver (long "opt-layout-use-solver" <> help "Use the solver instead of a Greedy Heuristic")
flag' Opt_Layout_Use_Solver (long "opt-layout-use-solver" <> help "Use the solver instead of a Greedy Heuristic") <|>
flag' Opt_PapiInstrumentation (long "enable-papi" <> help "Enable instrumentation using papi, extends the iterate timing function." )


dflagsParser :: Parser DebugFlag
Expand Down
42 changes: 35 additions & 7 deletions gibbon-compiler/src/Gibbon/Passes/Codegen.hs
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,9 @@ codegenProg cfg prg@(Prog info_tbl sym_tbl funs mtal) =
\#include <cilk/cilk.h>\n\
\#include <cilk/cilk_api.h>\n\
\#endif\n\n\
\#ifdef _GIBBON_ENABLE_PAPI\n\
\#include <papi.h>\n\
\#endif\n\n\
\/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\
\ * Program starts here\n\
\ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\
Expand Down Expand Up @@ -629,6 +632,8 @@ codegenTail venv fenv sort_fns (LetTimedT flg bnds rhs body) ty sync_deps =
selftimed <- gensym "selftimed"
times <- gensym "times"
tmp <- gensym "tmp"
papi_retval <- gensym "papi_retval"
papi_region <- gensym "papi_region"
let ident = case bnds of
((v,_):_) -> v
_ -> (toVar "")
Expand Down Expand Up @@ -659,13 +664,36 @@ codegenTail venv fenv sort_fns (LetTimedT flg bnds rhs body) ty sync_deps =
, C.BlockStm [cstm| printf("itertime: %lf\n", $id:itertime); |]
, C.BlockStm [cstm| gib_vector_inplace_update($id:times, $id:iters, &($id:itertime)); |]
]
in [ C.BlockStm [cstm| for (long long $id:iters = 0; $id:iters < gib_get_iters_param(); $id:iters ++) { $items:body } |]
, C.BlockStm [cstm| gib_vector_inplace_sort($id:times, gib_compare_doubles); |]
, C.BlockDecl [cdecl| double *$id:tmp = (double*) gib_vector_nth($id:times, (gib_get_iters_param() / 2)); |]
, C.BlockDecl [cdecl| double $id:selftimed = *($id:tmp); |]
, C.BlockDecl [cdecl| double $id:batchtime = gib_sum_timing_array($id:times); |]
, C.BlockStm [cstm| gib_print_timing_array($id:times); |]
, C.BlockStm [cstm| gib_vector_free($id:times); |]
-- TODO: Find a better way to get a name for the region id.
ifdef = "#ifdef _GIBBON_ENABLE_PAPI"
endif = "#endif"
body' = [ C.BlockStm [cstm| $escstm:ifdef |]
, C.BlockStm [cstm| sprintf($id:papi_region, "%d", get_papi_region_id());|]
, C.BlockDecl [cdecl| int $id:papi_retval = PAPI_hl_region_begin($id:papi_region);|]
, C.BlockStm [cstm| if ( $id:papi_retval != PAPI_OK ) {
exit(1);
} |]
, C.BlockStm [cstm| $escstm:endif |]
] ++
body ++
[ C.BlockStm [cstm| $escstm:ifdef |]
, C.BlockStm [cstm| $id:papi_retval = PAPI_hl_region_end($id:papi_region);|]
, C.BlockStm [cstm| if ( $id:papi_retval != PAPI_OK ) {
exit(1);
} |]
, C.BlockStm [cstm| increment_papi_region_id(); |]
, C.BlockStm [cstm| $escstm:endif |]
]
in [ C.BlockStm [cstm| $escstm:ifdef |]
, C.BlockDecl [cdecl| char $id:papi_region[128];|]
, C.BlockStm [cstm| $escstm:endif |]
, C.BlockStm [cstm| for (long long $id:iters = 0; $id:iters < gib_get_iters_param(); $id:iters ++) { $items:body' } |]
, C.BlockStm [cstm| gib_vector_inplace_sort($id:times, gib_compare_doubles); |]
, C.BlockDecl [cdecl| double *$id:tmp = (double*) gib_vector_nth($id:times, (gib_get_iters_param() / 2)); |]
, C.BlockDecl [cdecl| double $id:selftimed = *($id:tmp); |]
, C.BlockDecl [cdecl| double $id:batchtime = gib_sum_timing_array($id:times); |]
, C.BlockStm [cstm| gib_print_timing_array($id:times); |]
, C.BlockStm [cstm| gib_vector_free($id:times); |]
])

-- else
Expand Down
8 changes: 6 additions & 2 deletions gibbon-rts/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# POINTER
# PARALLEL
# BUMPALLOC
# PAPI
#
#
# GC toggles:
Expand Down Expand Up @@ -69,6 +70,10 @@ ifeq ($(POINTER), 1)
CFLAGS += -D_GIBBON_POINTER
endif

ifeq ($(PAPI), 1)
CFLAGS += -D_GIBBON_ENABLE_PAPI
endif

ifeq ($(PARALLEL), 1)
CFLAGS += -fcilkplus -D_GIBBON_PARALLEL
endif
Expand Down Expand Up @@ -111,7 +116,6 @@ RUST_RTS_SO := libgibbon_rts_ng.so
RUST_RTS_PATH := $(RUST_RTS_DIR)/target/$(MODE)/$(RUST_RTS_SO)
RUST_SOURCES := $(shell find $(RUST_RTS_DIR) -type f -name *.rs)


all: rts

rts: c_rts rs_rts
Expand Down Expand Up @@ -146,7 +150,7 @@ $(C_RTS_DIR)/%.o: $(C_RTS_DIR)/%.c

$(BUILD_DIR)/%.h: $(C_RTS_DIR)/%.h
mkdir -p $(BUILD_DIR) && \
ln -s $^ $@
ln -s -f $^ $@

$(BUILD_DIR):
mkdir -p $(BUILD_DIR)
Expand Down
17 changes: 15 additions & 2 deletions gibbon-rts/rts-c/gibbon_rts.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@
#include <cilk/cilk_api.h>
#endif



#ifdef _GIBBON_ENABLE_PAPI
#include <papi.h>
#endif

/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Globals and their accessors
Expand All @@ -61,6 +62,8 @@ static int64_t gib_global_region_count = 0;
// Invariant: should always be equal to max(sym_table_keys).
static GibSym gib_global_gensym_counter = 0;

//PAPI: specify the region to instrument
static uint64_t papi_region_id = 0;


size_t gib_get_biginf_init_chunk_size(void)
Expand Down Expand Up @@ -128,6 +131,16 @@ GibSym gib_read_gensym_counter(void)
return gib_global_gensym_counter;
}

uint64_t get_papi_region_id(void)
{
return papi_region_id;
}

void increment_papi_region_id(void)
{
papi_region_id++;
}


/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Allocators
Expand Down
27 changes: 15 additions & 12 deletions gibbon-rts/rts-c/gibbon_rts.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,19 @@
* CPP macros used in the RTS:
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* _GIBBON_VERBOSITY=int verbosity level for debug output
* _GIBBON_DEBUG enables various assertions if present
* _GIBBON_GCSTATS collect GC statistics if present
* _GIBBON_PRINT_GCSTATS print GC statistics if present
* _GIBBON_GENGC only use old reference counted GC set to 0
* _GIBBON_BOUNDSCHECK boundscheck vector accesses
* _GIBBON_BUMPALLOC_LISTS bump allocated linked lists
* _GIBBON_BUMPALLOC_HEAP bump allocated gib_alloc
* _GIBBON_POINTER pointer mode gib_alloc
* _GIBBON_PARALLEL parallel mode
* _GIBBON_EAGER_PROMOTION disable eager promotion if set to 0
* _GIBBON_SIMPLE_WRITE_BARRIER disable eliminate-indirection-chains optimization
* _GIBBON_VERBOSITY=int verbosity level for debug output
* _GIBBON_DEBUG enables various assertions if present
* _GIBBON_GCSTATS collect GC statistics if present
* _GIBBON_PRINT_GCSTATS print GC statistics if present
* _GIBBON_GENGC only use old reference counted GC set to 0
* _GIBBON_BOUNDSCHECK boundscheck vector accesses
* _GIBBON_BUMPALLOC_LISTS bump allocated linked lists
* _GIBBON_BUMPALLOC_HEAP bump allocated gib_alloc
* _GIBBON_POINTER pointer mode gib_alloc
* _GIBBON_PARALLEL parallel mode
* _GIBBON_EAGER_PROMOTION disable eager promotion if set to 0
* _GIBBON_SIMPLE_WRITE_BARRIER disable eliminate-indirection-chains optimization
* _GIBBON_ENABLE_PAPI enable instrumentation via papi
*
*/

Expand Down Expand Up @@ -116,6 +117,8 @@ char *gib_read_bench_prog_param(void);
char *gib_read_benchfile_param(void);
char *gib_read_arrayfile_param(void);
uint64_t gib_read_arrayfile_length_param(void);
uint64_t get_papi_region_id(void);
void increment_papi_region_id(void);

// Number of regions allocated.
int64_t gib_read_region_count(void);
Expand Down
Loading