diff --git a/share/CMakeLists.txt b/share/CMakeLists.txt index d572f33..3af2bf6 100644 --- a/share/CMakeLists.txt +++ b/share/CMakeLists.txt @@ -1,13 +1,15 @@ ### Fortran compiler flags -if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU") - add_compile_definitions( - CPRGNU - NAMING=_ADD_UNDERSCORE) -elseif(CMAKE_Fortran_COMPILER_ID MATCHES "Intel") - add_compile_definitions( - CPRINTEL - NAMING=_ADD_UNDERSCORE) -endif() +#if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU") +# add_compile_definitions( +# CPRGNU +# NAMING=_ADD_UNDERSCORE +# FORTRANUNDERSCORE) +#elseif(CMAKE_Fortran_COMPILER_ID MATCHES "Intel") +# add_compile_definitions( +# CPRINTEL +# NAMING=_ADD_UNDERSCORE +# FORTRANUNDERSCORE) +#endif() # The following files are generated with a script acting on templates # We include pre-generated files that are committed to the repository @@ -43,10 +45,23 @@ list(APPEND cesm_share_src_files CESM_share/src/water_isotopes/water_isotopes.F90 ) -# The following files are mostly stubs. +# GPTL timing library srcs +list(APPEND timing_src_files + timing/f_wrappers.c + timing/gptl.c + timing/GPTLget_memusage.c + timing/gptl.h + timing/gptl.inc + timing/gptl_papi.c + timing/GPTLprint_memusage.c + timing/GPTLutil.c + timing/perf_mod.F90 + timing/perf_utils.F90 +) + +# The following file is a stub. list(APPEND stubs_src_files - src/perf_mod.F90 - src/mct_mod.F90 + stubs/mct_mod.F90 ) if(CMAKE_Fortran_COMPILER_ID MATCHES "Intel") @@ -56,12 +71,13 @@ endif() # Collect source files for library -add_library(share STATIC ${stubs_src_files} ${cesm_share_generated_files} ${cesm_share_src_files}) +add_library(share STATIC ${timing_src_files} ${stubs_src_files} ${cesm_share_generated_files} ${cesm_share_src_files}) +add_compile_definitions(NUOPC_INTERFACE HAVE_MPI) if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU") - target_compile_definitions(share PRIVATE CPRGNU NAMING=_ADD_UNDERSCORE) + target_compile_definitions(share PRIVATE CPRGNU NAMING=_ADD_UNDERSCORE FORTRANUNDERSCORE) elseif(CMAKE_Fortran_COMPILER_ID MATCHES "Intel") - target_compile_definitions(share PRIVATE CPRINTEL NAMING=_ADD_UNDERSCORE) + target_compile_definitions(share PRIVATE CPRINTEL NAMING=_ADD_UNDERSCORE FORTRANUNDERSCORE) endif() set_target_properties(share PROPERTIES Fortran_MODULE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mod) diff --git a/share/src/perf_mod.F90 b/share/src/perf_mod.F90 deleted file mode 100644 index 431ec4c..0000000 --- a/share/src/perf_mod.F90 +++ /dev/null @@ -1,67 +0,0 @@ -module perf_mod - use shr_kind_mod - use ESMF - implicit none - -contains - - subroutine t_setLogUnit(LogUnit) - integer(SHR_KIND_IN), intent(IN) :: LogUnit ! Unit number for log output - - end subroutine t_setLogUnit - - subroutine t_prf(filename, mpicom, num_outpe, stride_outpe, single_file, global_stats, output_thispe) - character(len=*), intent(in), optional :: filename - integer, intent(in), optional :: mpicom - integer, intent(in), optional :: num_outpe - integer, intent(in), optional :: stride_outpe - logical, intent(in), optional :: single_file - logical, intent(in), optional :: global_stats - logical, intent(in), optional :: output_thispe - - end subroutine t_prf - - subroutine t_initf(NLFilename, LogPrint, LogUnit, mpicom, MasterTask, MaxThreads) - character(len=*), intent(IN) :: NLFilename - logical, optional, intent(IN) :: LogPrint - integer, optional, intent(IN) :: LogUnit - integer, optional, intent(IN) :: mpicom - logical, optional, intent(IN) :: MasterTask - integer, optional, intent(IN) :: MaxThreads - - end subroutine t_initf - - subroutine t_finalizef() - - end subroutine t_finalizef - - subroutine t_startf(event, handle) - character(len=*), intent(in) :: event - integer, optional :: handle - call ESMF_TraceRegionEnter(event) - end subroutine t_startf - - subroutine t_stopf(event, handle) - character(len=*), intent(in) :: event - integer, optional :: handle - call ESMF_TraceRegionExit(event) - end subroutine t_stopf - - subroutine t_barrierf(event, mpicom) - integer, intent(in), optional :: mpicom - character(len=*), intent(in), optional :: event - end subroutine t_barrierf - -end module perf_mod - -! The following functions are called from shr_mem_mod without an interface. We -! add some stubs here. -integer function GPTLget_memusage(size, rss, share, text, datastack) - integer :: size, rss, share, text, datastack - GPTLget_memusage = 0 -end function GPTLget_memusage - -integer function GPTLprint_memusage(str) - character(len=*) :: str - GPTLprint_memusage = 0 -end function GPTLprint_memusage diff --git a/share/src/mct_mod.F90 b/share/stubs/mct_mod.F90 similarity index 100% rename from share/src/mct_mod.F90 rename to share/stubs/mct_mod.F90 diff --git a/share/timing/COPYING b/share/timing/COPYING new file mode 100644 index 0000000..324ce86 --- /dev/null +++ b/share/timing/COPYING @@ -0,0 +1,17 @@ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the “Software”), to deal +in the Software for any noncommercial purposes without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to permit +persons to whom the Software is furnished to do so, subject to the following +conditions: The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. Any +commercial use (including sale) of the software, and derivative development +towards commercial use, requires written permission of the copyright +holder. THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO +EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES +OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/share/timing/ChangeLog b/share/timing/ChangeLog new file mode 100644 index 0000000..619565f --- /dev/null +++ b/share/timing/ChangeLog @@ -0,0 +1,238 @@ +timing_180912: Moved prefix support from perf_mod.F90 to gptl.c + and also added support for setting prefixes in + threaded regions. + [Patrick Worley] +timing_180911: Moved detail to end of timer name when specify + profile_add_detail (so that will not interfere with + planned move of the prefix logic into gptl.c) + [Patrick Worley] +timing_180910: Removed addition of double quotes to timer names in + perf_mod.F90 and added this as an output option in + gptl.c (so internally the names do not have the quotes) + [Patrick Worley] +timing_180822: Fixed perf_mod.F90 bug that prevents PAPI derived events + from being recognized. + [Patrick Worley] +timing_180731: Refactored implementation of append/write modes; + collected and output "on" events for global statistics + [Patrick Worley] +timing_180730: Added support for setting GPTLmaxthreads. Cleaned up white space. + Added SEQUENTIAL to fortran open, to avoid problems on some systems. + Added timing overhead measurement to perf_mod. Fixed errors in + f_wrappers.c in definition of gptlpr_query_append and + gptlpr_XXX_write. + [Patrick Worley (some from Jim Rosinksi)] +timing_180403: Added GPTLstartstop_val(f) to gptl.h, to provide explicit + typing and eliminate compile-time warning for some compilers. + Also do not define the CPP tokens HAVE_COMM_F2C and + HAVE_GETTIMEOFDAY in private.h if they have already been + defined, also eliminating compile-time warnings. + [Patrick Worley] +timing_171028: Backported GPTLstartstop_val from a more recent version + of GPTL, added a callcount parameter, and renamed it + GPTLstartstop_vals. Also added a version for non-null + terminated timing event names (GPTLstartstop_valsf). + Added t_startstop_valsf to perf_mod. Also replaced + all strncpy calls in f_wrapper.c, just to be safe. + [Patrick Worley (but primarily Jim Rosinksi)] +timing_171027: Mitigated against nonmonotonic timing calls by + setting negative deltas to zero in gptl.c . + [Patrick Worley] +timing_160816: Added quotes to timing event names in t_startf and t_stopf + before calling GPTL routines. + [Patrick Worley and Sean Patrick Santos] +timing_161207: Replaced strncpy and snprintf where applied to + non-null-terminated strings, to avoid memory issues + when strncpy and snprintf implementations use strnlen + to check validity of passed in string length parameter. + (Why this causes problems is still a mystery.) + [Patrick Worley and Gautam Bisht] +timing_160320: Added routines t_set_prefixf and t_unset_prefixf. + Setting the prefix adds this to the beginning of all subsequent + timer event names (defined in t_startf/t_stopf). + Also doubling default hash index tablesize to 2048. + [Patrick Worley] +timing_160124: Added option to prefix timer names with detail level. This is + controlled by a new namelist variable (profile_add_detail). + The default is to not enable this option. It is meant to be + used for inspecting the profile detail definitions and + associated logic. + [Patrick Worley] +timing_150903: Changed API to be compatible with NCAR CIME version of timing + library: (a) changed handle argument from integer8 to integer + in t_startf and t_stopf, and disabled use of handles in these + routines until gptl.c can be updated; (b) added MaxThreads + optional argument to t_initf, though it does not do anything + yet. + [Patrick Worley] +timing_150518: Disabled abort when calling GPTL routines before GPTLinitialize + (so can use with Chombo library, for example); changed top + level default from nanotimer to MPI_WTIME. + [Patrick Worley] +timing_150327: Added option to more accurately measure measurement overhead + (incurring additional overhead, so not on by default). + [Patrick Worley] +timing_150217: Added support for enabling/disabling start/stop timers from + perf_mod even when calling GPTL routines directly from C/C++ + libraries; increased maximum timer name length. + [Patrick Worley] +timing_141119: Enabled cmake build of timing library. + [Jayesh Krishna] +timing_140805: Disabled GPTL autoinstrumentation, as this conflicts with the + VampirTrace tool. (We do not use the autoinstrumentation + capability.) [Patrick Worley, from B. Jamroz] +timing_140416: Changed Makefile so that .mods and the static library are copied to + LIBROOT +timing_140317: Modified Makefile to deal with shared mpi-serial builds +timing_131108: Added memory retrieval commands for BG/Q [S Mickelson] +timing_131023: Added explicit include path for gptl.h [J Edwards] +timing_130827: added routines supporting non-null terminated timer labels, for use with + with C++ std:string (and more efficient usage with Fortran); also CMake + logic fixes + [Patrick Worley] +timing_130506: Copy all modules to the include directory on install +timing_130417: Made nano time the default timer if available. +timing_130316: Changed declarations of functions used in qsort in gptl.c, to eliminate + error with Cray compiler (and warnings from other compilers) + [Patrick Worley] +timing_130214: NAG port: Put mpif.h include before "save", and don't use + "abort" and "flush" extensions for NAG. [Sean Patrick Santos] +timing_120921: Add code for cmake build, should not have any affect otherwise +timing_120731: Correction in Makefile for serial build [Jim Edwards] +timing_120728: Replace process subset optional parameter in t_prf with + outpe_thispe optional parameter. Change def_perf_outpe_num to 0. + [Patrick Worley] +timing_120717: Retain timestamp on cp in Makefile [Jim Edwards] +timing_120710: Correct issue in Makefile [Jim Edwards] +timing_120709: Change for BGP to measure on compute nodes rather than IO nodes only, + minor Change in Makefile so that gptl can build seperate from csm_share + in cesm [Jim Edwards] +timing_120512: Bug fix in global statistics logic for when a thread has no events + to contribute to the merge (mods to gptl.c) + [Patrick Worley] +timing_120419: Minor changes for mpi-serial compile (jedwards) +timing_120408: Make HAVE_COMM_F2C default to true. (jedwards) +timing_120110: Update to GPTL 4.1 source (mods to gptl.c and GPTLprint_memusage) + [Jim Rosinski (GPTL 4.1), Patrick Worley] +timing_120109: Bug fix (adding shr_kind_i8 to shr_kind_mod list) +timing_111205: Update to gptl 4.0 (introducing CESM customizations); + support for handles in t_startf/t_stopf; + support for restricting output to explicitly named process subsets + [Jim Rosinski (gptl 4.0), Patrick Worley] +timing_111101: Workaround for mpi_rsend issue on cray/gemini +timing_110928: Add a Makefile and build as a library usable by mct and pio +timing_101215: No changes from previous tag other than updating Changelog +timing_101210: Fix interface to cesm build system, add workaround for xlf bug +timing_101202: updated get_memusage and print_memusage from GPTL version 3.7; adds + improved support for MacOS and SLASHPROC + [Jim Rosinski, Chuck Bardeen (integrated by P. Worley)] +timing_091021: update to GPTL version 3.5; rewrite of GPTLpr_summary: much faster, merging + events from all processes and all threads (not just process 0/thread 0); + miscellaneous fixes + [Jim Rosinski (gptl 3.5), Joseph Singh, Patrick Worley] +timing_090929: added explicit support for the GPTL-native token HAVE_MPI (indicating + presence of MPI library) + [Patrick Worley] +timing_081221: restore default assumption that gettimeofday available +timing_081028: bug fix in include order in gptl_papi.c +timing_081026: change in output format to make postprocessing simpler +timing_081024: support for up to one million processes and writing timing files to + subdirectories +timing_081017: updated to gptl version 3_4_2. Changed some defaults. + [Jim Rosinski, Patrick Worley] +timing_080629: added optional parameters perf_outpe_num and perf_outpe_stride to t_prf. + These are used to override the user specified values for timing data + written out before the end of a simulation. + [Patrick Worley] +timing_071213: changed default to disable inline keyword; changed global statistics + logic to avoid problems at scale; moved shr and CAM routine equivalencies + to a new module (in perf_utils.F90); added t_getLogUnit/t_setLogUnit + routines to control Log output in same way as shr_file_get/setLogUnit; + modified GPTLpr logic to support output of timing data during a run + [Patrick Worley] +timing_071023: updated to gptl version 2.16, added support for output of global + statistics; removed dependencies on shr and CAM routines; renamed + gptlutil.c to GPTLutil.c + [Patrick Worley, Jim Rosinski] +timing_071019: modified namelist logic to abort if try to set unknown namelist parameters; + changed default number of reporting processes to 1; + reversed meaning and changed names of CPP tokens to NO_C99_INLINE and NO_VPRINTF + [Patrick Worley] +timing_071010: modified gptl.c to remove the 'inline' specification unless the + CPP token C99 is defined. + [Patrick Worley] +timing_070810: added ChangeLog + updated to latest version of GPTL (from Jim Rosinski) + modified perf_mod.F90: + - added perf_outpe_num and perf_outpe_stride to perf_inparm + namelist to control which processes output timing data + - added perf_papi_enable to perf_inparm namelist to enable + PAPI counters + - added papi_inparm namelist and papi_ctr1,2,3,4 namelist + parameters to specify PAPI counters + [Patrick Worley, Jim Rosinski] +timing_070525: bug fix in gptl.c + - unitialized pointer, testing for null pter + before traversing + [Patrick Worley] +timing_070328: modified perf_mod.F90 + - deleted HIDE_MPI cpp token + [Erik Kluzek] +timing_070327: bug fixes in gptl.c + - testing for null pters before traversing + links; added missing type declaration to GPTLallocate for sum + bug fixes in perf_mod.F90 + - fixed OMP-related logic, modified settings reporting, + modified to work when namelist input is + missing; moved timer depth logic back into gptl.c + [Patrick Worley] +timing_070308: added perf_mod.F90 + - defines all t_xxx entry points - calling gptlxxx directly + and removing all external gptlxxx dependencies, + added detail option as an alternative way to disable + event timing, added runtime selection of timing_disable, + perf_timer, timer_depth_limit, timing_detail_limit, + timing_barrier, perf_single_file via namelist parameters + modified f_wrappers.c + - replaced all t_xxx entry points with gptlxxx entry points, + added new gptlxxx entry points, deleted _fcd support + modified gptl.c + - deleted DISABLE_TIMERS cpp token, modified GPTLpr call + and logic to move some of support for concatenating timing + output into a single file to perf_mod.F90 + modified gptl.h + - exposed gptlxxx entry points and to add support for choice + of GPTL timer + modified gptl.inc + - removed t_xxx entry points and expose gptlxxx entry points + [Patrick Worley] +timing_061207: modified gptl.c + - improved event output ordering + [Jim Edwards] +timing_061124: modified gptl.c + - modified GPTLpr to add option to concatenate + all timing data in a single output file, added GPTL_enable + and GPTL_disable as runtime control of event timing, + process 0-only reporting of timing options - unless DEBUG + cpp token defined + modified gptl.h + - redefined GPTLpr parameters + modified f_wrappers.c + - added t_enablef and t_disablef to call GPTL_enable and + GPTL_disable, added t_pr_onef, added string.h include + bug fix in f_wrappers.c + - changed character string size declaration from int to size_t + bug fix in gptl_papi.c + - modified error message - from Jim Edwards + modified private.h + - increased maximum event name length + [Patrick Worley] +timing_061028: modified f_wrappers.c + - deleted dependency on cfort.h + [Patrick Worley] +timing_060524: modified f_wrappers.c + - added support for CRAY cpp token and fixed routine + type declarations + [Patrick Worley] +timing_051212: original subversion version + - see CAM ChangeLog for earlier history diff --git a/share/timing/GPTLget_memusage.c b/share/timing/GPTLget_memusage.c new file mode 100644 index 0000000..db52740 --- /dev/null +++ b/share/timing/GPTLget_memusage.c @@ -0,0 +1,196 @@ +/* +** $Id: get_memusage.c,v 1.10 2010-11-09 19:08:53 rosinski Exp $ +** +** Author: Jim Rosinski +** Credit to Chuck Bardeen for MACOS section (__APPLE__ ifdef) +** +** get_memusage: +** +** Designed to be called from Fortran, returns information about memory +** usage in each of 5 input int* args. On Linux read from the /proc +** filesystem because getrusage() returns placebos (zeros). Return -1 for +** values which are unavailable or ambiguous on a particular architecture. +** Reported numbers are in kilobytes. +** +** Return value: 0 = success +** -1 = failure +*/ + +#include +#include "gptl.h" /* additional cpp defs and function prototypes */ +#include + +/* _AIX is automatically defined when using the AIX C compilers */ +#ifdef _AIX +#include +#endif + +#ifdef IRIX64 +#include +#endif + +#ifdef HAVE_SLASHPROC + +#include +#include +#include + +#elif (defined __APPLE__) + +#include +#include +#include + +#endif + +#ifdef BGP + +#include +#include +#include +#include +#define Personality _BGP_Personality_t + +#endif + +#ifdef __bgq__ + +#include +#include + +#endif + +#define PRINT_MEMUSAGE 0 + +int GPTLget_memusage (int *size, int *rss, int *share, int *text, int *datastack) +{ +#if defined (BGP) + long long alloc, total; + int node_config; + struct mallinfo m; + Personality pers; + + /* memory available */ + Kernel_GetPersonality(&pers, sizeof(pers)); + total = BGP_Personality_DDRSizeMB(&pers); + + node_config = BGP_Personality_processConfig(&pers); + if (node_config == _BGP_PERS_PROCESSCONFIG_VNM) total /= 4; + else if (node_config == _BGP_PERS_PROCESSCONFIG_2x2) total /= 2; + total *= 1024; // in KB + + /* total memory used - heap only (not static memory)*/ + *size = total; + + m = mallinfo(); + alloc = m.hblkhd + m.uordblks; + + *rss = alloc; + *share = -1; + *text = -1; + *datastack = -1; + return 0; + +#elif (defined __bgq__) + uint64_t heap, shared, stack; + + Kernel_GetMemorySize(KERNEL_MEMSIZE_HEAP, &heap); + Kernel_GetMemorySize(KERNEL_MEMSIZE_SHARED, &shared); + Kernel_GetMemorySize(KERNEL_MEMSIZE_STACK, &stack); + + *size = heap/1024; + *rss = heap/1024; + *share = shared/1024; + *text = -1; + *datastack = stack/1024; + return 0; + +#elif (defined HAVE_SLASHPROC) + FILE *fd; /* file descriptor for fopen */ + int pid; /* process id */ + char file[19]; /* full path to file in /proc */ + int dum; /* placeholder for unused return arguments */ + int ret; /* function return value */ + static int pg_sz = -1; /* page size */ + + /* + ** The file we want to open is /proc//statm + */ + + pid = (int) getpid (); + if (pid <= 0) { + fprintf (stderr, "get_memusage: pid %d is non-positive\n", pid); + return -1; + } + + sprintf (file, "/proc/%d/statm", pid); + if ((fd = fopen (file, "r")) < 0) { + fprintf (stderr, "get_memusage: bad attempt to open %s\n", file); + return -1; + } + + /* + ** Read the desired data from the /proc filesystem directly into the output + ** arguments, close the file and return. + */ + + ret = fscanf (fd, "%d %d %d %d %d %d %d", + size, rss, share, text, datastack, &dum, &dum); + ret = fclose (fd); + + // read page size once + if (pg_sz == -1) { + pg_sz = sysconf(_SC_PAGESIZE) / 1024; + } + + // convert from pages to KBs + *size = *size * pg_sz; + *rss = *rss * pg_sz; + *share = *share * pg_sz; + *text = *text * pg_sz; + *datastack = *datastack * pg_sz; +#if PRINT_MEMUSAGE + fprintf (stderr, "get_memusage: size=%d KB, rss=%d KB, share=%d KB, text=%d KB, datastack=%d KB, page_size=%d KB\n", + *size, *rss, *share, *text, *datastack, pg_sz); +#endif + + return 0; + +#elif (defined __APPLE__) + + FILE *fd; + char cmd[60]; + int pid = (int) getpid (); + + // returned values are in KBs + sprintf (cmd, "ps -o vsz -o rss -o tsiz -p %d | grep -v RSS", pid); + fd = popen (cmd, "r"); + + if (fd) { + fscanf (fd, "%d %d %d", size, rss, text); + *share = -1; + *datastack = -1; + (void) pclose (fd); + } + + return 0; + +#else + + struct rusage usage; /* structure filled in by getrusage */ + + if (getrusage (RUSAGE_SELF, &usage) < 0) + return -1; + + *size = -1; + *rss = usage.ru_maxrss; // in KBs + *share = -1; + *text = -1; + *datastack = -1; +#ifdef IRIX64 + *datastack = usage.ru_idrss + usage.ru_isrss; +#endif + return 0; + +#endif +} diff --git a/share/timing/GPTLprint_memusage.c b/share/timing/GPTLprint_memusage.c new file mode 100644 index 0000000..be5e706 --- /dev/null +++ b/share/timing/GPTLprint_memusage.c @@ -0,0 +1,155 @@ +/* +** $Id: print_memusage.c,v 1.13 2010-11-09 19:08:54 rosinski Exp $ +** +** Author: Jim Rosinski +** +** print_memusage: +** +** Prints info about memory usage of this process by calling get_memusage. +** +** Return value: 0 = success +** -1 = failure +*/ + +#include "gptl.h" +#include +#include +#include +#include +#ifdef __bgq__ +#include +#endif + +static int nearest_powerof2 (const int); +static int convert_to_mb = 1; /* true */ + +int GPTLprint_memusage (const char *str) +{ +#ifdef __bgq__ + uint64_t shared, persist, heapavail, stackavail, stack, heap, guard, mmap; + + Kernel_GetMemorySize(KERNEL_MEMSIZE_HEAP, &heap); + Kernel_GetMemorySize(KERNEL_MEMSIZE_SHARED, &shared); + Kernel_GetMemorySize(KERNEL_MEMSIZE_STACK, &stack); + Kernel_GetMemorySize(KERNEL_MEMSIZE_PERSIST, &persist); + Kernel_GetMemorySize(KERNEL_MEMSIZE_HEAPAVAIL, &heapavail); + Kernel_GetMemorySize(KERNEL_MEMSIZE_STACKAVAIL, &stackavail); + Kernel_GetMemorySize(KERNEL_MEMSIZE_GUARD, &guard); + Kernel_GetMemorySize(KERNEL_MEMSIZE_MMAP, &mmap); + + printf("%s Memory(MB): heap-alloc: %.2f, heap-avail: %.2f," + "stack-alloc: %.2f, stack-avail: %.2f," + "shared: %.2f, persist: %.2f, guard: %.2f, mmap: %.2f\n", str, + (double)heap/(1024*1024), (double)heapavail/(1024*1024), + (double)stack/(1024*1024), (double)stackavail/(1024*1024), + (double)shared/(1024*1024), (double)persist/(1024*1024), + (double)guard/(1024*1024), (double)mmap/(1024*1024)); + return 0; + +#else + int size, size2; /* process size (returned from OS) */ + int rss, rss2; /* resident set size (returned from OS) */ + int share, share2; /* shared data segment size (returned from OS) */ + int text, text2; /* text segment size (returned from OS) */ + int datastack, datastack2; /* data/stack size (returned from OS) */ + static int kbytesperblock = -1; /* convert to Kbytes (init to invalid) */ + static const int nbytes =1024*1024*1024;/* allocate 1 GB */ + void *space; /* allocated space */ + + setbuf(stdout, NULL); // don't buffer stdout, flush + if (GPTLget_memusage (&size, &rss, &share, &text, &datastack) < 0) { + printf ("GPTLprint_memusage: GPTLget_memusage failed.\n"); + return -1; + } + +#if (defined HAVE_SLASHPROC || defined __APPLE__) + if (kbytesperblock == -1) { + kbytesperblock = sysconf(_SC_PAGESIZE) / 1024; + printf ("GPTLprint_memusage: Using Kbytesperpage=%d\n", kbytesperblock); + } + + /* + ** Determine size in bytes of memory usage info presented by the OS. Method: allocate a + ** known amount of memory and see how much bigger the process becomes. + */ + + if (convert_to_mb && kbytesperblock == -1 && (space = malloc (nbytes))) { + memset (space, 0, nbytes); /* ensure the space is really allocated */ + if (GPTLget_memusage (&size2, &rss2, &share2, &text2, &datastack2) == 0) { + if (size2 > size) { + /* + ** Estimate bytes per block, then refine to nearest power of 2. + ** The assumption is that the OS presents memory usage info in + ** units that are a power of 2. + */ + kbytesperblock = (int) ((nbytes / (double) (size2 - size)) + 0.5); + kbytesperblock = nearest_powerof2 (kbytesperblock); + printf ("GPTLprint_memusage: Using Kbytesperblock=%d\n", kbytesperblock); + } else { + printf ("GPTLprint_memusage: highwater did not increase.\n"); + } + } else { + printf ("GPTLprint_memusage: call GPTLget_memusage failed.\n"); + } + free (space); + } + + if (kbytesperblock > 0) { + printf ("%s sysmem size=%.1f MB rss=%.1f MB share=%.1f MB text=%.1f MB datastack=%.1f MB\n", + str, size/1024., rss/1024., share/1024., text/1024., datastack/1024.); + } else { + printf ("%s sysmem size=%d rss=%d share=%d text=%d datastack=%d\n", + str, size, rss, share, text, datastack); + } + +#else + + /* + ** Use max rss as returned by getrusage. If someone knows how to + ** get the process size under AIX please tell me. + */ + + if (convert_to_mb) + printf ("%s max rss=%.1f MB\n", str, rss*1024.); + else + printf ("%s max rss=%d\n", str, rss); +#endif + + return 0; +#endif +} + +/* +** nearest_powerof2: +** Determine nearest integer which is a power of 2. +** Note: algorithm can't use anything that requires -lm because this is a library, +** and we don't want to burden the user with having to add extra libraries to the +** link line. +** +** Input arguments: +** val: input integer +** +** Return value: nearest integer to val which is a power of 2 +*/ + +static int nearest_powerof2 (const int val) +{ + int lower; /* power of 2 which is just less than val */ + int higher; /* power of 2 which is just more than val */ + int delta1; /* difference between val and lower */ + int delta2; /* difference between val and higher */ + + if (val < 2) + return 0; + + for (higher = 1; higher < val; higher *= 2) + lower = higher; + + delta1 = val - lower; + delta2 = higher - val; + + if (delta1 < delta2) + return lower; + else + return higher; +} diff --git a/share/timing/GPTLutil.c b/share/timing/GPTLutil.c new file mode 100644 index 0000000..d9a1a93 --- /dev/null +++ b/share/timing/GPTLutil.c @@ -0,0 +1,81 @@ +/* +** $Id: util.c,v 1.13 2010-01-01 01:34:07 rosinski Exp $ +*/ + +#include +#include +#include + +#include "private.h" + +static bool abort_on_error = false; /* flag says to abort on any error */ +static int max_error = 500; /* max number of error print msgs */ + +/* +** GPTLerror: error return routine to print a message and return a failure +** value. +** +** Input arguments: +** fmt: format string +** variable list of additional arguments for vfprintf +** +** Return value: -1 (failure) +*/ + +int GPTLerror (const char *fmt, ...) +{ + va_list args; + + va_start (args, fmt); + static int num_error = 0; + + if (fmt != NULL && num_error < max_error) { +#ifndef NO_VPRINTF + (void) vfprintf (stderr, fmt, args); +#else + (void) fprintf (stderr, "GPTLerror: no vfprintf: fmt is %s\n", fmt); +#endif + if (num_error == max_error) + (void) fprintf (stderr, "Truncating further error print now after %d msgs", + num_error); + ++num_error; + } + + va_end (args); + + if (abort_on_error) + exit (-1); + + return (-1); +} + +/* +** GPTLset_abort_on_error: User-visible routine to set abort_on_error flag +** +** Input arguments: +** val: true (abort on error) or false (don't) +*/ + +void GPTLset_abort_on_error (bool val) +{ + abort_on_error = val; +} + +/* +** GPTLallocate: wrapper utility for malloc +** +** Input arguments: +** nbytes: size to allocate +** +** Return value: pointer to the new space (or NULL) +*/ + +void *GPTLallocate (const int nbytes) +{ + void *ptr; + + if ( nbytes <= 0 || ! (ptr = malloc (nbytes))) + (void) GPTLerror ("GPTLallocate: malloc failed for %d bytes\n", nbytes); + + return ptr; +} diff --git a/share/timing/README b/share/timing/README new file mode 100644 index 0000000..f8f3f7f --- /dev/null +++ b/share/timing/README @@ -0,0 +1,143 @@ +This file contains information about using GPTL. For information on building +and installing GPTL, see the file INSTALL. + +GPTL is the "General Purpose Timing Library". It can be used to manually +instrument application codes with an arbitrary set of "regions" (or "timers") +over which statistics such as wallclock time and CPU time are gathered and +subsequently printed. If the target application is built with the GNU +compilers (gcc or gfortran) or Pathscale (pathcc or pathf95), GPTL can also be +used to automatically instrument regions which are defined by function entry +and exit points. This is an easy way to generate a dynamic call tree. See +Auto-Instrumentation below for a description of how to use this feature. + +If the PAPI library is installed (http://icl.cs.utk.edu/papi), GPTL +also provides a convenient mechanism to access all available PAPI events. In +addtion to PAPI preset and native events, GPTL defines derived events which +are based on PAPI counters. See gptl.h for a list of available derived events. +Of course these events can only be enabled if the PAPI counters they require +are available on the target architecture. + + +Using GPTL +---------- + +C codes making GPTL library calls should #include . Fortran codes +should #include or Fortran include 'gptl.inc'. The C and Fortran interfaces +are identical, except that the C interface uses mixed case. All +user-accessible functions return either 0 (success) or -1 (failure). Example +codes that use the library can be found in subdirectories ctests/ and ftests/. + +Code instrumentation to utilize GPTL involves zero or more calls to +GPTLsetoption(), then a single call to GPTLinitialize(), then an arbitrary +sequence of calls to GPTLstart() and GPTLstop(), and finally a call to +GPTLpr() or GPTLpr_file(). See "Example" below for a sample calling +sequence. Calls to GPTLstart() and GPTLstop() are thread-safe, with per-thread +statistics printed by GPTLpr() or GPTLpr_file(). + +The purpose of GPTLsetoption() is to enable or disable various library +options. For example, to enable the PAPI counter for total cycles, do this: + +ret = GPTLsetoption (PAPI_TOT_CYC, 1); + +The "1" says "enable". Use "0" for "disable". See the man pages for complete +documentation on function usage and arguments. The list of available GPTL +options is contained in gptl.h, and a complete list of available PAPI-based +events can be found by running "ctests/avail". + +GPTLinitialize() initializes the GPTL library. + +There can be an arbitrary number of start/stop pairs before GPTLpr() or +GPTLpr_file() is called to print the results. And an arbitrary amount of +nesting of regions is also allowed. The printed results will be indented to +indicate the level of nesting for each region. + +GPTLpr() prints the results to a file named timing., where the single +argument is an integer. For MPI jobs, it is most convenient to use +the MPI rank of the invoking task for . Equivalently, function +GPTLpr_file() can be called. Its input argument is a character string +indicating the output file name to be written. It is up to the user to ensure +that these print functions write to uniquely-named files, in order to avoid +name-space collisions. + +GPTLfinalize() can be called to clean up the GPTL environment. All space +malloc'ed by the GPTL library will be freed by this call. + + +Example +------- + +From "man GPTLstart", a simple example calling sequence to time a couple of +code regions and print the results is: + +(void) GPTLsetoption (GPTLcpu, 1); /* enable cpu timings */ +(void) GPTLsetoption (GPTLwall, 0); /* disable wallclock timings */ +(void) GPTLsetoption (PAPI_TOT_CYC, 1); /* enable counting of total cycles */ +... +(void) GPTLinitialize(); /* initialize the GPTL library */ +(void) GPTLstart ("total"); /* start a timer */ +... +(void) GPTLstart ("do_work"); /* start another timer */ + +do_work(); /* do some work */ + +(void) GPTLstop ("do_work"); /* stop a timer */ +(void) GPTLstop ("total"); /* stop a timer */ +... +(void) GPTLpr (mympitaskid); /* print the results to timing. */ + + +Auto-instrumentation +-------------------- + +If the regions to be timed are defined by function entry and exit points, and +the application to be profiled is built with either the GNU or Pathscale +compilers, you might find it convenient to use the auto-instrumentation +feature of GPTL. Here's how: + +1) Add the flag -finstrument-functions when compiling the routines you'd like +to profile. + +2) Add calls to GPTLsetoption() (if desired), and GPTLinitialize() to the main +program before any other routines are invoked. + +3) Add a call to GPTLpr() or GPTLpr_file() wherever appropriate prior to where +the code terminates. + +4) Link with -lgptl (and -lpapi if PAPI is enabled). + +5) Run the code. + +6) Run "hex2name.pl | less", where + is the name of the executable, and is the name of the +timing file to be converted. + +The result should be a dynamic call tree with timings and (if enabled) PAPI +counts and derived event statistics for each region, where regions are defined +by function entry and exit points. + +Here's what's happening under the covers: + +The -finstrument-functions flag tells the compiler to insert calls to +__cyg_profile_func_enter (void *this_fn, void *call_site) at function start, +and __cyg_profile_func_exit (void *this_fn, void *call_site) at function +exit. GPTL defines these functions as calls to (effectively) GPTLstart() and +GPTLstop(), where the address of the function is used as the input sentinel to +these routines. + +Running hex2name.pl converts the function addresses back to human-readable +function names. It uses the UNIX "nm" utility to do this. + + +Multi-processor instrumented codes +---------------------------------- + +For instrumented codes which make use of threading and/or MPI, a +post-processing script is provided to analyze GPTL output files and gather +max/min/average stats on a per-region basis. The script is parsegptlout.pl. It +might be invoked as, for example: + +parsegptlout.pl sub1 + +The script will look through all files in the current directory named timing.* +for regions named "sub1", then gather and print various statistics. Numerous +options are available. See "man parsegptlout" for more in-depth information. diff --git a/share/timing/f_wrappers.c b/share/timing/f_wrappers.c new file mode 100644 index 0000000..0e33e62 --- /dev/null +++ b/share/timing/f_wrappers.c @@ -0,0 +1,614 @@ +/* +** $Id: f_wrappers.c,v 1.56 2010-12-29 18:46:42 rosinski Exp $ +** +** Author: Jim Rosinski +** +** Fortran wrappers for timing library routines +*/ + +#include +#include +#include "private.h" /* MAX_CHARS, bool */ +#include "gptl.h" /* function prototypes and HAVE_MPI logic*/ + +#if ( defined FORTRANCAPS ) + +#define gptlinitialize GPTLINITIALIZE +#define gptlfinalize GPTLFINALIZE +#define gptlprint_mode_query GPTLPRINT_MODE_QUERY +#define gptlprint_mode_set GPTLPRINT_MODE_SET +#define gptlpr GPTLPR +#define gptlpr_file GPTLPR_FILE +#define gptlpr_summary GPTLPR_SUMMARY +#define gptlpr_summary_FILE GPTLPR_SUMMARY_FILE +#define gptlbarrier GPTLBARRIER +#define gptlprefix_set GPTLPREFIX_SET +#define gptlprefix_unset GPTLPREFIX_UNSET +#define gptlreset GPTLRESET +#define gptlstamp GPTLSTAMP +#define gptlstart GPTLSTART +#define gptlstart_handle GPTLSTART_HANDLE +#define gptlstop GPTLSTOP +#define gptlstop_handle GPTLSTOP_HANDLE +#define gptlstartstop_vals GPTLSTARTSTOP_VALS +#define gptlsetoption GPTLSETOPTION +#define gptlenable GPTLENABLE +#define gptldisable GPTLDISABLE +#define gptlsetutr GPTLSETUTR +#define gptlquery GPTLQUERY +#define gptlquerycounters GPTLQUERYCOUNTERS +#define gptlget_wallclock GPTLGET_WALLCLOCK +#define gptlget_eventvalue GPTLGET_EVENTVALUE +#define gptlget_nregions GPTLGET_NREGIONS +#define gptlget_regionname GPTLGET_REGIONNAME +#define gptlget_memusage GPTLGET_MEMUSAGE +#define gptlprint_memusage GPTLPRINT_MEMUSAGE +#define gptl_papilibraryinit GPTL_PAPILIBRARYINIT +#define gptlevent_name_to_code GPTLEVENT_NAME_TO_CODE +#define gptlevent_code_to_name GPTLEVENT_CODE_TO_NAME + +#elif ( defined INCLUDE_CMAKE_FCI ) + +#define gptlinitialize FCI_GLOBAL(gptlinitialize,GPTLINITIALIZE) +#define gptlfinalize FCI_GLOBAL(gptlfinalize,GPTLFINALIZE) +#define gptlprint_mode_query FCI_GLOBAL(gptlprint_mode_query,GPTLPRINT_MODE_QUERY) +#define gptlprint_mode_set FCI_GLOBAL(gptlprint_mode_set,GPTLPRINT_MODE_SET) +#define gptlpr FCI_GLOBAL(gptlpr,GPTLPR) +#define gptlpr_file FCI_GLOBAL(gptlpr_file,GPTLPR_FILE) +#define gptlpr_summary FCI_GLOBAL(gptlpr_summary,GPTLPR_SUMMARY) +#define gptlpr_summary_file FCI_GLOBAL(gptlpr_summary_file,GPTLPR_SUMMARY_FILE) +#define gptlbarrier FCI_GLOBAL(gptlbarrier,GPTLBARRIER) +#define gptlprefix_set FCI_GLOBAL(gptlprefix_set,GPTLPREFIX_SET) +#define gptlprefix_unset FCI_GLOBAL(gptlprefix_unset,GPTLPREFIX_UNSET) +#define gptlreset FCI_GLOBAL(gptlreset,GPTLRESET) +#define gptlstamp FCI_GLOBAL(gptlstamp,GPTLSTAMP) +#define gptlstart FCI_GLOBAL(gptlstart,GPTLSTART) +#define gptlstart_handle FCI_GLOBAL(gptlstart_handle,GPTLSTART_HANDLE) +#define gptlstop FCI_GLOBAL(gptlstop,GPTLSTOP) +#define gptlstop_handle FCI_GLOBAL(gptlstop_handle,GPTLSTOP_HANDLE) +#define gptlstartstop_vals FCI_GLOBAL(gptlstartstop_vals,GPTLSTARTSTOP_VALS) +#define gptlsetoption FCI_GLOBAL(gptlsetoption,GPTLSETOPTION) +#define gptlenable FCI_GLOBAL(gptlenable,GPTLENABLE) +#define gptldisable FCI_GLOBAL(gptldisable,GPTLDISABLE) +#define gptlsetutr FCI_GLOBAL(gptlsetutr,GPTLSETUTR) +#define gptlquery FCI_GLOBAL(gptlquery,GPTLQUERY) +#define gptlquerycounters FCI_GLOBAL(gptlquerycounters,GPTLQUERYCOUNTERS) +#define gptlget_wallclock FCI_GLOBAL(gptlget_wallclock,GPTLGET_WALLCLOCK) +#define gptlget_eventvalue FCI_GLOBAL(gptlget_eventvalue,GPTLGET_EVENTVALUE) +#define gptlget_nregions FCI_GLOBAL(gptlget_nregions,GPTLGET_NREGIONS) +#define gptlget_regionname FCI_GLOBAL(gptlget_regionname,GPTLGET_REGIONNAME) +#define gptlget_memusage FCI_GLOBAL(gptlget_memusage,GPTLGET_MEMUSAGE) +#define gptlprint_memusage FCI_GLOBAL(gptlprint_memusage,GPTLPRINT_MEMUSAGE) +#define gptl_papilibraryinit FCI_GLOBAL(gptl_papilibraryinit,GPTL_PAPILIBRARYINIT) +#define gptlevent_name_to_code FCI_GLOBAL(gptlevent_name_to_code,GPTLEVENT_NAME_TO_CODE) +#define gptlevent_code_to_name FCI_GLOBAL(gptlevent_code_to_name,GPTLEVENT_CODE_TO_NAME) + +#elif ( defined FORTRANUNDERSCORE ) + +#define gptlinitialize gptlinitialize_ +#define gptlfinalize gptlfinalize_ +#define gptlprint_mode_query gptlprint_mode_query_ +#define gptlprint_mode_set gptlprint_mode_set_ +#define gptlpr gptlpr_ +#define gptlpr_file gptlpr_file_ +#define gptlpr_summary gptlpr_summary_ +#define gptlpr_summary_file gptlpr_summary_file_ +#define gptlbarrier gptlbarrier_ +#define gptlprefix_set gptlprefix_set_ +#define gptlprefix_unset gptlprefix_unset_ +#define gptlreset gptlreset_ +#define gptlstamp gptlstamp_ +#define gptlstart gptlstart_ +#define gptlstart_handle gptlstart_handle_ +#define gptlstop gptlstop_ +#define gptlstop_handle gptlstop_handle_ +#define gptlstartstop_vals gptlstartstop_vals_ +#define gptlsetoption gptlsetoption_ +#define gptlenable gptlenable_ +#define gptldisable gptldisable_ +#define gptlsetutr gptlsetutr_ +#define gptlquery gptlquery_ +#define gptlquerycounters gptlquerycounters_ +#define gptlget_wallclock gptlget_wallclock_ +#define gptlget_eventvalue gptlget_eventvalue_ +#define gptlget_nregions gptlget_nregions_ +#define gptlget_regionname gptlget_regionname_ +#define gptlget_memusage gptlget_memusage_ +#define gptlprint_memusage gptlprint_memusage_ +#define gptl_papilibraryinit gptl_papilibraryinit_ +#define gptlevent_name_to_code gptlevent_name_to_code_ +#define gptlevent_code_to_name gptlevent_code_to_name_ + +#elif ( defined FORTRANDOUBLEUNDERSCORE ) + +#define gptlinitialize gptlinitialize__ +#define gptlfinalize gptlfinalize__ +#define gptlprint_mode_query gptlprint_mode_query__ +#define gptlprint_mode_set gptlprint_mode_set__ +#define gptlpr gptlpr__ +#define gptlpr_file gptlpr_file__ +#define gptlpr_summary gptlpr_summary__ +#define gptlpr_summary_file gptlpr_summary_file__ +#define gptlbarrier gptlbarrier__ +#define gptlprefix_set gptlprefix_set__ +#define gptlprefix_unset gptlprefix_unset__ +#define gptlreset gptlreset__ +#define gptlstamp gptlstamp__ +#define gptlstart gptlstart__ +#define gptlstart_handle gptlstart_handle__ +#define gptlstop gptlstop__ +#define gptlstop_handle gptlstop_handle__ +#define gptlstartstop_vals gptlstartstop_vals__ +#define gptlsetoption gptlsetoption__ +#define gptlenable gptlenable__ +#define gptldisable gptldisable__ +#define gptlsetutr gptlsetutr__ +#define gptlquery gptlquery__ +#define gptlquerycounters gptlquerycounters__ +#define gptlget_wallclock gptlget_wallclock__ +#define gptlget_eventvalue gptlget_eventvalue__ +#define gptlget_nregions gptlget_nregions__ +#define gptlget_regionname gptlget_regionname__ +#define gptlget_memusage gptlget_memusage__ +#define gptlprint_memusage gptlprint_memusage__ +#define gptl_papilibraryinit gptl_papilibraryinit__ +#define gptlevent_name_to_code gptlevent_name_to_code__ +#define gptlevent_code_to_name gptlevent_code_to_name__ + +#endif + +/* +** Local function prototypes +*/ + +int gptlinitialize (void); +int gptlfinalize (void); +int gptlprint_mode_query (void); +int gptlprint_mode_set (int *pr_mode); +int gptlpr (int *procid); +int gptlpr_file (char *file, int nc1); +int gptlpr_summary (int *fcomm); +int gptlpr_summary_file (int *fcomm, char *name, int nc1); +int gptlbarrier (int *fcomm, char *name, int nc1); +int gptlprefix_set (char *name, int nc1); +int gptlprefix_unset (void); +int gptlreset (void); +int gptlstamp (double *wall, double *usr, double *sys); +int gptlstart (char *name, int nc1); +int gptlstart_handle (char *name, void **, int nc1); +int gptlstop (char *name, int nc1); +int gptlstop_handle (char *name, void **, int nc1); +int gptlstartstop_vals (char *name, double *val, int *cnt, int nc1); +int gptlsetoption (int *option, int *val); +int gptlenable (void); +int gptldisable (void); +int gptlsetutr (int *option); +int gptlquery (const char *name, int *t, int *count, int *onflg, double *wallclock, + double *usr, double *sys, long long *papicounters_out, int *maxcounters, + int nc); +int gptlquerycounters (const char *name, int *t, long long *papicounters_out, int nc); +int gptlget_wallclock (const char *name, int *t, double *value, int nc); +int gptlget_eventvalue (const char *timername, const char *eventname, int *t, double *value, + int nc1, int nc2); +int gptlget_nregions (int *t, int *nregions); +int gptlget_regionname (int *t, int *region, char *name, int nc); +int gptlget_memusage (int *size, int *rss, int *share, int *text, int *datastack); +int gptlprint_memusage (const char *str, int nc); +#ifdef HAVE_PAPI +int gptl_papilibraryinit (void); +int gptlevent_name_to_code (const char *str, int *code, int nc); +int gptlevent_code_to_name (int *code, char *str, int nc); +#endif + +/* +** Fortran wrapper functions start here +*/ + +int gptlinitialize (void) +{ + return GPTLinitialize (); +} + +int gptlfinalize (void) +{ + return GPTLfinalize (); +} + +int gptlprint_mode_query (void) +{ + return GPTLprint_mode_query (); +} + +int gptlprint_mode_set (int *pr_mode) +{ + return GPTLprint_mode_set (*pr_mode); +} + +int gptlpr (int *procid) +{ + return GPTLpr (*procid); +} + +int gptlpr_file (char *file, int nc1) +{ + char *locfile; + int c; + int ret; + + if ( ! (locfile = (char *) malloc (nc1+1))) + return GPTLerror ("gptlpr_file: malloc error\n"); + + //pw snprintf (locfile, nc1+1, "%s", file); + for (c = 0; c < nc1; c++) { + locfile[c] = file[c]; + } + locfile[c] = '\0'; + + ret = GPTLpr_file (locfile); + free (locfile); + return ret; +} + +int gptlpr_summary (int *fcomm) +{ +#ifdef HAVE_MPI + MPI_Comm ccomm; +#ifdef HAVE_COMM_F2C + ccomm = MPI_Comm_f2c (*fcomm); +#else + /* Punt and try just casting the Fortran communicator */ + ccomm = (MPI_Comm) *fcomm; +#endif +#else + int ccomm = 0; +#endif + + return GPTLpr_summary (ccomm); +} + +int gptlpr_summary_file (int *fcomm, char *file, int nc1) +{ + char *locfile; + int c; + int ret; + +#ifdef HAVE_MPI + MPI_Comm ccomm; +#ifdef HAVE_COMM_F2C + ccomm = MPI_Comm_f2c (*fcomm); +#else + /* Punt and try just casting the Fortran communicator */ + ccomm = (MPI_Comm) *fcomm; +#endif +#else + int ccomm = 0; +#endif + + if ( ! (locfile = (char *) malloc (nc1+1))) + return GPTLerror ("gptlpr_summary_file: malloc error\n"); + + //pw snprintf (locfile, nc1+1, "%s", file); + for (c = 0; c < nc1; c++) { + locfile[c] = file[c]; + } + locfile[c] = '\0'; + + ret = GPTLpr_summary_file (ccomm, locfile); + free (locfile); + return ret; +} + +int gptlbarrier (int *fcomm, char *name, int nc1) +{ + char cname[MAX_CHARS+1]; + int c; + int numchars; +#ifdef HAVE_MPI + MPI_Comm ccomm; +#ifdef HAVE_COMM_F2C + ccomm = MPI_Comm_f2c (*fcomm); +#else + /* Punt and try just casting the Fortran communicator */ + ccomm = (MPI_Comm) *fcomm; +#endif +#else + int ccomm = 0; +#endif + + numchars = MIN (nc1, MAX_CHARS); + //pw strncpy (cname, name, numchars); + for (c = 0; c < numchars; c++) { + cname[c] = name[c]; + } + cname[numchars] = '\0'; + return GPTLbarrier (ccomm, cname); +} + +int gptlprefix_set (char *name, int nc1) +{ + /* char cname[MAX_CHARS+1]; */ + /* int numchars; */ + + /* numchars = MIN (nc1, MAX_CHARS);*/ + /* strncpy (cname, name, numchars);*/ + /* cname[numchars] = '\0';*/ + /* return GPTLprefix_set (cname);*/ + return GPTLprefix_setf (name, nc1); +} + +int gptlprefix_unset (void) +{ + return GPTLprefix_unset (); +} + +int gptlreset (void) +{ + return GPTLreset(); +} + +int gptlstamp (double *wall, double *usr, double *sys) +{ + return GPTLstamp (wall, usr, sys); +} + +int gptlstart (char *name, int nc1) +{ + /* char cname[MAX_CHARS+1]; */ + /* int numchars; */ + + /* numchars = MIN (nc1, MAX_CHARS);*/ + /* strncpy (cname, name, numchars);*/ + /* cname[numchars] = '\0';*/ + /* return GPTLstart (cname);*/ + return GPTLstartf (name, nc1); +} + +int gptlstart_handle (char *name, void **handle, int nc1) +{ + /* char cname[MAX_CHARS+1];*/ + /* int numchars;*/ + + /* if (*handle) {*/ + /* cname[0] = '\0';*/ + /* } else {*/ + /* numchars = MIN (nc1, MAX_CHARS);*/ + /* strncpy (cname, name, numchars);*/ + /* cname[numchars] = '\0';*/ + /* }*/ + /* return GPTLstart_handle (cname, handle);*/ + return GPTLstartf_handle (name, nc1, handle); +} + +int gptlstop (char *name, int nc1) +{ + /* char cname[MAX_CHARS+1];*/ + /* int numchars;*/ + + /* numchars = MIN (nc1, MAX_CHARS);*/ + /* strncpy (cname, name, numchars);*/ + /* cname[numchars] = '\0';*/ + /* return GPTLstop (cname);*/ + return GPTLstopf (name, nc1); +} + +int gptlstop_handle (char *name, void **handle, int nc1) +{ + /* char cname[MAX_CHARS+1];*/ + /* int numchars;*/ + + /* if (*handle) {*/ + /* cname[0] = '\0';*/ + /* } else {*/ + /* numchars = MIN (nc1, MAX_CHARS);*/ + /* strncpy (cname, name, numchars);*/ + /* cname[numchars] = '\0';*/ + /* }*/ + /* return GPTLstop_handle (cname, handle);*/ + return GPTLstopf_handle (name, nc1, handle); +} + +int gptlstartstop_vals (char *name, double *val, int *cnt, int nc1) +{ + /* char cname[MAX_CHARS+1];*/ + /* int c;*/ + /* int numchars;*/ + + /* numchars = MIN (nc1, MAX_CHARS);*/ + //pw strncpy (cname, name, numchars); + /* for (c = 0; c < numchars; c++) {*/ + /* cname[c] = name[c];*/ + /* }*/ + /* cname[numchars] = '\0';*/ + /* return GPTLstartstop_vals (cname, *val, *cnt);*/ + return GPTLstartstop_valsf (name, nc1, *val, *cnt); +} + +int gptlsetoption (int *option, int *val) +{ + return GPTLsetoption (*option, *val); +} + +int gptlenable (void) +{ + return GPTLenable (); +} + +int gptldisable (void) +{ + return GPTLdisable (); +} + +int gptlsetutr (int *option) +{ + return GPTLsetutr (*option); +} + +int gptlquery (const char *name, int *t, int *count, int *onflg, double *wallclock, + double *usr, double *sys, long long *papicounters_out, int *maxcounters, + int nc) +{ + char cname[MAX_CHARS+1]; + int c; + int numchars; + + numchars = MIN (nc, MAX_CHARS); + //pw strncpy (cname, name, numchars); + for (c = 0; c < numchars; c++) { + cname[c] = name[c]; + } + cname[numchars] = '\0'; + return GPTLquery (cname, *t, count, onflg, wallclock, usr, sys, papicounters_out, *maxcounters); +} + +int gptlquerycounters (const char *name, int *t, long long *papicounters_out, int nc) +{ + char cname[MAX_CHARS+1]; + int c; + int numchars; + + numchars = MIN (nc, MAX_CHARS); + //pw strncpy (cname, name, numchars); + for (c = 0; c < numchars; c++) { + cname[c] = name[c]; + } + cname[numchars] = '\0'; + return GPTLquerycounters (cname, *t, papicounters_out); +} + +int gptlget_wallclock (const char *name, int *t, double *value, int nc) +{ + char cname[MAX_CHARS+1]; + int c; + int numchars; + + numchars = MIN (nc, MAX_CHARS); + //pw strncpy (cname, name, numchars); + for (c = 0; c < numchars; c++) { + cname[c] = name[c]; + } + cname[numchars] = '\0'; + + return GPTLget_wallclock (cname, *t, value); +} + +int gptlget_eventvalue (const char *timername, const char *eventname, int *t, double *value, + int nc1, int nc2) +{ + char ctimername[MAX_CHARS+1]; + char ceventname[MAX_CHARS+1]; + int c; + int numchars; + + numchars = MIN (nc1, MAX_CHARS); + //pw strncpy (ctimername, timername, numchars); + for (c = 0; c < numchars; c++) { + ctimername[c] = timername[c]; + } + ctimername[numchars] = '\0'; + + numchars = MIN (nc2, MAX_CHARS); + //pw strncpy (ceventname, eventname, numchars); + for (c = 0; c < numchars; c++) { + ceventname[c] = eventname[c]; + } + ceventname[numchars] = '\0'; + + return GPTLget_eventvalue (ctimername, ceventname, *t, value); +} + +int gptlget_nregions (int *t, int *nregions) +{ + return GPTLget_nregions (*t, nregions); +} + +int gptlget_regionname (int *t, int *region, char *name, int nc) +{ + int n; + int ret; + + ret = GPTLget_regionname (*t, *region, name, nc); + /* Turn nulls into spaces for fortran */ + for (n = 0; n < nc; ++n) + if (name[n] == '\0') + name[n] = ' '; + return ret; +} + +int gptlget_memusage (int *size, int *rss, int *share, int *text, int *datastack) +{ + return GPTLget_memusage (size, rss, share, text, datastack); +} + +int gptlprint_memusage (const char *str, int nc) +{ + char cname[128+1]; + int c; + int numchars = MIN (nc, 128); + + //pw strncpy (cname, str, numchars); + for (c = 0; c < numchars; c++) { + cname[c] = str[c]; + } + cname[numchars] = '\0'; + return GPTLprint_memusage (cname); +} + +#ifdef HAVE_PAPI +#include + +int gptl_papilibraryinit (void) +{ + return GPTL_PAPIlibraryinit (); +} + +int gptlevent_name_to_code (const char *str, int *code, int nc) +{ + char cname[PAPI_MAX_STR_LEN+1]; + int c; + int numchars = MIN (nc, PAPI_MAX_STR_LEN); + + //pw strncpy (cname, str, numchars); + for (c = 0; c < numchars; c++) { + cname[c] = str[c]; + } + cname[numchars] = '\0'; + + /* "code" is an int* and is an output variable */ + + return GPTLevent_name_to_code (cname, code); +} + +int gptlevent_code_to_name (int *code, char *str, int nc) +{ + int i; + + if (nc < PAPI_MAX_STR_LEN) + return GPTLerror ("gptl_event_code_to_name: output name must hold at least %d characters\n", + PAPI_MAX_STR_LEN); + + if (GPTLevent_code_to_name (*code, str) == 0) { + for (i = strlen(str); i < nc; ++i) + str[i] = ' '; + } else { + return GPTLerror (""); + } + return 0; +} +#else + +int gptl_papilibraryinit (void) +{ + return GPTL_PAPIlibraryinit (); +} + +int gptlevent_name_to_code (const char *str, int *code, int nc) +{ + return GPTLevent_name_to_code (str, code); +} + +int gptlevent_code_to_name (const int *code, char *str, int nc) +{ + return GPTLevent_code_to_name (*code, str); +} + +#endif diff --git a/share/timing/gptl.c b/share/timing/gptl.c new file mode 100644 index 0000000..1eeaccc --- /dev/null +++ b/share/timing/gptl.c @@ -0,0 +1,6026 @@ +/* +** $Id: gptl.c,v 1.157 2011-03-28 20:55:18 rosinski Exp $ +** +** Author: Jim Rosinski +** +** Main file contains most user-accessible GPTL functions +*/ + +#include /* malloc */ +#include /* gettimeofday */ +#include /* times */ +#include /* gettimeofday, syscall */ +#include +#include /* memset, strcmp (via STRMATCH), strncmp (via STRNMATCH) */ +#include /* isdigit */ +#include /* u_int8_t, u_int16_t */ +#include /* FLT_MAX */ +#include + +#ifndef HAVE_C99_INLINE +#define inline +#endif + +#ifdef HAVE_PAPI +#include /* PAPI_get_real_usec */ +#endif + +#ifdef HAVE_LIBRT +#include +#endif + +#ifdef _AIX +#include +#endif + +#include "private.h" +#include "gptl.h" + +static Timer **timers = 0; /* linked list of timers */ +static Timer **last = 0; /* last element in list */ +static int *max_depth; /* maximum indentation level encountered */ +static int *max_name_len; /* max length of timer name */ +static volatile int nthreads = -1; /* num threads. Init to bad value */ +static volatile int maxthreads = -1; /* max threads (=nthreads for OMP). Init to bad value */ +static int depthlimit = 99999; /* max depth for timers (99999 is effectively infinite) */ +static volatile bool disabled = false; /* Timers disabled? */ +static volatile bool initialized = false; /* GPTLinitialize has been called */ +static volatile bool pr_has_been_called = false; /* GPTLpr_file has been called */ +static Entry eventlist[MAX_AUX]; /* list of PAPI-based events to be counted */ +static int nevents = 0; /* number of PAPI events (init to 0) */ +static bool dousepapi = false; /* saves a function call if stays false */ +static bool verbose = false; /* output verbosity */ +static bool percent = false; /* print wallclock also as percent of 1st timers[0] */ +static bool dopr_preamble = true; /* whether to print preamble info */ +static bool dopr_threadsort = true; /* whether to print sorted thread stats */ +static bool dopr_multparent = true; /* whether to print multiple parent info */ +static bool dopr_collision = true; /* whether to print hash collision info */ +static bool dopr_quotes = false; /* whether to surround timer names with double quotes */ + +static time_t ref_gettimeofday = -1; /* ref start point for gettimeofday */ +static time_t ref_clock_gettime = -1;/* ref start point for clock_gettime */ +#ifdef _AIX +static time_t ref_read_real_time = -1; /* ref start point for read_real_time */ +#endif +static long long ref_papitime = -1; /* ref start point for PAPI_get_real_usec */ + +#if ( defined THREADED_OMP ) + +#include +static volatile int *threadid_omp = 0; /* array of thread ids */ + +#elif ( defined THREADED_PTHREADS ) + +#include + +#define MUTEX_API +#ifdef MUTEX_API +static volatile pthread_mutex_t t_mutex; +#else +static volatile pthread_mutex_t t_mutex = PTHREAD_MUTEX_INITIALIZER; +#endif +static volatile pthread_t *threadid = 0; /* array of thread ids */ +static int lock_mutex (void); /* lock a mutex for entry into a critical region */ +static int unlock_mutex (void); /* unlock a mutex for exit from a critical region */ + +#else + +/* Unthreaded case */ +static int threadid = -1; + +#endif + +typedef struct { + const Option option; /* wall, cpu, etc. */ + const char *str; /* descriptive string for printing */ + bool enabled; /* flag */ +} Settings; + +/* For Summary stats */ + +typedef struct { + double wallmax; + double wallmin; + double walltotal; + int onflgs; + int processes; + int threads; +#ifdef HAVE_PAPI + double papimax[MAX_AUX]; + double papimin[MAX_AUX]; + double papitotal[MAX_AUX]; +#endif + unsigned long count; + int wallmax_p; /* over processes */ + int wallmax_t; /* over threads */ + int wallmin_p; + int wallmin_t; +#ifdef HAVE_PAPI + int papimax_p[MAX_AUX]; /* over processes */ + int papimax_t[MAX_AUX]; /* over threads */ + int papimin_p[MAX_AUX]; + int papimin_t[MAX_AUX]; +#endif +} Summarystats; + +/* Options, print strings, and default enable flags */ + +static Settings cpustats = {GPTLcpu, "Usr sys usr+sys ", false}; +static Settings wallstats = {GPTLwall, " Wallclock max min", true }; +static Settings overheadstats = {GPTLoverhead, " UTR Overhead " , true }; +static Settings profileovhd = {GPTLprofile_ovhd, "", false }; + +static Hashentry **hashtable; /* table of entries */ +static long ticks_per_sec; /* clock ticks per second */ +static char **timerlist; /* list of all timers */ + +typedef struct { + int val; /* depth in calling tree */ + int padding[31]; /* padding is to mitigate false cache sharing */ +} Nofalse; +static Timer ***callstack; /* call stack */ +static Nofalse *stackidx; /* index into callstack: */ + +static int prefix_len_nt; /* length of timer name prefix set outside parallel region */ +static char *prefix_nt; /* timer name prefix set outside of parallel region */ +static int *prefix_len; /* length of timer name prefix for each thread */ +static char **prefix; /* timer name prefix for each thread */ + +static Method method = GPTLmost_frequent; /* default parent/child printing mechanism */ +static PRMode print_mode = GPTLprint_write; /* default output mode */ + +/* Local function prototypes */ + +static void printstats (const Timer *, FILE *, const int, const int, const bool, double); +static void add (Timer *, const Timer *); + +static void get_threadstats (const int, const char *, Summarystats *); +static void get_summarystats (Summarystats *, const Summarystats *); +#ifdef HAVE_MPI +static int collect_data( const int, MPI_Comm, int *, Summarystats ** ); +#else +static int collect_data( const int, const int, int *, Summarystats ** ); +#endif +static int merge_thread_data(); + +static void print_multparentinfo (FILE *, Timer *); +static inline int get_cpustamp (long *, long *); +static int newchild (Timer *, Timer *); +static int get_max_depth (const Timer *, const int); +static int num_descendants (Timer *); +static int is_descendant (const Timer *, const Timer *); +static int show_descendant (const int, const Timer *, const Timer *); +static char *methodstr (Method); +static char *modestr (PRMode); + +/* Prototypes from previously separate file threadutil.c */ + +static int threadinit (void); /* initialize threading environment */ +static void threadfinalize (void); /* finalize threading environment */ +static void print_threadmapping (FILE *); /* print mapping of thread ids */ +static inline int get_thread_num (void); /* get 0-based thread number */ +static int serial_region (void); /* check whether in a serial region */ + +/* These are the (possibly) supported underlying wallclock timers */ + +static inline double utr_nanotime (void); +static inline double utr_mpiwtime (void); +static inline double utr_clock_gettime (void); +static inline double utr_papitime (void); +static inline double utr_read_real_time (void); +static inline double utr_gettimeofday (void); + +static int init_nanotime (void); +static int init_mpiwtime (void); +static int init_clock_gettime (void); +static int init_papitime (void); +static int init_read_real_time (void); +static int init_gettimeofday (void); + +static double utr_getoverhead (void); +static inline Timer *getentry_instr (const Hashentry *, void *, unsigned int *); +static inline Timer *getentry (const Hashentry *, const char *, unsigned int *); +static inline Timer *getentryf (const Hashentry *, const char *, const int, unsigned int *); +static void printself_andchildren (const Timer *, FILE *, const int, const int, const double); +static inline int update_parent_info (Timer *, Timer **, int); +static inline int update_stats (Timer *, const double, const long, const long, const int); +static int update_ll_hash (Timer *, const int, const unsigned int); +static inline int update_ptr (Timer *, const int); +static int construct_tree (Timer *, Method); + +static int cmp (const void *, const void *); +static int ncmp (const void *, const void *); +static int get_index ( const char *, const char *); + +static int add_prefix( char *, const char *, const int, const int); + +typedef struct { + const Funcoption option; + double (*func)(void); + int (*funcinit)(void); + const char *name; +} Funcentry; + +static Funcentry funclist[] = { + {GPTLgettimeofday, utr_gettimeofday, init_gettimeofday, "gettimeofday"}, + {GPTLnanotime, utr_nanotime, init_nanotime, "nanotime"}, + {GPTLmpiwtime, utr_mpiwtime, init_mpiwtime, "MPI_Wtime"}, + {GPTLclockgettime, utr_clock_gettime, init_clock_gettime, "clock_gettime"}, + {GPTLpapitime, utr_papitime, init_papitime, "PAPI_get_real_usec"}, + {GPTLread_real_time, utr_read_real_time, init_read_real_time,"read_real_time"} /* AIX only */ +}; +static const int nfuncentries = sizeof (funclist) / sizeof (Funcentry); + +static double (*ptr2wtimefunc)() = 0; /* init to invalid */ +static int funcidx = 0; /* default timer is gettimeofday */ + +#ifdef HAVE_NANOTIME +static float cpumhz = -1.; /* init to bad value */ +static double cyc2sec = -1; /* init to bad value */ +static unsigned inline long long nanotime (void); /* read counter (assembler) */ +static float get_clockfreq (void); /* cycles/sec */ +#endif + +#define DEFAULT_TABLE_SIZE 2048 +static int tablesize = DEFAULT_TABLE_SIZE; /* per-thread size of hash table (settable parameter) */ +static char *outdir = 0; /* dir to write output files to (currently unused) */ + +static double overhead_utr = 0.0; /* timer cost estimate */ +static double overhead_est = 0.0; /* direct measurement of overhead for thread 0 */ +static double overhead_bound = 0.0; /* direct measurement of overhead for thread 0 */ + +/* VERBOSE is a debugging ifdef local to the rest of this file */ +#undef VERBOSE + +/* +** GPTLsetoption: set option value to true or false. +** +** Input arguments: +** option: option to be set +** val: value to which option should be set (nonzero=true, zero=false) +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLsetoption (const int option, /* option */ + const int val) /* value */ +{ + static const char *thisfunc = "GPTLsetoption"; + + if (initialized) + return GPTLerror ("%s: must be called BEFORE GPTLinitialize\n", thisfunc); + + if (option == GPTLabort_on_error) { + GPTLset_abort_on_error ((bool) val); + if (verbose) + printf ("%s: boolean abort_on_error = %d\n", thisfunc, val); + return 0; + } + + switch (option) { + case GPTLcpu: +#ifdef HAVE_TIMES + cpustats.enabled = (bool) val; + if (verbose) + printf ("%s: cpustats = %d\n", thisfunc, val); +#else + if (val) + return GPTLerror ("%s: times() not available\n", thisfunc); +#endif + return 0; + case GPTLwall: + wallstats.enabled = (bool) val; + if (verbose) + printf ("%s: boolean wallstats = %d\n", thisfunc, val); + return 0; + case GPTLoverhead: + overheadstats.enabled = (bool) val; + if (verbose) + printf ("%s: boolean overheadstats = %d\n", thisfunc, val); + return 0; + case GPTLprofile_ovhd: + profileovhd.enabled = (bool) val; + if (verbose) + printf ("%s: boolean profileovhd = %d\n", thisfunc, val); + return 0; + case GPTLdepthlimit: + depthlimit = val; + if (verbose) + printf ("%s: depthlimit = %d\n", thisfunc, val); + return 0; + case GPTLverbose: + verbose = (bool) val; +#ifdef HAVE_PAPI + (void) GPTL_PAPIsetoption (GPTLverbose, val); +#endif + if (verbose) + printf ("%s: boolean verbose = %d\n", thisfunc, val); + return 0; + case GPTLpercent: + percent = (bool) val; + if (verbose) + printf ("%s: boolean percent = %d\n", thisfunc, val); + return 0; + case GPTLdopr_preamble: + dopr_preamble = (bool) val; + if (verbose) + printf ("%s: boolean dopr_preamble = %d\n", thisfunc, val); + return 0; + case GPTLdopr_threadsort: + dopr_threadsort = (bool) val; + if (verbose) + printf ("%s: boolean dopr_threadsort = %d\n", thisfunc, val); + return 0; + case GPTLdopr_multparent: + dopr_multparent = (bool) val; + if (verbose) + printf ("%s: boolean dopr_multparent = %d\n", thisfunc, val); + return 0; + case GPTLdopr_collision: + dopr_collision = (bool) val; + if (verbose) + printf ("%s: boolean dopr_collision = %d\n", thisfunc, val); + return 0; + case GPTLdopr_quotes: + dopr_quotes = (bool) val; + if (verbose) + printf ("%s: boolean dopr_quotes = %d\n", thisfunc, val); + return 0; + case GPTLprint_mode: + print_mode = (PRMode) val; + if (verbose) + printf ("%s: print_mode = %s\n", thisfunc, modestr (print_mode)); + return 0; + case GPTLprint_method: + method = (Method) val; + if (verbose) + printf ("%s: print_method = %s\n", thisfunc, methodstr (method)); + return 0; + case GPTLtablesize: + if (val < 1) + return GPTLerror ("%s: tablesize must be positive. %d is invalid\n", thisfunc, val); + tablesize = val; + if (verbose) + printf ("%s: tablesize = %d\n", thisfunc, tablesize); + return 0; + case GPTLsync_mpi: +#ifdef ENABLE_PMPI + if (GPTLpmpi_setoption (option, val) != 0) + fprintf (stderr, "%s: GPTLpmpi_setoption failure\n", thisfunc); +#endif + if (verbose) + printf ("%s: boolean sync_mpi = %d\n", thisfunc, val); + return 0; + + case GPTLmaxthreads: + if (val < 1) + return GPTLerror ("%s: maxthreads must be positive. %d is invalid\n", thisfunc, val); + + maxthreads = val; + return 0; + + /* + ** Allow GPTLmultiplex to fall through because it will be handled by + ** GPTL_PAPIsetoption() + */ + + case GPTLmultiplex: + default: + break; + } + +#ifdef HAVE_PAPI + if (GPTL_PAPIsetoption (option, val) == 0) { + if (val) + dousepapi = true; + return 0; + } +#else + /* Make GPTLnarrowprint a placebo if PAPI not enabled */ + + if (option == GPTLnarrowprint) + return 0; +#endif + + return GPTLerror ("%s: faiure to enable option %d\n", thisfunc, option); +} + +/* +** GPTLsetutr: set underlying timing routine. +** +** Input arguments: +** option: index which sets function +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLsetutr (const int option) +{ + int i; /* index over number of underlying timer */ + static const char *thisfunc = "GPTLsetutr"; + + if (initialized) + return GPTLerror ("%s: must be called BEFORE GPTLinitialize\n", thisfunc); + + for (i = 0; i < nfuncentries; i++) { + if (option == (int) funclist[i].option) { + if (verbose) + printf ("%s: underlying wallclock timer = %s\n", thisfunc, funclist[i].name); + funcidx = i; + + /* + ** Return an error condition if the function is not available. + ** OK for the user code to ignore: GPTLinitialize() will reset to gettimeofday + */ + + if ((*funclist[i].funcinit)() < 0) + return GPTLerror ("%s: utr=%s not available\n", thisfunc, funclist[i].name); + else + return 0; + } + } + return GPTLerror ("%s: unknown option %d\n", thisfunc, option); +} + +/* +** GPTLinitialize (): Initialization routine must be called from single-threaded +** region before any other timing routines may be called. The need for this +** routine could be eliminated if not targetting timing library for threaded +** capability. +** +** return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLinitialize (void) +{ + int i; /* loop index */ + int t; /* thread index */ + double t1, t2; /* returned from underlying timer */ + static const char *thisfunc = "GPTLinitialize"; + + if (initialized) + return GPTLerror ("%s: has already been called\n", thisfunc); + + if (threadinit () < 0) + return GPTLerror ("%s: bad return from threadinit\n", thisfunc); + + if ((ticks_per_sec = sysconf (_SC_CLK_TCK)) == -1) + return GPTLerror ("%s: failure from sysconf (_SC_CLK_TCK)\n", thisfunc); + + /* Allocate space for global arrays */ + + callstack = (Timer ***) GPTLallocate (maxthreads * sizeof (Timer **)); + stackidx = (Nofalse *) GPTLallocate (maxthreads * sizeof (Nofalse)); + timers = (Timer **) GPTLallocate (maxthreads * sizeof (Timer *)); + last = (Timer **) GPTLallocate (maxthreads * sizeof (Timer *)); + max_depth = (int *) GPTLallocate (maxthreads * sizeof (int)); + max_name_len = (int *) GPTLallocate (maxthreads * sizeof (int)); + hashtable = (Hashentry **) GPTLallocate (maxthreads * sizeof (Hashentry *)); + prefix_len = (int *) GPTLallocate (maxthreads * sizeof (int)); + prefix = (char **) GPTLallocate (maxthreads * sizeof (char *)); + + /* Initialize array values */ + + for (t = 0; t < maxthreads; t++) { + max_depth[t] = -1; + max_name_len[t] = 0; + callstack[t] = (Timer **) GPTLallocate (MAX_STACK * sizeof (Timer *)); + hashtable[t] = (Hashentry *) GPTLallocate (tablesize * sizeof (Hashentry)); + for (i = 0; i < tablesize; i++) { + hashtable[t][i].nument = 0; + hashtable[t][i].entries = 0; + } + + /* + ** Make a timer "GPTL_ROOT" to ensure no orphans, and to simplify printing. + */ + + timers[t] = (Timer *) GPTLallocate (sizeof (Timer)); + memset (timers[t], 0, sizeof (Timer)); + strcpy (timers[t]->name, "GPTL_ROOT"); + timers[t]->onflg = true; + last[t] = timers[t]; + + stackidx[t].val = 0; + callstack[t][0] = timers[t]; + for (i = 1; i < MAX_STACK; i++) + callstack[t][i] = 0; + + prefix_len[t] = 0; + prefix[t] = (char *) GPTLallocate ((MAX_CHARS+1) * sizeof (char)); + prefix[t][0] = '\0'; + } + + prefix_len_nt = 0; + prefix_nt = (char *) GPTLallocate ((MAX_CHARS+1) * sizeof (char)); + prefix_nt[0] = '\0'; + +#ifdef HAVE_PAPI + if (GPTL_PAPIinitialize (maxthreads, verbose, &nevents, eventlist) < 0) + return GPTLerror ("%s: Failure from GPTL_PAPIinitialize\n", thisfunc); +#endif + + /* + ** Call init routine for underlying timing routine. + */ + + if ((*funclist[funcidx].funcinit)() < 0) { + fprintf (stderr, "%s: Failure initializing %s. Reverting underlying timer to %s\n", + thisfunc, funclist[funcidx].name, funclist[0].name); + funcidx = 0; + } + + ptr2wtimefunc = funclist[funcidx].func; + + if (verbose) { + t1 = (*ptr2wtimefunc) (); + t2 = (*ptr2wtimefunc) (); + if (t1 > t2) + fprintf (stderr, "%s: negative delta-t=%g\n", thisfunc, t2-t1); + + printf ("Per call overhead est. t2-t1=%g should be near zero\n", t2-t1); + printf ("Underlying wallclock timing routine is %s\n", funclist[funcidx].name); + } + + /* set global timer overhead estimate */ + if (wallstats.enabled && profileovhd.enabled){ + overhead_utr = utr_getoverhead (); + } + + initialized = true; + return 0; +} + +/* +** GPTLfinalize (): Finalization routine must be called from single-threaded +** region. Free all malloc'd space +** +** return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLfinalize (void) +{ + int t; /* thread index */ + int n; /* array index */ + Timer *ptr, *ptrnext; /* ll indices */ + static const char *thisfunc = "GPTLfinalize"; + + if ( ! initialized) + return GPTLerror ("%s: initialization was not completed\n", thisfunc); + + for (t = 0; t < maxthreads; ++t) { + for (n = 0; n < tablesize; ++n) { + if (hashtable[t][n].nument > 0) + free (hashtable[t][n].entries); + } + free (hashtable[t]); + hashtable[t] = NULL; + free (callstack[t]); + free (prefix[t]); + for (ptr = timers[t]; ptr; ptr = ptrnext) { + ptrnext = ptr->next; + if (ptr->nparent > 0) { + free (ptr->parent); + free (ptr->parent_count); + } + if (ptr->nchildren > 0) + free (ptr->children); + free (ptr); + } + } + + free (callstack); + free (stackidx); + free (timers); + free (last); + free (max_depth); + free (max_name_len); + free (hashtable); + free (prefix_len); + free (prefix); + free (prefix_nt); + + threadfinalize (); + +#ifdef HAVE_PAPI + GPTL_PAPIfinalize (maxthreads); +#endif + + /* Reset initial values */ + + timers = 0; + last = 0; + max_depth = 0; + max_name_len = 0; + nthreads = -1; + maxthreads = -1; + depthlimit = 99999; + disabled = false; + initialized = false; + pr_has_been_called = false; + dousepapi = false; + verbose = false; + percent = false; + dopr_preamble = true; + dopr_threadsort = true; + dopr_multparent = true; + dopr_collision = true; + print_mode = GPTLprint_write; + ref_gettimeofday = -1; + ref_clock_gettime = -1; +#ifdef _AIX + ref_read_real_time = -1; +#endif + ref_papitime = -1; + funcidx = 0; +#ifdef HAVE_NANOTIME + cpumhz= 0; + cyc2sec = -1; +#endif + outdir = 0; + tablesize = DEFAULT_TABLE_SIZE; + prefix_len_nt = 0; + + return 0; +} + +/* +** GPTLprefix_set: define prefix for subsequent timer names +** +** Input arguments: +** prefixname: prefix string +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLprefix_set (const char *prefixname) /* prefix string */ +{ + int t; /* thread index (of this thread) */ + int len_prefix; /* number of characters in prefix */ + char *ptr_prefix; /* pointer to prefix string */ + static const char *thisfunc = "GPTLprefix_set"; + + if (disabled) + return 0; + + if ( ! initialized){ + return 0; + } + +#if ( defined THREADED_PTHREADS ) + /* + ** prefix logic not enabled when using PTHREADS + */ + return 0; +#endif + + len_prefix = MIN (strlen (prefixname), MAX_CHARS); + + /* + ** Note: if in a parallel region with only one active thread, e.g. + ** thread 0, this will NOT be identified as a serial regions. + ** If want GPTLprefix_set to apply to all threads, will need to + ** "fire up" the idle threads in some sort of parallel loop. + ** It is not safe to just test omp_in_parallel and + ** omp_get_thread_num == 1 unless add a thread barrier, and this + ** barrier would apply to all calls, so would be a performance bottleneck. + */ + + if (serial_region()){ + + prefix_len_nt = len_prefix; + ptr_prefix = prefix_nt; + + } else { + + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + + prefix_len[t] = len_prefix; + ptr_prefix = prefix[t]; + + } + + strncpy (ptr_prefix, prefixname, len_prefix); + + return (0); +} + +/* +** GPTLprefix_setf: define prefix for subsequent timer names when +** the string may not be null terminated +** +** Input arguments: +** prefixname: prefix string +** prefixlen: number of characters in timer name +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLprefix_setf (const char *prefixname, const int prefixlen) /* prefix string and length*/ +{ + int t; /* thread index (of this thread) */ + int c; /* character index */ + int len_prefix; /* number of characters in prefix */ + char *ptr_prefix; /* pointer to prefix string */ + static const char *thisfunc = "GPTLprefix_setf"; + + if (disabled) + return 0; + + if ( ! initialized){ + return 0; + } + +#if ( defined THREADED_PTHREADS ) + /* + ** prefix logic not enabled when using PTHREADS + */ + return 0; +#endif + + len_prefix = MIN (prefixlen, MAX_CHARS); + + /* + ** Note: if in a parallel region with only one active thread, e.g. + ** thread 0, this will NOT be identified as a serial regions. + ** If want GPTLprefix_setf to apply to all threads, will need to + ** "fire up" the idle threads in some sort of parallel loop. + ** It is not safe to just test omp_in_parallel and + ** omp_get_thread_num == 1 unless add a thread barrier, and this + ** barrier would apply to all calls, so would be a performance bottleneck. + */ + + if (serial_region()){ + + prefix_len_nt = len_prefix; + ptr_prefix = prefix_nt; + + } else { + + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + + prefix_len[t] = len_prefix; + ptr_prefix = prefix[t]; + + } + + for (c = 0; c < len_prefix; c++) { + ptr_prefix[c] = prefixname[c]; + } + ptr_prefix[len_prefix] = '\0'; + + return (0); +} + +/* +** GPTLprefix_unset: undefine prefix for subsequent timer names +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLprefix_unset () +{ + int t; /* thread index (of this thread) */ + int c; /* character index */ + char *ptr_prefix; /* pointer to prefix string */ + static const char *thisfunc = "GPTLprefix_setf"; + + if (disabled) + return 0; + + if ( ! initialized){ + return 0; + } + +#if ( defined THREADED_PTHREADS ) + /* + ** prefix logic not enabled when using PTHREADS + */ + return 0; +#endif + + /* + ** Note: if in a parallel region with only one active thread, e.g. + ** thread 0, this will NOT be identified as a serial regions. + ** If want GPTLprefix_unset to apply to all threads, will need to + ** "fire up" the idle threads in some sort of parallel loop. + ** It is not safe to just test omp_in_parallel and + ** omp_get_thread_num == 1 unless add a thread barrier, and this + ** barrier would apply to all calls, so would be a performance bottleneck. + */ + + if (serial_region()){ + + prefix_len_nt = 0; + ptr_prefix = prefix_nt; + + } else { + + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + + prefix_len[t] = 0; + ptr_prefix = prefix[t]; + + } + + ptr_prefix[0] = '\0'; + + return (0); +} + +/* +** GPTLstart_instr: start a timer (auto-instrumented) +** +** Input arguments: +** self: function address +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLstart_instr (void *self) +{ + Timer *ptr; /* linked list pointer */ + int t; /* thread index (of this thread) */ + unsigned int indx; /* hash table index */ + static const char *thisfunc = "GPTLstart_instr"; + + if (disabled) + return 0; + + if ( ! initialized) + return GPTLerror ("%s self=%p: GPTLinitialize has not been called\n", thisfunc, self); + + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + + /* + ** If current depth exceeds a user-specified limit for print, just + ** increment and return + */ + + if (stackidx[t].val >= depthlimit) { + ++stackidx[t].val; + return 0; + } + + ptr = getentry_instr (hashtable[t], self, &indx); + + /* + ** Recursion => increment depth in recursion and return. We need to return + ** because we don't want to restart the timer. We want the reported time for + ** the timer to reflect the outermost layer of recursion. + */ + + if (ptr && ptr->onflg) { + ++ptr->recurselvl; + return 0; + } + + /* + ** Increment stackidx[t] unconditionally. This is necessary to ensure the correct + ** behavior when GPTLstop_instr decrements stackidx[t] unconditionally. + */ + + if (++stackidx[t].val > MAX_STACK-1) + return GPTLerror ("%s: stack too big\n", thisfunc); + + if ( ! ptr) { /* Add a new entry and initialize */ + ptr = (Timer *) GPTLallocate (sizeof (Timer)); + memset (ptr, 0, sizeof (Timer)); + + /* + ** Need to save the address string for later conversion back to a real + ** name by an offline tool. + */ + + snprintf (ptr->name, MAX_CHARS+1, "%lx", (unsigned long) self); + ptr->address = self; + + if (update_ll_hash (ptr, t, indx) != 0) + return GPTLerror ("%s: update_ll_hash error\n", thisfunc); + } + + if (update_parent_info (ptr, callstack[t], stackidx[t].val) != 0) + return GPTLerror ("%s: update_parent_info error\n", thisfunc); + + if (update_ptr (ptr, t) != 0) + return GPTLerror ("%s: update_ptr error\n", thisfunc); + + return (0); +} + +/* +** GPTLstart: start a timer +** +** Input arguments: +** timername: timer name +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLstart (const char *timername) /* timer name */ +{ + Timer *ptr; /* linked list pointer */ + int t; /* thread index (of this thread) */ + int numchars; /* number of characters to copy */ + int namelen; /* number of characters in timer name */ + unsigned int indx; /* hash table index */ + double tpa = 0.0; /* time stamp */ + double tpb = 0.0; /* time stamp */ + char new_name[MAX_CHARS+1]; /* timer name with prefix, if there is one */ + const char *name; /* pointer to timer name */ + static const char *thisfunc = "GPTLstart"; + + if (disabled) + return 0; + + if ( ! initialized) + return 0; + + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + + /* + ** If current depth exceeds a user-specified limit for print, just + ** increment and return + */ + + if (stackidx[t].val >= depthlimit) { + ++stackidx[t].val; + return 0; + } + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* first caliper timestamp */ + tpa = (*ptr2wtimefunc) (); + } + } + + /* + ** If prefix string is defined, prepend it to timername + ** and assign the name pointer to the new string. + ** Otherwise assign the name pointer to the original string. + */ + + if ((prefix_len[t] > 0) || (prefix_len_nt > 0)){ + namelen = strlen(timername); + numchars = add_prefix(new_name, timername, namelen, t); + name = new_name; + } else { + name = timername; + numchars = MIN (strlen (name), MAX_CHARS); + } + + /* + ** ptr will point to the requested timer in the current list, + ** or NULL if this is a new entry + */ + + ptr = getentry (hashtable[t], name, &indx); + + /* + ** Recursion => increment depth in recursion and return. We need to return + ** because we don't want to restart the timer. We want the reported time for + ** the timer to reflect the outermost layer of recursion. + */ + + if (ptr && ptr->onflg) { + ++ptr->recurselvl; + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tpa) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tpa) + 2*overhead_utr); + } + } + return 0; + } + + /* + ** Increment stackidx[t] unconditionally. This is necessary to ensure the correct + ** behavior when GPTLstop decrements stackidx[t] unconditionally. + */ + + if (++stackidx[t].val > MAX_STACK-1) + return GPTLerror ("%s: stack too big\n", thisfunc); + + if ( ! ptr) { /* Add a new entry and initialize */ + ptr = (Timer *) GPTLallocate (sizeof (Timer)); + memset (ptr, 0, sizeof (Timer)); + + //pw numchars = MIN (strlen (name), MAX_CHARS); + strncpy (ptr->name, name, numchars); + ptr->name[numchars] = '\0'; + + if (update_ll_hash (ptr, t, indx) != 0) + return GPTLerror ("%s: update_ll_hash error\n", thisfunc); + } + + if (update_parent_info (ptr, callstack[t], stackidx[t].val) != 0) + return GPTLerror ("%s: update_parent_info error\n", thisfunc); + + if (update_ptr (ptr, t) != 0) + return GPTLerror ("%s: update_ptr error\n", thisfunc); + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tpa) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tpa) + 2*overhead_utr); + } + } + + return (0); +} + +/* +** GPTLstart_handle: start a timer based on a handle +** +** Input arguments: +** name: timer name (required when on input, handle=0) +** handle: pointer to timer matching "name" +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLstart_handle (const char *name, /* timer name */ + void **handle) /* handle (output if input value is 0) */ +{ + Timer *ptr; /* linked list pointer */ + int t; /* thread index (of this thread) */ + int numchars; /* number of characters to copy */ + unsigned int indx = (unsigned int) -1; /* hash table index: init to bad value */ + double tpa = 0.0; /* time stamp */ + double tpb = 0.0; /* time stamp */ + static const char *thisfunc = "GPTLstart_handle"; + + if (disabled) + return 0; + + if ( ! initialized) + return 0; + + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + + /* + ** If current depth exceeds a user-specified limit for print, just + ** increment and return + */ + + if (stackidx[t].val >= depthlimit) { + ++stackidx[t].val; + return 0; + } + + /* + ** If prefix string is defined, then call GPTLstart and + ** return a handle of 0. Otherwise a change in the prefix + ** might be ignored if the handle has already been set. + */ + + if ((prefix_len[t] > 0) || (prefix_len_nt > 0)){ + *handle = 0; + return GPTLstart (name); + } + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* first caliper timestamp */ + tpa = (*ptr2wtimefunc) (); + } + } + + /* + ** If on input, handle references a non-zero value, assume it's a previously returned Timer* + ** passed in by the user. If zero, generate the hash entry and return it to the user. + */ + + if (*handle) { + ptr = (Timer *) *handle; + } else { + ptr = getentry (hashtable[t], name, &indx); + } + + /* + ** Recursion => increment depth in recursion and return. We need to return + ** because we don't want to restart the timer. We want the reported time for + ** the timer to reflect the outermost layer of recursion. + */ + + if (ptr && ptr->onflg) { + ++ptr->recurselvl; + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tpa) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tpa) + 2*overhead_utr); + } + } + + return 0; + } + + /* + ** Increment stackidx[t] unconditionally. This is necessary to ensure the correct + ** behavior when GPTLstop decrements stackidx[t] unconditionally. + */ + + if (++stackidx[t].val > MAX_STACK-1) + return GPTLerror ("%s: stack too big\n", thisfunc); + + if ( ! ptr) { /* Add a new entry and initialize */ + ptr = (Timer *) GPTLallocate (sizeof (Timer)); + memset (ptr, 0, sizeof (Timer)); + + numchars = MIN (strlen (name), MAX_CHARS); + strncpy (ptr->name, name, numchars); + ptr->name[numchars] = '\0'; + + if (update_ll_hash (ptr, t, indx) != 0) + return GPTLerror ("%s: update_ll_hash error\n", thisfunc); + } + + if (update_parent_info (ptr, callstack[t], stackidx[t].val) != 0) + return GPTLerror ("%s: update_parent_info error\n", thisfunc); + + if (update_ptr (ptr, t) != 0) + return GPTLerror ("%s: update_ptr error\n", thisfunc); + + /* + ** If on input, *handle was 0, return the pointer to the timer for future input + */ + + if ( ! *handle) + *handle = (void *) ptr; + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tpa) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tpa) + 2*overhead_utr); + } + } + + return (0); +} + +/* +** GPTLstartf: start a timer when the timer name may not be null terminated +** +** Input arguments: +** timername: timer name +** namelen: number of characters in timer name +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLstartf (const char *timername, const int namelen) /* timer name and length */ +{ + Timer *ptr; /* linked list pointer */ + int t; /* thread index (of this thread) */ + int c; /* character index */ + int numchars; /* number of characters to copy */ + unsigned int indx; /* hash table index */ + double tpa = 0.0; /* time stamp */ + double tpb = 0.0; /* time stamp */ + char new_name[MAX_CHARS+1]; /* timer name with prefix, if there is one */ + const char *name; /* pointer to timer name */ + static const char *thisfunc = "GPTLstartf"; + + if (disabled) + return 0; + + if ( ! initialized) + return 0; + + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + + /* + ** If current depth exceeds a user-specified limit for print, just + ** increment and return + */ + + if (stackidx[t].val >= depthlimit) { + ++stackidx[t].val; + return 0; + } + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* first caliper timestamp */ + tpa = (*ptr2wtimefunc) (); + } + } + + /* + ** If prefix string is defined, prepend it to timername + ** and assign the name pointer to the new string. + ** Otherwise assign the name pointer to the original string. + */ + + if ((prefix_len[t] > 0) || (prefix_len_nt > 0)){ + numchars = add_prefix(new_name, timername, namelen, t); + name = new_name; + } else { + numchars = MIN (namelen, MAX_CHARS); + name = timername; + } + + /* + ** ptr will point to the requested timer in the current list, + ** or NULL if this is a new entry + */ + + ptr = getentryf (hashtable[t], name, numchars, &indx); + + /* + ** Recursion => increment depth in recursion and return. We need to return + ** because we don't want to restart the timer. We want the reported time for + ** the timer to reflect the outermost layer of recursion. + */ + + if (ptr && ptr->onflg) { + ++ptr->recurselvl; + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tpa) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tpa) + 2*overhead_utr); + } + } + + return 0; + } + + /* + ** Increment stackidx[t] unconditionally. This is necessary to ensure the correct + ** behavior when GPTLstop decrements stackidx[t] unconditionally. + */ + + if (++stackidx[t].val > MAX_STACK-1) + return GPTLerror ("%s: stack too big\n", thisfunc); + + if ( ! ptr) { /* Add a new entry and initialize */ + ptr = (Timer *) GPTLallocate (sizeof (Timer)); + memset (ptr, 0, sizeof (Timer)); + + //pw numchars = MIN (namelen, MAX_CHARS); + //pw strncpy (ptr->name, name, numchars); + for (c = 0; c < numchars; c++) { + ptr->name[c] = name[c]; + } + ptr->name[numchars] = '\0'; + + if (update_ll_hash (ptr, t, indx) != 0) + return GPTLerror ("%s: update_ll_hash error\n", thisfunc); + } + + if (update_parent_info (ptr, callstack[t], stackidx[t].val) != 0) + return GPTLerror ("%s: update_parent_info error\n", thisfunc); + + if (update_ptr (ptr, t) != 0) + return GPTLerror ("%s: update_ptr error\n", thisfunc); + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tpa) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tpa) + 2*overhead_utr); + } + } + + return (0); +} + +/* +** GPTLstartf_handle: start a timer based on a handle +** when the timer name may not be null terminated +** +** Input arguments: +** name: timer name (required when on input, handle=0) +** namelen: number of characters in timer name +** handle: pointer to timer matching "name" +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLstartf_handle (const char *name, /* timer name */ + const int namelen, /* timer name length */ + void **handle) /* handle (output if input value is 0) */ +{ + Timer *ptr; /* linked list pointer */ + int t; /* thread index (of this thread) */ + int c; /* character index */ + int numchars; /* number of characters to copy */ + unsigned int indx = (unsigned int) -1; /* hash table index: init to bad value */ + double tpa = 0.0; /* time stamp */ + double tpb = 0.0; /* time stamp */ + static const char *thisfunc = "GPTLstartf_handle"; + + if (disabled) + return 0; + + if ( ! initialized) + return 0; + + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + + /* + ** If current depth exceeds a user-specified limit for print, just + ** increment and return + */ + + if (stackidx[t].val >= depthlimit) { + ++stackidx[t].val; + return 0; + } + + /* + ** If prefix string is defined, then call GPTLstartf and + ** return a handle of 0. Otherwise a change in the prefix + ** might be ignored if the handle has already been set. + */ + + if ((prefix_len[t] > 0) || (prefix_len_nt > 0)){ + *handle = 0; + return GPTLstartf (name, namelen); + } + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* first caliper timestamp */ + tpa = (*ptr2wtimefunc) (); + } + } + + /* + ** If on input, handle references a non-zero value, assume it's a previously returned Timer* + ** passed in by the user. If zero, generate the hash entry and return it to the user. + */ + + if (*handle) { + ptr = (Timer *) *handle; + } else { + numchars = MIN (namelen, MAX_CHARS); + ptr = getentryf (hashtable[t], name, numchars, &indx); + } + + /* + ** Recursion => increment depth in recursion and return. We need to return + ** because we don't want to restart the timer. We want the reported time for + ** the timer to reflect the outermost layer of recursion. + */ + + if (ptr && ptr->onflg) { + ++ptr->recurselvl; + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tpa) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tpa) + 2*overhead_utr); + } + } + + return 0; + } + + /* + ** Increment stackidx[t] unconditionally. This is necessary to ensure the correct + ** behavior when GPTLstop decrements stackidx[t] unconditionally. + */ + + if (++stackidx[t].val > MAX_STACK-1) + return GPTLerror ("%s: stack too big\n", thisfunc); + + if ( ! ptr) { /* Add a new entry and initialize */ + ptr = (Timer *) GPTLallocate (sizeof (Timer)); + memset (ptr, 0, sizeof (Timer)); + + numchars = MIN (namelen, MAX_CHARS); + //pw strncpy (ptr->name, name, numchars); + for (c = 0; c < numchars; c++) { + ptr->name[c] = name[c]; + } + ptr->name[numchars] = '\0'; + + if (update_ll_hash (ptr, t, indx) != 0) + return GPTLerror ("%s: update_ll_hash error\n", thisfunc); + } + + if (update_parent_info (ptr, callstack[t], stackidx[t].val) != 0) + return GPTLerror ("%s: update_parent_info error\n", thisfunc); + + if (update_ptr (ptr, t) != 0) + return GPTLerror ("%s: update_ptr error\n", thisfunc); + + /* + ** If on input, *handle was 0, return the pointer to the timer for future input + */ + + if ( ! *handle) + *handle = (void *) ptr; + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tpa) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tpa) + 2*overhead_utr); + } + } + + return (0); +} + +/* +** add_prefix: add prefix string to timer name +** +** Input arguments: +** new_name: new name +** timername: timer name +** namelen: length of timer name +** t: thread id +** +** Return value: length of new name +*/ + +static int add_prefix (char *new_name, const char *timername, const int namelen, const int t) +{ + int numchars; /* number of characters to copy */ + int c; /* character index */ + + /* add prefix from serial region */ + numchars = MIN (prefix_len_nt, MAX_CHARS); + for (c = 0; c < numchars; c++) { + new_name[c] = prefix_nt[c]; + } + + /* add thread-specific prefix */ + numchars = MIN (prefix_len[t], MAX_CHARS-prefix_len_nt); + for (c = 0; c < numchars; c++) { + new_name[c+prefix_len_nt] = prefix[t][c]; + } + + /* add timer name */ + numchars = MIN (namelen, MAX_CHARS-prefix_len_nt-prefix_len[t]); + for (c = 0; c < numchars; c++) { + new_name[c+prefix_len_nt+prefix_len[t]] = timername[c]; + } + + /* add string terminator */ + numchars = MIN (namelen+prefix_len_nt+prefix_len[t], MAX_CHARS); + new_name[numchars] = '\0'; + + return numchars; +} + +/* +** update_ll_hash: Update linked list and hash table. +** Called by GPTLstart(f), GPTLstart_instr, +** and GPTLstart(f)_handle. +** +** Input arguments: +** ptr: pointer to timer +** t: thread index +** indx: hash index +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +static int update_ll_hash (Timer *ptr, const int t, const unsigned int indx) +{ + int nchars; /* number of chars */ + int nument; /* number of entries */ + Timer **eptr; /* for realloc */ + + nchars = strlen (ptr->name); + if (nchars > max_name_len[t]) + max_name_len[t] = nchars; + + last[t]->next = ptr; + last[t] = ptr; + ++hashtable[t][indx].nument; + nument = hashtable[t][indx].nument; + + eptr = (Timer **) realloc (hashtable[t][indx].entries, nument * sizeof (Timer *)); + if ( ! eptr) + return GPTLerror ("update_ll_hash: realloc error\n"); + + hashtable[t][indx].entries = eptr; + hashtable[t][indx].entries[nument-1] = ptr; + + return 0; +} + +/* +** update_ptr: Update timer contents. +** Called by GPTLstart(f), GPTLstart_instr, and GPTLstart(f)_handle. +** +** Input arguments: +** ptr: pointer to timer +** t: thread index +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +static inline int update_ptr (Timer *ptr, const int t) +{ + double tp2; /* time stamp */ + + ptr->onflg = true; + + if (cpustats.enabled && get_cpustamp (&ptr->cpu.last_utime, &ptr->cpu.last_stime) < 0) + return GPTLerror ("update_ptr: get_cpustamp error"); + + if (wallstats.enabled) { + tp2 = (*ptr2wtimefunc) (); + ptr->wall.last = tp2; + } + +#ifdef HAVE_PAPI + if (dousepapi && GPTL_PAPIstart (t, &ptr->aux) < 0) + return GPTLerror ("update_ptr: error from GPTL_PAPIstart\n"); +#endif + return 0; +} + +/* +** update_parent_info: update info about parent, and in the parent about this child +** +** Arguments: +** ptr: pointer to timer +** callstackt: callstack for this thread +** stackidxt: stack index for this thread +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +static inline int update_parent_info (Timer *ptr, + Timer **callstackt, + int stackidxt) +{ + int n; /* loop index through known parents */ + Timer *pptr; /* pointer to parent in callstack */ + Timer **pptrtmp; /* for realloc parent pointer array */ + int nparent; /* number of parents */ + int *parent_count; /* number of times parent invoked this child */ + static const char *thisfunc = "update_parent_info"; + + if ( ! ptr ) + return -1; + + if (stackidxt < 0) + return GPTLerror ("%s: called with negative stackidx\n", thisfunc); + + callstackt[stackidxt] = ptr; + + /* + ** If the region has no parent, bump its orphan count + ** (should never happen since "GPTL_ROOT" added). + */ + + if (stackidxt == 0) { + ++ptr->norphan; + return 0; + } + + pptr = callstackt[stackidxt-1]; + + /* If this parent occurred before, bump its count */ + + for (n = 0; n < ptr->nparent; ++n) { + if (ptr->parent[n] == pptr) { + ++ptr->parent_count[n]; + break; + } + } + + /* If this is a new parent, update info */ + + if (n == ptr->nparent) { + ++ptr->nparent; + nparent = ptr->nparent; + pptrtmp = (Timer **) realloc (ptr->parent, nparent * sizeof (Timer *)); + if ( ! pptrtmp) + return GPTLerror ("%s: realloc error pptrtmp nparent=%d\n", thisfunc, nparent); + + ptr->parent = pptrtmp; + ptr->parent[nparent-1] = pptr; + parent_count = (int *) realloc (ptr->parent_count, nparent * sizeof (int)); + if ( ! parent_count) + return GPTLerror ("%s: realloc error parent_count nparent=%d\n", thisfunc, nparent); + + ptr->parent_count = parent_count; + ptr->parent_count[nparent-1] = 1; + } + + return 0; +} + +/* +** GPTLstop_instr: stop a timer (auto-instrumented) +** +** Input arguments: +** self: function address +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLstop_instr (void *self) +{ + double tp1 = 0.0; /* time stamp */ + Timer *ptr; /* linked list pointer */ + int t; /* thread number for this process */ + unsigned int indx; /* index into hash table */ + long usr = 0; /* user time (returned from get_cpustamp) */ + long sys = 0; /* system time (returned from get_cpustamp) */ + static const char *thisfunc = "GPTLstop_instr"; + + if (disabled) + return 0; + + if ( ! initialized) + return GPTLerror ("%s: GPTLinitialize has not been called\n", thisfunc); + + /* Get the timestamp */ + + if (wallstats.enabled) { + tp1 = (*ptr2wtimefunc) (); + } + + if (cpustats.enabled && get_cpustamp (&usr, &sys) < 0) + return GPTLerror ("%s: bad return from get_cpustamp\n", thisfunc); + + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + + /* + ** If current depth exceeds a user-specified limit for print, just + ** decrement and return + */ + + if (stackidx[t].val > depthlimit) { + --stackidx[t].val; + return 0; + } + + ptr = getentry_instr (hashtable[t], self, &indx); + + if ( ! ptr) + return GPTLerror ("%s: timer for %p had not been started.\n", thisfunc, self); + + if ( ! ptr->onflg ) + return GPTLerror ("%s: timer %s was already off.\n", thisfunc, ptr->name); + + ++ptr->count; + + /* + ** Recursion => decrement depth in recursion and return. We need to return + ** because we don't want to stop the timer. We want the reported time for + ** the timer to reflect the outermost layer of recursion. + */ + + if (ptr->recurselvl > 0) { + ++ptr->nrecurse; + --ptr->recurselvl; + return 0; + } + + if (update_stats (ptr, tp1, usr, sys, t) != 0) + return GPTLerror ("%s: error from update_stats\n", thisfunc); + + return 0; +} + +/* +** GPTLstop: stop a timer +** +** Input arguments: +** timername: timer name +** +** Return value: 0 (success) or -1 (failure) +*/ + +int GPTLstop (const char *timername) /* timer name */ +{ + double tp1 = 0.0; /* time stamp */ + Timer *ptr; /* linked list pointer */ + int t; /* thread number for this process */ + int numchars; /* number of characters to copy */ + int namelen; /* number of characters in timer name */ + int len_prefix; /* number of characters in prefix */ + unsigned int indx; /* index into hash table */ + long usr = 0; /* user time (returned from get_cpustamp) */ + long sys = 0; /* system time (returned from get_cpustamp) */ + double tpa = 0.0; /* time stamp */ + double tpb = 0.0; /* time stamp */ + char *ptr_prefix; /* pointer to prefix string */ + char new_name[MAX_CHARS+1]; /* timer name with prefix, if there is one */ + const char *name; /* pointer to timer name */ + static const char *thisfunc = "GPTLstop"; + + if (disabled) + return 0; + + if ( ! initialized) + return 0; + + /* Get the timestamp */ + + if (wallstats.enabled) { + tp1 = (*ptr2wtimefunc) (); + } + + if (cpustats.enabled && get_cpustamp (&usr, &sys) < 0) + return GPTLerror ("%s: get_cpustamp error", thisfunc); + + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + + /* + ** If current depth exceeds a user-specified limit for print, just + ** decrement and return + */ + + if (stackidx[t].val > depthlimit) { + --stackidx[t].val; + return 0; + } + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* dummy clock call, to capture earlier tp1 call */ + tpa = (*ptr2wtimefunc) (); + } + } + + /* + ** If prefix string is defined, prepend it to timername + ** and assign the name pointer to the new string. + ** Otherwise assign the name pointer to the original string. + */ + + if ((prefix_len[t] > 0) || (prefix_len_nt > 0)){ + namelen = strlen(timername); + numchars = add_prefix(new_name, timername, namelen, t); + name = new_name; + } else { + name = timername; + } + + if ( ! (ptr = getentry (hashtable[t], name, &indx))) + return GPTLerror ("%s thread %d: timer for %s had not been started.\n", thisfunc, t, name); + + if ( ! ptr->onflg ) + return GPTLerror ("%s: timer %s was already off.\n", thisfunc, ptr->name); + + ++ptr->count; + + /* + ** Recursion => decrement depth in recursion and return. We need to return + ** because we don't want to stop the timer. We want the reported time for + ** the timer to reflect the outermost layer of recursion. + */ + + if (ptr->recurselvl > 0) { + ++ptr->nrecurse; + --ptr->recurselvl; + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tp1) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tp1) + 2*overhead_utr); + } + } + + return 0; + } + + if (update_stats (ptr, tp1, usr, sys, t) != 0) + return GPTLerror ("%s: error from update_stats\n", thisfunc); + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tp1) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tp1) + 2*overhead_utr); + } + } + + return 0; +} + +/* +** GPTLstop_handle: stop a timer based on a handle +** +** Input arguments: +** name: timer name (used only for diagnostics) +** handle: pointer to timer +** +** Return value: 0 (success) or -1 (failure) +*/ + +int GPTLstop_handle (const char *name, /* timer name */ + void **handle) /* handle (output if input value is 0) */ +{ + double tp1 = 0.0; /* time stamp */ + Timer *ptr; /* linked list pointer */ + int t; /* thread number for this process */ + unsigned int indx; /* index into hash table */ + long usr = 0; /* user time (returned from get_cpustamp) */ + long sys = 0; /* system time (returned from get_cpustamp) */ + double tpa = 0.0; /* time stamp */ + double tpb = 0.0; /* time stamp */ + static const char *thisfunc = "GPTLstop_handle"; + + if (disabled) + return 0; + + if ( ! initialized) + return 0; + + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + + /* + ** If prefix string is defined, then call GPTLstop and + ** return a handle of 0. Otherwise a change in the prefix + ** might be ignored if the handle has already been set. + */ + + if ((prefix_len[t] > 0) || (prefix_len_nt > 0)){ + *handle = 0; + return GPTLstop (name); + } + + /* Get the timestamp */ + + if (wallstats.enabled) { + tp1 = (*ptr2wtimefunc) (); + } + + if (cpustats.enabled && get_cpustamp (&usr, &sys) < 0) + return GPTLerror (0); + + /* + ** If current depth exceeds a user-specified limit for print, just + ** decrement and return + */ + + if (stackidx[t].val > depthlimit) { + --stackidx[t].val; + return 0; + } + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* dummy clock call, to capture earlier tp1 call */ + tpa = (*ptr2wtimefunc) (); + } + } + + /* + ** If on input, handle references a non-zero value, assume it's a previously returned Timer* + ** passed in by the user. If zero, generate the hash entry and return it to the user. + */ + + if (*handle) { + ptr = (Timer *) *handle; + } else { + if ( ! (ptr = getentry (hashtable[t], name, &indx))) + return GPTLerror ("%s thread %d: timer for %s had not been started.\n", thisfunc, t, name); + } + + if ( ! ptr->onflg ) + return GPTLerror ("%s: timer %s was already off.\n", thisfunc, ptr->name); + + ++ptr->count; + + /* + ** Recursion => decrement depth in recursion and return. We need to return + ** because we don't want to stop the timer. We want the reported time for + ** the timer to reflect the outermost layer of recursion. + */ + + if (ptr->recurselvl > 0) { + ++ptr->nrecurse; + --ptr->recurselvl; + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tp1) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tp1) + 2*overhead_utr); + } + } + + return 0; + } + + if (update_stats (ptr, tp1, usr, sys, t) != 0) + return GPTLerror ("%s: error from update_stats\n", thisfunc); + + /* + ** If on input, *handle was 0, return the pointer to the timer for future input + */ + + if ( ! *handle) + *handle = (void *) ptr; + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tp1) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tp1) + 2*overhead_utr); + } + } + + return 0; +} + +/* +** GPTLstopf: stop a timer when the timer name may not be null terminated +** +** Input arguments: +** timername: timer name +** namelen: number of characters in timer name +** +** Return value: 0 (success) or -1 (failure) +*/ + +int GPTLstopf (const char *timername, const int namelen) /* timer name and length */ +{ + double tp1 = 0.0; /* time stamp */ + Timer *ptr; /* linked list pointer */ + int t; /* thread number for this process */ + int c; /* character index */ + int numchars; /* number of characters to copy */ + unsigned int indx; /* index into hash table */ + long usr = 0; /* user time (returned from get_cpustamp) */ + long sys = 0; /* system time (returned from get_cpustamp) */ + char strname[MAX_CHARS+1]; /* null terminated version of name */ + double tpa = 0.0; /* time stamp */ + double tpb = 0.0; /* time stamp */ + char new_name[MAX_CHARS+1]; /* timer name with prefix, if there is one */ + const char *name; /* pointer to timer name */ + static const char *thisfunc = "GPTLstopf"; + + if (disabled) + return 0; + + if ( ! initialized) + return 0; + + /* Get the timestamp */ + + if (wallstats.enabled) { + tp1 = (*ptr2wtimefunc) (); + } + + if (cpustats.enabled && get_cpustamp (&usr, &sys) < 0) + return GPTLerror ("%s: get_cpustamp error", thisfunc); + + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + + /* + ** If current depth exceeds a user-specified limit for print, just + ** decrement and return + */ + + if (stackidx[t].val > depthlimit) { + --stackidx[t].val; + return 0; + } + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* dummy clock call, to capture earlier tp1 call */ + tpa = (*ptr2wtimefunc) (); + } + } + + /* + ** If prefix string is defined, prepend it to timername + ** and assign the name pointer to the new string. + ** Otherwise assign the name pointer to the original string. + */ + + if ((prefix_len[t] > 0) || (prefix_len_nt > 0)){ + numchars = add_prefix(new_name, timername, namelen, t); + name = new_name; + } else { + numchars = MIN (namelen, MAX_CHARS); + name = timername; + } + + if ( ! (ptr = getentryf (hashtable[t], name, numchars, &indx))){ + //pw numchars = MIN (namelen, MAX_CHARS); + //pw strncpy (strname, name, numchars); + for (c = 0; c < numchars; c++) { + strname[c] = name[c]; + } + strname[numchars] = '\0'; + return GPTLerror ("%s thread %d: timer for %s had not been started.\n", thisfunc, t, strname); + } + + if ( ! ptr->onflg ) + return GPTLerror ("%s: timer %s was already off.\n", thisfunc, ptr->name); + + ++ptr->count; + + /* + ** Recursion => decrement depth in recursion and return. We need to return + ** because we don't want to stop the timer. We want the reported time for + ** the timer to reflect the outermost layer of recursion. + */ + + if (ptr->recurselvl > 0) { + ++ptr->nrecurse; + --ptr->recurselvl; + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tp1) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tp1) + 2*overhead_utr); + } + } + + return 0; + } + + if (update_stats (ptr, tp1, usr, sys, t) != 0) + return GPTLerror ("%s: error from update_stats\n", thisfunc); + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tp1) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tp1) + 2*overhead_utr); + } + } + + return 0; +} + +/* +** GPTLstopf_handle: stop a timer based on a handle +** when the timer name may not be null terminated +** +** Input arguments: +** name: timer name (used only for diagnostics) +** namelen: number of characters in timer name +** handle: pointer to timer +** +** Return value: 0 (success) or -1 (failure) +*/ + +int GPTLstopf_handle (const char *name, /* timer name */ + const int namelen, /* timer name length */ + void **handle) /* handle (output if input value is 0) */ +{ + double tp1 = 0.0; /* time stamp */ + Timer *ptr; /* linked list pointer */ + int t; /* thread number for this process */ + int c; /* character index */ + unsigned int indx; /* index into hash table */ + long usr = 0; /* user time (returned from get_cpustamp) */ + long sys = 0; /* system time (returned from get_cpustamp) */ + int numchars; /* number of characters to copy */ + char strname[MAX_CHARS+1]; /* null terminated version of name */ + double tpa = 0.0; /* time stamp */ + double tpb = 0.0; /* time stamp */ + static const char *thisfunc = "GPTLstopf_handle"; + + if (disabled) + return 0; + + if ( ! initialized) + return 0; + + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + + /* + ** If prefix string is defined, then call GPTLstopf and + ** return a handle of 0. Otherwise a change in the prefix + ** might be ignored if the handle has already been set. + */ + + if ((prefix_len[t] > 0) || (prefix_len_nt > 0)){ + *handle = 0; + return GPTLstopf (name, namelen); + } + + /* Get the timestamp */ + + if (wallstats.enabled) { + tp1 = (*ptr2wtimefunc) (); + } + + if (cpustats.enabled && get_cpustamp (&usr, &sys) < 0) + return GPTLerror (0); + + /* + ** If current depth exceeds a user-specified limit for print, just + ** decrement and return + */ + + if (stackidx[t].val > depthlimit) { + --stackidx[t].val; + return 0; + } + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* dummy clock call, to capture earlier tp1 call */ + tpa = (*ptr2wtimefunc) (); + } + } + + /* + ** If on input, handle references a non-zero value, assume it's a previously returned Timer* + ** passed in by the user. If zero, generate the hash entry and return it to the user. + */ + + if (*handle) { + ptr = (Timer *) *handle; + } else { + if ( ! (ptr = getentryf (hashtable[t], name, namelen, &indx))){ + numchars = MIN (namelen, MAX_CHARS); + //pw strncpy (strname, name, numchars); + for (c = 0; c < numchars; c++) { + strname[c] = name[c]; + } + strname[numchars] = '\0'; + return GPTLerror ("%s thread %d: timer for %s had not been started.\n", thisfunc, t, strname); + } + } + + if ( ! ptr->onflg ) + return GPTLerror ("%s: timer %s was already off.\n", thisfunc, ptr->name); + + ++ptr->count; + + /* + ** Recursion => decrement depth in recursion and return. We need to return + ** because we don't want to stop the timer. We want the reported time for + ** the timer to reflect the outermost layer of recursion. + */ + + if (ptr->recurselvl > 0) { + ++ptr->nrecurse; + --ptr->recurselvl; + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tp1) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tp1) + 2*overhead_utr); + } + } + + return 0; + } + + if (update_stats (ptr, tp1, usr, sys, t) != 0) + return GPTLerror ("%s: error from update_stats\n", thisfunc); + + /* + ** If on input, *handle was 0, return the pointer to the timer for future input + */ + + if ( ! *handle) + *handle = (void *) ptr; + + if (wallstats.enabled && profileovhd.enabled){ + if (t == 0){ + /* second caliper timestamp */ + tpb = (*ptr2wtimefunc) (); + /* subtract out additional overhead from caliper timing calls */ + overhead_est += ((tpb - tp1) - overhead_utr); + /* add in additional overhead due to caliper timing calls (probaby 2X what necessary) */ + overhead_bound += ((tpb - tp1) + 2*overhead_utr); + } + } + + return 0; +} + +/* +** update_stats: update stats inside ptr. Called by GPTLstop(f), GPTLstop_instr, +** GPTLstop(f)_handle +** +** Input arguments: +** ptr: pointer to timer +** tp1: input time stapm +** usr: user time +** sys: system time +** t: thread index +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +static inline int update_stats (Timer *ptr, + const double tp1, + const long usr, + const long sys, + const int t) +{ + double delta; /* difference */ + static const char *thisfunc = "update_stats"; + + ptr->onflg = false; + --stackidx[t].val; + if (stackidx[t].val < -1) { + stackidx[t].val = -1; + return GPTLerror ("%s: tree depth has become negative.\n", thisfunc); + } + +#ifdef HAVE_PAPI + if (dousepapi && GPTL_PAPIstop (t, &ptr->aux) < 0) + return GPTLerror ("%s: error from GPTL_PAPIstop\n", thisfunc); +#endif + + if (wallstats.enabled) { + delta = tp1 - ptr->wall.last; + + if (delta < 0.) { + fprintf (stderr, "%s: negative delta=%g\n", thisfunc, delta); + delta = 0.0; + } + + ptr->wall.accum += delta; + ptr->wall.latest = delta; + + if (ptr->count == 1) { + ptr->wall.max = delta; + + ptr->wall.prev_min = FLT_MAX; + ptr->wall.min = delta; + ptr->wall.latest_is_min = 1; + } else { + if (delta > ptr->wall.max) + ptr->wall.max = delta; + if (delta < ptr->wall.min){ + ptr->wall.prev_min = ptr->wall.min; + ptr->wall.min = delta; + ptr->wall.latest_is_min = 1; + } else { + ptr->wall.latest_is_min = 0; + } + } + } + + if (cpustats.enabled) { + ptr->cpu.accum_utime += usr - ptr->cpu.last_utime; + ptr->cpu.accum_stime += sys - ptr->cpu.last_stime; + ptr->cpu.last_utime = usr; + ptr->cpu.last_stime = sys; + } + return 0; +} + +/* +** GPTLenable: enable timers +** +** Return value: 0 (success) +*/ + +int GPTLenable (void) +{ + disabled = false; + return (0); +} + +/* +** GPTLdisable: disable timers +** +** Return value: 0 (success) +*/ + +int GPTLdisable (void) +{ + disabled = true; + return (0); +} + +/* +** GPTLstamp: Compute timestamp of usr, sys, and wallclock time (seconds) +** +** Output arguments: +** wall: wallclock +** usr: user time +** sys: system time +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLstamp (double *wall, double *usr, double *sys) +{ + struct tms buf; /* argument to times */ + + if ( ! initialized) + return GPTLerror ("GPTLstamp: GPTLinitialize has not been called\n"); + +#ifdef HAVE_TIMES + *usr = 0; + *sys = 0; + + if (times (&buf) == -1) + return GPTLerror ("GPTLstamp: times() failed. Results bogus\n"); + + *usr = buf.tms_utime / (double) ticks_per_sec; + *sys = buf.tms_stime / (double) ticks_per_sec; +#endif + *wall = (*ptr2wtimefunc) (); + return 0; +} + +/* +** GPTLreset: reset all timers to 0 +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLreset (void) +{ + int t; /* index over threads */ + Timer *ptr; /* linked list index */ + static const char *thisfunc = "GPTLreset"; + + if ( ! initialized) + return GPTLerror ("%s: GPTLinitialize has not been called\n", thisfunc); + + for (t = 0; t < nthreads; t++) { + for (ptr = timers[t]; ptr; ptr = ptr->next) { + ptr->onflg = false; + ptr->count = 0; + memset (&ptr->wall, 0, sizeof (ptr->wall)); + memset (&ptr->cpu, 0, sizeof (ptr->cpu)); +#ifdef HAVE_PAPI + memset (&ptr->aux, 0, sizeof (ptr->aux)); +#endif + } + } + + if (verbose) + printf ("%s: accumulators for all timers set to zero\n", thisfunc); + + return 0; +} + +/* +** GPTLprint_mode_set: set output mode to use for +** GPTLpr_file and GPTLpr_summary_file +*/ + +int GPTLprint_mode_set (int pr_mode) +{ + print_mode = (PRMode) pr_mode; + return 0; +} + +/* +** GPTLprint_mode_query: query output mode used +** for GPTLpr_file and GPTLpr_summary_file +*/ + +int GPTLprint_mode_query (void) +{ + return (int) print_mode; +} + +/* +** GPTLpr: Print values of all timers +** +** Input arguments: +** id: integer to append to string "timing." +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLpr (const int id) /* output file will be named "timing." */ +{ + char outfile[14]; /* name of output file: timing.xxxxxx */ + static const char *thisfunc = "GPTLpr"; + + if (id < 0 || id > 999999) + return GPTLerror ("%s: bad id=%d for output file. Must be >= 0 and < 1000000\n", thisfunc, id); + + sprintf (outfile, "timing.%d", id); + + if (GPTLpr_file (outfile) != 0) + return GPTLerror ("%s: Error in GPTLpr_file\n", thisfunc); + + return 0; +} + +/* +** GPTLpr_file: Print values of all timers +** +** Input arguments: +** outfile: Name of output file to write +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLpr_file (const char *outfile) /* output file to write */ +{ + FILE *fp; /* file handle to write to */ + Timer *ptr; /* walk through master thread linked list */ + Timer *tptr; /* walk through slave threads linked lists */ + Timer sumstats; /* sum of same timer stats over threads */ + int i, ii, n, t; /* indices */ + int totent; /* per-thread collision count (diagnostic) */ + int nument; /* per-index collision count (diagnostic) */ + int totlen; /* length for malloc */ + unsigned long totcount; /* total timer invocations */ + char *outpath; /* path to output file: outdir/timing.xxxxxx */ + float *sum; /* sum of overhead values (per thread) */ + float osum; /* sum of overhead over threads */ + double utr_overhead; /* overhead of calling underlying timing routine */ + double tot_overhead; /* utr_overhead + papi overhead */ + double papi_overhead = 0; /* overhead of reading papi counters */ + bool found; /* jump out of loop when name found */ + bool foundany; /* whether summation print necessary */ + bool first; /* flag 1st time entry found */ + /* + ** Diagnostics for collisions and GPTL memory usage + */ + int num_zero; /* number of buckets with 0 collisions */ + int num_one; /* number of buckets with 1 collision */ + int num_two; /* number of buckets with 2 collisions */ + int num_more; /* number of buckets with more than 2 collisions */ + int most; /* biggest collision count */ + int numtimers = 0; /* number of timers */ + float hashmem; /* hash table memory usage */ + float regionmem; /* timer memory usage */ + float papimem; /* PAPI stats memory usage */ + float pchmem; /* parent/child array memory usage */ + float gptlmem; /* total per-thread GPTL memory usage estimate */ + float totmem; /* sum of gptlmem across threads */ + + static const char *thisfunc = "GPTLpr_file"; + + if ( ! initialized) + return GPTLerror ("%s: GPTLinitialize() has not been called\n", thisfunc); + + /* 2 is for "/" plus null */ + if (outdir) + totlen = strlen (outdir) + strlen (outfile) + 2; + else + totlen = strlen (outfile) + 2; + + outpath = (char *) GPTLallocate (totlen); + + if (outdir) { + strcpy (outpath, outdir); + strcat (outpath, "/"); + strcat (outpath, outfile); + } else { + strcpy (outpath, outfile); + } + + if (print_mode == GPTLprint_append){ + if ( ! (fp = fopen (outpath, "a"))) + fp = stderr; + } + else{ + if ( ! (fp = fopen (outpath, "w"))) + fp = stderr; + } + + free (outpath); + + fprintf (fp, "$Id: gptl.c,v 1.157 2011-03-28 20:55:18 rosinski Exp $\n"); + + /* + ** A set of nasty ifdefs to tell important aspects of how GPTL was built + */ + +#ifdef HAVE_NANOTIME + if (funclist[funcidx].option == GPTLnanotime) { + fprintf (fp, "Clock rate = %f MHz\n", cpumhz); +#ifdef BIT64 + fprintf (fp, " BIT64 was true\n"); +#else + fprintf (fp, " BIT64 was false\n"); +#endif + } +#endif + +#if ( defined THREADED_OMP ) + fprintf (fp, "GPTL was built with THREADED_OMP\n"); +#elif ( defined THREADED_PTHREADS ) + fprintf (fp, "GPTL was built with THREADED_PTHREADS\n"); +#else + fprintf (fp, "GPTL was built without threading\n"); +#endif + +#ifdef HAVE_MPI + fprintf (fp, "HAVE_MPI was true\n"); + +#ifdef HAVE_COMM_F2C + fprintf (fp, " HAVE_COMM_F2C was true\n"); +#else + fprintf (fp, " HAVE_COMM_F2C was false\n"); +#endif + +#ifdef ENABLE_PMPI + fprintf (fp, " ENABLE_PMPI was true\n"); +#else + fprintf (fp, " ENABLE_PMPI was false\n"); +#endif + +#else + fprintf (fp, "HAVE_MPI was false\n"); +#endif + +#ifdef HAVE_PAPI + fprintf (fp, "HAVE_PAPI was true\n"); + if (dousepapi) { + if (GPTL_PAPIis_multiplexed ()) + fprintf (fp, " PAPI event multiplexing was ON\n"); + else + fprintf (fp, " PAPI event multiplexing was OFF\n"); + GPTL_PAPIprintenabled (fp); + } +#else + fprintf (fp, "HAVE_PAPI was false\n"); +#endif + + /* + ** Estimate underlying timing routine overhead + */ + + utr_overhead = utr_getoverhead (); + fprintf (fp, "Underlying timing routine was %s.\n", funclist[funcidx].name); + if (wallstats.enabled && profileovhd.enabled){ + fprintf (fp, "Per-call utr overhead est (at init): %g sec.\n", overhead_utr); + fprintf (fp, "Per-call utr overhead est (at end): %g sec.\n", utr_overhead); + } else { + fprintf (fp, "Per-call utr overhead est: %g sec.\n", utr_overhead); + } +#ifdef HAVE_PAPI + if (dousepapi) { + double t1, t2; + t1 = (*ptr2wtimefunc) (); + read_counters100 (); + t2 = (*ptr2wtimefunc) (); + papi_overhead = 0.01 * (t2 - t1); + fprintf (fp, "Per-call PAPI overhead est: %g sec.\n", papi_overhead); + } +#endif + tot_overhead = utr_overhead + papi_overhead; + if (dopr_preamble) { + fprintf (fp, "If overhead stats are printed, roughly half the estimated number is\n" + "embedded in the wallclock stats for each timer.\n" + "Print method was %s.\n", methodstr (method)); +#ifdef ENABLE_PMPI + fprintf (fp, "If a AVG_MPI_BYTES field is present, it is an estimate of the per-call " + "average number of bytes handled by that process.\n" + "If timers beginning with sync_ are present, it means MPI synchronization " + "was turned on.\n"); +#endif + fprintf (fp, "If a \'%%_of\' field is present, it is w.r.t. the first timer for thread 0.\n" + "If a \'e6_per_sec\' field is present, it is in millions of PAPI counts per sec.\n\n" + "A '*' in column 1 below means the timer had multiple parents, though the\n" + "values printed are for all calls.\n" + "Further down the listing may be more detailed information about multiple\n" + "parents. Look for 'Multiple parent info'\n\n"); + } + + sum = (float *) GPTLallocate (nthreads * sizeof (float)); + + for (t = 0; t < nthreads; ++t) { + + /* + ** Construct tree for printing timers in parent/child form. get_max_depth() must be called + ** AFTER construct_tree() because it relies on the per-parent children arrays being complete. + */ + + if (construct_tree (timers[t], method) != 0) + printf ("GPTLpr_file: failure from construct_tree: output will be incomplete\n"); + max_depth[t] = get_max_depth (timers[t], 0); + + if (t > 0) + fprintf (fp, "\n"); + fprintf (fp, "Stats for thread %d:\n", t); + + for (n = 0; n < max_depth[t]+1; ++n) /* +1 to always indent timer name */ + fprintf (fp, " "); + if (dopr_quotes){ + for (n = 0; n < max_name_len[t]+2; ++n) /* longest timer name + quotes */ + fprintf (fp, " "); + } else { + for (n = 0; n < max_name_len[t]; ++n) /* longest timer name */ + fprintf (fp, " "); + } + + fprintf (fp, " On Called Recurse"); + + /* Print strings for enabled timer types */ + + if (cpustats.enabled) + fprintf (fp, "%s", cpustats.str); + if (wallstats.enabled) { + fprintf (fp, "%s", wallstats.str); + if (percent && timers[0]->next) + fprintf (fp, "%%_of_%5.5s ", timers[0]->next->name); + if (overheadstats.enabled) + fprintf (fp, "%s", overheadstats.str); + } + +#ifdef ENABLE_PMPI + fprintf (fp, "AVG_MPI_BYTES "); +#endif + +#ifdef HAVE_PAPI + GPTL_PAPIprstr (fp); +#endif + + fprintf (fp, "\n"); /* Done with titles, now print stats */ + + /* + ** Print call tree and stats via recursive routine. "-1" is flag to + ** avoid printing dummy outermost timer, and initialize the depth. + */ + + printself_andchildren (timers[t], fp, t, -1, tot_overhead); + + /* + ** Sum of overhead across timers is meaningful. + ** Factor of 2 is because there are 2 utr calls per start/stop pair. + */ + + sum[t] = 0; + totcount = 0; + for (ptr = timers[t]->next; ptr; ptr = ptr->next) { + sum[t] += ptr->count * 2 * tot_overhead; + totcount += ptr->count; + } + fprintf (fp, "\n"); + if (wallstats.enabled && overheadstats.enabled){ + fprintf (fp, "Overhead sum = %9.3g wallclock seconds\n", sum[t]); + } + if (t == 0){ + if (wallstats.enabled && profileovhd.enabled){ + fprintf (fp, "Overhead estimate = %9.3g wallclock seconds\n", overhead_est); + fprintf (fp, "Overhead bound = %9.3g wallclock seconds\n", overhead_bound); + } + } + if (totcount < PRTHRESH) + fprintf (fp, "Total calls = %lu\n", totcount); + else + fprintf (fp, "Total calls = %9.3e\n", (float) totcount); + } + + /* Print per-name stats for all threads */ + + if (dopr_threadsort && nthreads > 1) { + fprintf (fp, "\nSame stats sorted by timer for threaded regions (for timers active on thread 0):\n"); + fprintf (fp, "Thd "); + + for (n = 0; n < max_name_len[0]; ++n) /* longest timer name */ + fprintf (fp, " "); + + fprintf (fp, " On Called Recurse"); + + if (cpustats.enabled) + fprintf (fp, "%s", cpustats.str); + if (wallstats.enabled) { + fprintf (fp, "%s", wallstats.str); + if (percent && timers[0]->next) + fprintf (fp, "%%_of_%5.5s ", timers[0]->next->name); + if (overheadstats.enabled) + fprintf (fp, "%s", overheadstats.str); + } + +#ifdef HAVE_PAPI + GPTL_PAPIprstr (fp); +#endif + + fprintf (fp, "\n"); + + /* Start at next to skip dummy */ + + for (ptr = timers[0]->next; ptr; ptr = ptr->next) { + + /* + ** To print sum stats, first create a new timer then copy thread 0 + ** stats into it. then sum using "add", and finally print. + */ + + foundany = false; + first = true; + sumstats = *ptr; + for (t = 1; t < nthreads; ++t) { + found = false; + for (tptr = timers[t]->next; tptr && ! found; tptr = tptr->next) { + if (STRMATCH (ptr->name, tptr->name)) { + + /* Only print thread 0 when this timer found for other threads */ + + if (first) { + first = false; + fprintf (fp, "%3.3d ", 0); + printstats (ptr, fp, 0, 0, false, tot_overhead); + } + + found = true; + foundany = true; + fprintf (fp, "%3.3d ", t); + printstats (tptr, fp, 0, 0, false, tot_overhead); + add (&sumstats, tptr); + } + } + } + + if (foundany) { + fprintf (fp, "SUM "); + printstats (&sumstats, fp, 0, 0, false, tot_overhead); + fprintf (fp, "\n"); + } + } + + /* Repeat overhead print in loop over threads */ + + if (wallstats.enabled && overheadstats.enabled) { + osum = 0.; + for (t = 0; t < nthreads; ++t) { + fprintf (fp, "OVERHEAD.%3.3d (wallclock seconds) = %9.3g\n", t, sum[t]); + osum += sum[t]; + } + fprintf (fp, "OVERHEAD.SUM (wallclock seconds) = %9.3g\n", osum); + } + } + + /* Print info about timers with multiple parents */ + + if (dopr_multparent) { + for (t = 0; t < nthreads; ++t) { + bool some_multparents = false; /* thread has entries with multiple parents? */ + for (ptr = timers[t]->next; ptr; ptr = ptr->next) { + if (ptr->nparent > 1) { + some_multparents = true; + break; + } + } + + if (some_multparents) { + fprintf (fp, "\nMultiple parent info for thread %d:\n", t); + if (dopr_preamble && t == 0) { + fprintf (fp, "Columns are count and name for the listed child\n" + "Rows are each parent, with their common child being the last entry, " + "which is indented.\n" + "Count next to each parent is the number of times it called the child.\n" + "Count next to child is total number of times it was called by the " + "listed parents.\n\n"); + } + + for (ptr = timers[t]->next; ptr; ptr = ptr->next) + if (ptr->nparent > 1) + print_multparentinfo (fp, ptr); + } + } + } + + /* Print hash table stats */ + + if (dopr_collision) { + for (t = 0; t < nthreads; t++) { + first = true; + totent = 0; + num_zero = 0; + num_one = 0; + num_two = 0; + num_more = 0; + most = 0; + numtimers= 0; + + for (i = 0; i < tablesize; i++) { + nument = hashtable[t][i].nument; + if (nument > 1) { + totent += nument-1; + if (first) { + first = false; + fprintf (fp, "\nthread %d had some hash collisions:\n", t); + } + fprintf (fp, "hashtable[%d][%d] had %d entries:", t, i, nument); + for (ii = 0; ii < nument; ii++) + fprintf (fp, " %s", hashtable[t][i].entries[ii]->name); + fprintf (fp, "\n"); + } + switch (nument) { + case 0: + ++num_zero; + break; + case 1: + ++num_one; + break; + case 2: + ++num_two; + break; + default: + ++num_more; + break; + } + most = MAX (most, nument); + numtimers += nument; + } + + if (totent > 0) { + fprintf (fp, "Total collisions thread %d = %d\n", t, totent); + fprintf (fp, "Entry information:\n"); + fprintf (fp, "num_zero = %d num_one = %d num_two = %d num_more = %d\n", + num_zero, num_one, num_two, num_more); + fprintf (fp, "Most = %d\n", most); + } + } + } + + /* Stats on GPTL memory usage */ + + totmem = 0.; + for (t = 0; t < nthreads; t++) { + hashmem = (float) sizeof (Hashentry) * tablesize; + regionmem = (float) numtimers * sizeof (Timer); +#ifdef HAVE_PAPI + papimem = (float) numtimers * sizeof (Papistats); +#else + papimem = 0.; +#endif + pchmem = 0.; + for (ptr = timers[t]->next; ptr; ptr = ptr->next) + pchmem += (float) (sizeof (Timer *)) * (ptr->nchildren + ptr->nparent); + + gptlmem = hashmem + regionmem + pchmem; + totmem += gptlmem; + fprintf (fp, "\n"); + fprintf (fp, "Thread %d total memory usage = %g KB\n", t, gptlmem*.001); + fprintf (fp, " Hashmem = %g KB\n" + " Regionmem = %g KB (papimem portion = %g KB)\n" + " Parent/child arrays = %g KB\n", + hashmem*.001, regionmem*.001, papimem*.001, pchmem*.001); + } + fprintf (fp, "\n"); + fprintf (fp, "Total memory usage all threads = %g KB\n", totmem*0.001); + + print_threadmapping (fp); + free (sum); + + if (fclose (fp) != 0) + fprintf (stderr, "Attempt to close %s failed\n", outfile); + + pr_has_been_called = true; + return 0; +} + +/* +** construct_tree: Build the parent->children tree starting with knowledge of +** parent list for each child. +** +** Input arguments: +** timerst: Linked list of timers +** method: method to be used to define the links +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int construct_tree (Timer *timerst, Method method) +{ + Timer *ptr; /* loop through linked list */ + Timer *pptr = 0; /* parent (init to NULL to avoid compiler warning) */ + int nparent; /* number of parents */ + int maxcount; /* max calls by a single parent */ + int n; /* loop over nparent */ + + /* + ** Walk the linked list to build the parent-child tree, using whichever + ** mechanism is in place. newchild() will prevent loops. + */ + + for (ptr = timerst; ptr; ptr = ptr->next) { + switch (method) { + case GPTLfirst_parent: + if (ptr->nparent > 0) { + pptr = ptr->parent[0]; + if (newchild (pptr, ptr) != 0); + } + break; + case GPTLlast_parent: + if (ptr->nparent > 0) { + nparent = ptr->nparent; + pptr = ptr->parent[nparent-1]; + if (newchild (pptr, ptr) != 0); + } + break; + case GPTLmost_frequent: + maxcount = 0; + for (n = 0; n < ptr->nparent; ++n) { + if (ptr->parent_count[n] > maxcount) { + pptr = ptr->parent[n]; + maxcount = ptr->parent_count[n]; + } + } + if (maxcount > 0) { /* not an orphan */ + if (newchild (pptr, ptr) != 0); + } + break; + case GPTLfull_tree: + /* + ** Careful: this one can create *lots* of output! + */ + for (n = 0; n < ptr->nparent; ++n) { + pptr = ptr->parent[n]; + if (newchild (pptr, ptr) != 0); + } + break; + default: + return GPTLerror ("construct_tree: method %d is not known\n", method); + } + } + return 0; +} + +/* +** modestr: Return a pointer to a string that represents the mode +** +** Input arguments: +** mode: print mode type (write or append) +*/ +static char *modestr (PRMode prmode) +{ + if (prmode == GPTLprint_write) + return "write"; + else if (prmode == GPTLprint_append) + return "append"; + else + return "Unknown"; +} + +/* +** methodstr: Return a pointer to a string which represents the method +** +** Input arguments: +** method: method type +*/ + +static char *methodstr (Method method) +{ + if (method == GPTLfirst_parent) + return "first_parent"; + else if (method == GPTLlast_parent) + return "last_parent"; + else if (method == GPTLmost_frequent) + return "most_frequent"; + else if (method == GPTLfull_tree) + return "full_tree"; + else + return "Unknown"; +} + +/* +** newchild: Add an entry to the children list of parent. Use function +** is_descendant() to prevent infinite loops. +** +** Input arguments: +** parent: parent node +** child: child to be added +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +static int newchild (Timer *parent, Timer *child) +{ + int nchildren; /* number of children (temporary) */ + Timer **chptr; /* array of pointers to children */ + int n; /* loop over nchildren */ + + static const char *thisfunc = "newchild"; + + if (parent == child) + return GPTLerror ("%s: child %s can't be a parent of itself\n", thisfunc, child->name); + + /* + ** To allow construct_tree to be called multiple times, check that proposed child + ** is not a known child + */ + + for (n = 0; n < parent->nchildren; ++n) { + if (parent->children[n] == child){ + n = parent->nchildren + 1; + } + } + if (n > parent->nchildren){ + return 0; + } + + /* + ** To guarantee no loops, ensure that proposed parent isn't already a descendant of + ** proposed child + */ + + if (is_descendant (child, parent)) { + show_descendant (0, child, parent); + return GPTLerror ("%s: loop detected: NOT adding %s to descendant list of %s. " + "Proposed parent is in child's descendant path.\n", + thisfunc, child->name, parent->name); + } + + /* Safe to add the child to the parent's list of children */ + + ++parent->nchildren; + nchildren = parent->nchildren; + chptr = (Timer **) realloc (parent->children, nchildren * sizeof (Timer *)); + if ( ! chptr) + return GPTLerror ("%s: realloc error\n", thisfunc); + parent->children = chptr; + parent->children[nchildren-1] = child; + + return 0; +} + +/* +** get_max_depth: Determine the maximum call tree depth by traversing the +** tree recursively +** +** Input arguments: +** ptr: Starting timer +** startdepth: current depth when function invoked +** +** Return value: maximum depth +*/ + +static int get_max_depth (const Timer *ptr, const int startdepth) +{ + int maxdepth = startdepth; + int depth; + int n; + + for (n = 0; n < ptr->nchildren; ++n) + if ((depth = get_max_depth (ptr->children[n], startdepth+1)) > maxdepth) + maxdepth = depth; + + return maxdepth; +} + +/* +** num_descendants: Determine the number of descendants of a timer by traversing +** the tree recursively. This function is not currently used. It could be +** useful in a pruning algorithm +** +** Input arguments: +** ptr: Starting timer +** +** Return value: number of descendants +*/ + +static int num_descendants (Timer *ptr) +{ + int n; + + ptr->num_desc = ptr->nchildren; + for (n = 0; n < ptr->nchildren; ++n) { + ptr->num_desc += num_descendants (ptr->children[n]); + } + return ptr->num_desc; +} + +/* +** is_descendant: Determine whether node2 is in the descendant list for +** node1 +** +** Input arguments: +** node1: starting node for recursive search +** node2: node to be searched for +** +** Return value: true or false +*/ + +static int is_descendant (const Timer *node1, const Timer *node2) +{ + int n; + + /* Breadth before depth for efficiency */ + + for (n = 0; n < node1->nchildren; ++n) + if (node1->children[n] == node2) + return 1; + + for (n = 0; n < node1->nchildren; ++n) + if (is_descendant (node1->children[n], node2)) + return 1; + + return 0; +} + +/* +** show_descendant: list descendants, breadth first, stopping early +** if a particular node is discovered (e.g. the parent) +** +** Input arguments: +** level: current level in recursion, should be 0 when first called +** node1: starting node for recursive listing +** node2: node defining the early stopping criterion +** +** Return value: true (listed all descendants) or false (stopped early) +*/ + +static int show_descendant (const int level, const Timer *node1, const Timer *node2) +{ + int n; + + /* Breadth before depth for efficiency */ + + for (n = 0; n < node1->nchildren; ++n){ + printf ("node1: %-32s level: %d child: %d label: %-32s\n", node1->name, level, n, node1->children[n]->name); + if (node1->children[n] == node2) + return 1; + } + + for (n = 0; n < node1->nchildren; ++n) + if (show_descendant (level+1, node1->children[n], node2)) + return 1; + + return 0; +} + +/* +** printstats: print a single timer +** +** Input arguments: +** timer: timer for which to print stats +** fp: file descriptor to write to +** t: thread number +** depth: depth to indent timer +** doindent: whether indenting will be done +** tot_overhead: underlying timing routine overhead +*/ + +static void printstats (const Timer *timer, + FILE *fp, + const int t, + const int depth, + const bool doindent, + const double tot_overhead) +{ + int i; /* index */ + int indent; /* index for indenting */ + int extraspace; /* for padding to length of longest name */ + float fusr; /* user time as float */ + float fsys; /* system time as float */ + float usrsys; /* usr + sys */ + float elapse; /* elapsed time */ + float wallmax; /* max wall time */ + float wallmin; /* min wall time */ + float ratio; /* percentage calc */ + + /* Flag regions having multiple parents with a "*" in column 1 */ + + if (doindent) { + if (timer->nparent > 1) + fprintf (fp, "* "); + else + fprintf (fp, " "); + + /* Indent to depth of this timer */ + + for (indent = 0; indent < depth; ++indent) + fprintf (fp, " "); + } + + if (dopr_quotes){ + fprintf (fp, "\"%s\"", timer->name); + } else { + fprintf (fp, "%s", timer->name); + } + + /* Pad to length of longest name */ + + extraspace = max_name_len[t] - strlen (timer->name); + for (i = 0; i < extraspace; ++i) + fprintf (fp, " "); + + /* Pad to max indent level */ + + if (doindent) + for (indent = depth; indent < max_depth[t]; ++indent) + fprintf (fp, " "); + + if (timer->onflg) + fprintf (fp, " y"); + else + fprintf (fp, " -"); + + if (timer->count < PRTHRESH) { + if (timer->nrecurse > 0) + fprintf (fp, "%8lu %6lu ", timer->count, timer->nrecurse); + else + fprintf (fp, "%8lu - ", timer->count); + } else { + if (timer->nrecurse > 0) + fprintf (fp, "%8.1e %6.0e ", (float) timer->count, (float) timer->nrecurse); + else + fprintf (fp, "%8.1e - ", (float) timer->count); + } + + if (cpustats.enabled) { + fusr = timer->cpu.accum_utime / (float) ticks_per_sec; + fsys = timer->cpu.accum_stime / (float) ticks_per_sec; + usrsys = fusr + fsys; + fprintf (fp, "%9.3f %9.3f %9.3f ", fusr, fsys, usrsys); + } + + if (wallstats.enabled) { + elapse = timer->wall.accum; + wallmax = timer->wall.max; + wallmin = timer->wall.min; + fprintf (fp, "%12.6f %12.6f %12.6f ", elapse, wallmax, wallmin); + + if (percent && timers[0]->next) { + ratio = 0.; + if (timers[0]->next->wall.accum > 0.) + ratio = (timer->wall.accum * 100.) / timers[0]->next->wall.accum; + fprintf (fp, " %9.2f ", ratio); + } + + /* + ** Factor of 2 is because there are 2 utr calls per start/stop pair. + */ + + if (overheadstats.enabled) { + fprintf (fp, "%16.6f ", timer->count * 2 * tot_overhead); + } + } + +#ifdef ENABLE_PMPI + if (timer->nbytes == 0.) + fprintf (fp, " - "); + else + fprintf (fp, "%13.3e ", timer->nbytes / timer->count); +#endif + +#ifdef HAVE_PAPI + GPTL_PAPIpr (fp, &timer->aux, t, timer->count, timer->wall.accum); +#endif + + fprintf (fp, "\n"); +} + +/* +** print_multparentinfo: +** +** Input arguments: +** Input/output arguments: +*/ +void print_multparentinfo (FILE *fp, + Timer *ptr) +{ + int n; + + if (ptr->norphan > 0) { + if (ptr->norphan < PRTHRESH) + fprintf (fp, "%8u %-32s\n", ptr->norphan, "ORPHAN"); + else + fprintf (fp, "%8.1e %-32s\n", (float) ptr->norphan, "ORPHAN"); + } + + for (n = 0; n < ptr->nparent; ++n) { + if (ptr->parent_count[n] < PRTHRESH) + fprintf (fp, "%8d %-32s\n", ptr->parent_count[n], ptr->parent[n]->name); + else + fprintf (fp, "%8.1e %-32s\n", (float) ptr->parent_count[n], ptr->parent[n]->name); + } + + if (ptr->count < PRTHRESH) + fprintf (fp, "%8lu %-32s\n\n", ptr->count, ptr->name); + else + fprintf (fp, "%8.1e %-32s\n\n", (float) ptr->count, ptr->name); +} + +/* +** add: add the contents of tin to tout +** +** Input arguments: +** tin: input timer +** Input/output arguments: +** tout: output timer summed into +*/ + +static void add (Timer *tout, + const Timer *tin) +{ + tout->count += tin->count; + + if (wallstats.enabled) { + tout->wall.accum += tin->wall.accum; + + tout->wall.max = MAX (tout->wall.max, tin->wall.max); + tout->wall.min = MIN (tout->wall.min, tin->wall.min); + } + + if (cpustats.enabled) { + tout->cpu.accum_utime += tin->cpu.accum_utime; + tout->cpu.accum_stime += tin->cpu.accum_stime; + } +#ifdef HAVE_PAPI + GPTL_PAPIadd (&tout->aux, &tin->aux); +#endif +} + +/* +** GPTLpr_summary: Gather and print summary stats across +** threads and MPI tasks +** +** Input arguments: +** comm: commuicator (e.g. MPI_COMM_WORLD). If zero, use MPI_COMM_WORLD +*/ + +#ifdef HAVE_MPI +int GPTLpr_summary (MPI_Comm comm) +#else +int GPTLpr_summary (int comm) +#endif +{ + const char *outfile = "timing.summary"; + int ret; + + ret = GPTLpr_summary_file(comm, outfile); + return 0; +} + +#ifdef HAVE_MPI +int GPTLpr_summary_file (MPI_Comm comm, + const char *outfile) +#else +int GPTLpr_summary_file (int comm, + const char *outfile) +#endif +{ + int iam = 0; /* MPI rank: default master */ + int n; /* index */ + int extraspace; /* for padding to length of longest name */ + int totlen; /* length for malloc */ + char *outpath; /* path to output file: outdir/outfile */ + FILE *fp = 0; /* output file */ + + int count; /* number of timers */ + Summarystats *storage; /* storage for data from all timers */ + + int x; /* pointer increment */ + int k; /* counter */ + char *tempname; /* event name workspace */ + int max_name_length; + int len; + float temp; + int ret; /* return code */ + + static const char *thisfunc = "GPTLpr_summary_file"; + +#ifdef HAVE_MPI + int nproc; /* number of procs in MPI communicator */ + + char name[MAX_CHARS+1]; /* timer name requested by master */ + + if (((int) comm) == 0) + comm = MPI_COMM_WORLD; + + if ((ret = MPI_Comm_rank (comm, &iam)) != MPI_SUCCESS) + return GPTLerror ("%s: Bad return from MPI_Comm_rank=%d\n", thisfunc, ret); + + if ((ret = MPI_Comm_size (comm, &nproc)) != MPI_SUCCESS) + return GPTLerror ("%s rank %d: Bad return from MPI_Comm_size=%d\n", thisfunc, iam, ret); + +#endif + + if ( ! initialized) + return GPTLerror ("%s: GPTLinitialize() has not been called\n", thisfunc); + + /* + ** Each process gathers stats for its threads. + ** Binary tree used combine results. + ** Master prints results. + */ + + if (iam == 0) { + + /* 2 is for "/" plus null */ + if (outdir) + totlen = strlen (outdir) + strlen (outfile) + 2; + else + totlen = strlen (outfile) + 2; + + outpath = (char *) GPTLallocate (totlen); + + if (outdir) { + strcpy (outpath, outdir); + strcat (outpath, "/"); + strcat (outpath, outfile); + } else { + strcpy (outpath, outfile); + } + + if (print_mode == GPTLprint_append){ + if ( ! (fp = fopen (outpath, "a"))) + fp = stderr; + } + else{ + if ( ! (fp = fopen (outpath, "w"))) + fp = stderr; + } + + free (outpath); + + fprintf (fp, "$Id: gptl.c,v 1.157 2011-03-28 20:55:18 rosinski Exp $\n"); + fprintf (fp, "'count' is cumulative. All other stats are max/min\n"); + fprintf (fp, "'on' indicates whether the timer was active during output, and so stats are lower or upper bounds.\n"); +#ifndef HAVE_MPI + fprintf (fp, "NOTE: GPTL was built WITHOUT MPI: Only task 0 stats will be printed.\n"); + fprintf (fp, "This is even for MPI codes.\n"); +#endif + fprintf (fp, "\n"); + + count = merge_thread_data(); /*merges events from all threads*/ + + if( !( tempname = (char*)malloc((MAX_CHARS + 1) * sizeof(char) ) ) ) + return GPTLerror ("%s: memory allocation failed\n", thisfunc); + + /* allocate storage for data for all timers */ + if( !( storage = malloc( sizeof(Summarystats) * count ) ) && count ) + return GPTLerror ("%s: memory allocation failed\n", thisfunc); + + if ( (ret = collect_data( iam, comm, &count, &storage) ) != 0 ) + return GPTLerror ("%s: master collect_data failed\n", thisfunc); + + x = 0; /*finds max timer name length*/ + max_name_length = 0; + for( k = 0; k < count; k++ ) { + len = strlen( timerlist[0] + x ); + if( len > max_name_length ) + max_name_length = len; + x += MAX_CHARS + 1; + } + + /* Print heading */ + + fprintf (fp, "name"); + if (dopr_quotes){ + extraspace = (max_name_length+2) - strlen ("name"); + } else { + extraspace = max_name_length - strlen ("name"); + } + for (n = 0; n < extraspace; ++n) + fprintf (fp, " "); + fprintf (fp, " on processes threads count"); + fprintf (fp, " walltotal wallmax (proc thrd ) wallmin (proc thrd )"); + + for (n = 0; n < nevents; ++n) { + fprintf (fp, " %8.8stotal", eventlist[n].str8); + fprintf (fp, " %8.8smax (proc thrd )", eventlist[n].str8); + fprintf (fp, " %8.8smin (proc thrd )", eventlist[n].str8); + } + + fprintf (fp, "\n"); + + x = 0; + for( k = 0; k < count; k++ ) { + + /* Print the results for this timer */ + memset( tempname, 0, (MAX_CHARS + 1) * sizeof(char) ); + memcpy( tempname, timerlist[0] + x, (MAX_CHARS + 1) * sizeof(char) ); + + x += (MAX_CHARS + 1); + if (dopr_quotes){ + fprintf (fp, "\"%s\"", tempname); + } else { + fprintf (fp, "%s", tempname); + } + extraspace = max_name_length - strlen (tempname); + for (n = 0; n < extraspace; ++n) + fprintf (fp, " "); + if (storage[k].onflgs > 0) + fprintf (fp, " y "); + else + fprintf (fp, " - "); + temp = storage[k].count; + fprintf(fp, " %8d %8d %12.6e ", + storage[k].processes, storage[k].threads, temp); + fprintf (fp, " %12.6e %9.3f (%6d %6d) %9.3f (%6d %6d)", + storage[k].walltotal, + storage[k].wallmax, storage[k].wallmax_p, storage[k].wallmax_t, + storage[k].wallmin, storage[k].wallmin_p, storage[k].wallmin_t); +#ifdef HAVE_PAPI + for (n = 0; n < nevents; ++n) { + fprintf (fp, " %12.6e", storage[k].papitotal[n]); + + fprintf (fp, " %9.3e (%6d %6d)", + storage[k].papimax[n], storage[k].papimax_p[n], + storage[k].papimax_t[n]); + + fprintf (fp, " %9.3e (%6d %6d)", + storage[k].papimin[n], storage[k].papimin_p[n], + storage[k].papimin_t[n]); + } +#endif + fprintf (fp, "\n"); + } + + fprintf (fp, "\n"); + free(tempname); + + } + else { /* iam != 0 (slave) */ +#ifdef HAVE_MPI + /* count number of timers from linked list */ + count = merge_thread_data(); + + /*allocate storage for data for all timers */ + if( !( storage = malloc( sizeof(Summarystats) * count ) ) && count ) + return GPTLerror ("%s: memory allocation failed\n", thisfunc); + + if ( (ret = collect_data( iam, comm, &count, &storage ) ) != 0 ) + return GPTLerror ("%s: slave collect_data failed\n", thisfunc); +#endif + } + + free(timerlist[0]); + free(timerlist); + free(storage); + if (iam == 0 && fclose (fp) != 0) + fprintf (stderr, "%s: Attempt to close %s failed\n", thisfunc, outfile); + return 0; +} + +/* +** merge_thread_data: returns number of events in merged list +*/ + +static int merge_thread_data() +{ + int n, k, x; /*counters*/ + int t; /*current thread*/ + int num_newtimers; + int compare; + int *count; + int max_count; /* largest number of timers among non-thread-0 threads */ + char **newtimers; + int length = MAX_CHARS + 1; + char ***sort; + int count_r; /* count to be returned, allows *count to be free()ed */ + Timer *ptr; + + static const char *thisfunc = "merge_thread_data"; + + if( nthreads == 1 ) { /* merging is not needed since only 1 thread */ + + /* count timers for thread 0 */ + count_r = 0; + for (ptr = timers[0]->next; ptr; ptr = ptr->next) count_r++; + + timerlist = (char **) GPTLallocate( sizeof (char *)); + if( !( timerlist[0] = (char *)malloc( count_r * length * sizeof (char)) ) && count_r) + return GPTLerror ("%s: memory allocation failed\n", thisfunc); + + x = 0; + for (ptr = timers[0]->next; ptr; ptr = ptr->next) { + strcpy((timerlist[0] + x), ptr->name); + x += length; + } + + return count_r; + + } + + timerlist = (char **) GPTLallocate( nthreads * sizeof (char *)); + count = (int *) GPTLallocate( nthreads * sizeof (int)); + sort = (char ***) GPTLallocate( nthreads * sizeof (void *)); + + max_count = 0; + for (t = 0; t < nthreads; t++) { + + /* count timers for thread */ + count[t] = 0; + for (ptr = timers[t]->next; ptr; ptr = ptr->next) count[t]++; + + if( count[t] > max_count || max_count == 0 ) max_count = count[t]; + + if( !( sort[t] = (char **)malloc( count[t] * sizeof (char *)) ) && count[t]) + return GPTLerror ("%s: memory allocation failed\n", thisfunc); + + /* allocate memory to hold list of timer names */ + if( !( timerlist[t] = (char *)malloc( length * count[t] * sizeof (char)) ) && count[t]) + return GPTLerror ("%s: memory allocation failed\n", thisfunc); + memset( timerlist[t], 0, length * count[t] * sizeof (char) ); + + x = 0; + for (ptr = timers[t]->next; ptr; ptr = ptr->next) { + strcpy((timerlist[t] + x), ptr->name); + x += length; + } + + x = 0; + for (k = 0; k < count[t]; k++) { + sort[t][k] = timerlist[t] + x; + x += length; + } + + qsort( sort[t], count[t], sizeof (char *), cmp ); + + } + + if( !( newtimers = (char **)malloc( max_count * sizeof (char *)) ) && max_count) + return GPTLerror ("%s: memory allocation failed\n", thisfunc); + + for (t = 1; t < nthreads; t++) { + memset( newtimers, 0, max_count * sizeof (char *) ); + k = 0; + n = 0; + num_newtimers = 0; + while( k < count[0] && n < count[t] ) { + /* linear comparison of timers */ + compare = strcmp( sort[0][k], sort[t][n] ); + + if( compare == 0 ) { + /* both have, nothing needs to be done */ + k++; + n++; + continue; + } + + if( compare < 0 ) { + /* event that only master has, nothing needs to be done */ + k++; + continue; + } + + if( compare > 0 ) { + /* event that only slave thread has, need to add */ + newtimers[num_newtimers] = sort[t][n]; + n++; + num_newtimers++; + } + } + + while( n < count[t] ) { + /* adds any remaining timers, since we know that all the rest + are new since have checked all master thread timers */ + newtimers[num_newtimers] = sort[t][n]; + num_newtimers++; + n++; + } + + if( num_newtimers ) { + /* sorts by memory address to restore original order */ + qsort( newtimers, num_newtimers, sizeof(char*), ncmp ); + + /* reallocate memory to hold additional timers */ + if( !( sort[0] = realloc( sort[0], (count[0] + num_newtimers) * sizeof (char *)) ) ) + return GPTLerror ("%s: memory reallocation failed\n", thisfunc); + if( !(timerlist[0] = realloc(timerlist[0], length * (count[0] + num_newtimers) * sizeof (char)) ) ) + return GPTLerror ("%s: memory reallocation failed\n", thisfunc); + + k = count[0]; + for (n = 0; n < num_newtimers; n++) { + /* add new found timers */ + memcpy( timerlist[0] + (count[0] + n) * length, newtimers[n], length * sizeof (char) ); + } + + count[0] += num_newtimers; + + /* reassign pointers in sort since realloc will have broken them if it moved the memory. */ + x = 0; + for (k = 0; k < count[0]; k++) { + sort[0][k] = timerlist[0] + x; + x += length; + } + + qsort( sort[0], count[0], sizeof (char *), cmp ); + } + } + + free(newtimers); + free(sort[0]); + /* don't free timerlist[0], since needed for subsequent steps in gathering global statistics */ + for (t = 1; t < nthreads; t++) { + free(sort[t]); + free(timerlist[t]); + } + + free(sort); + count_r = count[0]; + free(count); + + return count_r; +} + +/* +** collect data: compute global stats using tree reduction algorithm +** returns pointer to new summarystats list +** +** Input arguments: +** iam: process id +** comm: MPI communicator +** Input/Output arguments: +** summarystats: max/min/etc stats over all processes and threads +** count: number of events +** timerlist: list of all timer names (global variable) +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +#ifdef HAVE_MPI +static int collect_data(const int iam, + MPI_Comm comm, + int *count, + Summarystats **summarystats_cumul ) +#else +static int collect_data(const int iam, + int comm, + int *count, + Summarystats **summarystats_cumul ) +#endif +{ + int step; /* spacing beween active processes */ + int mstep; /* spacing between active masters */ + int procid; /* process to communicate with */ + int ret; + int nproc; + int signal = 1; + int x, k, n; /* counters */ + char *tempname; + int s = (MAX_CHARS + 1 ); /* spacing between timer names */ + int length = MAX_CHARS + 1; + int compare; + int num_newtimers; + int count_slave; + char *timers_slave; /* slave timerlist */ + char **newtimers; + char **sort_slave; /* slave sorted list */ + char **sort_master; /* master sorted list */ + int m_index, s_index; + Summarystats *summarystats; /* stats collected on master */ + + static const char *thisfunc = "collect_data"; + +#ifdef HAVE_MPI + Summarystats *summarystats_slave; /* stats sent to master */ + const int taga = 99; + const int tagb = 100; + const int tagc = 101; + MPI_Status status; + MPI_Request rcvreq1; + MPI_Request rcvreq2; + MPI_Request rcvreq3; + + if ((ret = MPI_Comm_size (comm, &nproc)) != MPI_SUCCESS) + return GPTLerror ("%s rank %d: Bad return from MPI_Comm_size=%d\n", thisfunc, iam, ret); + +#endif + + summarystats = *summarystats_cumul; + + if (!( tempname = (char*)malloc((MAX_CHARS +1) * sizeof(char) ) )) + return GPTLerror ("%s: memory allocation failed\n", thisfunc); + + x = 0; + for (k = 0; k < *count; k++) { + memcpy( tempname, timerlist[0] + x, (MAX_CHARS + 1) * sizeof (char) ); + /* calculate individual stats */ + get_threadstats( iam, tempname, &summarystats[k]); + x += (MAX_CHARS + 1); + } + +#ifdef HAVE_MPI + step = 1; + mstep = 2; + while( step < nproc ) { + + if ((iam % mstep) == 0) { + /* find new masters at the current level, which are at every n*step starting with 0 */ + + procid = iam + step; + if (procid < nproc) { + /* prevent lone master wanting data from nonexistent process problem */ + + /* prepare for receive */ + if ((ret = MPI_Irecv (&count_slave, 1, MPI_INTEGER, procid, taga, comm, &rcvreq2)) != MPI_SUCCESS) + return GPTLerror ("%s rank %d: Bad return from MPI_Irecv=%d\n", thisfunc, iam, ret); + + /* handshake with slave */ + if ((ret = MPI_Send (&signal, 1, MPI_INTEGER, procid, taga, comm)) != MPI_SUCCESS) + return GPTLerror ("%s rank %d: Bad return from MPI_Send=%d\n", thisfunc, iam, ret); + + /* wait for message from slave */ + if ((ret = MPI_Wait (&rcvreq2, MPI_STATUS_IGNORE)) != MPI_SUCCESS) + return GPTLerror ("%s rank %d: Bad return from MPI_Wait=%d\n", thisfunc, iam, ret); + + if (count_slave != 0) { /* if slave had no events, then nothing needs to be done*/ + + if (!(sort_master = (char **) malloc( (*count) * sizeof (char *) ) ) && (*count)) + return GPTLerror ("%s: memory allocation failed\n", thisfunc); + if (!(newtimers = (char **) malloc( count_slave * sizeof (char *) ) )) + return GPTLerror ("%s: memory allocation failed\n", thisfunc); + if (!(sort_slave = (char **) malloc( count_slave * sizeof (char *) ) )) + return GPTLerror ("%s: memory allocation failed\n", thisfunc); + if (!(summarystats_slave = (Summarystats *) malloc( count_slave * sizeof (Summarystats) ) )) + return GPTLerror ("%s: memory allocation failed\n", thisfunc); + if (!(timers_slave = (char *) malloc( count_slave * (MAX_CHARS + 1) * sizeof (char) ) )) + return GPTLerror ("%s: memory allocation failed\n", thisfunc); + + if ((ret = MPI_Irecv (timers_slave, count_slave * (MAX_CHARS + 1), MPI_CHAR, procid, tagb, comm, &rcvreq3)) != MPI_SUCCESS) + return GPTLerror ("%s rank %d: Bad return from MPI_Irecv=%d\n", thisfunc, iam, ret); + if ((ret = MPI_Irecv (summarystats_slave, count_slave * sizeof(Summarystats), MPI_BYTE, procid, tagc, comm, &rcvreq1)) != MPI_SUCCESS) + return GPTLerror ("%s rank %d: Bad return from MPI_Irecv=%d\n", thisfunc, iam, ret); + if ((ret = MPI_Send (&signal, 1, MPI_INT, procid, tagb, comm)) != MPI_SUCCESS) + return GPTLerror ("%s rank %d: Bad return from MPI_Send=%d\n", thisfunc, iam, ret); + if ((ret = MPI_Wait (&rcvreq1, MPI_STATUS_IGNORE)) != MPI_SUCCESS) + return GPTLerror ("%s rank %d: Bad return from MPI_Wait=%d\n", thisfunc, iam, ret); + if ((ret = MPI_Wait (&rcvreq3, MPI_STATUS_IGNORE)) != MPI_SUCCESS) + return GPTLerror ("%s rank %d: Bad return from MPI_Wait=%d\n", thisfunc, iam, ret); + + x = 0; + for (k = 0; k < count_slave; k++) { + sort_slave[k] = timers_slave + x; + x += MAX_CHARS + 1; + } + x = 0; + for (k = 0; k < *count; k++) { + sort_master[k] = timerlist[0] + x; + x += MAX_CHARS + 1; + } + + qsort(sort_master, *count, sizeof(char*), cmp); + qsort(sort_slave, count_slave, sizeof(char*), cmp); + + num_newtimers = 0; + n = 0; + k = 0; + while (k < *count && n < count_slave) + { + compare = strcmp(sort_master[k], sort_slave[n]); + + if (compare == 0) { + /* matching timers found */ + + /* find element number of the name in original timerlist so that it can be matched with its summarystats */ + m_index = get_index( timerlist[0], sort_master[k] ); + + s_index = get_index( timers_slave, sort_slave[n] ); + get_summarystats (&summarystats[m_index], &summarystats_slave[s_index]); + k++; + n++; + continue; + } + + if (compare > 0) { + /* s1 >s2 . slave has event; master does not */ + newtimers[num_newtimers] = sort_slave[n]; + num_newtimers++; + n++; + continue; + } + + if (compare < 0) /* only master has event; nothing needs to be done */ + k++; + } + + while (n < count_slave) { + /* add all remaining timers which only the slave has */ + newtimers[num_newtimers] = sort_slave[n]; + num_newtimers++; + n++; + } + + /* sort by memory address to get original order */ + qsort (newtimers, num_newtimers, sizeof(char*), ncmp); + + /* reallocate to hold new timer names and summary stats from slave */ + if (!(timerlist[0] = realloc( timerlist[0], length * (*count + num_newtimers) * sizeof (char) ) )) + return GPTLerror ("%s: memory reallocation failed\n", thisfunc); + if (!(summarystats = realloc( summarystats, (*count + count_slave ) * sizeof (Summarystats) ) )) + return GPTLerror ("%s: memory reallocation failed\n", thisfunc); + + k = *count; + x = *count * (MAX_CHARS + 1); + for (n = 0; n < num_newtimers; n++) { + /* copy new timers names and new timer data */ + memcpy(timerlist[0] + x, newtimers[n], length * sizeof (char)); + s_index = get_index( timers_slave, newtimers[n] ); + memcpy(&summarystats[k], &summarystats_slave[s_index], sizeof (Summarystats)); + k++; + x += MAX_CHARS + 1; + } + *count += num_newtimers; + + free(timers_slave); + free(summarystats_slave); + free(newtimers); + free(sort_slave); + free(sort_master); + } + + } + + } + else if ( (iam % step) == 0 ) { + /* non masters send data */ + + procid = iam - step; + + /* wait for ready signal from master */ + if ((ret = MPI_Recv (&signal, 1, MPI_INTEGER, procid, taga, comm, MPI_STATUS_IGNORE)) != MPI_SUCCESS) + return GPTLerror ("%s rank %d: Bad return from MPI_Recv=%d\n", thisfunc, iam, ret); + + if ((ret = MPI_Send (count, 1, MPI_INTEGER, procid, taga, comm)) != MPI_SUCCESS) + return GPTLerror ("%s rank %d: Bad return from MPI_Send=%d\n", thisfunc, iam, ret); + + if ( count != 0) { + if ((ret = MPI_Recv (&signal, 1, MPI_INTEGER, procid, tagb, comm, MPI_STATUS_IGNORE)) != MPI_SUCCESS) + return GPTLerror ("%s rank %d: Bad return from MPI_Recv=%d\n", thisfunc, iam, ret); + if ((ret = MPI_Send (timerlist[0], (*count) * (MAX_CHARS + 1), MPI_CHAR, procid, tagb, comm)) != MPI_SUCCESS) + return GPTLerror ("%s rank %d: Bad return from MPI_Send=%d\n", thisfunc, iam, ret); + if ((ret = MPI_Send (summarystats, (*count) * sizeof(Summarystats), MPI_BYTE, procid, tagc, comm)) != MPI_SUCCESS) + return GPTLerror ("%s rank %d: Bad return from MPI_Send=%d\n", thisfunc, iam, ret); + } + free(tempname); + *summarystats_cumul = summarystats; + return 0; + + } + + step = mstep; + mstep = 2 * mstep; + + } + +#endif + + free(tempname); + *summarystats_cumul = summarystats; + return 0; +} + +/* +** get_index: calculates the index number of an element in a list +** based on the start memory address and memory address of the element +** where each element is MAX_CHARS+1 long +** +** Input arguments: +** list: start address of list +** element: start address of element +** +** Return value: index of element in list +*/ + +int get_index( const char * list, + const char * element ) +{ + return (( element - list ) / ( MAX_CHARS + 1 )); +} + + +/* +** cmp: returns value from strcmp. for use with qsort +*/ + +static int cmp(const void *pa, const void *pb) +{ + const char** x = (const char**)pa; + const char** y = (const char**)pb; + return strcmp(*x, *y); +} + + +/* +** ncmp: compares values of memory adresses pointed to by a pointer. for use with qsort +*/ + +static int ncmp( const void *pa, const void *pb ) +{ + static const char *thisfunc = "GPTLsetoption"; + const char** x = (const char**)pa; + const char** y = (const char**)pb; + + if( *x > *y ) + return 1; + if( *x < *y ) + return -1; + if( *x == *y ) + GPTLerror("%s: shared memory address between timers\n", thisfunc); +} + +/* +** get_threadstats: gather stats for timer "name" over all threads +** +** Input arguments: +** iam: MPI process id +** name: timer name +** Output arguments: +** summarystats: max/min stats over all threads +*/ + +void get_threadstats (const int iam, + const char *name, + Summarystats *summarystats) +{ +#ifdef HAVE_PAPI + int n; /* event index */ +#endif + int t; /* thread index */ + unsigned int indx; /* returned from getentry() */ + Timer *ptr; /* timer */ + + /* + ** This memset fortuitiously initializes the process values (_p) to master (0) + */ + + memset (summarystats, 0, sizeof (Summarystats)); + + summarystats->wallmax_p = iam; + summarystats->wallmin_p = iam; + + for (t = 0; t < nthreads; ++t) { + if ((ptr = getentry (hashtable[t], name, &indx))) { + + if (ptr->onflg) + summarystats->onflgs++; + + if (ptr->count > 0) { + summarystats->threads++; + summarystats->walltotal += ptr->wall.accum; + } + summarystats->count += ptr->count; + + if (ptr->wall.accum > summarystats->wallmax) { + summarystats->wallmax = ptr->wall.accum; + summarystats->wallmax_t = t; + } + + if (ptr->wall.accum < summarystats->wallmin || summarystats->wallmin == 0.) { + summarystats->wallmin = ptr->wall.accum; + summarystats->wallmin_t = t; + } +#ifdef HAVE_PAPI + for (n = 0; n < nevents; ++n) { + double value; + if (GPTL_PAPIget_eventvalue (eventlist[n].namestr, &ptr->aux, &value) != 0) { + fprintf (stderr, "Bad return from GPTL_PAPIget_eventvalue\n"); + return; + } + summarystats->papimax_p[n] = iam; + summarystats->papimin_p[n] = iam; + + if (value > summarystats->papimax[n]) { + summarystats->papimax[n] = value; + summarystats->papimax_t[n] = t; + } + + if (value < summarystats->papimin[n] || summarystats->papimin[n] == 0.) { + summarystats->papimin[n] = value; + summarystats->papimin_t[n] = t; + } + summarystats->papitotal[n] += value; + } +#endif + } + } + if ( summarystats->count ) summarystats->processes = 1; +} + +/* +** get_summarystats: write max/min stats into mpistats based on comparison +** with summarystats_slave +** +** Input arguments: +** summarystats_slave: stats from a slave process +** Input/Output arguments: +** summarystats: stats (starts out as master stats) +*/ + +void get_summarystats (Summarystats *summarystats, + const Summarystats *summarystats_slave) +{ + if (summarystats_slave->count == 0) return; + + if (summarystats_slave->wallmax > summarystats->wallmax) { + summarystats->wallmax = summarystats_slave->wallmax; + summarystats->wallmax_p = summarystats_slave->wallmax_p; + summarystats->wallmax_t = summarystats_slave->wallmax_t; + } + + if ((summarystats_slave->wallmin < summarystats->wallmin) || + (summarystats->count == 0)){ + summarystats->wallmin = summarystats_slave->wallmin; + summarystats->wallmin_p = summarystats_slave->wallmin_p; + summarystats->wallmin_t = summarystats_slave->wallmin_t; + } + +#ifdef HAVE_PAPI + { + int n; + for (n = 0; n < nevents; ++n) { + if (summarystats_slave->papimax[n] > summarystats->papimax[n]) { + summarystats->papimax[n] = summarystats_slave->papimax[n]; + summarystats->papimax_p[n] = summarystats_slave->papimax_p[n]; + summarystats->papimax_t[n] = summarystats_slave->papimax_t[n]; + } + + if ((summarystats_slave->papimin[n] < summarystats->papimin[n]) || + (summarystats->count == 0)){ + summarystats->papimin[n] = summarystats_slave->papimin[n]; + summarystats->papimin_p[n] = summarystats_slave->papimin_p[n]; + summarystats->papimin_t[n] = summarystats_slave->papimin_t[n]; + } + summarystats->papitotal[n] += summarystats_slave->papitotal[n]; + } + } +#endif + + summarystats->onflgs += summarystats_slave->onflgs; + summarystats->count += summarystats_slave->count; + summarystats->walltotal += summarystats_slave->walltotal; + summarystats->processes += summarystats_slave->processes; + summarystats->threads += summarystats_slave->threads; +} + +/* +** GPTLbarrier: When MPI enabled, set and time an MPI barrier +** +** Input arguments: +** comm: commuicator (e.g. MPI_COMM_WORLD). If zero, use MPI_COMM_WORLD +** name: region name +** +** Return value: 0 (success) +*/ + +#ifdef HAVE_MPI +int GPTLbarrier (MPI_Comm comm, const char *name) +#else +int GPTLbarrier (int comm, const char *name) +#endif +{ + int ret; + static const char *thisfunc = "GPTLbarrier"; + + ret = GPTLstart (name); +#ifdef HAVE_MPI + if ((ret = MPI_Barrier (comm)) != MPI_SUCCESS) + return GPTLerror ("%s: Bad return from MPI_Barrier=%d", thisfunc, ret); +#endif + ret = GPTLstop (name); + return 0; +} + +/* +** get_cpustamp: Invoke the proper system timer and return stats. +** +** Output arguments: +** usr: user time +** sys: system time +** +** Return value: 0 (success) +*/ + +static inline int get_cpustamp (long *usr, long *sys) +{ +#ifdef HAVE_TIMES + struct tms buf; + + (void) times (&buf); + *usr = buf.tms_utime; + *sys = buf.tms_stime; + return 0; +#else + return GPTLerror ("get_cpustamp: times() not available\n"); +#endif +} + +/* +** GPTLquery: return current status info about a timer. If certain stats are not +** enabled, they should just have zeros in them. If PAPI is not enabled, input +** counter info is ignored. +** +** Input args: +** timername: timer name +** maxcounters: max number of PAPI counters to get info for +** t: thread number (if < 0, the request is for the current thread) +** +** Output args: +** count: number of times this timer was called +** onflg: whether timer is currently on +** wallclock: accumulated wallclock time +** usr: accumulated user CPU time +** sys: accumulated system CPU time +** papicounters_out: accumulated PAPI counters +*/ + +int GPTLquery (const char *timername, + int t, + int *count, + int *onflg, + double *wallclock, + double *dusr, + double *dsys, + long long *papicounters_out, + const int maxcounters) +{ + Timer *ptr; /* linked list pointer */ + int numchars; /* number of characters to copy */ + int namelen; /* number of characters in timer name */ + unsigned int indx; /* linked list index returned from getentry (unused) */ + char new_name[MAX_CHARS+1]; /* timer name with prefix, if there is one */ + const char *name; /* pointer to timer name */ + static const char *thisfunc = "GPTLquery"; + + if ( ! initialized) + return GPTLerror ("%s: GPTLinitialize has not been called\n", thisfunc); + + /* + ** If t is < 0, assume the request is for the current thread + */ + + if (t < 0) { + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: get_thread_num failure\n", thisfunc); + } else { + if (t >= maxthreads) + return GPTLerror ("%s: requested thread %d is too big\n", thisfunc, t); + } + + /* + ** If prefix string is defined, prepend it to timername + ** and assign the name pointer to the new string. + ** Otherwise assign the name pointer to the original string. + */ + + if ((prefix_len[t] > 0) || (prefix_len_nt > 0)){ + namelen = strlen(timername); + numchars = add_prefix(new_name, timername, namelen, t); + name = new_name; + } else { + name = timername; + } + + ptr = getentry (hashtable[t], name, &indx); + if ( !ptr) + return GPTLerror ("%s: requested timer %s does not have a name hash\n", thisfunc, name); + + *onflg = ptr->onflg; + *count = ptr->count; + *wallclock = ptr->wall.accum; + *dusr = ptr->cpu.accum_utime / (double) ticks_per_sec; + *dsys = ptr->cpu.accum_stime / (double) ticks_per_sec; +#ifdef HAVE_PAPI + GPTL_PAPIquery (&ptr->aux, papicounters_out, maxcounters); +#endif + return 0; +} + +/* +** GPTLquerycounters: return current PAPI counters for a timer. +** THIS ROUTINE ID DEPRECATED. USE GPTLget_eventvalue() instead +** +** Input args: +** timername: timer name +** t: thread number (if < 0, the request is for the current thread) +** +** Output args: +** papicounters_out: accumulated PAPI counters +*/ + +int GPTLquerycounters (const char *timername, + int t, + long long *papicounters_out) +{ + Timer *ptr; /* linked list pointer */ + unsigned int indx; /* hash index returned from getentry */ + int numchars; /* number of characters to copy */ + int namelen; /* number of characters in timer name */ + char new_name[MAX_CHARS+1]; /* timer name with prefix, if there is one */ + const char *name; /* pointer to timer name */ + static const char *thisfunc = "GPTLquery_counters"; + + if ( ! initialized) + return GPTLerror ("%s: GPTLinitialize has not been called\n", thisfunc); + + /* + ** If t is < 0, assume the request is for the current thread + */ + + if (t < 0) { + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: get_thread_num failure\n", thisfunc); + } else { + if (t >= maxthreads) + return GPTLerror ("%s: requested thread %d is too big\n", thisfunc, t); + } + + /* + ** If prefix string is defined, prepend it to timername + ** and assign the name pointer to the new string. + ** Otherwise assign the name pointer to the original string. + */ + + if ((prefix_len[t] > 0) || (prefix_len_nt > 0)){ + namelen = strlen(timername); + numchars = add_prefix(new_name, timername, namelen, t); + name = new_name; + } else { + name = timername; + } + + ptr = getentry (hashtable[t], name, &indx); + if ( !ptr) + return GPTLerror ("%s: requested timer %s does not have a name hash\n", thisfunc, name); + +#ifdef HAVE_PAPI + /* The 999 is a hack to say "give me all the counters" */ + GPTL_PAPIquery (&ptr->aux, papicounters_out, 999); +#endif + return 0; +} + +/* +** GPTLget_wallclock: return wallclock accumulation for a timer. +** +** Input args: +** timername: timer name +** t: thread number (if < 0, the request is for the current thread) +** +** Output args: +** value: current wallclock accumulation for the timer +*/ + +int GPTLget_wallclock (const char *timername, + int t, + double *value) +{ + void *self; /* timer address when hash entry generated with *_instr */ + Timer *ptr; /* linked list pointer */ + unsigned int indx; /* hash index returned from getentry (unused) */ + int numchars; /* number of characters to copy */ + int namelen; /* number of characters in timer name */ + char new_name[MAX_CHARS+1]; /* timer name with prefix, if there is one */ + const char *name; /* pointer to timer name */ + static const char *thisfunc = "GPTLget_wallclock"; + + if ( ! initialized) + return GPTLerror ("%s: GPTLinitialize has not been called\n", thisfunc); + + if ( ! wallstats.enabled) + return GPTLerror ("%s: wallstats not enabled\n", thisfunc); + + /* + ** If t is < 0, assume the request is for the current thread + */ + + if (t < 0) { + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + } else { + if (t >= maxthreads) + return GPTLerror ("%s: requested thread %d is too big\n", thisfunc, t); + } + + /* + ** If prefix string is defined, prepend it to timername + ** and assign the name pointer to the new string. + ** Otherwise assign the name pointer to the original string. + */ + + if ((prefix_len[t] > 0) || (prefix_len_nt > 0)){ + namelen = strlen(timername); + numchars = add_prefix(new_name, timername, namelen, t); + name = new_name; + } else { + name = timername; + } + + /* + ** Don't know whether hashtable entry for timername was generated with + ** *_instr() or not, so try both possibilities + */ + + ptr = getentry (hashtable[t], name, &indx); + if ( !ptr) { + if (sscanf (timername, "%lx", (unsigned long *) &self) < 1) + return GPTLerror ("%s: requested timer %s does not exist\n", thisfunc, timername); + ptr = getentry_instr (hashtable[t], self, &indx); + if ( !ptr) + return GPTLerror ("%s: requested timer %s does not exist\n", thisfunc, timername); + } + + *value = ptr->wall.accum; + return 0; +} + +/* +** GPTLstartstop_vals: create/add walltime and call count to an event timer +** +** Input arguments: +** timername: timer name +** add_time: value to add to the walltime accumulator +** add_count: value to add to the call counter +** +** Return value: 0 (success) or -1 (failure) +*/ + +int GPTLstartstop_vals (const char *timername, /* timer name */ + double add_time, /* walltime increment */ + int add_count) /* call count increment */ +{ + Timer *ptr; /* linked list pointer */ + int t; /* thread number for this process */ + int numchars; /* number of characters to copy */ + int namelen; /* number of characters in timer name */ + unsigned int indx; /* index into hash table */ + char new_name[MAX_CHARS+1]; /* timer name with prefix, if there is one */ + const char *name; /* pointer to timer name */ + static const char *thisfunc = "GPTLstartstop_vals"; + + if (disabled) + return 0; + + if ( ! initialized) + return 0; + + if ( ! wallstats.enabled) + return GPTLerror ("%s: wallstats must be enabled to call this function\n", thisfunc); + + if (add_time < 0.) + return GPTLerror ("%s: Input add_time must not be negative\n", thisfunc); + + /* getentry requires the thread number */ + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + + /* + ** If prefix string is defined, prepend it to timername + ** and assign the name pointer to the new string. + ** Otherwise assign the name pointer to the original string. + */ + if ((prefix_len[t] > 0) || (prefix_len_nt > 0)){ + namelen = strlen(timername); + numchars = add_prefix(new_name, timername, namelen, t); + name = new_name; + } else { + name = timername; + } + + /* Find out if the timer already exists */ + ptr = getentry (hashtable[t], name, &indx); + + if (ptr) { + /* + ** The timer already exists. If add_count is > 0, then increment the + ** count and update the time stamp. Then let control jump to the point where + ** wallclock settings are adjusted. + */ + if (add_count > 0){ + ptr->count += add_count; + ptr->wall.last = (*ptr2wtimefunc) (); + } + } else { + /* Need to call start/stop to set up linked list and hash table. */ + if (GPTLstart (timername) != 0) + return GPTLerror ("%s: Error from GPTLstart\n", thisfunc); + + if (GPTLstop (timername) != 0) + return GPTLerror ("%s: Error from GPTLstop\n", thisfunc); + + /* start/stop pair just called should guarantee ptr will be found */ + if ( ! (ptr = getentry (hashtable[t], name, &indx))) + return GPTLerror ("%s: Unexpected error from getentry\n", thisfunc); + + /* + ** If add_count >= 0, then set count to desired value. + ** Otherwise, assume add_count == 0 and set count to 0. + */ + if (add_count >= 0){ + ptr->count = add_count; + } else { + ptr->count = 0; + } + + /* Since this is the first call, set max and min to user input. */ + ptr->wall.max = add_time; + + ptr->wall.prev_min = FLT_MAX; + ptr->wall.min = add_time; + ptr->wall.latest_is_min = 1; + + /* + ** Minor mod: Subtract the overhead of the above start/stop call, before + ** adding user input + */ + ptr->wall.accum -= ptr->wall.latest; + + /* Then set latest to zero, so that update below is correct */ + ptr->wall.latest = 0.0; + + } + + /* Update accum with user input */ + ptr->wall.accum += add_time; + + /* + ** Update latest with user input: + ** If add_count > 0 and old count > 0 (new count > add_count), + ** assume new event time is the average (add_time/add_count). + ** If add_count > 0 and old count = 0 (new count == add_count), + ** assume new event time is the augmented average + ** ((latest value + add_time)/add_count). + ** If add_count == 0, new event time is latest value + add_time. + */ + if (add_count > 0){ + if (ptr->count > add_count) + ptr->wall.latest = add_time/add_count; + else + ptr->wall.latest = (ptr->wall.latest+add_time)/add_count; + } else { + ptr->wall.latest += add_time; + } + + /* Update max with user input */ + if (ptr->wall.latest > ptr->wall.max) + ptr->wall.max = ptr->wall.latest; + + /* Update min with user input */ + if ((ptr->count <= 1) || (add_count == ptr->count)) { + /* + ** still recording walltime for first occurrence, + ** so assign latest estimate to min and prev_min + */ + ptr->wall.min = ptr->wall.latest; + ptr->wall.latest_is_min = 1; + } else { + if (add_count > 0){ + /* check whether latest is the new min */ + if (ptr->wall.latest < ptr->wall.min){ + ptr->wall.prev_min = ptr->wall.min; + ptr->wall.min = ptr->wall.latest; + ptr->wall.latest_is_min = 1; + } else { + ptr->wall.latest_is_min = 0; + } + } else { + /* + ** still recording walltime for latest occurrence, + ** so check whether updated latest is the new min. + */ + if (ptr->wall.latest_is_min == 1){ + if (ptr->wall.prev_min > ptr->wall.latest){ + ptr->wall.min = ptr->wall.latest; + } else { + ptr->wall.min = ptr->wall.prev_min; + ptr->wall.latest_is_min = 0; + } + } + } + } + + return 0; +} + +/* +** GPTLstartstop_valsf: create/add walltime and call count to an event timer. +** Version for when timer name may not be null terminated. +** +** Input arguments: +** timername: timer name +** namelen: number of characters in timer name +** add_time: value to add to the walltime accumulator +** add_count: value to add to the call counter +** +** Return value: 0 (success) or -1 (failure) +*/ + +int GPTLstartstop_valsf (const char *timername, /* timer name */ + const int namelen, /* timer name length */ + double add_time, /* walltime increment */ + int add_count) /* call count increment */ +{ + Timer *ptr; /* linked list pointer */ + int t; /* thread number for this process */ + int numchars; /* number of characters to copy */ + unsigned int indx; /* index into hash table */ + char new_name[MAX_CHARS+1]; /* timer name with prefix, if there is one */ + const char *name; /* pointer to timer name */ + static const char *thisfunc = "GPTLstartstop_valsf"; + + if (disabled) + return 0; + + if ( ! initialized) + return 0; + + if ( ! wallstats.enabled) + return GPTLerror ("%s: wallstats must be enabled to call this function\n", thisfunc); + + if (add_time < 0.) + return GPTLerror ("%s: Input add_time must not be negative\n", thisfunc); + + /* getentry requires the thread number */ + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + + /* + ** If prefix string is defined, prepend it to timername + ** and assign the name pointer to the new string. + ** Otherwise assign the name pointer to the original string. + */ + if ((prefix_len[t] > 0) || (prefix_len_nt > 0)){ + numchars = add_prefix(new_name, timername, namelen, t); + name = new_name; + } else { + numchars = MIN (namelen, MAX_CHARS); + name = timername; + } + + /* Find out if the timer already exists */ + ptr = getentryf (hashtable[t], name, numchars, &indx); + + if (ptr) { + /* + ** The timer already exists. If add_count is > 0, then increment the + ** count and update the time stamp. Then let control jump to the point where + ** wallclock settings are adjusted. + */ + if (add_count > 0){ + ptr->count += add_count; + ptr->wall.last = (*ptr2wtimefunc) (); + } + } else { + /* Need to call start/stop to set up linked list and hash table. */ + if (GPTLstartf (timername, namelen) != 0) + return GPTLerror ("%s: Error from GPTLstart\n", thisfunc); + + if (GPTLstopf (timername, namelen) != 0) + return GPTLerror ("%s: Error from GPTLstop\n", thisfunc); + + /* start/stop pair just called should guarantee ptr will be found */ + if ( ! (ptr = getentryf (hashtable[t], name, numchars, &indx))) + return GPTLerror ("%s: Unexpected error from getentry\n", thisfunc); + + /* + ** If add_count >= 0, then set count to desired value. + ** Otherwise, assume add_count == 0 and set count to 0. + */ + if (add_count >= 0){ + ptr->count = add_count; + } else { + ptr->count = 0; + } + + /* Since this is the first call, set max and min to user input. */ + ptr->wall.max = add_time; + + ptr->wall.prev_min = FLT_MAX; + ptr->wall.min = add_time; + ptr->wall.latest_is_min = 1; + + /* + ** Minor mod: Subtract the overhead of the above start/stop call, before + ** adding user input + */ + ptr->wall.accum -= ptr->wall.latest; + + /* Then set latest to zero, so that update below is correct */ + ptr->wall.latest = 0.0; + + } + + /* Update accum with user input */ + ptr->wall.accum += add_time; + + /* + ** Update latest with user input: + ** If add_count > 0 and old count > 0 (new count > add_count), + ** assume new event time is the average (add_time/add_count). + ** If add_count > 0 and old count = 0 (new count == add_count), + ** assume new event time is the augmented average + ** ((latest value + add_time)/add_count). + ** If add_count == 0, new event time is latest value + add_time. + */ + if (add_count > 0){ + if (ptr->count > add_count) + ptr->wall.latest = add_time/add_count; + else + ptr->wall.latest = (ptr->wall.latest+add_time)/add_count; + } else { + ptr->wall.latest += add_time; + } + + /* Update max with user input */ + if (ptr->wall.latest > ptr->wall.max) + ptr->wall.max = ptr->wall.latest; + + /* Update min with user input */ + if ((ptr->count <= 1) || (add_count == ptr->count)) { + /* + ** still recording walltime for first occurrence, + ** so assign latest estimate to min and prev_min + */ + ptr->wall.min = ptr->wall.latest; + ptr->wall.latest_is_min = 1; + } else { + if (add_count > 0){ + /* check whether latest is the new min */ + if (ptr->wall.latest < ptr->wall.min){ + ptr->wall.prev_min = ptr->wall.min; + ptr->wall.min = ptr->wall.latest; + ptr->wall.latest_is_min = 1; + } else { + ptr->wall.latest_is_min = 0; + } + } else { + /* + ** still recording walltime for latest occurrence, + ** so check whether updated latest is the new min. + */ + if (ptr->wall.latest_is_min == 1){ + if (ptr->wall.prev_min > ptr->wall.latest){ + ptr->wall.min = ptr->wall.latest; + } else { + ptr->wall.min = ptr->wall.prev_min; + ptr->wall.latest_is_min = 0; + } + } + } + } + + return 0; +} + +/* +** GPTLget_eventvalue: return PAPI-based event value for a timer. All values will be +** returned as doubles, even if the event is not derived. +** +** Input args: +** timername: timer name +** eventname: event name (must be currently enabled) +** t: thread number (if < 0, the request is for the current thread) +** +** Output args: +** value: current value of the event for this timer +*/ + +int GPTLget_eventvalue (const char *timername, + const char *eventname, + int t, + double *value) +{ + void *self; /* timer address when hash entry generated with *_instr */ + Timer *ptr; /* linked list pointer */ + int numchars; /* number of characters to copy */ + int namelen; /* number of characters in timer name */ + unsigned int indx; /* hash index returned from getentry (unused) */ + char new_name[MAX_CHARS+1]; /* timer name with prefix, if there is one */ + const char *name; /* pointer to timer name */ + static const char *thisfunc = "GPTLget_eventvalue"; + + if ( ! initialized) + return GPTLerror ("%s: GPTLinitialize has not been called\n", thisfunc); + + /* + ** If t is < 0, assume the request is for the current thread + */ + + if (t < 0) { + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: get_thread_num failure\n", thisfunc); + } else { + if (t >= maxthreads) + return GPTLerror ("%s: requested thread %d is too big\n", thisfunc, t); + } + + /* + ** If prefix string is defined, prepend it to timername + ** and assign the name pointer to the new string. + ** Otherwise assign the name pointer to the original string. + */ + + if ((prefix_len[t] > 0) || (prefix_len_nt > 0)){ + namelen = strlen(timername); + numchars = add_prefix(new_name, timername, namelen, t); + name = new_name; + } else { + name = timername; + } + + /* + ** Don't know whether hashtable entry for timername was generated with + ** *_instr() or not, so try both possibilities + */ + + ptr = getentry (hashtable[t], name, &indx); + if ( !ptr) { + if (sscanf (timername, "%lx", (unsigned long *) &self) < 1) + return GPTLerror ("%s: requested timer %s does not exist\n", thisfunc, timername); + ptr = getentry_instr (hashtable[t], self, &indx); + if ( !ptr) + return GPTLerror ("%s: requested timer %s does not exist\n", thisfunc, timername); + } + +#ifdef HAVE_PAPI + return GPTL_PAPIget_eventvalue (eventname, &ptr->aux, value); +#else + return GPTLerror ("%s: PAPI not enabled\n", thisfunc); +#endif +} + +/* +** GPTLget_nregions: return number of regions (i.e. timer names) for this thread +** +** Input args: +** t: thread number (if < 0, the request is for the current thread) +** +** Output args: +** nregions: number of regions +*/ + +int GPTLget_nregions (int t, + int *nregions) +{ + Timer *ptr; /* walk through linked list */ + static const char *thisfunc = "GPTLget_nregions"; + + if ( ! initialized) + return GPTLerror ("%s: GPTLinitialize has not been called\n", thisfunc); + + /* + ** If t is < 0, assume the request is for the current thread + */ + + if (t < 0) { + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: get_thread_num failure\n", thisfunc); + } else { + if (t >= maxthreads) + return GPTLerror ("%s: requested thread %d is too big\n", thisfunc, t); + } + + *nregions = 0; + for (ptr = timers[t]->next; ptr; ptr = ptr->next) + ++*nregions; + + return 0; +} + +/* +** GPTLget_regionname: return region name for this thread +** +** Input args: +** t: thread number (if < 0, the request is for the current thread) +** region: region number +** nc: max number of chars to put in name +** +** Output args: +** name region name +*/ + +int GPTLget_regionname (int t, /* thread number */ + int region, /* region number (0-based) */ + char *name, /* output region name */ + int nc) /* number of chars in name (free form Fortran) */ +{ + int ncpy; /* number of characters to copy */ + int i; /* index */ + Timer *ptr; /* walk through linked list */ + static const char *thisfunc = "GPTLget_regionname"; + + if ( ! initialized) + return GPTLerror ("%s: GPTLinitialize has not been called\n", thisfunc); + + /* + ** If t is < 0, assume the request is for the current thread + */ + + if (t < 0) { + if ((t = get_thread_num ()) < 0) + return GPTLerror ("%s: get_thread_num failure\n", thisfunc); + } else { + if (t >= maxthreads) + return GPTLerror ("%s: requested thread %d is too big\n", thisfunc, t); + } + + ptr = timers[t]->next; + for (i = 0; i < region; i++) { + if ( ! ptr) + return GPTLerror ("%s: timer number %d does not exist in thread %d\n", thisfunc, region, t); + ptr = ptr->next; + } + + if (ptr) { + ncpy = MIN (nc, strlen (ptr->name)); + strncpy (name, ptr->name, ncpy); + + /* + ** Adding the \0 is only important when called from C + */ + + if (ncpy < nc) + name[ncpy] = '\0'; + } else { + return GPTLerror ("%s: timer number %d does not exist in thread %d\n", thisfunc, region, t); + } + return 0; +} + +/* +** GPTLis_initialized: Return whether GPTL has been initialized +*/ + +int GPTLis_initialized (void) +{ + return (int) initialized; +} + +/* +** getentry_instr: find hash table entry and return a pointer to it +** +** Input args: +** hashtable: the hashtable (array) +** self: input address (from -finstrument-functions) +** Output args: +** indx: hashtable index +** +** Return value: pointer to the entry, or NULL if not found +*/ + +static inline Timer *getentry_instr (const Hashentry *hashtable, /* hash table */ + void *self, /* address */ + unsigned int *indx) /* hash index */ +{ + int i; + Timer *ptr = 0; /* return value when entry not found */ + + /* + ** Hash index is timer address modulo the table size + ** On most machines, right-shifting the address helps because linkers often + ** align functions on even boundaries + */ + + *indx = (((unsigned long) self) >> 4) % tablesize; + for (i = 0; i < hashtable[*indx].nument; ++i) { + if (hashtable[*indx].entries[i]->address == self) { + ptr = hashtable[*indx].entries[i]; + break; + } + } + return ptr; +} + +/* +** getentry: find the entry in the hash table and return a pointer to it. +** +** Input args: +** hashtable: the hashtable (array) +** name: string to be hashed on (specifically, summed) +** Output args: +** indx: hashtable index +** +** Return value: pointer to the entry, or NULL if not found +*/ + +static inline Timer *getentry (const Hashentry *hashtable, /* hash table */ + const char *name, /* name to hash */ + unsigned int *indx) /* hash index */ +{ + int i; /* multiplier for hashing; loop index */ + const unsigned char *c; /* pointer to elements of "name" */ + Timer *ptr = 0; /* return value when entry not found */ + + /* + ** Hash value is sum of: chars times their 1-based position index, modulo tablesize + */ + + *indx = 0; + c = (unsigned char *) name; + for (i = 1; *c && i < MAX_CHARS+1; ++c, ++i) { + *indx += (*c) * i; + } + + *indx %= tablesize; + + /* + ** If nument exceeds 1 there was a hash collision and we must search + ** linearly through an array for a match + */ + + for (i = 0; i < hashtable[*indx].nument; i++) { + if (STRMATCH (name, hashtable[*indx].entries[i]->name)) { + ptr = hashtable[*indx].entries[i]; + break; + } + } + return ptr; +} + +/* +** getentryf: find the entry in the hash table and return a pointer to it. +** (variant of getentry where string length is included because string +** may not be null terminated) +** +** Input args: +** hashtable: the hashtable (array) +** name: string to be hashed on (specifically, summed) +** namelen: number of characters in string +** Output args: +** indx: hashtable index +** +** Return value: pointer to the entry, or NULL if not found +*/ + +static inline Timer *getentryf (const Hashentry *hashtable, /* hash table */ + const char *name, /* name to hash */ + const int namelen, /* length of name */ + unsigned int *indx) /* hash index */ +{ + int i; /* multiplier for hashing; loop index */ + int numchars; /* maximum number of characters to examine */ + const unsigned char *c; /* pointer to elements of "name" */ + Timer *ptr = 0; /* return value when entry not found */ + + numchars = MIN (namelen, MAX_CHARS); + + /* + ** Hash value is sum of: chars times their 1-based position index, modulo tablesize + */ + + *indx = 0; + c = (unsigned char *) name; + for (i = 1; i < numchars+1; ++c, ++i) { + *indx += (*c) * i; + } + + *indx %= tablesize; + + /* + ** If nument exceeds 1 there was a hash collision and we must search + ** linearly through an array for a match + */ + + for (i = 0; i < hashtable[*indx].nument; i++) { + if (STRNMATCH (name, hashtable[*indx].entries[i]->name,numchars)) { + ptr = hashtable[*indx].entries[i]; + break; + } + } + return ptr; +} + +/* +** Add entry points for auto-instrumented codes +** Auto instrumentation flags for various compilers: +** +** gcc, pathcc, icc: -finstrument-functions +** pgcc: -Minstrument:functions +** xlc: -qdebug=function_trace +*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef AUTO_INST +#ifdef _AIX +void __func_trace_enter (const char *function_name, + const char *file_name, + int line_number, + void **const user_data) +{ + (void) GPTLstart (function_name); +} + +void __func_trace_exit (const char *function_name, + const char *file_name, + int line_number, + void **const user_data) +{ + (void) GPTLstop (function_name); +} + +#else + +void __cyg_profile_func_enter (void *this_fn, + void *call_site) +{ + (void) GPTLstart_instr (this_fn); +} + +void __cyg_profile_func_exit (void *this_fn, + void *call_site) +{ + (void) GPTLstop_instr (this_fn); +} +#endif +#endif + +#ifdef __cplusplus +}; +#endif + +#ifdef HAVE_NANOTIME +#ifdef BIT64 +/* 64-bit code copied from PAPI library */ +static inline unsigned long long nanotime (void) +{ + unsigned long long val; + do { + unsigned int a,d; + asm volatile("rdtsc" : "=a" (a), "=d" (d)); + (val) = ((unsigned long)a) | (((unsigned long)d)<<32); + } while(0); + + return (val); +} +#else +static inline unsigned long long nanotime (void) +{ + unsigned long long val; + __asm__ __volatile__("rdtsc" : "=A" (val) : ); + return (val); +} +#endif + +#define LEN 4096 + +static float get_clockfreq () +{ + FILE *fd = 0; + char buf[LEN]; + int is; + + if ( ! (fd = fopen ("/proc/cpuinfo", "r"))) { + fprintf (stderr, "get_clockfreq: can't open /proc/cpuinfo\n"); + return -1.; + } + + while (fgets (buf, LEN, fd)) { + if (strncmp (buf, "cpu MHz", 7) == 0) { + for (is = 7; buf[is] != '\0' && !isdigit (buf[is]); is++); + if (isdigit (buf[is])) + return (float) atof (&buf[is]); + } + } + + return -1.; +} +#endif + +/* +** The following are the set of underlying timing routines which may or may +** not be available. And their accompanying init routines. +** NANOTIME is currently only available on x86. +*/ + +static int init_nanotime () +{ + static const char *thisfunc = "init_nanotime"; +#ifdef HAVE_NANOTIME + if ((cpumhz = get_clockfreq ()) < 0) + return GPTLerror ("%s: Can't get clock freq\n", thisfunc); + + if (verbose) + printf ("%s: Clock rate = %f MHz\n", thisfunc, cpumhz); + + cyc2sec = 1./(cpumhz * 1.e6); + return 0; +#else + return GPTLerror ("%s: not enabled\n", thisfunc); +#endif +} + +static inline double utr_nanotime () +{ +#ifdef HAVE_NANOTIME + double timestamp; + timestamp = nanotime () * cyc2sec; + return timestamp; +#else + static const char *thisfunc = "utr_nanotime"; + (void) GPTLerror ("%s: not enabled\n", thisfunc); + return -1.; +#endif +} + +/* +** MPI_Wtime requires the MPI lib. +*/ + +static int init_mpiwtime () +{ +#ifdef HAVE_MPI + return 0; +#else + static const char *thisfunc = "init_mpiwtime"; + return GPTLerror ("%s: not enabled\n", thisfunc); +#endif +} + +static inline double utr_mpiwtime () +{ +#ifdef HAVE_MPI + return MPI_Wtime (); +#else + static const char *thisfunc = "utr_mpiwtime"; + (void) GPTLerror ("%s: not enabled\n", thisfunc); + return -1.; +#endif +} + +/* +** PAPI_get_real_usec requires the PAPI lib. +*/ + +static int init_papitime () +{ + static const char *thisfunc = "init_papitime"; +#ifdef HAVE_PAPI + ref_papitime = PAPI_get_real_usec (); + if (verbose) + printf ("%s: ref_papitime=%ld\n", thisfunc, (long) ref_papitime); + return 0; +#else + return GPTLerror ("%s: not enabled\n", thisfunc); +#endif +} + +static inline double utr_papitime () +{ +#ifdef HAVE_PAPI + return (PAPI_get_real_usec () - ref_papitime) * 1.e-6; +#else + static const char *thisfunc = "utr_papitime"; + (void) GPTLerror ("%s: not enabled\n", thisfunc); + return -1.; +#endif +} + +/* +** Probably need to link with -lrt for this one to work +*/ + +static int init_clock_gettime () +{ + static const char *thisfunc = "init_clock_gettime"; +#ifdef HAVE_LIBRT + struct timespec tp; + (void) clock_gettime (CLOCK_REALTIME, &tp); + ref_clock_gettime = tp.tv_sec; + if (verbose) + printf ("%s: ref_clock_gettime=%ld\n", thisfunc, (long) ref_clock_gettime); + return 0; +#else + return GPTLerror ("%s: not enabled\n", thisfunc); +#endif +} + +static inline double utr_clock_gettime () +{ +#ifdef HAVE_LIBRT + struct timespec tp; + (void) clock_gettime (CLOCK_REALTIME, &tp); + return (tp.tv_sec - ref_clock_gettime) + 1.e-9*tp.tv_nsec; +#else + static const char *thisfunc = "utr_clock_gettime"; + (void) GPTLerror ("%s: not enabled\n", thisfunc); + return -1.; +#endif +} + +/* +** High-res timer on AIX: read_real_time +*/ + +static int init_read_real_time () +{ + static const char *thisfunc = "init_read_real_time"; +#ifdef _AIX + timebasestruct_t ibmtime; + (void) read_real_time (&ibmtime, TIMEBASE_SZ); + (void) time_base_to_time (&ibmtime, TIMEBASE_SZ); + ref_read_real_time = ibmtime.tb_high; + if (verbose) + printf ("%s: ref_read_real_time=%ld\n", thisfunc, (long) ref_read_real_time); + return 0; +#else + return GPTLerror ("%s: not enabled\n", thisfunc); +#endif +} + +static inline double utr_read_real_time () +{ +#ifdef _AIX + timebasestruct_t ibmtime; + (void) read_real_time (&ibmtime, TIMEBASE_SZ); + (void) time_base_to_time (&ibmtime, TIMEBASE_SZ); + return (ibmtime.tb_high - ref_read_real_time) + 1.e-9*ibmtime.tb_low; +#else + static const char *thisfunc = "utr_read_real_time"; + return GPTLerror ("%s: not enabled\n", thisfunc); +#endif +} + +/* +** Default available most places: gettimeofday +*/ + +static int init_gettimeofday () +{ + static const char *thisfunc = "init_gettimeofday"; +#ifdef HAVE_GETTIMEOFDAY + struct timeval tp; + (void) gettimeofday (&tp, 0); + ref_gettimeofday = tp.tv_sec; + if (verbose) + printf ("%s: ref_gettimeofday=%ld\n", thisfunc, (long) ref_gettimeofday); + return 0; +#else + return GPTLerror ("%s: not enabled\n", thisfunc); +#endif +} + +static inline double utr_gettimeofday () +{ +#ifdef HAVE_GETTIMEOFDAY + struct timeval tp; + (void) gettimeofday (&tp, 0); + return (tp.tv_sec - ref_gettimeofday) + 1.e-6*tp.tv_usec; +#else + static const char *thisfunc = "utr_gettimeofday"; + return GPTLerror ("%s: not enabled\n", thisfunc); +#endif +} + +/* +** Determine underlying timing routine overhead: call it 1000 times. +*/ + +static double utr_getoverhead () +{ + double val2[1001]; + int i; + + val2[0] = (*ptr2wtimefunc)(); + for (i = 1; i < 1001; ++i) { + val2[i] = (*ptr2wtimefunc)(); + } + return 0.001 * (val2[1000] - val2[0]); +} + +/* +** printself_andchildren: Recurse through call tree, printing stats for self, then children +*/ + +static void printself_andchildren (const Timer *ptr, + FILE *fp, + const int t, + const int depth, + const double tot_overhead) +{ + int n; + + if (depth > -1) /* -1 flag is to avoid printing stats for dummy outer timer */ + printstats (ptr, fp, t, depth, true, tot_overhead); + + for (n = 0; n < ptr->nchildren; n++) + printself_andchildren (ptr->children[n], fp, t, depth+1, tot_overhead); +} + +#ifdef ENABLE_PMPI +/* +** GPTLgetentry: called ONLY from pmpi.c (i.e. not a public entry point). Returns a pointer to the +** requested timer name by calling internal function getentry() +** +** Return value: 0 (NULL) or the return value of getentry() +*/ + +Timer *GPTLgetentry (const char *timername) +{ + int t; /* thread number */ + int numchars; /* number of characters to copy */ + int namelen; /* number of characters in timer name */ + unsigned int indx; /* returned from getentry (unused) */ + char new_name[MAX_CHARS+1]; /* timer name with prefix, if there is one */ + char *name; /* pointer to timer name */ + static const char *thisfunc = "GPTLgetentry"; + + if ( ! initialized) { + (void) GPTLerror ("%s: initialization was not completed\n", thisfunc); + return 0; + } + + if ((t = get_thread_num ()) < 0) { + (void) GPTLerror ("%s: bad return from get_thread_num\n", thisfunc); + return 0; + } + + /* + ** If prefix string is defined, prepend it to timername + ** and assign the name pointer to the new string. + ** Otherwise assign the name pointer to the original string. + */ + + if ((prefix_len[t] > 0) || (prefix_len_nt > 0)){ + namelen = strlen(timername); + numchars = add_prefix(new_name, timername, namelen, t); + name = new_name; + } else { + name = timername; + } + + return (getentry (hashtable[t], name, &indx)); +} + +/* +** GPTLpr_file_has_been_called: Called ONLY from pmpi.c (i.e. not a public entry point). Return +** whether GPTLpr_file has been called. MPI_Finalize wrapper needs +** to know whether it needs to call GPTLpr. +*/ + +int GPTLpr_has_been_called (void) +{ + return (int) pr_has_been_called; +} + +#endif + +/*************************************************************************************/ + +/* +** Contents of inserted threadutil.c starts here. +** Moved to gptl.c to enable inlining +*/ + +/* +** $Id: gptl.c,v 1.157 2011-03-28 20:55:18 rosinski Exp $ +** +** Author: Jim Rosinski +** +** Utility functions handle thread-based GPTL needs. +*/ + +/* Max allowable number of threads (used only when THREADED_PTHREADS is true) */ +#define MAX_THREADS 128 + +/**********************************************************************************/ +/* +** 3 sets of routines: OMP threading, PTHREADS, unthreaded +*/ + +#if ( defined THREADED_OMP ) + +/* +** threadinit: Allocate and initialize threadid_omp; set max number of threads +** +** Output results: +** maxthreads: max number of threads +** +** threadid_omp[] is allocated and initialized to -1 +** +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +static int threadinit (void) +{ + int t; /* loop index */ + static const char *thisfunc = "threadinit"; + + if (omp_get_thread_num () != 0) + return GPTLerror ("OMP %s: MUST only be called by the master thread\n", thisfunc); + + /* + ** Allocate the threadid array which maps physical thread IDs to logical IDs + ** For OpenMP this will be just threadid_omp[iam] = iam; + */ + + if (threadid_omp) + return GPTLerror ("OMP %s: has already been called.\nMaybe mistakenly called by multiple threads?", + thisfunc); + + /* + ** maxthreads may have been set by the user, in which case use that. But if as + ** yet uninitialized, set to the current value of OMP_NUM_THREADS. + */ + if (maxthreads == -1) + maxthreads = MAX ((1), (omp_get_max_threads ())); + + if ( ! (threadid_omp = (int *) GPTLallocate (maxthreads * sizeof (int)))) + return GPTLerror ("OMP %s: malloc failure for %d elements of threadid_omp\n", thisfunc, maxthreads); + + /* + ** Initialize threadid array to flag values for use by get_thread_num(). + ** get_thread_num() will fill in the values on first use. + */ + + for (t = 0; t < maxthreads; ++t) + threadid_omp[t] = -1; + +#ifdef VERBOSE + printf ("OMP %s: Set maxthreads=%d\n", thisfunc, maxthreads); +#endif + + return 0; +} + +/* +** Threadfinalize: clean up +** +** Output results: +** threadid_omp array is freed and array pointer nullified +*/ + +static void threadfinalize () +{ + free ((void *) threadid_omp); + threadid_omp = 0; +} + +/* +** get_thread_num: Determine thread number of the calling thread +** Start PAPI counters if enabled and first call for this thread. +** +** Output results: +** nthreads: Number of threads (=maxthreads) +** threadid_omp: Our thread id added to list on 1st call +** +** Return value: thread number (success) or GPTLerror (failure) +*/ + +static inline int get_thread_num (void) +{ + int t; /* thread number */ + static const char *thisfunc = "get_thread_num"; + + if ((t = omp_get_thread_num ()) >= maxthreads) + return GPTLerror ("OMP %s: returned id=%d exceeds maxthreads=%d\n", thisfunc, t, maxthreads); + + /* + ** If our thread number has already been set in the list, we are done + */ + + if (t == threadid_omp[t]) + return t; + + /* + ** Thread id not found. Modify threadid_omp with our ID, then start PAPI events if required. + ** Due to the setting of threadid_omp, everything below here will only execute once per thread. + */ + + threadid_omp[t] = t; + +#ifdef VERBOSE + printf ("OMP %s: 1st call t=%d\n", thisfunc, t); +#endif + +#ifdef HAVE_PAPI + + /* + ** When HAVE_PAPI is true, if 1 or more PAPI events are enabled, + ** create and start an event set for the new thread. + */ + + if (GPTLget_npapievents () > 0) { +#ifdef VERBOSE + printf ("OMP %s: Starting EventSet t=%d\n", thisfunc, t); +#endif + + if (GPTLcreate_and_start_events (t) < 0) + return GPTLerror ("OMP %s: error from GPTLcreate_and_start_events for thread %d\n", thisfunc, t); + } +#endif + + /* + ** nthreads = maxthreads based on setting in threadinit + */ + + nthreads = maxthreads; +#ifdef VERBOSE + printf ("OMP %s: nthreads=%d\n", thisfunc, nthreads); +#endif + + return t; +} + +static void print_threadmapping (FILE *fp) +{ + int n; + + fprintf (fp, "\n"); + fprintf (fp, "Thread mapping:\n"); + for (n = 0; n < nthreads; ++n) + fprintf (fp, "threadid_omp[%d] = %d\n", n, threadid_omp[n]); +} + +/* +** serial_region: determine whether in a serial or parallel region +** +** Return value: true (1) or false (0) +*/ + +static int serial_region () +{ + + /* + ** This test is more robust than 'omp_in_parallel', which is true + ** in a parallel region when only one thread is active, which may + ** not be thread 0. Other active thread teams also will not be + ** recognized. + */ + if ( (omp_get_num_threads()==1 ) && ( omp_get_level()==0 ) ){ + return 1; + } else { + return 0; + } + +} + +/**********************************************************************************/ +/* +** PTHREADS +*/ + +#elif ( defined THREADED_PTHREADS ) + +/* +** threadinit: Allocate threadid and initialize to -1; set max number of threads; +** Initialize the mutex for later use; Initialize nthreads to 0 +** +** Output results: +** nthreads: number of threads (init to zero here, increment later in get_thread_num) +** maxthreads: max number of threads (MAX_THREADS) +** +** threadid[] is allocated and initialized to -1 +** mutex is initialized for future use +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +static int threadinit (void) +{ + int t; /* thread number */ + int ret; /* return code */ + static const char *thisfunc = "threadinit"; + + /* + ** The following test is not rock-solid, but it's pretty close in terms of guaranteeing that + ** threadinit gets called by only 1 thread. Problem is, mutex hasn't yet been initialized + ** so we can't use it. + */ + + if (nthreads == -1) + nthreads = 0; + else + return GPTLerror ("PTHREADS %s: has already been called.\n" + "Maybe mistakenly called by multiple threads?\n", thisfunc); + + /* + ** Initialize the mutex required for critical regions. + ** Previously, t_mutex = PTHREAD_MUTEX_INITIALIZER on the static declaration line was + ** adequate to initialize the mutex. But this failed in programs that invoked + ** GPTLfinalize() followed by GPTLinitialize(). + ** "man pthread_mutex_init" indicates that passing NULL as the second argument to + ** pthread_mutex_init() should appropriately initialize the mutex, assuming it was + ** properly destroyed by a previous call to pthread_mutex_destroy(); + */ + +#ifdef MUTEX_API + if ((ret = pthread_mutex_init ((pthread_mutex_t *) &t_mutex, NULL)) != 0) + return GPTLerror ("PTHREADS %s: mutex init failure: ret=%d\n", thisfunc, ret); +#endif + + /* + ** Allocate the threadid array which maps physical thread IDs to logical IDs + */ + + if (threadid) + return GPTLerror ("PTHREADS %s: threadid not null\n", thisfunc); + else if ( ! (threadid = (pthread_t *) GPTLallocate (MAX_THREADS * sizeof (pthread_t)))) + return GPTLerror ("PTHREADS %s: malloc failure for %d elements of threadid\n", thisfunc, MAX_THREADS); + + maxthreads = MAX_THREADS; + + /* + ** Initialize threadid array to flag values for use by get_thread_num(). + ** get_thread_num() will fill in the values on first use. + */ + + for (t = 0; t < maxthreads; ++t) + threadid[t] = (pthread_t) -1; + +#ifdef VERBOSE + printf ("PTHREADS %s: Set maxthreads=%d nthreads=%d\n", thisfunc, maxthreads, nthreads); +#endif + + return 0; +} + +/* +** threadfinalize: Clean up +** +** Output results: +** threadid array is freed and array pointer nullified +** mutex is destroyed +*/ + +static void threadfinalize () +{ + int ret; + +#ifdef MUTEX_API + if ((ret = pthread_mutex_destroy ((pthread_mutex_t *) &t_mutex)) != 0) + printf ("threadfinalize: failed attempt to destroy t_mutex: ret=%d\n", ret); +#endif + free ((void *) threadid); + threadid = 0; +} + +/* +** get_thread_num: Determine zero-based thread number of the calling thread. +** Update nthreads and maxthreads if necessary. +** Start PAPI counters if enabled and first call for this thread. +** +** Output results: +** nthreads: Updated number of threads +** threadid: Our thread id added to list on 1st call +** +** Return value: thread number (success) or GPTLerror (failure) +*/ + +static inline int get_thread_num (void) +{ + int t; /* logical thread number, defined by array index of found threadid */ + pthread_t mythreadid; /* thread id from pthreads library */ + int retval; /* value to return to caller */ + bool foundit = false; /* thread id found in list */ + static const char *thisfunc = "get_thread_num"; + + mythreadid = pthread_self (); + + /* + ** If our thread number has already been set in the list, we are done + ** VECTOR code should run a bit faster on vector machines. + */ +#define VECTOR +#ifdef VECTOR + for (t = 0; t < nthreads; ++t) + if (pthread_equal (mythreadid, threadid[t])) { + foundit = true; + retval = t; + } + + if (foundit) + return retval; +#else + for (t = 0; t < nthreads; ++t) + if (pthread_equal (mythreadid, threadid[t])) + return t; +#endif + + /* + ** Thread id not found. Define a critical region, then start PAPI counters if + ** necessary and modify threadid[] with our id. + */ + + if (lock_mutex () < 0) + return GPTLerror ("PTHREADS %s: mutex lock failure\n", thisfunc); + + /* + ** If our thread id is not in the known list, add to it after checking that + ** we do not have too many threads. + */ + + if (nthreads >= MAX_THREADS) { + if (unlock_mutex () < 0) + fprintf (stderr, "PTHREADS %s: mutex unlock failure\n", thisfunc); + + return GPTLerror ("PTHREADS %s: nthreads=%d is too big. Recompile " + "with larger value of MAX_THREADS\n", thisfunc, nthreads); + } + + threadid[nthreads] = mythreadid; + +#ifdef VERBOSE + printf ("PTHREADS %s: 1st call threadid=%lu maps to location %d\n", + thisfunc, (unsigned long) mythreadid, nthreads); +#endif + +#ifdef HAVE_PAPI + + /* + ** When HAVE_PAPI is true, if 1 or more PAPI events are enabled, + ** create and start an event set for the new thread. + */ + + if (GPTLget_npapievents () > 0) { +#ifdef VERBOSE + printf ("PTHREADS get_thread_num: Starting EventSet threadid=%lu location=%d\n", + (unsigned long) mythreadid, nthreads); +#endif + if (GPTLcreate_and_start_events (nthreads) < 0) { + if (unlock_mutex () < 0) + fprintf (stderr, "PTHREADS %s: mutex unlock failure\n", thisfunc); + + return GPTLerror ("PTHREADS %s: error from GPTLcreate_and_start_events for thread %d\n", + thisfunc, nthreads); + } + } +#endif + + /* + ** IMPORTANT to set return value before unlocking the mutex!!!! + ** "return nthreads-1" fails occasionally when another thread modifies + ** nthreads after it gets the mutex! + */ + + retval = nthreads++; + +#ifdef VERBOSE + printf ("PTHREADS get_thread_num: nthreads bumped to %d\n", nthreads); +#endif + + if (unlock_mutex () < 0) + return GPTLerror ("PTHREADS %s: mutex unlock failure\n", thisfunc); + + return retval; +} + +/* +** lock_mutex: lock a mutex for private access +*/ + +static int lock_mutex () +{ + static const char *thisfunc = "lock_mutex"; + + if (pthread_mutex_lock ((pthread_mutex_t *) &t_mutex) != 0) + return GPTLerror ("%s: failure from pthread_lock_mutex\n", thisfunc); + + return 0; +} + +/* +** unlock_mutex: unlock a mutex from private access +*/ + +static int unlock_mutex () +{ + static const char *thisfunc = "unlock_mutex"; + + if (pthread_mutex_unlock ((pthread_mutex_t *) &t_mutex) != 0) + return GPTLerror ("%s: failure from pthread_unlock_mutex\n", thisfunc); + return 0; +} + +static void print_threadmapping (FILE *fp) +{ + int t; + + fprintf (fp, "\n"); + fprintf (fp, "Thread mapping:\n"); + for (t = 0; t < nthreads; ++t) + fprintf (fp, "threadid[%d] = %lu\n", t, (unsigned long) threadid[t]); +} + +/* +** serial_region: determine whether in a serial or parallel region +** +** Not currently implemented (or even defined) when using PTHREADS/ +** It is an error if this is ever called. +** +** Return value: true (1) or false (0) +*/ + +static int serial_region () +{ + static const char *thisfunc = "serial_region"; + + return GPTLerror ("%s: not supported for THREADED_PTHREADS\n", thisfunc); + +} + +/**********************************************************************************/ +/* +** Unthreaded case +*/ + +#else + +static int threadinit (void) +{ + static const char *thisfunc = "threadinit"; + + if (nthreads != -1) + return GPTLerror ("Unthreaded %s: MUST only be called once", thisfunc); + + nthreads = 0; + maxthreads = 1; + return 0; +} + +void threadfinalize () +{ + threadid = -1; +} + +static inline int get_thread_num () +{ + static const char *thisfunc = "get_thread_num"; +#ifdef HAVE_PAPI + /* + ** When HAVE_PAPI is true, if 1 or more PAPI events are enabled, + ** create and start an event set for the new thread. + */ + + if (threadid == -1 && GPTLget_npapievents () > 0) { + if (GPTLcreate_and_start_events (0) < 0) + return GPTLerror ("Unthreaded %s: error from GPTLcreate_and_start_events for thread %0\n", thisfunc); + + threadid = 0; + } +#endif + + nthreads = 1; + return 0; +} + +static void print_threadmapping (FILE *fp) +{ + fprintf (fp, "\n"); + fprintf (fp, "threadid[0] = 0\n"); +} + +/* +** serial_region: determine whether in a serial or parallel region +** +** Return value: true (1) or false (0) +*/ + +static int serial_region () +{ + return 1; +} + +#endif diff --git a/share/timing/gptl.h b/share/timing/gptl.h new file mode 100644 index 0000000..e701545 --- /dev/null +++ b/share/timing/gptl.h @@ -0,0 +1,184 @@ +/* +** $Id: gptl.h,v 1.59 2011-03-28 20:55:19 rosinski Exp $ +** +** Author: Jim Rosinski +** +** GPTL header file to be included in user code +*/ + +#ifndef GPTL_H +#define GPTL_H + +#ifdef INCLUDE_CMAKE_FCI +#include "cmake_fortran_c_interface.h" +#endif + +/* following block for camtimers only */ +#ifndef NO_GETTIMEOFDAY +#ifndef HAVE_GETTIMEOFDAY +#define HAVE_GETTIMEOFDAY +#endif +#endif + +#ifdef SPMD +#undef HAVE_MPI +#define HAVE_MPI +#endif + +#ifdef _OPENMP +#ifndef THREADED_PTHREADS +#define THREADED_OMP +#endif +#endif +/* above block for camtimers only */ + +#ifdef HAVE_MPI +#include +#endif + +/* +** Options settable by a call to GPTLsetoption() (default in parens) +** These numbers need to be small integers because GPTLsetoption can +** be passed PAPI counters, and we need to avoid collisions in that +** integer space. PAPI presets are big negative integers, and PAPI +** native events are big positive integers. +*/ + +typedef enum { + GPTLsync_mpi = 0, /* Synchronize before certain MPI calls (PMPI-mode only) */ + GPTLwall = 1, /* Collect wallclock stats (true) */ + GPTLcpu = 2, /* Collect CPU stats (false)*/ + GPTLabort_on_error = 3, /* Abort on failure (false) */ + GPTLoverhead = 4, /* Estimate overhead of underlying timing routine (true) */ + GPTLdepthlimit = 5, /* Only print timers this depth or less in the tree (inf) */ + GPTLverbose = 6, /* Verbose output (false) */ + GPTLnarrowprint = 7, /* Print PAPI and derived stats in 8 columns not 16 (true) */ + GPTLpercent = 9, /* Add a column for percent of first timer (false) */ + GPTLpersec = 10, /* Add a PAPI column that prints "per second" stats (true) */ + GPTLmultiplex = 11, /* Allow PAPI multiplexing (false) */ + GPTLdopr_preamble = 12, /* Print preamble info (true) */ + GPTLdopr_threadsort = 13, /* Print sorted thread stats (true) */ + GPTLdopr_multparent = 14, /* Print multiple parent info (true) */ + GPTLdopr_collision = 15, /* Print hastable collision info (true) */ + GPTLprint_method = 16, /* Tree print method: first parent, last parent + most frequent, or full tree (most frequent) */ + GPTLprint_mode = 50, /* Write mode for output file (overwrite, append) */ + GPTLtablesize = 51, /* per-thread size of hash table (1024) */ + GPTLmaxthreads = 52, /* maximum number of threads */ + /* + ** These are derived counters based on PAPI counters. All default to false + */ + GPTL_IPC = 17, /* Instructions per cycle */ + GPTL_CI = 18, /* Computational intensity */ + GPTL_FPC = 19, /* FP ops per cycle */ + GPTL_FPI = 20, /* FP ops per instruction */ + GPTL_LSTPI = 21, /* Load-store instruction fraction */ + GPTL_DCMRT = 22, /* L1 miss rate (fraction) */ + GPTL_LSTPDCM = 23, /* Load-stores per L1 miss */ + GPTL_L2MRT = 24, /* L2 miss rate (fraction) */ + GPTL_LSTPL2M = 25, /* Load-stores per L2 miss */ + GPTL_L3MRT = 26, /* L3 read miss rate (fraction) */ + /* + ** New ESMF options for GPTL + */ + GPTLprofile_ovhd = 27, /* Direct measurement of profiling overhead (false) */ + GPTLdopr_quotes = 28 /* Add double quotes to timer names on output (false) */ +} Option; + +/* +** Underlying wallclock timer: optimize for best granularity with least overhead. +** These numbers need not be distinct from the above because these are passed +** to GPTLsetutr() and the above are passed to GPTLsetoption() +*/ + +typedef enum { + GPTLgettimeofday = 1, /* the default */ + GPTLnanotime = 2, /* only available on x86 */ + GPTLmpiwtime = 4, /* MPI_Wtime */ + GPTLclockgettime = 5, /* clock_gettime */ + GPTLpapitime = 6, /* only if PAPI is available */ + GPTLread_real_time = 3 /* AIX only */ +} Funcoption; + +/* +** How to report parent/child relationships at print time (for children with multiple parents) +*/ + +typedef enum { + GPTLfirst_parent = 1, /* first parent found */ + GPTLlast_parent = 2, /* last parent found */ + GPTLmost_frequent = 3, /* most frequent parent (default) */ + GPTLfull_tree = 4 /* complete call tree */ +} Method; + +/* +** Whether to overwrite or append to output file +*/ + +typedef enum { + GPTLprint_write = 1, /* overwrite */ + GPTLprint_append = 2, /* append */ +} PRMode; + +/* +** Function prototypes +*/ + +#ifdef __cplusplus +extern "C" { +#endif + +extern int GPTLsetoption (const int, const int); +extern int GPTLinitialize (void); +extern int GPTLprefix_set (const char *); +extern int GPTLprefix_setf (const char *, const int); +extern int GPTLprefix_unset (void); +extern int GPTLstart (const char *); +extern int GPTLstart_handle (const char *, void **); +extern int GPTLstartf (const char *, const int); +extern int GPTLstartf_handle (const char *, const int, void **); +extern int GPTLstop (const char *); +extern int GPTLstopf (const char *, const int); +extern int GPTLstop_handle (const char *, void **); +extern int GPTLstopf_handle (const char *, const int, void **); +extern int GPTLstartstop_vals (const char *, double, int); +extern int GPTLstartstop_valsf (const char *, const int, double, int); +extern int GPTLstamp (double *, double *, double *); +extern int GPTLprint_mode_query (void); +extern int GPTLprint_mode_set (const int); +extern int GPTLpr (const int); +extern int GPTLpr_file (const char *); + +#ifdef HAVE_MPI +extern int GPTLpr_summary (MPI_Comm comm); +extern int GPTLpr_summary_file (MPI_Comm, const char *); +extern int GPTLbarrier (MPI_Comm comm, const char *); +#else +extern int GPTLpr_summary (int); +extern int GPTLpr_summary_file (int, const char *); +extern int GPTLbarrier (int, const char *); +#endif + +extern int GPTLreset (void); +extern int GPTLfinalize (void); +extern int GPTLget_memusage (int *, int *, int *, int *, int *); +extern int GPTLprint_memusage (const char *); +extern int GPTLenable (void); +extern int GPTLdisable (void); +extern int GPTLsetutr (const int); +extern int GPTLquery (const char *, int, int *, int *, double *, double *, double *, + long long *, const int); +extern int GPTLquerycounters (const char *, int, long long *); +extern int GPTLget_wallclock (const char *, int, double *); +extern int GPTLget_eventvalue (const char *, const char *, int, double *); +extern int GPTLget_nregions (int, int *); +extern int GPTLget_regionname (int, int, char *, int); +extern int GPTL_PAPIlibraryinit (void); +extern int GPTLevent_name_to_code (const char *, int *); +extern int GPTLevent_code_to_name (const int, char *); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/share/timing/gptl.inc b/share/timing/gptl.inc new file mode 100644 index 0000000..4f46cf9 --- /dev/null +++ b/share/timing/gptl.inc @@ -0,0 +1,188 @@ +! +! $Id: gptl.inc,v 1.44 2011-03-28 20:55:19 rosinski Exp $ +! +! Author: Jim Rosinski +! +! GPTL header file to be included in user code. Values match +! their counterparts in gptl.h. See that file or man pages +! or web-based documenation for descriptions of each value +! + integer GPTLsync_mpi + integer GPTLwall + integer GPTLcpu + integer GPTLabort_on_error + integer GPTLoverhead + integer GPTLdepthlimit + integer GPTLverbose + integer GPTLnarrowprint + integer GPTLpercent + integer GPTLpersec + integer GPTLmultiplex + integer GPTLdopr_preamble + integer GPTLdopr_threadsort + integer GPTLdopr_multparent + integer GPTLdopr_collision + integer GPTLprint_method + integer GPTLprint_mode + integer GPTLtablesize + integer GPTLmaxthreads + + integer GPTL_IPC + integer GPTL_CI + integer GPTL_FPC + integer GPTL_FPI + integer GPTL_LSTPI + integer GPTL_DCMRT + integer GPTL_LSTPDCM + integer GPTL_L2MRT + integer GPTL_LSTPL2M + integer GPTL_L3MRT + + integer GPTLprofile_ovhd + integer GPTLdopr_quotes + + integer GPTLnanotime + integer GPTLmpiwtime + integer GPTLclockgettime + integer GPTLgettimeofday + integer GPTLpapitime + integer GPTLread_real_time + + integer GPTLfirst_parent + integer GPTLlast_parent + integer GPTLmost_frequent + integer GPTLfull_tree + + integer GPTLprint_write + integer GPTLprint_append + + parameter (GPTLsync_mpi = 0) + parameter (GPTLwall = 1) + parameter (GPTLcpu = 2) + parameter (GPTLabort_on_error = 3) + parameter (GPTLoverhead = 4) + parameter (GPTLdepthlimit = 5) + parameter (GPTLverbose = 6) + parameter (GPTLnarrowprint = 7) + parameter (GPTLpercent = 9) + parameter (GPTLpersec = 10) + parameter (GPTLmultiplex = 11) + parameter (GPTLdopr_preamble = 12) + parameter (GPTLdopr_threadsort= 13) + parameter (GPTLdopr_multparent= 14) + parameter (GPTLdopr_collision = 15) + parameter (GPTLprint_method = 16) + parameter (GPTLprint_mode = 50) + parameter (GPTLtablesize = 51) + parameter (GPTLmaxthreads = 52) + + parameter (GPTL_IPC = 17) + parameter (GPTL_CI = 18) + parameter (GPTL_FPC = 19) + parameter (GPTL_FPI = 20) + parameter (GPTL_LSTPI = 21) + parameter (GPTL_DCMRT = 22) + parameter (GPTL_LSTPDCM = 23) + parameter (GPTL_L2MRT = 24) + parameter (GPTL_LSTPL2M = 25) + parameter (GPTL_L3MRT = 26) + + parameter (GPTLprofile_ovhd = 27) + parameter (GPTLdopr_quotes = 28) + + parameter (GPTLgettimeofday = 1) + parameter (GPTLnanotime = 2) + parameter (GPTLmpiwtime = 4) + parameter (GPTLclockgettime = 5) + parameter (GPTLpapitime = 6) + parameter (GPTLread_real_time = 3) + + parameter (GPTLfirst_parent = 1) + parameter (GPTLlast_parent = 2) + parameter (GPTLmost_frequent = 3) + parameter (GPTLfull_tree = 4) + + parameter (GPTLprint_write = 1) + parameter (GPTLprint_append = 2) + +! Externals + + integer gptlsetoption + integer gptlinitialize + integer gptlprefix_set + integer gptlprefix_setf + integer gptlprefix_unset + integer gptlstart + integer gptlstart_handle + integer gptlstartf + integer gptlstartf_handle + integer gptlstop + integer gptlstop_handle + integer gptlstopf + integer gptlstopf_handle + integer gptlstartstop_vals + integer gptlstartstop_valsf + integer gptlstamp + integer gptlprint_mode_query + integer gptlprint_mode_set + integer gptlpr + integer gptlpr_file + integer gptlpr_summary + integer gptlpr_summary_file + integer gptlbarrier + integer gptlreset + integer gptlfinalize + integer gptlget_memusage + integer gptlprint_memusage + integer gptlenable + integer gptldisable + integer gptlsetutr + integer gptlquery + integer gptlquerycounters + integer gptlget_wallclock + integer gptlget_eventvalue + integer gptlget_nregions + integer gptlget_regionname + integer gptl_papilibraryinit + integer gptlevent_name_to_code + integer gptlevent_code_to_name + + external gptlsetoption + external gptlinitialize + external gptlprefix_set + external gptlprefix_setf + external gptlprefix_unset + external gptlstart + external gptlstart_handle + external gptlstartf + external gptlstartf_handle + external gptlstop + external gptlstop_handle + external gptlstopf + external gptlstopf_handle + external gptlstartstop_vals + external gptlstartstop_valsf + external gptlstamp + external gptlprint_mode_query + external gptlprint_mode_set + external gptlpr + external gptlpr_file + external gptlpr_summary + external gptlpr_summary_file + external gptlbarrier + external gptlreset + external gptlfinalize + external gptlget_memusage + external gptlprint_memusage + external gptlenable + external gptldisable + external gptlsetutr + external gptlquery + external gptlquerycounters + external gptlget_wallclock + external gptlget_eventvalue + external gptlget_nregions + external gptlget_regionname + external gptl_papilibraryinit + external gptlevent_name_to_code + external gptlevent_code_to_name diff --git a/share/timing/gptl_papi.c b/share/timing/gptl_papi.c new file mode 100644 index 0000000..1f701cb --- /dev/null +++ b/share/timing/gptl_papi.c @@ -0,0 +1,1325 @@ +/* +** $Id: gptl_papi.c,v 1.79 2011-03-28 20:55:19 rosinski Exp $ +** +** Author: Jim Rosinski +** +** Contains routines which interface to PAPI library +*/ + +#include "private.h" +#include "gptl.h" + +#ifdef HAVE_PAPI + +#include +#include +#include +#include + +#if ( defined THREADED_OMP ) +#include +#elif ( defined THREADED_PTHREADS ) +#include +#endif + +/* Mapping of PAPI counters to short and long printed strings */ + +static const Entry papitable [] = { + {PAPI_L1_DCM, "PAPI_L1_DCM", "L1_DCM ", "L1_Dcache_miss ", "Level 1 data cache misses"}, + {PAPI_L1_ICM, "PAPI_L1_ICM", "L1_ICM ", "L1_Icache_miss ", "Level 1 instruction cache misses"}, + {PAPI_L2_DCM, "PAPI_L2_DCM", "L2_DCM ", "L2_Dcache_miss ", "Level 2 data cache misses"}, + {PAPI_L2_ICM, "PAPI_L2_ICM", "L2_ICM ", "L2_Icache_miss ", "Level 2 instruction cache misses"}, + {PAPI_L3_DCM, "PAPI_L3_DCM", "L3_DCM ", "L3_Dcache_miss ", "Level 3 data cache misses"}, + {PAPI_L3_ICM, "PAPI_L3_ICM", "L3_ICM ", "L3_Icache_miss ", "Level 3 instruction cache misses"}, + {PAPI_L1_TCM, "PAPI_L1_TCM", "L1_TCM ", "L1_cache_miss ", "Level 1 total cache misses"}, + {PAPI_L2_TCM, "PAPI_L2_TCM", "L2_TCM ", "L2_cache_miss ", "Level 2 total cache misses"}, + {PAPI_L3_TCM, "PAPI_L3_TCM", "L3_TCM ", "L3_cache_miss ", "Level 3 total cache misses"}, + {PAPI_CA_SNP, "PAPI_CA_SNP", "CA_SNP ", "Snoops ", "Snoops "}, + {PAPI_CA_SHR, "PAPI_CA_SHR", "CA_SHR ", "PAPI_CA_SHR ", "Request for shared cache line (SMP)"}, + {PAPI_CA_CLN, "PAPI_CA_CLN", "CA_CLN ", "PAPI_CA_CLN ", "Request for clean cache line (SMP)"}, + {PAPI_CA_INV, "PAPI_CA_INV", "CA_INV ", "PAPI_CA_INV ", "Request for cache line Invalidation (SMP)"}, + {PAPI_CA_ITV, "PAPI_CA_ITV", "CA_ITV ", "PAPI_CA_ITV ", "Request for cache line Intervention (SMP)"}, + {PAPI_L3_LDM, "PAPI_L3_LDM", "L3_LDM ", "L3_load_misses ", "Level 3 load misses"}, + {PAPI_L3_STM, "PAPI_L3_STM", "L3_STM ", "L3_store_misses ", "Level 3 store misses"}, + {PAPI_BRU_IDL,"PAPI_BRU_IDL","BRU_IDL ", "PAPI_BRU_IDL ", "Cycles branch units are idle"}, + {PAPI_FXU_IDL,"PAPI_FXU_IDL","FXU_IDL ", "PAPI_FXU_IDL ", "Cycles integer units are idle"}, + {PAPI_FPU_IDL,"PAPI_FPU_IDL","FPU_IDL ", "PAPI_FPU_IDL ", "Cycles floating point units are idle"}, + {PAPI_LSU_IDL,"PAPI_LSU_IDL","LSU_IDL ", "PAPI_LSU_IDL ", "Cycles load/store units are idle"}, + {PAPI_TLB_DM, "PAPI_TLB_DM" "TLB_DM ", "Data_TLB_misses ", "Data translation lookaside buffer misses"}, + {PAPI_TLB_IM, "PAPI_TLB_IM", "TLB_IM ", "Inst_TLB_misses ", "Instr translation lookaside buffer misses"}, + {PAPI_TLB_TL, "PAPI_TLB_TL", "TLB_TL ", "Tot_TLB_misses ", "Total translation lookaside buffer misses"}, + {PAPI_L1_LDM, "PAPI_L1_LDM", "L1_LDM ", "L1_load_misses ", "Level 1 load misses"}, + {PAPI_L1_STM, "PAPI_L1_STM", "L1_STM ", "L1_store_misses ", "Level 1 store misses"}, + {PAPI_L2_LDM, "PAPI_L2_LDM", "L2_LDM ", "L2_load_misses ", "Level 2 load misses"}, + {PAPI_L2_STM, "PAPI_L2_STM", "L2_STM ", "L2_store_misses ", "Level 2 store misses"}, + {PAPI_BTAC_M, "PAPI_BTAC_M", "BTAC_M ", "BTAC_miss ", "BTAC miss"}, + {PAPI_PRF_DM, "PAPI_PRF_DM", "PRF_DM ", "PAPI_PRF_DM ", "Prefetch data instruction caused a miss"}, + {PAPI_L3_DCH, "PAPI_L3_DCH", "L3_DCH ", "L3_DCache_Hit ", "Level 3 Data Cache Hit"}, + {PAPI_TLB_SD, "PAPI_TLB_SD", "TLB_SD ", "PAPI_TLB_SD ", "Xlation lookaside buffer shootdowns (SMP)"}, + {PAPI_CSR_FAL,"PAPI_CSR_FAL","CSR_FAL ", "PAPI_CSR_FAL ", "Failed store conditional instructions"}, + {PAPI_CSR_SUC,"PAPI_CSR_SUC","CSR_SUC ", "PAPI_CSR_SUC ", "Successful store conditional instructions"}, + {PAPI_CSR_TOT,"PAPI_CSR_TOT","CSR_TOT ", "PAPI_CSR_TOT ", "Total store conditional instructions"}, + {PAPI_MEM_SCY,"PAPI_MEM_SCY","MEM_SCY ", "Cyc_Stalled_Mem ", "Cycles Stalled Waiting for Memory Access"}, + {PAPI_MEM_RCY,"PAPI_MEM_RCY","MEM_RCY ", "Cyc_Stalled_MemR", "Cycles Stalled Waiting for Memory Read"}, + {PAPI_MEM_WCY,"PAPI_MEM_WCY","MEM_WCY ", "Cyc_Stalled_MemW", "Cycles Stalled Waiting for Memory Write"}, + {PAPI_STL_ICY,"PAPI_STL_ICY","STL_ICY ", "Cyc_no_InstrIss ", "Cycles with No Instruction Issue"}, + {PAPI_FUL_ICY,"PAPI_FUL_ICY","FUL_ICY ", "Cyc_Max_InstrIss", "Cycles with Maximum Instruction Issue"}, + {PAPI_STL_CCY,"PAPI_STL_CCY","STL_CCY ", "Cyc_No_InstrComp", "Cycles with No Instruction Completion"}, + {PAPI_FUL_CCY,"PAPI_FUL_CCY","FUL_CCY ", "Cyc_Max_InstComp", "Cycles with Maximum Instruction Completion"}, + {PAPI_HW_INT, "PAPI_HW_INT", "HW_INT ", "HW_interrupts ", "Hardware interrupts"}, + {PAPI_BR_UCN, "PAPI_BR_UCN", "BR_UCN ", "Uncond_br_instr ", "Unconditional branch instructions executed"}, + {PAPI_BR_CN, "PAPI_BR_CN", "BR_CN ", "Cond_br_instr_ex", "Conditional branch instructions executed"}, + {PAPI_BR_TKN, "PAPI_BR_TKN", "BR_TKN ", "Cond_br_instr_tk", "Conditional branch instructions taken"}, + {PAPI_BR_NTK, "PAPI_BR_NTK", "BR_NTK ", "Cond_br_instrNtk", "Conditional branch instructions not taken"}, + {PAPI_BR_MSP, "PAPI_BR_MSP", "BR_MSP ", "Cond_br_instrMPR", "Conditional branch instructions mispred"}, + {PAPI_BR_PRC, "PAPI_BR_PRC", "BR_PRC ", "Cond_br_instrCPR", "Conditional branch instructions corr. pred"}, + {PAPI_FMA_INS,"PAPI_FMA_INS","FMA_INS ", "FMA_instr_comp ", "FMA instructions completed"}, + {PAPI_TOT_IIS,"PAPI_TOT_IIS","TOT_IIS ", "Total_instr_iss ", "Total instructions issued"}, + {PAPI_TOT_INS,"PAPI_TOT_INS","TOT_INS ", "Total_instr_ex ", "Total instructions executed"}, + {PAPI_INT_INS,"PAPI_INT_INS","INT_INS ", "Int_instr_ex ", "Integer instructions executed"}, + {PAPI_FP_INS, "PAPI_FP_INS", "FP_INS ", "FP_instr_ex ", "Floating point instructions executed"}, + {PAPI_LD_INS, "PAPI_LD_INS", "LD_INS ", "Load_instr_ex ", "Load instructions executed"}, + {PAPI_SR_INS, "PAPI_SR_INS", "SR_INS ", "Store_instr_ex ", "Store instructions executed"}, + {PAPI_BR_INS, "PAPI_BR_INS", "BR_INS ", "br_instr_ex ", "Total branch instructions executed"}, + {PAPI_VEC_INS,"PAPI_VEC_INS","VEC_INS ", "Vec/SIMD_instrEx", "Vector/SIMD instructions executed"}, + {PAPI_RES_STL,"PAPI_RES_STL","RES_STL ", "Cyc_proc_stalled", "Cycles processor is stalled on resource"}, + {PAPI_FP_STAL,"PAPI_FP_STAL","FP_STAL ", "Cyc_any_FP_stall", "Cycles any FP units are stalled"}, + {PAPI_TOT_CYC,"PAPI_TOT_CYC","TOT_CYC ", "Total_cycles ", "Total cycles"}, + {PAPI_LST_INS,"PAPI_LST_INS","LST_INS ", "Tot_L/S_inst_ex ", "Total load/store inst. executed"}, + {PAPI_SYC_INS,"PAPI_SYC_INS","SYC_INS ", "Sync._inst._ex ", "Sync. inst. executed"}, + {PAPI_L1_DCH, "PAPI_L1_DCH", "L1_DCH ", "L1_D_Cache_Hit ", "L1 D Cache Hit"}, + {PAPI_L2_DCH, "PAPI_L2_DCH", "L2_DCH ", "L2_D_Cache_Hit ", "L2 D Cache Hit"}, + {PAPI_L1_DCA, "PAPI_L1_DCA", "L1_DCA ", "L1_D_Cache_Acc ", "L1 D Cache Access"}, + {PAPI_L2_DCA, "PAPI_L2_DCA", "L2_DCA ", "L2_D_Cache_Acc ", "L2 D Cache Access"}, + {PAPI_L3_DCA, "PAPI_L3_DCA", "L3_DCA ", "L3_D_Cache_Acc ", "L3 D Cache Access"}, + {PAPI_L1_DCR, "PAPI_L1_DCR", "L1_DCR ", "L1_D_Cache_Read ", "L1 D Cache Read"}, + {PAPI_L2_DCR, "PAPI_L2_DCR", "L2_DCR ", "L2_D_Cache_Read ", "L2 D Cache Read"}, + {PAPI_L3_DCR, "PAPI_L3_DCR", "L3_DCR ", "L3_D_Cache_Read ", "L3 D Cache Read"}, + {PAPI_L1_DCW, "PAPI_L1_DCW", "L1_DCW ", "L1_D_Cache_Write", "L1 D Cache Write"}, + {PAPI_L2_DCW, "PAPI_L2_DCW", "L2_DCW ", "L2_D_Cache_Write", "L2 D Cache Write"}, + {PAPI_L3_DCW, "PAPI_L3_DCW", "L3_DCW ", "L3_D_Cache_Write", "L3 D Cache Write"}, + {PAPI_L1_ICH, "PAPI_L1_ICH", "L1_ICH ", "L1_I_cache_hits ", "L1 instruction cache hits"}, + {PAPI_L2_ICH, "PAPI_L2_ICH", "L2_ICH ", "L2_I_cache_hits ", "L2 instruction cache hits"}, + {PAPI_L3_ICH, "PAPI_L3_ICH", "L3_ICH ", "L3_I_cache_hits ", "L3 instruction cache hits"}, + {PAPI_L1_ICA, "PAPI_L1_ICA", "L1_ICA ", "L1_I_cache_acc ", "L1 instruction cache accesses"}, + {PAPI_L2_ICA, "PAPI_L2_ICA", "L2_ICA ", "L2_I_cache_acc ", "L2 instruction cache accesses"}, + {PAPI_L3_ICA, "PAPI_L3_ICA", "L3_ICA ", "L3_I_cache_acc ", "L3 instruction cache accesses"}, + {PAPI_L1_ICR, "PAPI_L1_ICR", "L1_ICR ", "L1_I_cache_reads", "L1 instruction cache reads"}, + {PAPI_L2_ICR, "PAPI_L2_ICR", "L2_ICR ", "L2_I_cache_reads", "L2 instruction cache reads"}, + {PAPI_L3_ICR, "PAPI_L3_ICR", "L3_ICR ", "L3_I_cache_reads", "L3 instruction cache reads"}, + {PAPI_L1_ICW, "PAPI_L1_ICW", "L1_ICW ", "L1_I_cache_write", "L1 instruction cache writes"}, + {PAPI_L2_ICW, "PAPI_L2_ICW", "L2_ICW ", "L2_I_cache_write", "L2 instruction cache writes"}, + {PAPI_L3_ICW, "PAPI_L3_ICW", "L3_ICW ", "L3_I_cache_write", "L3 instruction cache writes"}, + {PAPI_L1_TCH, "PAPI_L1_TCH", "L1_TCH ", "L1_cache_hits ", "L1 total cache hits"}, + {PAPI_L2_TCH, "PAPI_L2_TCH", "L2_TCH ", "L2_cache_hits ", "L2 total cache hits"}, + {PAPI_L3_TCH, "PAPI_L3_TCH", "L3_TCH ", "L3_cache_hits ", "L3 total cache hits"}, + {PAPI_L1_TCA, "PAPI_L1_TCA", "L1_TCA ", "L1_cache_access ", "L1 total cache accesses"}, + {PAPI_L2_TCA, "PAPI_L2_TCA", "L2_TCA ", "L2_cache_access ", "L2 total cache accesses"}, + {PAPI_L3_TCA, "PAPI_L3_TCA", "L3_TCA ", "L3_cache_access ", "L3 total cache accesses"}, + {PAPI_L1_TCR, "PAPI_L1_TCR", "L1_TCR ", "L1_cache_reads ", "L1 total cache reads"}, + {PAPI_L2_TCR, "PAPI_L2_TCR", "L2_TCR ", "L2_cache_reads ", "L2 total cache reads"}, + {PAPI_L3_TCR, "PAPI_L3_TCR", "L3_TCR ", "L3_cache_reads ", "L3 total cache reads"}, + {PAPI_L1_TCW, "PAPI_L1_TCW", "L1_TCW ", "L1_cache_writes ", "L1 total cache writes"}, + {PAPI_L2_TCW, "PAPI_L2_TCW", "L2_TCW ", "L2_cache_writes ", "L2 total cache writes"}, + {PAPI_L3_TCW, "PAPI_L3_TCW", "L3_TCW ", "L3_cache_writes ", "L3 total cache writes"}, + {PAPI_FML_INS,"PAPI_FML_INS","FML_INS ", "FM_ins ", "FM ins"}, + {PAPI_FAD_INS,"PAPI_FAD_INS","FAD_INS ", "FA_ins ", "FA ins"}, + {PAPI_FDV_INS,"PAPI_FDV_INS","FDV_INS ", "FD_ins ", "FD ins"}, + {PAPI_FSQ_INS,"PAPI_FSQ_INS","FSQ_INS ", "FSq_ins ", "FSq ins"}, + {PAPI_FNV_INS,"PAPI_FNV_INS","FNV_INS ", "Finv_ins ", "Finv ins"}, + {PAPI_FP_OPS, "PAPI_FP_OPS", "FP_OPS ", "FP_ops_executed ", "Floating point operations executed"} +}; + +static const int npapientries = sizeof (papitable) / sizeof (Entry); +static int papieventlist[MAX_AUX]; /* list of PAPI events to be counted */ +static Pr_event pr_event[MAX_AUX]; /* list of events (PAPI or derived) */ + +/* Derived events */ +static const Entry derivedtable [] = { + {GPTL_IPC, "GPTL_IPC", "IPC ", "Instr_per_cycle ", "Instructions per cycle"}, + {GPTL_CI, "GPTL_CI", "CI ", "Comp_Intensity ", "Computational intensity"}, + {GPTL_FPC, "GPTL_FPC", "Flop/Cyc", "FP_Ops_per_cycle", "Floating point ops per cycle"}, + {GPTL_FPI, "GPTL_FPI", "Flop/Ins", "FP_Ops_per_instr", "Floating point ops per instruction"}, + {GPTL_LSTPI, "GPTL_LSTPI", "LST_frac", "LST_fraction ", "Load-store instruction fraction"}, + {GPTL_DCMRT, "GPTL_DCMRT", "DCMISRAT", "L1_Miss_Rate ", "L1 miss rate (fraction)"}, + {GPTL_LSTPDCM,"GPTL_LSTPDCM", "LSTPDCM ", "LST_per_L1_miss ", "Load-store instructions per L1 miss"}, + {GPTL_L2MRT, "GPTL_L2MRT", "L2MISRAT", "L2_Miss_Rate ", "L2 miss rate (fraction)"}, + {GPTL_LSTPL2M,"GPTL_LSTPL2M", "LSTPL2M ", "LST_per_L2_miss ", "Load-store instructions per L2 miss"}, + {GPTL_L3MRT, "GPTL_L3MRT", "L3MISRAT", "L3_Miss_Rate ", "L3 read miss rate (fraction)"} +}; +static const int nderivedentries = sizeof (derivedtable) / sizeof (Entry); + +static int npapievents = 0; /* number of PAPI events: initialize to 0 */ +static int nevents = 0; /* number of events: initialize to 0 */ +static int *EventSet; /* list of events to be counted by PAPI */ +static long_long **papicounters; /* counters returned from PAPI */ + +static const int BADCOUNT = -999999; /* Set counters to this when they are bad */ +static bool is_multiplexed = false; /* whether multiplexed (always start false)*/ +static bool narrowprint = true; /* only use 8 digits not 16 for counter prints */ +static bool persec = true; /* print PAPI stats per second */ +static bool enable_multiplexing = false; /* whether to try multiplexing */ +static bool verbose = false; /* output verbosity */ + +/* Function prototypes */ + +static int canenable (int); +static int canenable2 (int, int); +static int papievent_is_enabled (int); +static int already_enabled (int); +static int enable (int); +static int getderivedidx (int); + +/* +** GPTL_PAPIsetoption: enable or disable PAPI event defined by "counter". Called +** from GPTLsetoption. Since all events are off by default, val=false degenerates +** to a no-op. Coded this way to be consistent with the rest of GPTL +** +** Input args: +** counter: PAPI counter +** val: true or false for enable or disable +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTL_PAPIsetoption (const int counter, /* PAPI counter (or option) */ + const int val) /* true or false for enable or disable */ +{ + int n; /* loop index */ + int ret; /* return code */ + int numidx; /* numerator index */ + int idx; /* derived counter index */ + char eventname[PAPI_MAX_STR_LEN]; /* returned from PAPI_event_code_to_name */ + + /* + ** First, check for option which is not an actual counter + */ + + switch (counter) { + case GPTLverbose: + /* don't printf here--that'd duplicate what's in gptl.c */ + verbose = (bool) val; + return 0; + case GPTLmultiplex: + enable_multiplexing = (bool) val; + if (verbose) + printf ("GPTL_PAPIsetoption: boolean enable_multiplexing = %d\n", val); + return 0; + case GPTLnarrowprint: + narrowprint = (bool) val; + if (verbose) + printf ("GPTL_PAPIsetoption: boolean narrowprint = %d\n", val); + return 0; + case GPTLpersec: + persec = (bool) val; + if (verbose) + printf ("GPTL_PAPIsetoption: boolean persec = %d\n", val); + return 0; + default: + break; + } + + /* + ** If val is false, return an error if the event has already been enabled. + ** Otherwise just warn that attempting to disable a PAPI-based event + ** that has already been enabled doesn't work--for now it's just a no-op + */ + + if (! val) { + if (already_enabled (counter)) + return GPTLerror ("GPTL_PAPIsetoption: already enabled counter %d cannot be disabled\n", + counter); + else + if (verbose) + printf ("GPTL_PAPIsetoption: 'disable' %d currently is just a no-op\n", counter); + return 0; + } + + /* If the event has already been enabled for printing, exit */ + + if (already_enabled (counter)) + return GPTLerror ("GPTL_PAPIsetoption: counter %d has already been enabled\n", + counter); + + /* + ** Initialize PAPI if it hasn't already been done. + ** From here on down we can assume the intent is to enable (not disable) an option + */ + + if (GPTL_PAPIlibraryinit () < 0) + return GPTLerror ("GPTL_PAPIsetoption: PAPI library init error\n"); + + /* Ensure max nevents won't be exceeded */ + + if (nevents+1 > MAX_AUX) + return GPTLerror ("GPTL_PAPIsetoption: %d is too many events. Can be increased in private.h\n", + nevents+1); + + /* Check derived events */ + + switch (counter) { + case GPTL_IPC: + if ( ! canenable2 (PAPI_TOT_INS, PAPI_TOT_CYC)) + return GPTLerror ("GPTL_PAPIsetoption: GPTL_IPC unavailable\n"); + + idx = getderivedidx (GPTL_IPC); + pr_event[nevents].event = derivedtable[idx]; + pr_event[nevents].numidx = enable (PAPI_TOT_INS); + pr_event[nevents].denomidx = enable (PAPI_TOT_CYC); + if (verbose) + printf ("GPTL_PAPIsetoption: enabling derived event %s = PAPI_TOT_INS / PAPI_TOT_CYC\n", + pr_event[nevents].event.namestr); + ++nevents; + return 0; + case GPTL_CI: + idx = getderivedidx (GPTL_CI); + if (canenable2 (PAPI_FP_OPS, PAPI_LST_INS)) { + pr_event[nevents].event = derivedtable[idx]; + pr_event[nevents].numidx = enable (PAPI_FP_OPS); + pr_event[nevents].denomidx = enable (PAPI_LST_INS); + if (verbose) + printf ("GPTL_PAPIsetoption: enabling derived event %s = PAPI_FP_OPS / PAPI_LST_INS\n", + pr_event[nevents].event.namestr); + } else if (canenable2 (PAPI_FP_OPS, PAPI_L1_DCA)) { + pr_event[nevents].event = derivedtable[idx]; + pr_event[nevents].numidx = enable (PAPI_FP_OPS); + pr_event[nevents].denomidx = enable (PAPI_L1_DCA); +#ifdef DEBUG + printf ("GPTL_PAPIsetoption: pr_event %d is derived and will be PAPI event %d / %d\n", + nevents, pr_event[nevents].numidx, pr_event[nevents].denomidx); +#endif + if (verbose) + printf ("GPTL_PAPIsetoption: enabling derived event %s = PAPI_FP_OPS / PAPI_L1_DCA\n", + pr_event[nevents].event.namestr); + } else { + return GPTLerror ("GPTL_PAPIsetoption: GPTL_CI unavailable\n"); + } + ++nevents; + return 0; + case GPTL_FPC: + if ( ! canenable2 (PAPI_FP_OPS, PAPI_TOT_CYC)) + return GPTLerror ("GPTL_PAPIsetoption: GPTL_FPC unavailable\n"); + + idx = getderivedidx (GPTL_FPC); + pr_event[nevents].event = derivedtable[idx]; + pr_event[nevents].numidx = enable (PAPI_FP_OPS); + pr_event[nevents].denomidx = enable (PAPI_TOT_CYC); + if (verbose) + printf ("GPTL_PAPIsetoption: enabling derived event %s = PAPI_FP_OPS / PAPI_TOT_CYC\n", + pr_event[nevents].event.namestr); + ++nevents; + return 0; + case GPTL_FPI: + if ( ! canenable2 (PAPI_FP_OPS, PAPI_TOT_INS)) + return GPTLerror ("GPTL_PAPIsetoption: GPTL_FPI unavailable\n"); + + idx = getderivedidx (GPTL_FPI); + pr_event[nevents].event = derivedtable[idx]; + pr_event[nevents].numidx = enable (PAPI_FP_OPS); + pr_event[nevents].denomidx = enable (PAPI_TOT_INS); + if (verbose) + printf ("GPTL_PAPIsetoption: enabling derived event %s = PAPI_FP_OPS / PAPI_TOT_INS\n", + pr_event[nevents].event.namestr); + ++nevents; + return 0; + case GPTL_LSTPI: + idx = getderivedidx (GPTL_LSTPI); + if (canenable2 (PAPI_LST_INS, PAPI_TOT_INS)) { + pr_event[nevents].event = derivedtable[idx]; + pr_event[nevents].numidx = enable (PAPI_LST_INS); + pr_event[nevents].denomidx = enable (PAPI_TOT_INS); + if (verbose) + printf ("GPTL_PAPIsetoption: enabling derived event %s = PAPI_LST_INS / PAPI_TOT_INS\n", + pr_event[nevents].event.namestr); + } else if (canenable2 (PAPI_L1_DCA, PAPI_TOT_INS)) { + pr_event[nevents].event = derivedtable[idx]; + pr_event[nevents].numidx = enable (PAPI_L1_DCA); + pr_event[nevents].denomidx = enable (PAPI_TOT_INS); + if (verbose) + printf ("GPTL_PAPIsetoption: enabling derived event %s = PAPI_L1_DCA / PAPI_TOT_INS\n", + pr_event[nevents].event.namestr); + } else { + return GPTLerror ("GPTL_PAPIsetoption: GPTL_LSTPI unavailable\n"); + } + ++nevents; + return 0; + case GPTL_DCMRT: + if ( ! canenable2 (PAPI_L1_DCM, PAPI_L1_DCA)) + return GPTLerror ("GPTL_PAPIsetoption: GPTL_DCMRT unavailable\n"); + + idx = getderivedidx (GPTL_DCMRT); + pr_event[nevents].event = derivedtable[idx]; + pr_event[nevents].numidx = enable (PAPI_L1_DCM); + pr_event[nevents].denomidx = enable (PAPI_L1_DCA); + if (verbose) + printf ("GPTL_PAPIsetoption: enabling derived event %s = PAPI_L1_DCM / PAPI_L1_DCA\n", + pr_event[nevents].event.namestr); + ++nevents; + return 0; + case GPTL_LSTPDCM: + idx = getderivedidx (GPTL_LSTPDCM); + if (canenable2 (PAPI_LST_INS, PAPI_L1_DCM)) { + pr_event[nevents].event = derivedtable[idx]; + pr_event[nevents].numidx = enable (PAPI_LST_INS); + pr_event[nevents].denomidx = enable (PAPI_L1_DCM); + if (verbose) + printf ("GPTL_PAPIsetoption: enabling derived event %s = PAPI_LST_INS / PAPI_L1_DCM\n", + pr_event[nevents].event.namestr); + } else if (canenable2 (PAPI_L1_DCA, PAPI_L1_DCM)) { + pr_event[nevents].event = derivedtable[idx]; + pr_event[nevents].numidx = enable (PAPI_L1_DCA); + pr_event[nevents].denomidx = enable (PAPI_L1_DCM); + if (verbose) + printf ("GPTL_PAPIsetoption: enabling derived event %s = PAPI_L1_DCA / PAPI_L1_DCM\n", + pr_event[nevents].event.namestr); + } else { + return GPTLerror ("GPTL_PAPIsetoption: GPTL_LSTPDCM unavailable\n"); + } + ++nevents; + return 0; + /* + ** For L2 counts, use TC* instead of DC* to avoid PAPI derived events + */ + case GPTL_L2MRT: + if ( ! canenable2 (PAPI_L2_TCM, PAPI_L2_TCA)) + return GPTLerror ("GPTL_PAPIsetoption: GPTL_L2MRT unavailable\n"); + + idx = getderivedidx (GPTL_L2MRT); + pr_event[nevents].event = derivedtable[idx]; + pr_event[nevents].numidx = enable (PAPI_L2_TCM); + pr_event[nevents].denomidx = enable (PAPI_L2_TCA); + if (verbose) + printf ("GPTL_PAPIsetoption: enabling derived event %s = PAPI_L2_TCM / PAPI_L2_TCA\n", + pr_event[nevents].event.namestr); + ++nevents; + return 0; + case GPTL_LSTPL2M: + idx = getderivedidx (GPTL_LSTPL2M); + if (canenable2 (PAPI_LST_INS, PAPI_L2_TCM)) { + pr_event[nevents].event = derivedtable[idx]; + pr_event[nevents].numidx = enable (PAPI_LST_INS); + pr_event[nevents].denomidx = enable (PAPI_L2_TCM); + if (verbose) + printf ("GPTL_PAPIsetoption: enabling derived event %s = PAPI_LST_INS / PAPI_L2_TCM\n", + pr_event[nevents].event.namestr); + } else if (canenable2 (PAPI_L1_DCA, PAPI_L2_TCM)) { + pr_event[nevents].event = derivedtable[idx]; + pr_event[nevents].numidx = enable (PAPI_L1_DCA); + pr_event[nevents].denomidx = enable (PAPI_L2_TCM); + if (verbose) + printf ("GPTL_PAPIsetoption: enabling derived event %s = PAPI_L1_DCA / PAPI_L2_TCM\n", + pr_event[nevents].event.namestr); + } else { + return GPTLerror ("GPTL_PAPIsetoption: GPTL_LSTPL2M unavailable\n"); + } + ++nevents; + return 0; + case GPTL_L3MRT: + if ( ! canenable2 (PAPI_L3_TCM, PAPI_L3_TCR)) + return GPTLerror ("GPTL_PAPIsetoption: GPTL_L3MRT unavailable\n"); + + idx = getderivedidx (GPTL_L3MRT); + pr_event[nevents].event = derivedtable[idx]; + pr_event[nevents].numidx = enable (PAPI_L3_TCM); + pr_event[nevents].denomidx = enable (PAPI_L3_TCR); + if (verbose) + printf ("GPTL_PAPIsetoption: enabling derived event %s = PAPI_L3_TCM / PAPI_L3_TCR\n", + pr_event[nevents].event.namestr); + ++nevents; + return 0; + default: + break; + } + + /* Check PAPI presets */ + + for (n = 0; n < npapientries; n++) { + if (counter == papitable[n].counter) { + if ((numidx = papievent_is_enabled (counter)) >= 0) { + pr_event[nevents].event = papitable[n]; + pr_event[nevents].numidx = numidx; + pr_event[nevents].denomidx = -1; /* flag says not derived (no denominator) */ + } else if (canenable (counter)) { + pr_event[nevents].event = papitable[n]; + pr_event[nevents].numidx = enable (counter); + pr_event[nevents].denomidx = -1; /* flag says not derived (no denominator) */ + } else { + return GPTLerror ("GPTL_PAPIsetoption: Can't enable event \n", + papitable[n].longstr); + } + if (verbose) + printf ("GPTL_PAPIsetoption: enabling PAPI preset event %s\n", + pr_event[nevents].event.namestr); + ++nevents; + return 0; + } + } + + /* + ** Check native events last: If PAPI_event_code_to_name fails, give up + */ + + if ((ret = PAPI_event_code_to_name (counter, eventname)) != PAPI_OK) + return GPTLerror ("GPTL_PAPIsetoption: name not found for counter %d: PAPI_strerror: %s\n", + counter, PAPI_strerror (ret)); + + /* + ** A table with predefined names of various lengths does not exist for + ** native events. Just truncate eventname. + */ + + if ((numidx = papievent_is_enabled (counter)) >= 0) { + pr_event[nevents].event.counter = counter; + + pr_event[nevents].event.namestr = (char *) GPTLallocate (12+1); + strncpy (pr_event[nevents].event.namestr, eventname, 12); + pr_event[nevents].event.namestr[12] = '\0'; + + pr_event[nevents].event.str16 = (char *) GPTLallocate (16+1); + strncpy (pr_event[nevents].event.str16, eventname, 16); + pr_event[nevents].event.str16[16] = '\0'; + + pr_event[nevents].event.longstr = (char *) GPTLallocate (PAPI_MAX_STR_LEN); + strncpy (pr_event[nevents].event.longstr, eventname, PAPI_MAX_STR_LEN); + + pr_event[nevents].numidx = numidx; + pr_event[nevents].denomidx = -1; /* flag says not derived (no denominator) */ + } else if (canenable (counter)) { + pr_event[nevents].event.counter = counter; + + pr_event[nevents].event.namestr = (char *) GPTLallocate (12+1); + strncpy (pr_event[nevents].event.namestr, eventname, 12); + pr_event[nevents].event.namestr[12] = '\0'; + + pr_event[nevents].event.str16 = (char *) GPTLallocate (16+1); + strncpy (pr_event[nevents].event.str16, eventname, 16); + pr_event[nevents].event.str16[16] = '\0'; + + pr_event[nevents].event.longstr = (char *) GPTLallocate (PAPI_MAX_STR_LEN); + strncpy (pr_event[nevents].event.longstr, eventname, PAPI_MAX_STR_LEN); + + pr_event[nevents].numidx = enable (counter); + pr_event[nevents].denomidx = -1; /* flag says not derived (no denominator) */ + } else { + return GPTLerror ("GPTL_PAPIsetoption: Can't enable event %s\n", eventname); + } + + if (verbose) + printf ("GPTL_PAPIsetoption: enabling native event %s\n", pr_event[nevents].event.longstr); + + ++nevents; + return 0; +} + +/* +** canenable: determine whether a PAPI counter can be enabled +** +** Input args: +** counter: PAPI counter +** +** Return value: 0 (success) or non-zero (failure) +*/ + +int canenable (int counter) +{ + char eventname[PAPI_MAX_STR_LEN]; /* returned from PAPI_event_code_to_name */ + + if (npapievents+1 > MAX_AUX) + return false; + + if (PAPI_query_event (counter) != PAPI_OK) { + (void) PAPI_event_code_to_name (counter, eventname); + fprintf (stderr, "canenable: event %s not available on this arch\n", eventname); + return false; + } + + return true; +} + +/* +** canenable2: determine whether 2 PAPI counters can be enabled +** +** Input args: +** counter1: PAPI counter +** counter2: PAPI counter +** +** Return value: 0 (success) or non-zero (failure) +*/ + +int canenable2 (int counter1, int counter2) +{ + char eventname[PAPI_MAX_STR_LEN]; /* returned from PAPI_event_code_to_name */ + + if (npapievents+2 > MAX_AUX) + return false; + + if (PAPI_query_event (counter1) != PAPI_OK) { + (void) PAPI_event_code_to_name (counter1, eventname); + return false; + } + + if (PAPI_query_event (counter2) != PAPI_OK) { + (void) PAPI_event_code_to_name (counter2, eventname); + return false; + } + + return true; +} + +/* +** papievent_is_enabled: determine whether a PAPI counter has already been +** enabled. Used internally to keep track of PAPI counters enabled. A given +** PAPI counter may occur in the computation of multiple derived events, as +** well as output directly. E.g. PAPI_FP_OPS is used to compute +** computational intensity, and floating point ops per instruction. +** +** Input args: +** counter: PAPI counter +** +** Return value: index into papieventlist (success) or negative (not found) +*/ + +int papievent_is_enabled (int counter) +{ + int n; + + for (n = 0; n < npapievents; ++n) + if (papieventlist[n] == counter) + return n; + return -1; +} + +/* +** already_enabled: determine whether a PAPI-based event has already been +** enabled for printing. +** +** Input args: +** counter: PAPI or derived counter +** +** Return value: 1 (true) or 0 (false) +*/ + +int already_enabled (int counter) +{ + int n; + + for (n = 0; n < nevents; ++n) + if (pr_event[n].event.counter == counter) + return 1; + return 0; +} + +/* +** enable: enable a PAPI event. ASSUMES that canenable() has already determined +** that the event can be enabled. +** +** Input args: +** counter: PAPI counter +** +** Return value: index into papieventlist +*/ + +int enable (int counter) +{ + int n; + + /* If the event is already enabled, return its index */ + + for (n = 0; n < npapievents; ++n) { + if (papieventlist[n] == counter) { +#ifdef DEBUG + printf ("enable: PAPI event %d is %d\n", n, counter); +#endif + return n; + } + } + + /* New event */ + + papieventlist[npapievents++] = counter; + return npapievents-1; +} + +/* +** getderivedidx: find the table index of a derived counter +** +** Input args: +** counter: derived counter +** +** Return value: index into derivedtable (success) or GPTLerror (failure) +*/ + +int getderivedidx (int dcounter) +{ + int n; + + for (n = 0; n < nderivedentries; ++n) { + if (derivedtable[n].counter == dcounter) + return n; + } + return GPTLerror ("getderivedidx: failed to find derived counter %d\n", dcounter); +} + +/* +** GPTL_PAPIlibraryinit: Call PAPI_library_init if necessary +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTL_PAPIlibraryinit () +{ + int ret; + + if ((ret = PAPI_is_initialized ()) == PAPI_NOT_INITED) { + if ((ret = PAPI_library_init (PAPI_VER_CURRENT)) != PAPI_VER_CURRENT) { + fprintf (stderr, "GPTL_PAPIlibraryinit: ret=%d PAPI_VER_CURRENT=%d\n", + ret, (int) PAPI_VER_CURRENT); + return GPTLerror ("GPTL_PAPIlibraryinit: PAPI_library_init failure:%s\n", + PAPI_strerror (ret)); + } + } + return 0; +} + +/* +** GPTL_PAPIinitialize(): Initialize the PAPI interface. Called from GPTLinitialize. +** PAPI_library_init must be called before any other PAPI routines. +** PAPI_thread_init is called subsequently if threading is enabled. +** Finally, allocate space for PAPI counters and start them. +** +** Input args: +** maxthreads: number of threads +** +** Return value: 0 (success) or GPTLerror or -1 (failure) +*/ + +int GPTL_PAPIinitialize (const int maxthreads, /* number of threads */ + const bool verbose_flag, /* output verbosity */ + int *nevents_out, /* nevents needed by gptl.c */ + Entry *pr_event_out) /* events needed by gptl.c */ +{ + int ret; /* return code */ + int n; /* loop index */ + int t; /* thread index */ + + verbose = verbose_flag; + + if (maxthreads < 1) + return GPTLerror ("GPTL_PAPIinitialize: maxthreads = %d\n", maxthreads); + + /* Ensure that PAPI_library_init has already been called */ + + if ((ret = GPTL_PAPIlibraryinit ()) < 0) + return GPTLerror ("GPTL_PAPIinitialize: GPTL_PAPIlibraryinit failure\n"); + + /* PAPI_thread_init needs to be called if threading enabled */ + +#if ( defined THREADED_OMP ) + if (PAPI_thread_init ((unsigned long (*)(void)) (omp_get_thread_num)) != PAPI_OK) + return GPTLerror ("GPTL_PAPIinitialize: PAPI_thread_init failure\n"); +#elif ( defined THREADED_PTHREADS ) + if (PAPI_thread_init ((unsigned long (*)(void)) (pthread_self)) != PAPI_OK) + return GPTLerror ("GPTL_PAPIinitialize: PAPI_thread_init failure\n"); +#endif + + /* allocate and initialize static local space */ + + EventSet = (int *) GPTLallocate (maxthreads * sizeof (int)); + papicounters = (long_long **) GPTLallocate (maxthreads * sizeof (long_long *)); + + for (t = 0; t < maxthreads; t++) { + EventSet[t] = PAPI_NULL; + papicounters[t] = (long_long *) GPTLallocate (MAX_AUX * sizeof (long_long)); + } + + *nevents_out = nevents; + for (n = 0; n < nevents; ++n) { + pr_event_out[n].counter = pr_event[n].event.counter; + pr_event_out[n].namestr = pr_event[n].event.namestr; + pr_event_out[n].str8 = pr_event[n].event.str8; + pr_event_out[n].str16 = pr_event[n].event.str16; + pr_event_out[n].longstr = pr_event[n].event.longstr; + } + return 0; +} + +/* +** GPTLcreate_and_start_events: Create and start the PAPI eventset. +** Threaded routine to create the "event set" (PAPI terminology) and start +** the counters. This is only done once, and is called from get_thread_num +** for the first time for the thread. +** +** Input args: +** t: thread number +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLcreate_and_start_events (const int t) /* thread number */ +{ + int ret; /* return code */ + int n; /* loop index over events */ + char eventname[PAPI_MAX_STR_LEN]; /* returned from PAPI_event_code_to_name */ + + /* Create the event set */ + + if ((ret = PAPI_create_eventset (&EventSet[t])) != PAPI_OK) + return GPTLerror ("GPTLcreate_and_start_events: thread %d failure creating eventset: %s\n", + t, PAPI_strerror (ret)); + + if (verbose) + printf ("GPTLcreate_and_start_events: successfully created eventset for thread %d\n", t); + + /* Add requested events to the event set */ + + for (n = 0; n < npapievents; n++) { + if ((ret = PAPI_add_event (EventSet[t], papieventlist[n])) != PAPI_OK) { + if (verbose) { + fprintf (stderr, "%s\n", PAPI_strerror (ret)); + ret = PAPI_event_code_to_name (papieventlist[n], eventname); + fprintf (stderr, "GPTLcreate_and_start_events: failure adding event:%s\n", + eventname); + } + + if (enable_multiplexing) { + if (verbose) + printf ("Trying multiplexing...\n"); + is_multiplexed = true; + break; + } else + return GPTLerror ("enable_multiplexing is false: giving up\n"); + } + } + + if (is_multiplexed) { + + /* Cleanup the eventset for multiplexing */ + + if ((ret = PAPI_cleanup_eventset (EventSet[t])) != PAPI_OK) + return GPTLerror ("GPTLcreate_and_start_events: %s\n", PAPI_strerror (ret)); + + if ((ret = PAPI_destroy_eventset (&EventSet[t])) != PAPI_OK) + return GPTLerror ("GPTLcreate_and_start_events: %s\n", PAPI_strerror (ret)); + + if ((ret = PAPI_create_eventset (&EventSet[t])) != PAPI_OK) + return GPTLerror ("GPTLcreate_and_start_events: failure creating eventset: %s\n", + PAPI_strerror (ret)); + + if ((ret = PAPI_multiplex_init ()) != PAPI_OK) + return GPTLerror ("GPTLcreate_and_start_events: failure from PAPI_multiplex_init%s\n", + PAPI_strerror (ret)); + + if ((ret = PAPI_set_multiplex (EventSet[t])) != PAPI_OK) + return GPTLerror ("GPTLcreate_and_start_events: failure from PAPI_set_multiplex: %s\n", + PAPI_strerror (ret)); + + for (n = 0; n < npapievents; n++) { + if ((ret = PAPI_add_event (EventSet[t], papieventlist[n])) != PAPI_OK) { + ret = PAPI_event_code_to_name (papieventlist[n], eventname); + return GPTLerror ("GPTLcreate_and_start_events: failure adding event:%s\n" + " Error was: %s\n", eventname, PAPI_strerror (ret)); + } + } + } + + /* Start the event set. It will only be read from now on--never stopped */ + + if ((ret = PAPI_start (EventSet[t])) != PAPI_OK) + return GPTLerror ("GPTLcreate_and_start_events: failed to start event set: %s\n", + PAPI_strerror (ret)); + + return 0; +} + +/* +** GPTL_PAPIstart: Start the PAPI counters (actually they are just read). +** Called from GPTLstart. +** +** Input args: +** t: thread number +** +** Output args: +** aux: struct containing the counters +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTL_PAPIstart (const int t, /* thread number */ + Papistats *aux) /* struct containing PAPI stats */ +{ + int ret; /* return code from PAPI lib calls */ + int n; /* loop index */ + + /* If no events are to be counted just return */ + + if (npapievents == 0) + return 0; + + /* Read the counters */ + + if ((ret = PAPI_read (EventSet[t], papicounters[t])) != PAPI_OK) + return GPTLerror ("GPTL_PAPIstart: %s\n", PAPI_strerror (ret)); + + /* + ** Store the counter values. When GPTL_PAPIstop is called, the counters + ** will again be read, and differenced with the values saved here. + */ + + for (n = 0; n < npapievents; n++) + aux->last[n] = papicounters[t][n]; + + return 0; +} + +/* +** GPTL_PAPIstop: Stop the PAPI counters (actually they are just read). +** Called from GPTLstop. +** +** Input args: +** t: thread number +** +** Input/output args: +** aux: struct containing the counters +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTL_PAPIstop (const int t, /* thread number */ + Papistats *aux) /* struct containing PAPI stats */ +{ + int ret; /* return code from PAPI lib calls */ + int n; /* loop index */ + long_long delta; /* change in counters from previous read */ + + /* If no events are to be counted just return */ + + if (npapievents == 0) + return 0; + + /* Read the counters */ + + if ((ret = PAPI_read (EventSet[t], papicounters[t])) != PAPI_OK) + return GPTLerror ("GPTL_PAPIstop: %s\n", PAPI_strerror (ret)); + + /* + ** Accumulate the difference since timer start in aux. + ** Negative accumulation can happen when multiplexing is enabled, so don't + ** set count to BADCOUNT in that case. + */ + + for (n = 0; n < npapievents; n++) { +#ifdef DEBUG + printf ("GPTL_PAPIstop: event %d counter value is %ld\n", n, (long) papicounters[t][n]); +#endif + delta = papicounters[t][n] - aux->last[n]; + if ( ! is_multiplexed && delta < 0) + aux->accum[n] = BADCOUNT; + else + aux->accum[n] += delta; + } + return 0; +} + +/* +** GPTL_PAPIprstr: Print the descriptive string for all enabled PAPI events. +** Called from GPTLpr. +** +** Input args: +** fp: file descriptor +*/ + +void GPTL_PAPIprstr (FILE *fp) +{ + int n; + + if (narrowprint) { + for (n = 0; n < nevents; n++) { + fprintf (fp, "%8.8s ", pr_event[n].event.str8); + + /* Test on < 0 says it's a PAPI preset */ + + if (persec && pr_event[n].event.counter < 0) + fprintf (fp, "e6_/_sec "); + } + } else { + for (n = 0; n < nevents; n++) { + fprintf (fp, "%16.16s ", pr_event[n].event.str16); + + /* Test on < 0 says it's a PAPI preset */ + + if (persec && pr_event[n].event.counter < 0) + fprintf (fp, "e6_/_sec "); + } + } +} + +/* +** GPTL_PAPIpr: Print PAPI counter values for all enabled events, including +** derived events. Called from GPTLpr. +** +** Input args: +** fp: file descriptor +** aux: struct containing the counters +*/ + +void GPTL_PAPIpr (FILE *fp, /* file descriptor to write to */ + const Papistats *aux, /* stats to write */ + const int t, /* thread number */ + const int count, /* number of invocations */ + const double wcsec) /* wallclock time (sec) */ +{ + const char *shortintfmt = "%8ld "; + const char *longintfmt = "%16ld "; + const char *shortfloatfmt = "%8.2e "; + const char *longfloatfmt = "%16.10e "; + const char *intfmt; /* integer format */ + const char *floatfmt; /* floating point format */ + + int n; /* loop index */ + int numidx; /* index pointer to appropriated (derived) numerator */ + int denomidx; /* index pointer to appropriated (derived) denominator */ + double val; /* value to be printed */ + + intfmt = narrowprint ? shortintfmt : longintfmt; + floatfmt = narrowprint ? shortfloatfmt : longfloatfmt; + + for (n = 0; n < nevents; n++) { + numidx = pr_event[n].numidx; + if (pr_event[n].denomidx > -1) { /* derived event */ + denomidx = pr_event[n].denomidx; + +#ifdef DEBUG + printf ("GPTL_PAPIpr: derived event: numidx=%d denomidx=%d values = %ld %ld\n", + numidx, denomidx, (long) aux->accum[numidx], (long) aux->accum[denomidx]); +#endif + /* Protect against divide by zero */ + + if (aux->accum[denomidx] > 0) + val = (double) aux->accum[numidx] / (double) aux->accum[denomidx]; + else + val = 0.; + fprintf (fp, floatfmt, val); + + } else { /* Raw PAPI event */ + +#ifdef DEBUG + printf ("GPTL_PAPIpr: raw event: numidx=%d value = %ld\n", + numidx, (long) aux->accum[numidx]); +#endif + if (aux->accum[numidx] < PRTHRESH) + fprintf (fp, intfmt, (long) aux->accum[numidx]); + else + fprintf (fp, floatfmt, (double) aux->accum[numidx]); + + if (persec) { + if (wcsec > 0.) + fprintf (fp, "%8.2f ", aux->accum[numidx] * 1.e-6 / wcsec); + else + fprintf (fp, "%8.2f ", 0.); + } + } + } +} + +/* +** GPTL_PAPIprintenabled: Print list of enabled timers +** +** Input args: +** fp: file descriptor +*/ + +void GPTL_PAPIprintenabled (FILE *fp) +{ + int n, nn; + PAPI_event_info_t info; /* returned from PAPI_get_event_info */ + char eventname[PAPI_MAX_STR_LEN]; /* returned from PAPI_event_code_to_name */ + + if (nevents > 0) { + fprintf (fp, "Description of printed events (PAPI and derived):\n"); + for (n = 0; n < nevents; n++) { + if (strncmp (pr_event[n].event.namestr, "GPTL", 4) == 0) { + fprintf (fp, " %s: %s\n", pr_event[n].event.namestr, pr_event[n].event.longstr); + } else { + nn = pr_event[n].event.counter; + if (PAPI_get_event_info (nn, &info) == PAPI_OK) { + fprintf (fp, " %s\n", info.short_descr); + fprintf (fp, " %s\n", info.note); + } + } + } + fprintf (fp, "\n"); + + fprintf (fp, "PAPI events enabled (including those required for derived events):\n"); + for (n = 0; n < npapievents; n++) + if (PAPI_event_code_to_name (papieventlist[n], eventname) == PAPI_OK) + fprintf (fp, " %s\n", eventname); + fprintf (fp, "\n"); + } +} + +/* +** GPTL_PAPIadd: Accumulate PAPI counters. Called from add. +** +** Input/Output args: +** auxout: auxout = auxout + auxin +** +** Input args: +** auxin: counters to be summed into auxout +*/ + +void GPTL_PAPIadd (Papistats *auxout, /* output struct */ + const Papistats *auxin) /* input struct */ +{ + int n; + + for (n = 0; n < npapievents; n++) + if (auxin->accum[n] == BADCOUNT || auxout->accum[n] == BADCOUNT) + auxout->accum[n] = BADCOUNT; + else + auxout->accum[n] += auxin->accum[n]; +} + +/* +** GPTL_PAPIfinalize: finalization routine must be called from single-threaded +** region. Free all malloc'd space +*/ + +void GPTL_PAPIfinalize (int maxthreads) +{ + int t; /* thread index */ + int ret; /* return code */ + + for (t = 0; t < maxthreads; t++) { + ret = PAPI_stop (EventSet[t], papicounters[t]); + free (papicounters[t]); + ret = PAPI_cleanup_eventset (EventSet[t]); + ret = PAPI_destroy_eventset (&EventSet[t]); + } + + free (EventSet); + free (papicounters); + + /* Reset initial values */ + + npapievents = 0; + nevents = 0; + is_multiplexed = false; + narrowprint = true; + persec = true; + enable_multiplexing = false; + verbose = false; +} + +/* +** GPTL_PAPIquery: return current PAPI counter info. Return into a long for best +** compatibility possibilities with Fortran. +** +** Input args: +** aux: struct containing the counters +** ncounters: max number of counters to return +** +** Output args: +** papicounters_out: current value of PAPI counters +*/ + +void GPTL_PAPIquery (const Papistats *aux, + long long *papicounters_out, + int ncounters) +{ + int n; + + if (ncounters > 0) { + for (n = 0; n < ncounters && n < npapievents; n++) { + papicounters_out[n] = (long long) aux->accum[n]; + } + } +} + +/* +** GPTL_PAPIget_eventvalue: return current value for an enabled event. +** +** Input args: +** eventname: event name to check (whether derived or raw PAPI counter) +** aux: struct containing the counter(s) for the event +** +** Output args: +** value: current value of the event +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTL_PAPIget_eventvalue (const char *eventname, + const Papistats *aux, + double *value) +{ + int n; /* loop index through enabled events */ + int numidx; /* numerator index into papicounters */ + int denomidx; /* denominator index into papicounters */ + + for (n = 0; n < nevents; ++n) { + if (STRMATCH (eventname, pr_event[n].event.namestr)) { + numidx = pr_event[n].numidx; + if (pr_event[n].denomidx > -1) { /* derived event */ + denomidx = pr_event[n].denomidx; + if (aux->accum[denomidx] > 0) /* protect against divide by zero */ + *value = (double) aux->accum[numidx] / (double) aux->accum[denomidx]; + else + *value = 0.; + } else { /* Raw PAPI event */ + *value = (double) aux->accum[numidx]; + } + break; + } + } + if (n == nevents) + return GPTLerror ("GPTL_PAPIget_eventvalue: event %s not enabled\n", eventname); + return 0; +} + +/* +** GPTL_PAPIis_multiplexed: return status of whether events are being multiplexed +*/ + +bool GPTL_PAPIis_multiplexed () +{ + return is_multiplexed; +} + +/* +** The following functions are publicly available +*/ + +void read_counters100 () +{ + int i; + int ret; + long_long counters[MAX_AUX]; + + for (i = 0; i < 10; ++i) { + ret = PAPI_read (EventSet[0], counters); + ret = PAPI_read (EventSet[0], counters); + ret = PAPI_read (EventSet[0], counters); + ret = PAPI_read (EventSet[0], counters); + ret = PAPI_read (EventSet[0], counters); + ret = PAPI_read (EventSet[0], counters); + ret = PAPI_read (EventSet[0], counters); + ret = PAPI_read (EventSet[0], counters); + ret = PAPI_read (EventSet[0], counters); + ret = PAPI_read (EventSet[0], counters); + } + return; +} + +/* +** GPTLevent_name_to_code: convert a string to a PAPI code +** or derived event code. +** +** Input arguments: +** arg: string to convert +** +** Output arguments: +** code: PAPI or GPTL derived code +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLevent_name_to_code (const char *name, int *code) +{ + int ret; /* return code */ + int n; /* loop over derived entries */ + + /* + ** First check derived events + */ + + for (n = 0; n < nderivedentries; ++n) { + if (STRMATCH (name, derivedtable[n].namestr)) { + *code = derivedtable[n].counter; + return 0; + } + } + + /* + ** Next check PAPI events--note that PAPI must be initialized before the + ** name_to_code function can be invoked. + */ + + if ((ret = GPTL_PAPIlibraryinit ()) < 0) + return GPTLerror ("GPTL_event_name_to_code: GPTL_PAPIlibraryinit failure\n"); + + if ((PAPI_event_name_to_code ((char *) name, code)) != PAPI_OK) + return GPTLerror ("GPTL_event_name_to_code: PAPI_event_name_to_code failure\n"); + + return 0; +} + +/* +** GPTLevent_code_to_name: convert a string to a PAPI code +** or derived event code. +** +** Input arguments: +** code: event code (PAPI or derived) +** +** Output arguments: +** name: string corresponding to code +** +** Return value: 0 (success) or GPTLerror (failure) +*/ + +int GPTLevent_code_to_name (const int code, char *name) +{ + int ret; /* return code */ + int n; /* loop over derived entries */ + + /* + ** First check derived events + */ + + for (n = 0; n < nderivedentries; ++n) { + if (code == derivedtable[n].counter) { + strcpy (name, derivedtable[n].namestr); + return 0; + } + } + + /* + ** Next check PAPI events--note that PAPI must be initialized before the + ** code_to_name function can be invoked. + */ + + if ((ret = GPTL_PAPIlibraryinit ()) < 0) + return GPTLerror ("GPTL_event_code_to_name: GPTL_PAPIlibraryinit failure\n"); + + if (PAPI_event_code_to_name (code, name) != PAPI_OK) + return GPTLerror ("GPTL_event_code_to_name: PAPI_event_code_to_name failure\n"); + + return 0; +} + +int GPTLget_npapievents (void) +{ + return npapievents; +} + +#else + +/* +** HAVE_PAPI not defined branch: "Should not be called" entry points for public routines +*/ + +int GPTL_PAPIlibraryinit () +{ + return GPTLerror ("GPTL_PAPIlibraryinit: PAPI not enabled\n"); +} + +int GPTLevent_name_to_code (const char *name, int *code) +{ + return GPTLerror ("GPTLevent_name_to_code: PAPI not enabled\n"); +} + +int GPTLevent_code_to_name (int code, char *name) +{ + return GPTLerror ("GPTLevent_code_to_name: PAPI not enabled\n"); +} + +#endif /* HAVE_PAPI */ diff --git a/share/timing/perf_mod.F90 b/share/timing/perf_mod.F90 new file mode 100644 index 0000000..b8f9b50 --- /dev/null +++ b/share/timing/perf_mod.F90 @@ -0,0 +1,1770 @@ +module perf_mod + +!----------------------------------------------------------------------- +! +! Purpose: This module is responsible for controlling the performance +! timer logic. +! +! Author: P. Worley, January 2007 +! +! $Id$ +! +!----------------------------------------------------------------------- + +!----------------------------------------------------------------------- +!- Uses ---------------------------------------------------------------- +!----------------------------------------------------------------------- +#ifdef NUOPC_INTERFACE +#define TIMERSTART call ESMF_TraceRegionEnter +#define TIMERSTOP call ESMF_TraceRegionExit + use ESMF, only: ESMF_TraceRegionEnter, ESMF_TraceRegionExit +#else +#define TIMERSTART ierr = GPTLstart +#define TIMERSTOP ierr = GPTLstop +#endif + +#ifndef USE_CSM_SHARE + use perf_utils +#else + use shr_sys_mod, only: shr_sys_abort + use shr_kind_mod, only: SHR_KIND_CS, SHR_KIND_CM, SHR_KIND_CX, & + SHR_KIND_R8, SHR_KIND_I8 + use shr_mpi_mod, only: shr_mpi_barrier, shr_mpi_bcast + use shr_log_mod, only: shr_log_getUnit, shr_log_freeUnit + use namelist_utils, only: find_group_name +#endif + use mpi +#if ( defined _OPENMP ) + use omp_lib, only : omp_in_parallel +#endif +!!----------------------------------------------------------------------- +!- module boilerplate -------------------------------------------------- +!----------------------------------------------------------------------- + implicit none + private ! Make the default access private + save + +!----------------------------------------------------------------------- +! Public interfaces ---------------------------------------------------- +!----------------------------------------------------------------------- + public t_initf + public t_setLogUnit + public t_getLogUnit + public t_profile_onf + public t_barrier_onf + public t_single_filef + public t_set_prefixf + public t_unset_prefixf + public t_stampf + public t_startf + public t_stopf + public t_startstop_valsf + public t_enablef + public t_disablef + public t_adj_detailf + public t_barrierf + public t_prf + public t_finalizef + +!----------------------------------------------------------------------- +! Private interfaces (local) ------------------------------------------- +!----------------------------------------------------------------------- + private perf_defaultopts + private perf_setopts + private papi_defaultopts + private papi_setopts + +!----------------------------------------------------------------------- +!- include statements -------------------------------------------------- +!----------------------------------------------------------------------- +#include "gptl.inc" + +!----------------------------------------------------------------------- +! Private data --------------------------------------------------------- +!----------------------------------------------------------------------- + + !---------------------------------------------------------------------------- + ! perf_mod options + !---------------------------------------------------------------------------- + integer, parameter :: def_p_logunit = 6 ! default + integer, private :: p_logunit = def_p_logunit + ! unit number for log output + + logical, parameter :: def_timing_initialized = .false. ! default + logical, private :: timing_initialized = def_timing_initialized + ! flag indicating whether timing library has + ! been initialized + + logical, parameter :: def_timing_disable = .false. ! default + logical, private :: timing_disable = def_timing_disable + ! flag indicating whether timers are disabled + + logical, parameter :: def_timing_barrier = .false. ! default + logical, private :: timing_barrier = def_timing_barrier + ! flag indicating whether the mpi_barrier in + ! t_barrierf should be called + + integer, parameter :: def_timer_depth_limit = 99999 ! default + integer, private :: timer_depth_limit = def_timer_depth_limit + ! integer indicating maximum number of levels of + ! timer nesting + + integer, parameter :: def_timing_detail_limit = 1 ! default + integer, private :: timing_detail_limit = def_timing_detail_limit + ! integer indicating maximum detail level to + ! profile + + integer, parameter :: init_timing_disable_depth = 0 ! init + integer, private :: timing_disable_depth = init_timing_disable_depth + ! integer indicating depth of t_disablef calls + + integer, parameter :: init_timing_detail = 0 ! init + integer, private :: cur_timing_detail = init_timing_detail + ! current timing detail level +#ifdef NUOPC_INTERFACE + integer, private :: cur_timing_depth = 0 +#endif + + integer, parameter :: init_num_threads = 1 ! init + integer, private :: num_threads = init_num_threads + ! current maximum number of threads per process + + logical, parameter :: def_perf_single_file = .false. ! default + logical, private :: perf_single_file = def_perf_single_file + ! flag indicating whether the performance timer + ! output should be written to a single file + ! (per component communicator) or to a + ! separate file for each process + + integer, parameter :: def_perf_outpe_num = 0 ! default + integer, private :: perf_outpe_num = def_perf_outpe_num + ! maximum number of processes writing out + ! timing data (for this component communicator) + + integer, parameter :: def_perf_outpe_stride = 1 ! default + integer, private :: perf_outpe_stride = def_perf_outpe_stride + ! separation between process ids for processes + ! that are writing out timing data + ! (for this component communicator) + + logical, parameter :: def_perf_global_stats = .true. ! default + logical, private :: perf_global_stats = def_perf_global_stats + ! collect and print out global performance statistics + ! (for this component communicator) + + logical, parameter :: def_perf_ovhd_measurement = .false. ! default + logical, private :: perf_ovhd_measurement = def_perf_ovhd_measurement + ! measure overhead of profiling directly + + real(shr_kind_r8), private :: perf_timing_ovhd = 0.0 ! start/stop overhead + + logical, parameter :: def_perf_add_detail = .false. ! default + logical, private :: perf_add_detail = def_perf_add_detail + ! flag indicating whether to add the current + ! detail level as a suffix to the timer name. + ! This requires that even t_startf/t_stopf + ! calls do not cross detail level changes +#ifdef HAVE_MPI + integer, parameter :: def_perf_timer = GPTLmpiwtime ! default +#else +#ifdef HAVE_NANOTIME + integer, parameter :: def_perf_timer = GPTLnanotime ! default +#else +#ifdef CPRIBM + integer,parameter :: def_perf_timer = GPTLread_real_time +#else + integer,parameter :: def_perf_timer = GPTLgettimeofday +#endif +#endif +#endif + + + integer, private :: perf_timer = def_perf_timer ! default + ! integer indicating which timer to use + ! (as defined in gptl.inc) + +#ifdef HAVE_PAPI + logical, parameter :: def_perf_papi_enable = .false. ! default +#else + logical, parameter :: def_perf_papi_enable = .false. ! default +#endif + logical, private :: perf_papi_enable = def_perf_papi_enable + ! flag indicating whether the PAPI namelist + ! should be read and HW performance counters + ! used in profiling + + ! PAPI counter ids + integer, parameter :: PAPI_NULL = -1 + + integer, parameter :: def_papi_ctr1 = PAPI_NULL ! default + integer, private :: papi_ctr1 = def_papi_ctr1 + + integer, parameter :: def_papi_ctr2 = PAPI_NULL ! default + integer, private :: papi_ctr2 = def_papi_ctr2 + + integer, parameter :: def_papi_ctr3 = PAPI_NULL ! default + integer, private :: papi_ctr3 = def_papi_ctr3 + + integer, parameter :: def_papi_ctr4 = PAPI_NULL ! default + integer, private :: papi_ctr4 = def_papi_ctr4 + +!======================================================================= +contains +!======================================================================= + +! +!======================================================================== +! + subroutine t_getLogUnit(LogUnit) +!----------------------------------------------------------------------- +! Purpose: Get log unit number. +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Input arguments----------------------------- +! + integer(SHR_KIND_IN), intent(OUT) :: LogUnit ! Unit number for log output +!----------------------------------------------------------------------- + + LogUnit = p_logunit + + return + end subroutine t_getLogUnit +! +!======================================================================== +! + subroutine t_setLogUnit(LogUnit) +!----------------------------------------------------------------------- +! Purpose: Set log unit number. +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Input arguments----------------------------- +! + integer(SHR_KIND_IN), intent(IN) :: LogUnit ! Unit number for log output +!----------------------------------------------------------------------- + + p_logunit = LogUnit +#ifndef USE_CSM_SHARE + call perfutils_setunit(p_logunit) +#endif + + return + end subroutine t_setLogUnit +! +!======================================================================== +! + subroutine perf_defaultopts(timing_disable_out, & + perf_timer_out, & + timer_depth_limit_out, & + timing_detail_limit_out, & + timing_barrier_out, & + perf_outpe_num_out, & + perf_outpe_stride_out, & + perf_single_file_out, & + perf_global_stats_out, & + perf_papi_enable_out, & + perf_ovhd_measurement_out, & + perf_add_detail_out ) +!----------------------------------------------------------------------- +! Purpose: Return default runtime options +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Input arguments----------------------------- + ! timers disable/enable option + logical, intent(out), optional :: timing_disable_out + ! performance timer option + integer, intent(out), optional :: perf_timer_out + ! timer depth limit option + integer, intent(out), optional :: timer_depth_limit_out + ! timer detail limit option + integer, intent(out), optional :: timing_detail_limit_out + ! timing barrier enable/disable option + logical, intent(out), optional :: timing_barrier_out + ! number of processes writing out timing data + integer, intent(out), optional :: perf_outpe_num_out + ! separation between process ids for processes that are writing out timing data + integer, intent(out), optional :: perf_outpe_stride_out + ! timing single / multple output file option + logical, intent(out), optional :: perf_single_file_out + ! collect and output global performance statistics option + logical, intent(out), optional :: perf_global_stats_out + ! calling PAPI to read HW performance counters option + logical, intent(out), optional :: perf_papi_enable_out + ! measure overhead of profiling directly + logical, intent(out), optional :: perf_ovhd_measurement_out + ! 'suffix' timer name with current detail level + logical, intent(out), optional :: perf_add_detail_out +!----------------------------------------------------------------------- + if ( present(timing_disable_out) ) then + timing_disable_out = def_timing_disable + endif + if ( present(perf_timer_out) ) then + perf_timer_out = def_perf_timer + endif + if ( present(timer_depth_limit_out) ) then + timer_depth_limit_out = def_timer_depth_limit + endif + if ( present(timing_detail_limit_out) ) then + timing_detail_limit_out = def_timing_detail_limit + endif + if ( present(timing_barrier_out) ) then + timing_barrier_out = def_timing_barrier + endif + if ( present(perf_outpe_num_out) ) then + perf_outpe_num_out = def_perf_outpe_num + endif + if ( present(perf_outpe_stride_out) ) then + perf_outpe_stride_out = def_perf_outpe_stride + endif + if ( present(perf_single_file_out) ) then + perf_single_file_out = def_perf_single_file + endif + if ( present(perf_global_stats_out) ) then + perf_global_stats_out = def_perf_global_stats + endif + if ( present(perf_papi_enable_out) ) then + perf_papi_enable_out = def_perf_papi_enable + endif + if ( present(perf_ovhd_measurement_out) ) then + perf_ovhd_measurement_out = def_perf_ovhd_measurement + endif + if ( present(perf_add_detail_out) ) then + perf_add_detail_out = def_perf_add_detail + endif +! + return + end subroutine perf_defaultopts +! +!======================================================================== +! + subroutine perf_setopts(mastertask, & + LogPrint, & + timing_disable_in, & + perf_timer_in, & + timer_depth_limit_in, & + timing_detail_limit_in, & + timing_barrier_in, & + perf_outpe_num_in, & + perf_outpe_stride_in, & + perf_single_file_in, & + perf_global_stats_in, & + perf_papi_enable_in, & + perf_ovhd_measurement_in, & + perf_add_detail_in ) +!----------------------------------------------------------------------- +! Purpose: Set runtime options +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Input arguments---------------------------- +! + ! master process? + logical, intent(in) :: mastertask + ! Print out to log file? + logical, intent(IN) :: LogPrint + ! timers disable/enable option + logical, intent(in), optional :: timing_disable_in + ! performance timer option + integer, intent(in), optional :: perf_timer_in + ! timer depth limit option + integer, intent(in), optional :: timer_depth_limit_in + ! timer detail limit option + integer, intent(in), optional :: timing_detail_limit_in + ! timing barrier enable/disable option + logical, intent(in), optional :: timing_barrier_in + ! number of processes writing out timing data + integer, intent(in), optional :: perf_outpe_num_in + ! separation between process ids for processes that are writing out timing data + integer, intent(in), optional :: perf_outpe_stride_in + ! timing single / multple output file option + logical, intent(in), optional :: perf_single_file_in + ! collect and output global performance statistics option + logical, intent(in), optional :: perf_global_stats_in + ! calling PAPI to read HW performance counters option + logical, intent(in), optional :: perf_papi_enable_in + ! measure overhead of profiling directly + logical, intent(in), optional :: perf_ovhd_measurement_in + ! 'suffix' timer name with current detail level + logical, intent(in), optional :: perf_add_detail_in +! +!---------------------------Local workspace----------------------------- +! + integer ierr ! error return +!----------------------------------------------------------------------- + if ( .not. timing_initialized ) then + + if ( present(timing_disable_in) ) then + timing_disable = timing_disable_in + if (timing_disable) then + ierr = GPTLdisable() + else + ierr = GPTLenable() + endif + endif + if ( present(perf_timer_in) ) then + if ((perf_timer_in .eq. GPTLgettimeofday) .or. & + (perf_timer_in .eq. GPTLnanotime) .or. & + (perf_timer_in .eq. GPTLread_real_time) .or. & + (perf_timer_in .eq. GPTLmpiwtime) .or. & + (perf_timer_in .eq. GPTLclockgettime) .or. & + (perf_timer_in .eq. GPTLpapitime)) then + perf_timer = perf_timer_in + else + if (mastertask) then + write(p_logunit,*) 'PERF_SETOPTS: illegal timer requested=',& + perf_timer_in, '. Request ignored.' + endif + endif + endif + if ( present(timer_depth_limit_in) ) then + timer_depth_limit = timer_depth_limit_in + endif + if ( present(timing_detail_limit_in) ) then + timing_detail_limit = timing_detail_limit_in + endif + if ( present(timing_barrier_in) ) then + timing_barrier = timing_barrier_in + endif + if ( present(perf_outpe_num_in) ) then + perf_outpe_num = perf_outpe_num_in + endif + if ( present(perf_outpe_stride_in) ) then + perf_outpe_stride = perf_outpe_stride_in + endif + if ( present(perf_single_file_in) ) then + perf_single_file = perf_single_file_in + endif + if ( present(perf_global_stats_in) ) then + perf_global_stats = perf_global_stats_in + endif + if ( present(perf_papi_enable_in) ) then +#ifdef HAVE_PAPI + perf_papi_enable = perf_papi_enable_in +#else + if (perf_papi_enable_in) then + if (mastertask) then + write(p_logunit,*) 'PERF_SETOPTS: PAPI library not linked in. ',& + 'Request to enable PAPI ignored.' + endif + endif + perf_papi_enable = .false. +#endif + endif + if ( present(perf_ovhd_measurement_in) ) then + perf_ovhd_measurement = perf_ovhd_measurement_in + endif + if ( present(perf_add_detail_in) ) then + perf_add_detail = perf_add_detail_in + endif +! + if (mastertask .and. LogPrint) then + write(p_logunit,*) '(t_initf) Using profile_disable= ', timing_disable + write(p_logunit,*) '(t_initf) profile_timer= ', perf_timer + write(p_logunit,*) '(t_initf) profile_depth_limit= ', timer_depth_limit + write(p_logunit,*) '(t_initf) profile_detail_limit= ', timing_detail_limit + write(p_logunit,*) '(t_initf) profile_barrier= ', timing_barrier + write(p_logunit,*) '(t_initf) profile_outpe_num= ', perf_outpe_num + write(p_logunit,*) '(t_initf) profile_outpe_stride= ', perf_outpe_stride + write(p_logunit,*) '(t_initf) profile_single_file= ', perf_single_file + write(p_logunit,*) '(t_initf) profile_global_stats= ', perf_global_stats + write(p_logunit,*) '(t_initf) profile_ovhd_measurement=', perf_ovhd_measurement + write(p_logunit,*) '(t_initf) profile_add_detail= ', perf_add_detail + write(p_logunit,*) '(t_initf) profile_papi_enable= ', perf_papi_enable + endif +! +#ifdef DEBUG + else + write(p_logunit,*) 'PERF_SETOPTS: timing library already initialized. Request ignored.' +#endif + endif +! + return + end subroutine perf_setopts + +! +!======================================================================== +! + subroutine papi_defaultopts(papi_ctr1_out, & + papi_ctr2_out, & + papi_ctr3_out, & + papi_ctr4_out ) +!----------------------------------------------------------------------- +! Purpose: Return default runtime PAPI counter options +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Input arguments----------------------------- + ! PAPI counter option #1 + integer, intent(out), optional :: papi_ctr1_out + ! PAPI counter option #2 + integer, intent(out), optional :: papi_ctr2_out + ! PAPI counter option #3 + integer, intent(out), optional :: papi_ctr3_out + ! PAPI counter option #4 + integer, intent(out), optional :: papi_ctr4_out +!----------------------------------------------------------------------- + if ( present(papi_ctr1_out) ) then + papi_ctr1_out = def_papi_ctr1 + endif + if ( present(papi_ctr2_out) ) then + papi_ctr2_out = def_papi_ctr2 + endif + if ( present(papi_ctr3_out) ) then + papi_ctr3_out = def_papi_ctr3 + endif + if ( present(papi_ctr4_out) ) then + papi_ctr4_out = def_papi_ctr4 + endif +! + return + end subroutine papi_defaultopts +! +!======================================================================== +! + subroutine papi_setopts(papi_ctr1_in, & + papi_ctr2_in, & + papi_ctr3_in, & + papi_ctr4_in ) +!----------------------------------------------------------------------- +! Purpose: Set runtime PAPI counter options +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Input arguments---------------------------- +! + ! performance counter option + integer, intent(in), optional :: papi_ctr1_in + ! performance counter option + integer, intent(in), optional :: papi_ctr2_in + ! performance counter option + integer, intent(in), optional :: papi_ctr3_in + ! performance counter option + integer, intent(in), optional :: papi_ctr4_in +! +!---------------------------Local workspace----------------------------- +! + integer ierr ! error return +!----------------------------------------------------------------------- + if ( .not. timing_initialized ) then + + if ( present(papi_ctr1_in) ) then + papi_ctr1 = papi_ctr1_in + endif + if ( present(papi_ctr2_in) ) then + papi_ctr2 = papi_ctr2_in + endif + if ( present(papi_ctr3_in) ) then + papi_ctr3 = papi_ctr3_in + endif + if ( present(papi_ctr4_in) ) then + papi_ctr4 = papi_ctr4_in + endif +! +#ifdef DEBUG + else + write(p_logunit,*) 'PAPI_SETOPTS: timing library already initialized. Request ignored.' +#endif + endif +! + return + end subroutine papi_setopts +! +!======================================================================== +! + logical function t_profile_onf() +!----------------------------------------------------------------------- +! Purpose: Return flag indicating whether profiling is currently active. +! Part of workaround to implement FVbarrierclock before +! communicators exposed in Pilgrim. Does not check level of +! event nesting. +! Author: P. Worley +!----------------------------------------------------------------------- + + if ((.not. timing_initialized) .or. & + (timing_disable_depth > 0)) then + t_profile_onf = .false. + else + t_profile_onf = .true. + endif + + end function t_profile_onf +! +!======================================================================== +! + logical function t_barrier_onf() +!----------------------------------------------------------------------- +! Purpose: Return timing_barrier. Part of workaround to implement +! FVbarrierclock before communicators exposed in Pilgrim. +! Author: P. Worley +!----------------------------------------------------------------------- + + t_barrier_onf = timing_barrier + + end function t_barrier_onf +! +!======================================================================== +! + logical function t_single_filef() +!----------------------------------------------------------------------- +! Purpose: Return perf_single_file. Used to control output of other +! performance data, only spmdstats currently. +! Author: P. Worley +!----------------------------------------------------------------------- + + t_single_filef = perf_single_file + + end function t_single_filef +! +!======================================================================== +! + subroutine t_set_prefixf(prefix_string) +!----------------------------------------------------------------------- +! Purpose: Set prefix for subsequent time event names. +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Input arguments----------------------------- +! + ! performance timer event name prefix + character(len=*), intent(in) :: prefix_string +! +!---------------------------Local workspace----------------------------- +! + integer ierr ! GPTL error return + integer i ! loop index +! +!----------------------------------------------------------------------- +! + if (.not. timing_initialized) return + + ierr = GPTLprefix_set(trim(prefix_string)) + + end subroutine t_set_prefixf +! +!======================================================================== +! + subroutine t_unset_prefixf() +!----------------------------------------------------------------------- +! Purpose: Unset prefix for subsequent time event names. +! Ignored in threaded regions. +! Author: P. Worley +! +!---------------------------Local workspace----------------------------- +! + integer ierr ! GPTL error return +! +!----------------------------------------------------------------------- +! + if (.not. timing_initialized) return + + ierr = GPTLprefix_unset() + + end subroutine t_unset_prefixf +! +!======================================================================== +! + subroutine t_stampf(wall, usr, sys) +!----------------------------------------------------------------------- +! Purpose: Record wallclock, user, and system times (seconds). +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Output arguments----------------------------- +! + real(shr_kind_r8), intent(out) :: wall ! wallclock time + real(shr_kind_r8), intent(out) :: usr ! user time + real(shr_kind_r8), intent(out) :: sys ! system time +! +!---------------------------Local workspace----------------------------- +! + integer ierr ! GPTL error return +! +!----------------------------------------------------------------------- +! + if ((.not. timing_initialized) .or. & + (timing_disable_depth > 0)) then + wall = 0.0 + usr = 0.0 + sys = 0.0 + else + ierr = GPTLstamp(wall, usr, sys) + endif + + return + end subroutine t_stampf +! +!======================================================================== +! + subroutine t_startf(event, handle) +!----------------------------------------------------------------------- +! Purpose: Start an event timer +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Input arguments----------------------------- +! + ! performance timer event name + character(len=*), intent(in) :: event +! +!---------------------------Input/Output arguments---------------------- +! + ! GPTL event handle + integer, optional :: handle +! +!---------------------------Local workspace----------------------------- +! + integer ierr ! GPTL error return + integer str_length, i ! support for adding + ! detail suffix + character(len=2) cdetail ! char variable for detail + real(shr_kind_r8) ovhd_start, ovhd_stop, usr, sys + ! for overhead calculation +! +!----------------------------------------------------------------------- +! + if (.not. timing_initialized) return + if (timing_disable_depth > 0) return +#ifdef NUOPC_INTERFACE +#if ( defined _OPENMP ) + if (omp_in_parallel()) return +#endif + cur_timing_depth = cur_timing_depth + 1 + if(cur_timing_depth > timer_depth_limit) return +#ifdef DEBUG +! print *, 'start timer ',trim(event), cur_timing_depth, timer_depth_limit +#endif +#endif + +!$OMP MASTER + if (perf_ovhd_measurement) then +#ifdef HAVE_MPI + ovhd_start = mpi_wtime() +#else + usr = 0.0 + sys = 0.0 + ierr = GPTLstamp(ovhd_start, usr, sys) +#endif + perf_timing_ovhd = perf_timing_ovhd - ovhd_start + endif +#ifndef NUOPC_INTERFACE +!$OMP END MASTER +#endif + if ((perf_add_detail) .AND. (cur_timing_detail < 100)) then + write(cdetail,'(i2.2)') cur_timing_detail + str_length = min(SHR_KIND_CM-3,len_trim(event)) + TIMERSTART(event(1:str_length)//'_'//cdetail) + else + str_length = min(SHR_KIND_CM,len_trim(event)) + TIMERSTART(event(1:str_length)) + endif +#ifndef NUOPC_INTERFACE +!$OMP MASTER +#endif + if (perf_ovhd_measurement) then +#ifdef HAVE_MPI + ovhd_stop = mpi_wtime() +#else + ierr = GPTLstamp(ovhd_stop, usr, sys) +#endif + perf_timing_ovhd = perf_timing_ovhd + ovhd_stop + endif +!$OMP END MASTER + return + end subroutine t_startf +! +!======================================================================== +! + subroutine t_stopf(event, handle) +!----------------------------------------------------------------------- +! Purpose: Stop an event timer +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Input arguments----------------------------- +! + ! performance timer event name + character(len=*), intent(in) :: event +! +!---------------------------Input/Output arguments---------------------- +! + ! GPTL event handle + integer, optional :: handle +! +!---------------------------Local workspace----------------------------- +! + integer ierr ! GPTL error return + integer str_length, i ! support for adding + ! detail suffix + character(len=2) cdetail ! char variable for detail + real(shr_kind_r8) ovhd_start, ovhd_stop, usr, sys + ! for overhead calculation +! +!----------------------------------------------------------------------- +! + if (.not. timing_initialized) return + if (timing_disable_depth > 0) return +#ifdef NUOPC_INTERFACE +#if ( defined _OPENMP ) + if (omp_in_parallel()) return +#endif +#endif +!$OMP MASTER + if (perf_ovhd_measurement) then +#ifdef HAVE_MPI + ovhd_start = mpi_wtime() +#else + usr = 0.0 + sys = 0.0 + ierr = GPTLstamp(ovhd_start, usr, sys) +#endif + perf_timing_ovhd = perf_timing_ovhd - ovhd_start + endif +#ifdef NUOPC_INTERFACE + cur_timing_depth = cur_timing_depth - 1 + if(cur_timing_depth < timer_depth_limit) then +#else +!$OMP END MASTER +#endif + if ((perf_add_detail) .AND. (cur_timing_detail < 100)) then + write(cdetail,'(i2.2)') cur_timing_detail + str_length = min(SHR_KIND_CM-3,len_trim(event)) + TIMERSTOP(event(1:str_length)//'_'//cdetail) + else + str_length = min(SHR_KIND_CM,len_trim(event)) + TIMERSTOP(event(1:str_length)) + endif +#ifndef NUOPC_INTERFACE +!$OMP MASTER +#endif + if (perf_ovhd_measurement) then +#ifdef HAVE_MPI + ovhd_stop = mpi_wtime() +#else + ierr = GPTLstamp(ovhd_stop, usr, sys) +#endif + perf_timing_ovhd = perf_timing_ovhd + ovhd_stop + endif +#ifdef NUOPC_INTERFACE + endif +#endif +!$OMP END MASTER + return + end subroutine t_stopf +! +!======================================================================== +! + subroutine t_startstop_valsf(event, walltime, callcount, handle) +!----------------------------------------------------------------------- +! Purpose: Create/add walltime and call count to an event timer +! Author: P. Worley (based on J. Rosinski GPTL routine) +!----------------------------------------------------------------------- +!---------------------------Input arguments----------------------------- +! + ! performance timer event name + character(len=*), intent(in) :: event + ! walltime (seconds) associated with this start/stop pair + ! If not set, default is 0.0 . If < 0.0, set to 0.0 . + real(shr_kind_r8), intent(in), optional :: walltime + ! call count associated with this start/stop pair + ! If not set, default is 1. If < 0, set to 0. + integer, intent(in), optional :: callcount +! +!---------------------------Input/Output arguments---------------------- +! + ! GPTL event handle + integer, optional :: handle +! +!---------------------------Local workspace----------------------------- +! + integer ierr ! GPTL error return + + integer str_length, i ! support for adding + ! detail suffix + character(len=2) cdetail ! char variable for detail + integer callcnt ! call count increment + real(shr_kind_r8) wtime ! walltime increment (seconds) + real(shr_kind_r8) ovhd_start, ovhd_stop, usr, sys + ! for overhead calculation +! +!----------------------------------------------------------------------- +! + if (.not. timing_initialized) return + if (timing_disable_depth > 0) return + +!$OMP MASTER + if (perf_ovhd_measurement) then +#ifdef HAVE_MPI + ovhd_start = mpi_wtime() +#else + usr = 0.0 + sys = 0.0 + ierr = GPTLstamp(ovhd_start, usr, sys) +#endif + perf_timing_ovhd = perf_timing_ovhd - ovhd_start + endif +!$OMP END MASTER + + wtime = 0.0_shr_kind_r8 + if ( present(walltime) ) then + if (walltime > 0.0) then + wtime = walltime + endif + endif + + callcnt = 1 + if ( present(callcount) ) then + if (callcount > 0) then + callcnt = callcount + else + callcnt = 0 + endif + endif + + if ((perf_add_detail) .AND. (cur_timing_detail < 100)) then + + write(cdetail,'(i2.2)') cur_timing_detail + str_length = min(SHR_KIND_CM-3,len_trim(event)) + ierr = GPTLstartstop_vals( & + event(1:str_length)//'_'//cdetail, wtime, callcnt) + + else + + str_length = min(SHR_KIND_CM,len_trim(event)) + ierr = GPTLstartstop_vals(trim(event), wtime, callcnt) + + endif + +!$OMP MASTER + if (perf_ovhd_measurement) then +#ifdef HAVE_MPI + ovhd_stop = mpi_wtime() +#else + ierr = GPTLstamp(ovhd_stop, usr, sys) +#endif + perf_timing_ovhd = perf_timing_ovhd + ovhd_stop + endif +!$OMP END MASTER + return + end subroutine t_startstop_valsf +! +!======================================================================== +! + subroutine t_enablef() +!----------------------------------------------------------------------- +! Purpose: Enable t_startf, t_stopf, t_stampf, and t_barrierf. Ignored +! in threaded regions. +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Local workspace----------------------------- +! + integer ierr ! GPTL error return +! +!---------------------------Externals----------------------------------- +! + +!----------------------------------------------------------------------- +! + if (.not. timing_initialized) return + +#if ( defined _OPENMP ) + if (omp_in_parallel()) return +#endif + + if (timing_disable_depth > 0) then + if (timing_disable_depth .eq. 1) then + ierr = GPTLenable() + endif + timing_disable_depth = timing_disable_depth - 1 + endif + + return + end subroutine t_enablef +! +!======================================================================== +! + subroutine t_disablef() +!----------------------------------------------------------------------- +! Purpose: Disable t_startf, t_stopf, t_stampf, and t_barrierf. Ignored +! in threaded regions. +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Local workspace----------------------------- +! + integer ierr ! GPTL error return +! +!---------------------------Externals----------------------------------- +! +#if ( defined _OPENMP ) + logical omp_in_parallel + external omp_in_parallel +#endif +! +!----------------------------------------------------------------------- +! + if (.not. timing_initialized) return + +#if ( defined _OPENMP ) + if (omp_in_parallel()) return +#endif + + if (timing_disable_depth .eq. 0) then + ierr = GPTLdisable() + endif + timing_disable_depth = timing_disable_depth + 1 + + return + end subroutine t_disablef +! +!======================================================================== +! + subroutine t_adj_detailf(detail_adjustment) +!----------------------------------------------------------------------- +! Purpose: Modify current detail level. Ignored in threaded regions. +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Input arguments----------------------------- +! + integer, intent(in) :: detail_adjustment ! user defined increase or + ! decrease in detail level +! +!---------------------------Externals----------------------------------- +! +#if ( defined _OPENMP ) + logical omp_in_parallel + external omp_in_parallel +#endif +! +!----------------------------------------------------------------------- +! + if (.not. timing_initialized) return + +#if ( defined _OPENMP ) + if (omp_in_parallel()) return +#endif + +! using disable/enable to implement timing_detail logic so also control +! direct GPTL calls (such as occur in Trilinos library) + if ((cur_timing_detail <= timing_detail_limit) .and. & + (cur_timing_detail + detail_adjustment > timing_detail_limit)) then + call t_disablef() + elseif ((cur_timing_detail > timing_detail_limit) .and. & + (cur_timing_detail + detail_adjustment <= timing_detail_limit)) then + call t_enablef() + endif + + cur_timing_detail = cur_timing_detail + detail_adjustment + + return + end subroutine t_adj_detailf +! +!======================================================================== +! + subroutine t_barrierf(event, mpicom) +!----------------------------------------------------------------------- +! Purpose: Call (and time) mpi_barrier. Ignored inside OpenMP +! threaded regions. Note that barrier executed even if +! event not recorded because of level of timer event nesting. +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Input arguments----------------------------- + ! mpi communicator id + integer, intent(in), optional :: mpicom + ! performance timer event name + character(len=*), intent(in), optional :: event +! +!---------------------------Local workspace----------------------------- +! + integer ierr ! GPTL error return +! +!---------------------------Externals----------------------------------- +! +#if ( defined _OPENMP ) + logical omp_in_parallel + external omp_in_parallel +#endif +! +!----------------------------------------------------------------------- +! + if (timing_barrier) then + +#if ( defined _OPENMP ) + if (omp_in_parallel()) return +#endif + if (.not. timing_initialized) return + if (timing_disable_depth > 0) return + + if ( present (event) ) then + call t_startf(event) + endif + + if ( present (mpicom) ) then + call shr_mpi_barrier(mpicom, 'T_BARRIERF: bad mpi communicator') + else + call shr_mpi_barrier(MPI_COMM_WORLD, 'T_BARRIERF: bad mpi communicator') + endif + + if ( present (event) ) then + call t_stopf(event) + endif + + endif + + return + end subroutine t_barrierf +! +!======================================================================== +! + subroutine t_prf(filename, mpicom, num_outpe, stride_outpe, & + single_file, global_stats, output_thispe) +!----------------------------------------------------------------------- +! Purpose: Write out performance timer data +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Input arguments----------------------------- +! + ! performance timer output file name + character(len=*), intent(in), optional :: filename + ! mpi communicator id + integer, intent(in), optional :: mpicom + ! maximum number of processes writing out timing data + integer, intent(in), optional :: num_outpe + ! separation between process ids for processes writing out data + integer, intent(in), optional :: stride_outpe + ! enable/disable the writing of data to a single file + logical, intent(in), optional :: single_file + ! enable/disable the collection of global statistics + logical, intent(in), optional :: global_stats + ! output timing data for this process + logical, intent(in), optional :: output_thispe +! +!---------------------------Local workspace----------------------------- +! + logical one_file ! flag indicting whether to write + ! all data to a single file + logical glb_stats ! flag indicting whether to compute + ! global statistics + logical pr_write ! flag indicating whether the current + ! GPTL output mode is write + logical write_data ! flag indicating whether this process + ! should output its timing data + integer i ! loop index + integer mpicom2 ! local copy of MPI communicator + integer me ! communicator local process id + integer npes ! local communicator group size + integer gme ! global process id + integer ierr ! MPI error return + integer outpe_num ! max number of processes writing out + ! timing data (excluding output_thispe) + integer outpe_stride ! separation between process ids for + ! processes writing out timing data + integer max_outpe ! max process id for processes + ! writing out timing data + integer signal ! send/recv variable for single + ! output file logic + integer str_length ! string length + integer unitn ! file unit number + integer cme_adj ! length of filename suffix + integer status (MPI_STATUS_SIZE) ! Status of message + character(len=7) cme ! string representation of process id + character(len=SHR_KIND_CX+14) fname ! timing output filename +!----------------------------------------------------------------------- +! + if (.not. timing_initialized) return +#ifdef NUOPC_INTERFACE + return +#endif + + call t_startf("t_prf") +!$OMP MASTER + call mpi_comm_rank(MPI_COMM_WORLD, gme, ierr) + if ( present(mpicom) ) then + mpicom2 = mpicom + call mpi_comm_size(mpicom2, npes, ierr) + if (ierr .eq. MPI_ERR_COMM) then + call shr_sys_abort('T_PRF: bad mpi communicator') + endif + call mpi_comm_rank(mpicom2, me, ierr) + else + call mpi_comm_size(MPI_COMM_WORLD, npes, ierr) + mpicom2 = MPI_COMM_WORLD + me = gme + endif + + do i=1,SHR_KIND_CX+14 + fname(i:i) = " " + enddo + + unitn = shr_log_getUnit() + + ! determine what the current output mode is (append or write) + if (GPTLprint_mode_query() == GPTLprint_write) then + pr_write = .true. + ierr = GPTLprint_mode_set(GPTLprint_append) + else + pr_write = .false. + endif + + ! Determine whether to write all data to a single fie + if (present(single_file)) then + one_file = single_file + else + one_file = perf_single_file + endif + + ! Determine whether to compute global statistics + if (present(global_stats)) then + glb_stats = global_stats + else + glb_stats = perf_global_stats + endif + + ! Determine which processes are writing out timing data + write_data = .false. + + if (present(num_outpe)) then + if (num_outpe < 0) then + outpe_num = npes + else + outpe_num = num_outpe + endif + else + if (perf_outpe_num < 0) then + outpe_num = npes + else + outpe_num = perf_outpe_num + endif + endif + + if (present(stride_outpe)) then + if (stride_outpe < 1) then + outpe_stride = 1 + else + outpe_stride = stride_outpe + endif + else + if (perf_outpe_stride < 1) then + outpe_stride = 1 + else + outpe_stride = perf_outpe_stride + endif + endif + + max_outpe = min(outpe_num*outpe_stride, npes) - 1 + + if ((mod(me, outpe_stride) .eq. 0) .and. (me .le. max_outpe)) & + write_data = .true. + + if (present(output_thispe)) then + write_data = output_thispe + endif + + ! If a single timing output file, take turns writing to it. + if (one_file) then + + if ( present(filename) ) then + str_length = min(SHR_KIND_CX,len_trim(filename)) + fname(1:str_length) = filename(1:str_length) + else + fname(1:10) = "timing_all" + endif + + signal = 0 + if (me .eq. 0) then + + if (glb_stats) then + open( unitn, file=trim(fname), status='UNKNOWN', access='SEQUENTIAL' ) + write( unitn, 100) npes + 100 format(/,"***** GLOBAL STATISTICS (",I6," MPI TASKS) *****",/) + close( unitn ) + + ierr = GPTLpr_summary_file(mpicom2, trim(fname)) + endif + + if (write_data) then + if (glb_stats) then + open( unitn, file=trim(fname), status='OLD', access='SEQUENTIAL', position='APPEND' ) + else + open( unitn, file=trim(fname), status='UNKNOWN', access='SEQUENTIAL' ) + endif + + if (perf_ovhd_measurement) then + write( unitn, 101) me, gme + 101 format(/,"************ PROCESS ",I6," (",I6,") ************") + write( unitn, 102) perf_timing_ovhd + 102 format("** TIMING OVERHEAD ",E20.10," SECONDS *",/) + else + write( unitn, 103) me, gme + 103 format(/,"************ PROCESS ",I6," (",I6,") ************",/) + endif + + close( unitn ) + + ierr = GPTLpr_file(trim(fname)) + endif + + else + + if (glb_stats) then + ierr = GPTLpr_summary_file(mpicom2, trim(fname)) + endif + + call mpi_recv (signal, 1, mpi_integer, me-1, me-1, mpicom2, status, ierr) + if (ierr /= mpi_success) then + write(p_logunit,*) 'T_PRF: mpi_recv failed ierr=',ierr + call shr_sys_abort() + end if + + if (write_data) then + open( unitn, file=trim(fname), status='OLD', access='SEQUENTIAL', position='APPEND' ) + if (perf_ovhd_measurement) then + write( unitn, 101) me, gme + write( unitn, 102) perf_timing_ovhd + else + write( unitn, 103) me, gme + endif + close( unitn ) + + ierr = GPTLpr_file(trim(fname)) + endif + + endif + + if (me+1 < npes) & + call mpi_send (signal, 1, mpi_integer, me+1, me, mpicom2, ierr) + + else + + if (glb_stats) then + if ( present(filename) ) then + str_length = min(SHR_KIND_CX-6,len_trim(filename)) + fname(1:str_length) = filename(1:str_length) + else + str_length = 6 + fname(1:10) = "timing" + endif + fname(str_length+1:str_length+6) = '_stats' + + if (me .eq. 0) then + open( unitn, file=trim(fname), status='UNKNOWN', access='SEQUENTIAL' ) + write( unitn, 100) npes + close( unitn ) + endif + + ierr = GPTLpr_summary_file(mpicom2, trim(fname)) + fname(str_length+1:str_length+6) = ' ' + endif + + if (write_data) then + if (npes .le. 10) then + write(cme,'(i1.1)') me + cme_adj = 2 + elseif (npes .le. 100) then + write(cme,'(i2.2)') me + cme_adj = 3 + elseif (npes .le. 1000) then + write(cme,'(i3.3)') me + cme_adj = 4 + elseif (npes .le. 10000) then + write(cme,'(i4.4)') me + cme_adj = 5 + elseif (npes .le. 100000) then + write(cme,'(i5.5)') me + cme_adj = 6 + else + write(cme,'(i6.6)') me + cme_adj = 7 + endif + + if ( present(filename) ) then + str_length = min(SHR_KIND_CX-cme_adj,len_trim(filename)) + fname(1:str_length) = filename(1:str_length) + else + str_length = 6 + fname(1:10) = "timing" + endif + fname(str_length+1:str_length+1) = '.' + fname(str_length+2:str_length+cme_adj) = cme + + open( unitn, file=trim(fname), status='UNKNOWN', access='SEQUENTIAL' ) + if (perf_ovhd_measurement) then + write( unitn, 101) me, gme + write( unitn, 102) perf_timing_ovhd + else + write( unitn, 103) me, gme + endif + close( unitn ) + + ierr = GPTLpr_file(trim(fname)) + endif + + endif + + call shr_log_freeUnit( unitn ) + + ! reset GPTL output mode + if (pr_write) then + ierr = GPTLprint_mode_set(GPTLprint_write) + endif + +!$OMP END MASTER + call t_stopf("t_prf") + + return + end subroutine t_prf +! +!======================================================================== +! + subroutine t_initf(NLFilename, LogPrint, LogUnit, mpicom, MasterTask, & + MaxThreads) +!----------------------------------------------------------------------- +! Purpose: Set default values of runtime timing options +! before namelists prof_inparm and papi_inparm are read, +! read namelists (and broadcast, if SPMD), +! then initialize timing library. +! Author: P. Worley (based on shr_inputinfo_mod and runtime_opts) +!----------------------------------------------------------------------- +!---------------------------Input arguments----------------------------- +! + character(len=*), intent(IN) :: NLFilename ! Name-list filename + logical, optional, intent(IN) :: LogPrint ! If print out to log file + integer, optional, intent(IN) :: LogUnit ! Unit number for log output + integer, optional, intent(IN) :: mpicom ! MPI communicator + logical, optional, intent(IN) :: MasterTask ! If MPI master task + integer, optional, intent(IN) :: MaxThreads ! maximum number of threads + ! used by components +! +!---------------------------Local workspace----------------------------- +! + character(len=*), parameter :: subname = '(T_INITF) ' + logical :: MasterTask2 ! If MPI master task + logical :: LogPrint2 ! If print to log + + integer me ! communicator local process id + integer ierr ! error return + integer unitn ! file unit number + integer papi_ctr1_id ! PAPI counter id + integer papi_ctr2_id ! PAPI counter id + integer papi_ctr3_id ! PAPI counter id + integer papi_ctr4_id ! PAPI counter id +! +!---------------------------Namelists ---------------------------------- +! + logical profile_disable + logical profile_barrier + logical profile_single_file + logical profile_global_stats + integer profile_depth_limit + integer profile_detail_limit + integer profile_outpe_num + integer profile_outpe_stride + integer profile_timer + logical profile_papi_enable + logical profile_ovhd_measurement + logical profile_add_detail + namelist /prof_inparm/ profile_disable, profile_barrier, & + profile_single_file, profile_global_stats, & + profile_depth_limit, & + profile_detail_limit, profile_outpe_num, & + profile_outpe_stride, profile_timer, & + profile_papi_enable, profile_ovhd_measurement, & + profile_add_detail + + character(len=16) papi_ctr1_str + character(len=16) papi_ctr2_str + character(len=16) papi_ctr3_str + character(len=16) papi_ctr4_str + namelist /papi_inparm/ papi_ctr1_str, papi_ctr2_str, & + papi_ctr3_str, papi_ctr4_str +! +!---------------------------Externals----------------------------------- +! +#if ( defined _OPENMP ) + integer omp_get_max_threads + external omp_get_max_threads +#endif +!----------------------------------------------------------------------- + if ( timing_initialized ) then +#ifdef DEBUG + write(p_logunit,*) 'T_INITF: timing library already initialized. Request ignored.' +#endif + return + endif + +!$OMP MASTER + if ( present(MaxThreads) ) then + num_threads = MaxThreads + else +#ifdef _OPENMP +!$omp parallel + num_threads = omp_get_max_threads() +!$omp end parallel +#else + num_threads = 1 +#endif + endif + + if ( present(LogUnit) ) then + call t_setLogUnit(LogUnit) + else + call t_setLogUnit(def_p_logunit) + endif + + if ( present(MasterTask) .and. present(mpicom) )then + call mpi_comm_rank(mpicom, me, ierr) + if (ierr .eq. MPI_ERR_COMM) then + call shr_sys_abort('T_INITF: bad mpi communicator') + endif + if (me .eq. 0) then + MasterTask2 = .true. + else + MasterTask2 = .false. + endif + else + MasterTask2 = .true. + end if + + if ( present(LogPrint) ) then + LogPrint2 = LogPrint + else + LogPrint2 = .true. + endif + + ! Set PERF defaults, then override with user-specified input + call perf_defaultopts(timing_disable_out=profile_disable, & + perf_timer_out=profile_timer, & + timer_depth_limit_out=profile_depth_limit, & + timing_detail_limit_out=profile_detail_limit, & + timing_barrier_out=profile_barrier, & + perf_outpe_num_out = profile_outpe_num, & + perf_outpe_stride_out = profile_outpe_stride, & + perf_single_file_out=profile_single_file, & + perf_global_stats_out=profile_global_stats, & + perf_papi_enable_out=profile_papi_enable, & + perf_ovhd_measurement_out=profile_ovhd_measurement, & + perf_add_detail_out=profile_add_detail ) + if ( MasterTask2 ) then + + ! Read in the prof_inparm namelist from NLFilename if it exists + + write(p_logunit,*) '(t_initf) Read in prof_inparm namelist from: '//trim(NLFilename) + unitn = shr_log_getUnit() + + ierr = 1 + open( unitn, file=trim(NLFilename), status="OLD", form="FORMATTED", access="SEQUENTIAL", iostat=ierr ) + if (ierr .eq. 0) then + + ! Look for prof_inparm group name in the input file. + ! If found, leave the file positioned at that namelist group. + call find_group_name(unitn, 'prof_inparm', status=ierr) + + if (ierr == 0) then ! found prof_inparm + read(unitn, nml=prof_inparm, iostat=ierr) + if (ierr /= 0) then + call shr_sys_abort( subname//':: namelist read returns an'// & + ' error condition for prof_inparm' ) + end if + end if + + close(unitn) + + endif + call shr_log_freeUnit( unitn ) + + endif + + ! This logic assumes that there will be only one MasterTask + ! per communicator, and that this MasterTask is process 0. + if ( present(MasterTask) .and. present(mpicom) )then + call shr_mpi_bcast( profile_disable, MPICom ) + call shr_mpi_bcast( profile_barrier, MPICom ) + call shr_mpi_bcast( profile_single_file, MPICom ) + call shr_mpi_bcast( profile_global_stats, MPICom ) + call shr_mpi_bcast( profile_papi_enable, MPICom ) + call shr_mpi_bcast( profile_ovhd_measurement, MPICom ) + call shr_mpi_bcast( profile_add_detail, MPICom ) + call shr_mpi_bcast( profile_depth_limit, MPICom ) + call shr_mpi_bcast( profile_detail_limit, MPICom ) + call shr_mpi_bcast( profile_outpe_num, MPICom ) + call shr_mpi_bcast( profile_outpe_stride, MPICom ) + call shr_mpi_bcast( profile_timer, MPICom ) + end if + call perf_setopts (MasterTask2, LogPrint2, & + timing_disable_in=profile_disable, & + perf_timer_in=profile_timer, & + timer_depth_limit_in=profile_depth_limit, & + timing_detail_limit_in=profile_detail_limit, & + timing_barrier_in=profile_barrier, & + perf_outpe_num_in=profile_outpe_num, & + perf_outpe_stride_in=profile_outpe_stride, & + perf_single_file_in=profile_single_file, & + perf_global_stats_in=profile_global_stats, & + perf_papi_enable_in=profile_papi_enable, & + perf_ovhd_measurement_in=profile_ovhd_measurement, & + perf_add_detail_in=profile_add_detail ) + + ! Set PAPI defaults, then override with user-specified input + if (perf_papi_enable) then + call papi_defaultopts(papi_ctr1_out=papi_ctr1_id, & + papi_ctr2_out=papi_ctr2_id, & + papi_ctr3_out=papi_ctr3_id, & + papi_ctr4_out=papi_ctr4_id ) + + if ( MasterTask2 ) then + papi_ctr1_str = "PAPI_NO_CTR" + papi_ctr2_str = "PAPI_NO_CTR" + papi_ctr3_str = "PAPI_NO_CTR" + papi_ctr4_str = "PAPI_NO_CTR" + + + ! Read in the papi_inparm namelist from NLFilename if it exists + + write(p_logunit,*) '(t_initf) Read in papi_inparm namelist from: '//trim(NLFilename) + unitn = shr_log_getUnit() + + ierr = 1 + open( unitn, file=trim(NLFilename), status="OLD", form="FORMATTED", access="SEQUENTIAL", iostat=ierr ) + if (ierr .eq. 0) then + ! Look for papi_inparm group name in the input file. + ! If found, leave the file positioned at that namelist group. + call find_group_name(unitn, 'papi_inparm', status=ierr) + + if (ierr == 0) then ! found papi_inparm + read(unitn, nml=papi_inparm, iostat=ierr) + if (ierr /= 0) then + call shr_sys_abort( subname//':: namelist read returns an'// & + ' error condition for papi_inparm' ) + end if + end if + + close(unitn) + + endif + call shr_log_freeUnit( unitn ) + + ! if enabled and nothing set, use "defaults" + if ((papi_ctr1_str(1:11) .eq. "PAPI_NO_CTR") .and. & + (papi_ctr2_str(1:11) .eq. "PAPI_NO_CTR") .and. & + (papi_ctr3_str(1:11) .eq. "PAPI_NO_CTR") .and. & + (papi_ctr4_str(1:11) .eq. "PAPI_NO_CTR")) then + papi_ctr1_str = "PAPI_FP_OPS" + endif + + if (papi_ctr1_str(1:11) /= "PAPI_NO_CTR") then + ierr = gptlevent_name_to_code(trim(papi_ctr1_str), papi_ctr1_id) + endif + if (papi_ctr2_str(1:11) /= "PAPI_NO_CTR") then + ierr = gptlevent_name_to_code(trim(papi_ctr2_str), papi_ctr2_id) + endif + if (papi_ctr3_str(1:11) /= "PAPI_NO_CTR") then + ierr = gptlevent_name_to_code(trim(papi_ctr3_str), papi_ctr3_id) + endif + if (papi_ctr4_str(1:11) /= "PAPI_NO_CTR") then + ierr = gptlevent_name_to_code(trim(papi_ctr4_str), papi_ctr4_id) + endif + + endif + ! This logic assumes that there will be only one MasterTask + ! per communicator, and that this MasterTask is process 0. + if ( present(MasterTask) .and. present(mpicom) )then + call shr_mpi_bcast( papi_ctr1_id, MPICom ) + call shr_mpi_bcast( papi_ctr2_id, MPICom ) + call shr_mpi_bcast( papi_ctr3_id, MPICom ) + call shr_mpi_bcast( papi_ctr4_id, MPICom ) + end if + + call papi_setopts (papi_ctr1_in=papi_ctr1_id, & + papi_ctr2_in=papi_ctr2_id, & + papi_ctr3_in=papi_ctr3_id, & + papi_ctr4_in=papi_ctr4_id ) + endif +!$OMP END MASTER +!$OMP BARRIER + + if (timing_disable) return + +!$OMP MASTER + ! + ! Set options and initialize timing library. + ! + ! Set timer + if (gptlsetutr (perf_timer) < 0) call shr_sys_abort (subname//':: gptlsetutr') + ! + ! For logical settings, 2nd arg 0 + ! to gptlsetoption means disable, non-zero means enable + ! + ! Turn off CPU timing (expensive) + ! + if (gptlsetoption (gptlcpu, 0) < 0) call shr_sys_abort (subname//':: gptlsetoption') + ! + ! Enable addition of double quotes to the output of timer names + ! + if (gptlsetoption (gptldopr_quotes, 1) < 0) & + call shr_sys_abort (subname//':: gptlsetoption') + ! + ! Set maximum number of threads + ! + if ( present(MaxThreads) ) then + if (gptlsetoption (gptlmaxthreads, MaxThreads) < 0) & + call shr_sys_abort (subname//':: gptlsetoption') + endif + ! + ! Set max timer depth + ! + if (gptlsetoption (gptldepthlimit, timer_depth_limit) < 0) & + call shr_sys_abort (subname//':: gptlsetoption') + ! + ! Set profile ovhd measurement (default is false) + ! + if (perf_ovhd_measurement) then + if (gptlsetoption (gptlprofile_ovhd, 1) < 0) & + call shr_sys_abort (subname//':: gptlsetoption') + endif + ! + ! Next 2 calls only work if PAPI is enabled. These examples enable counting + ! of total cycles and floating point ops, respectively + ! + if (perf_papi_enable) then + if (papi_ctr1 /= PAPI_NULL) then + if (gptlsetoption (papi_ctr1, 1) < 0) call shr_sys_abort (subname//':: gptlsetoption') + endif + if (papi_ctr2 /= PAPI_NULL) then + if (gptlsetoption (papi_ctr2, 1) < 0) call shr_sys_abort (subname//':: gptlsetoption') + endif + if (papi_ctr3 /= PAPI_NULL) then + if (gptlsetoption (papi_ctr3, 1) < 0) call shr_sys_abort (subname//':: gptlsetoption') + endif + if (papi_ctr4 /= PAPI_NULL) then + if (gptlsetoption (papi_ctr4, 1) < 0) call shr_sys_abort (subname//':: gptlsetoption') + endif + endif + ! + ! Initialize the timing lib. This call must occur after all gptlsetoption + ! calls and before all other timing lib calls. + ! + if (gptlinitialize () < 0) call shr_sys_abort (subname//':: gptlinitialize') + timing_initialized = .true. +!$OMP END MASTER +!$OMP BARRIER + + return + end subroutine t_initf +! +!======================================================================== +! + subroutine t_finalizef() +!----------------------------------------------------------------------- +! Purpose: shut down timing library +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Local workspace----------------------------- +! + integer ierr ! GPTL error return +! +!----------------------------------------------------------------------- +! + if (.not. timing_initialized) return + +!$OMP MASTER + ierr = GPTLfinalize() + timing_initialized = .false. +!$OMP END MASTER +!$OMP BARRIER + + return + end subroutine t_finalizef + +!=============================================================================== + +end module perf_mod diff --git a/share/timing/perf_utils.F90 b/share/timing/perf_utils.F90 new file mode 100644 index 0000000..96d08ff --- /dev/null +++ b/share/timing/perf_utils.F90 @@ -0,0 +1,535 @@ +module perf_utils + +!----------------------------------------------------------------------- +! +! Purpose: This module supplies the csm_share and CAM utilities +! needed by perf_mod.F90 (when the csm_share and CAM utilities +! are not available). +! +! Author: P. Worley, October 2007 +! +! $Id$ +! +!----------------------------------------------------------------------- + +!----------------------------------------------------------------------- +!- module boilerplate -------------------------------------------------- +!----------------------------------------------------------------------- + implicit none + private ! Make the default access private +#include + save + +!----------------------------------------------------------------------- +! Public interfaces ---------------------------------------------------- +!----------------------------------------------------------------------- + public perfutils_setunit + public shr_sys_abort + public shr_mpi_barrier + public shr_log_getUnit + public shr_log_freeUnit + public find_group_name + public to_lower + public shr_mpi_bcast + + interface shr_mpi_bcast ; module procedure & + shr_mpi_bcastl0, & + shr_mpi_bcasti0 + end interface + +!----------------------------------------------------------------------- +! Private interfaces --------------------------------------------------- +!----------------------------------------------------------------------- + private shr_sys_flush + private shr_mpi_chkerr + private shr_mpi_abort + +!----------------------------------------------------------------------- +!- include statements -------------------------------------------------- +!----------------------------------------------------------------------- +#include "gptl.inc" + +!----------------------------------------------------------------------- +! Public data --------------------------------------------------------- +!----------------------------------------------------------------------- + + !---------------------------------------------------------------------------- + ! precision/kind constants (from csm_share/shr/shr_kind_mod.F90) + !---------------------------------------------------------------------------- + integer,parameter,public :: SHR_KIND_R8 = selected_real_kind(12) ! 8 byte real + integer,parameter,public :: SHR_KIND_I8 = selected_int_kind (13) ! 8 byte integer + integer,parameter,public :: SHR_KIND_IN = kind(1) ! native integer + integer,parameter,public :: SHR_KIND_CS = 80 ! short char + integer,parameter,public :: SHR_KIND_CM = 160 ! mid-sized char + integer,parameter,public :: SHR_KIND_CL = 256 ! long char + integer,parameter,public :: SHR_KIND_CX = 512 ! extra-long char + +!----------------------------------------------------------------------- +! Private data --------------------------------------------------------- +!----------------------------------------------------------------------- + + integer, parameter :: def_pu_logunit = 6 ! default + integer, private :: pu_logunit = def_pu_logunit + ! unit number for log output + +!======================================================================= +contains +!======================================================================= + +! +!======================================================================== +! + subroutine perfutils_setunit(LogUnit) +!----------------------------------------------------------------------- +! Purpose: Set log unit number. +! Author: P. Worley +!----------------------------------------------------------------------- +!---------------------------Input arguments----------------------------- +! + integer(SHR_KIND_IN), intent(IN) :: LogUnit ! Unit number for log output +!----------------------------------------------------------------------- + pu_logunit = LogUnit +! + return +! + end subroutine perfutils_setunit + +!============== Routines from csm_share/shr/shr_sys_mod.F90 ============ +!======================================================================= + +SUBROUTINE shr_sys_abort(string) + + IMPLICIT none + + character(*) ,optional :: string ! error message string + + !----- local ----- + integer(SHR_KIND_IN) :: ierr + logical :: flag + + !----- formats ----- + character(*),parameter :: subName = '(shr_sys_abort) ' + character(*),parameter :: F00 = "('(shr_sys_abort) ',4a)" + +!------------------------------------------------------------------------------- +! PURPOSE: consistent stopping mechanism +! (dumbed down from original shr_sys_mod.F90 version for use in perf_mod) +!------------------------------------------------------------------------------- + + call shr_sys_flush(pu_logunit) + + if ( present(string) ) then + if (len_trim(string) > 0) then + write(pu_logunit,*) trim(subName),' ERROR: ',trim(string) + else + write(pu_logunit,*) trim(subName),' ERROR ' + endif + else + write(pu_logunit,*) trim(subName),' ERROR ' + endif + + write(pu_logunit,F00) 'WARNING: calling mpi_abort() and stopping' + call shr_sys_flush(pu_logunit) + call mpi_abort(MPI_COMM_WORLD,0,ierr) + call shr_sys_flush(pu_logunit) +#ifndef CPRNAG + call abort() +#endif + + stop + +END SUBROUTINE shr_sys_abort + +!=============================================================================== +!=============================================================================== + +SUBROUTINE shr_sys_flush(unit) + + IMPLICIT none + + !----- arguments ----- + integer(SHR_KIND_IN) :: unit ! flush output buffer for this unit + + !----- formats ----- + character(*),parameter :: subName = '(shr_sys_flush) ' + character(*),parameter :: F00 = "('(shr_sys_flush) ',4a)" + +!------------------------------------------------------------------------------- +! PURPOSE: an architecture independant system call +!------------------------------------------------------------------------------- + +#if (defined IRIX64 || defined CRAY || defined OSF1 || defined SUNOS || defined LINUX || defined NEC_SX || defined UNICOSMP) +#ifdef CPRNAG + flush(unit) +#else + call flush(unit) +#endif +#endif +#if (defined AIX) + call flush_(unit) +#endif + +END SUBROUTINE shr_sys_flush + +!=============================================================================== + +!================== Routines from csm_share/shr/shr_mpi_mod.F90 =============== +!=============================================================================== + +SUBROUTINE shr_mpi_chkerr(rcode,string) + + IMPLICIT none + + !----- arguments --- + integer(SHR_KIND_IN), intent(in) :: rcode ! input MPI error code + character(*), intent(in) :: string ! message + + !----- local --- + character(*),parameter :: subName = '(shr_mpi_chkerr) ' + character(MPI_MAX_ERROR_STRING) :: lstring + integer(SHR_KIND_IN) :: len + integer(SHR_KIND_IN) :: ierr + +!------------------------------------------------------------------------------- +! PURPOSE: layer on MPI error checking +!------------------------------------------------------------------------------- + + if (rcode /= MPI_SUCCESS) then + call MPI_ERROR_STRING(rcode,lstring,len,ierr) + write(pu_logunit,*) trim(subName),":",lstring(1:len) + call shr_mpi_abort(string,rcode) + endif + +END SUBROUTINE shr_mpi_chkerr + +!=============================================================================== +!=============================================================================== + +SUBROUTINE shr_mpi_abort(string,rcode) + + IMPLICIT none + + !----- arguments --- + character(*),optional,intent(in) :: string ! message + integer,optional,intent(in) :: rcode ! optional code + + !----- local --- + character(*),parameter :: subName = '(shr_mpi_abort) ' + integer(SHR_KIND_IN) :: ierr + +!------------------------------------------------------------------------------- +! PURPOSE: MPI abort +!------------------------------------------------------------------------------- + + if ( present(string) .and. present(rcode) ) then + write(pu_logunit,*) trim(subName),":",trim(string),rcode + endif + call MPI_ABORT(MPI_COMM_WORLD,rcode,ierr) + +END SUBROUTINE shr_mpi_abort + +!=============================================================================== +!=============================================================================== + +SUBROUTINE shr_mpi_barrier(comm,string) + + IMPLICIT none + + !----- arguments --- + integer,intent(in) :: comm + character(*),optional,intent(in) :: string ! message + + !----- local --- + character(*),parameter :: subName = '(shr_mpi_barrier) ' + integer(SHR_KIND_IN) :: ierr + +!------------------------------------------------------------------------------- +! PURPOSE: MPI barrier +!------------------------------------------------------------------------------- + + call MPI_BARRIER(comm,ierr) + if (present(string)) then + call shr_mpi_chkerr(ierr,subName//trim(string)) + else + call shr_mpi_chkerr(ierr,subName) + endif + +END SUBROUTINE shr_mpi_barrier + +!=============================================================================== +!=============================================================================== + +SUBROUTINE shr_mpi_bcasti0(vec,comm,string) + + IMPLICIT none + + !----- arguments --- + integer(SHR_KIND_IN), intent(inout):: vec ! vector of 1 + integer(SHR_KIND_IN), intent(in) :: comm ! mpi communicator + character(*),optional,intent(in) :: string ! message + + !----- local --- + character(*),parameter :: subName = '(shr_mpi_bcasti0) ' + integer(SHR_KIND_IN) :: ierr + integer(SHR_KIND_IN) :: lsize + +!------------------------------------------------------------------------------- +! PURPOSE: Broadcast an integer +!------------------------------------------------------------------------------- + + lsize = 1 + + call MPI_BCAST(vec,lsize,MPI_INTEGER,0,comm,ierr) + if (present(string)) then + call shr_mpi_chkerr(ierr,subName//trim(string)) + else + call shr_mpi_chkerr(ierr,subName) + endif + +END SUBROUTINE shr_mpi_bcasti0 + +!=============================================================================== +!=============================================================================== + +SUBROUTINE shr_mpi_bcastl0(vec,comm,string) + + IMPLICIT none + + !----- arguments --- + logical, intent(inout):: vec ! vector of 1 + integer(SHR_KIND_IN), intent(in) :: comm ! mpi communicator + character(*),optional,intent(in) :: string ! message + + !----- local --- + character(*),parameter :: subName = '(shr_mpi_bcastl0) ' + integer(SHR_KIND_IN) :: ierr + integer(SHR_KIND_IN) :: lsize + +!------------------------------------------------------------------------------- +! PURPOSE: Broadcast a logical +!------------------------------------------------------------------------------- + + lsize = 1 + + call MPI_BCAST(vec,lsize,MPI_LOGICAL,0,comm,ierr) + if (present(string)) then + call shr_mpi_chkerr(ierr,subName//trim(string)) + else + call shr_mpi_chkerr(ierr,subName) + endif + +END SUBROUTINE shr_mpi_bcastl0 + +!=============================================================================== + +!================== Routines from csm_share/shr/shr_log_mod.F90 =============== +!=============================================================================== +!BOP =========================================================================== +! +! !IROUTINE: shr_log_getUnit -- Get a free FORTRAN unit number +! +! !DESCRIPTION: Get the next free FORTRAN unit number. +! +! !REVISION HISTORY: +! 2005-Dec-14 - E. Kluzek - creation +! 2007-Oct-21 - P. Worley - dumbed down for use in perf_mod +! +! !INTERFACE: ------------------------------------------------------------------ + +INTEGER FUNCTION shr_log_getUnit () + + implicit none + +!EOP + + !----- local parameters ----- + integer(SHR_KIND_IN),parameter :: shr_log_minUnit = 10 ! Min unit number to give + integer(SHR_KIND_IN),parameter :: shr_log_maxUnit = 99 ! Max unit number to give + + !----- local variables ----- + integer(SHR_KIND_IN) :: n ! loop index + logical :: opened ! If unit opened or not + + !----- formats ----- + character(*),parameter :: subName = '(shr_log_getUnit) ' + character(*),parameter :: F00 = "('(shr_log_getUnit) ',A,I4,A)" + +!------------------------------------------------------------------------------- +! Notes: +!------------------------------------------------------------------------------- + + ! --- Choose first available unit other than 0, 5, or 6 ------ + do n=shr_log_minUnit, shr_log_maxUnit + inquire( n, opened=opened ) + if (n == 5 .or. n == 6 .or. opened) then + cycle + end if + shr_log_getUnit = n + return + end do + + call shr_sys_abort( subName//': Error: no available units found' ) + +END FUNCTION shr_log_getUnit +!=============================================================================== + +!=============================================================================== +!BOP =========================================================================== +! +! !IROUTINE: shr_log_freeUnit -- Free up a FORTRAN unit number +! +! !DESCRIPTION: Free up the given unit number +! +! !REVISION HISTORY: +! 2005-Dec-14 - E. Kluzek - creation +! 2007-Oct-21 - P. Worley - dumbed down for use in perf_mod +! +! !INTERFACE: ------------------------------------------------------------------ + +SUBROUTINE shr_log_freeUnit ( unit) + + implicit none + +! !INPUT/OUTPUT PARAMETERS: + + integer(SHR_KIND_IN),intent(in) :: unit ! unit number to be freed + +!EOP + + !----- local parameters ----- + integer(SHR_KIND_IN),parameter :: shr_log_minUnit = 10 ! Min unit number to give + integer(SHR_KIND_IN),parameter :: shr_log_maxUnit = 99 ! Max unit number to give + + !----- formats ----- + character(*), parameter :: subName = '(shr_log_freeUnit) ' + character(*), parameter :: F00 = "('(shr_log_freeUnit) ',A,I4,A)" + +!------------------------------------------------------------------------------- +! Notes: +!------------------------------------------------------------------------------- + + if (unit < 0 .or. unit > shr_log_maxUnit) then +!pw if (s_loglev > 0) write(pu_logunit,F00) 'invalid unit number request:', unit + else if (unit == 0 .or. unit == 5 .or. unit == 6) then + call shr_sys_abort( subName//': Error: units 0, 5, and 6 must not be freed' ) + end if + + return + +END SUBROUTINE shr_log_freeUnit +!=============================================================================== + +!============= Routines from atm/cam/src/utils/namelist_utils.F90 ============== +!=============================================================================== + +subroutine find_group_name(unit, group, status) + +!--------------------------------------------------------------------------------------- +! Purpose: +! Search a file that contains namelist input for the specified namelist group name. +! Leave the file positioned so that the current record is the first record of the +! input for the specified group. +! +! Method: +! Read the file line by line. Each line is searched for an '&' which may only +! be preceded by blanks, immediately followed by the group name which is case +! insensitive. If found then backspace the file so the current record is the +! one containing the group name and return success. Otherwise return -1. +! +! Author: B. Eaton, August 2007 +!--------------------------------------------------------------------------------------- + + integer, intent(in) :: unit ! fortran unit attached to file + character(len=*), intent(in) :: group ! namelist group name + integer, intent(out) :: status ! 0 for success, -1 if group name not found + + ! Local variables + + integer :: len_grp + integer :: ios ! io status + character(len=80) :: inrec ! first 80 characters of input record + character(len=80) :: inrec2 ! left adjusted input record + character(len=len(group)) :: lc_group + + !--------------------------------------------------------------------------- + + len_grp = len_trim(group) + lc_group = to_lower(group) + + ios = 0 + do while (ios <= 0) + + read(unit, '(a)', iostat=ios, end=102) inrec + + if (ios <= 0) then ! ios < 0 indicates an end of record condition + + ! look for group name in this record + + ! remove leading blanks + inrec2 = to_lower(adjustl(inrec)) + + ! check for leading '&' + if (inrec2(1:1) == '&') then + + ! check for case insensitive group name + if (trim(lc_group) == inrec2(2:len_grp+1)) then + + ! found group name. backspace to leave file position at this record + backspace(unit) + status = 0 + return + + end if + end if + end if + + end do + + 102 continue ! end of file processing + status = -1 + +end subroutine find_group_name +!=============================================================================== + +!================ Routines from atm/cam/src/utils/string_utils.F90 ============= +!=============================================================================== + +function to_lower(str) + +!----------------------------------------------------------------------- +! Purpose: +! Convert character string to lower case. +! +! Method: +! Use achar and iachar intrinsics to ensure use of ascii collating sequence. +! +! Author: B. Eaton, July 2001 +! +! $Id$ +!----------------------------------------------------------------------- + implicit none + + character(len=*), intent(in) :: str ! String to convert to lower case + character(len=len(str)) :: to_lower + +! Local variables + + integer :: i ! Index + integer :: aseq ! ascii collating sequence + integer :: upper_to_lower ! integer to convert case + character(len=1) :: ctmp ! Character temporary +!----------------------------------------------------------------------- + upper_to_lower = iachar("a") - iachar("A") + + do i = 1, len(str) + ctmp = str(i:i) + aseq = iachar(ctmp) + if ( aseq >= iachar("A") .and. aseq <= iachar("Z") ) & + ctmp = achar(aseq + upper_to_lower) + to_lower(i:i) = ctmp + end do + +end function to_lower +!=============================================================================== + +end module perf_utils diff --git a/share/timing/private.h b/share/timing/private.h new file mode 100644 index 0000000..a255224 --- /dev/null +++ b/share/timing/private.h @@ -0,0 +1,165 @@ +/* +** $Id: private.h,v 1.74 2011-03-28 20:55:19 rosinski Exp $ +** +** Author: Jim Rosinski +** +** Contains definitions private to GPTL and inaccessible to invoking user environment +*/ + +#include +#include + +#ifndef NO_COMM_F2C +#ifndef HAVE_COMM_F2C +#define HAVE_COMM_F2C +#endif +#endif + +#ifndef MIN +#define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) +#endif + +#ifndef MAX +#define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) +#endif + +#define STRMATCH(X,Y) (strcmp((X),(Y)) == 0) + +#define STRNMATCH(X,Y,N) (strncmp((X),(Y),(N)) == 0) + +/* Output counts less than PRTHRESH will be printed as integers */ +#define PRTHRESH 1000000L + +/* Maximum allowed callstack depth */ +#define MAX_STACK 128 + +/* longest timer name allowed (probably safe to just change) */ +#define MAX_CHARS 127 + +/* +** max allowable number of PAPI counters, or derived events. For convenience, +** set to max (# derived events, # papi counters required) so "avail" lists +** all available options. +*/ +#define MAX_AUX 9 + +#ifndef __cplusplus +typedef enum {false = 0, true = 1} bool; /* mimic C++ */ +#endif + +typedef struct { + long last_utime; /* saved usr time from "start" */ + long last_stime; /* saved sys time from "start" */ + long accum_utime; /* accumulator for usr time */ + long accum_stime; /* accumulator for sys time */ +} Cpustats; + +typedef struct { + double last; /* timestamp from last call */ + double latest; /* most recent delta */ + double accum; /* accumulated time */ + float max; /* longest time for start/stop pair */ + float min; /* shortest time for start/stop pair */ + float prev_min; /* previous shortest time for start/stop pair */ + int latest_is_min; /* whether min is current latest (1) or not (0) */ +} Wallstats; + +typedef struct { + long long last[MAX_AUX]; /* array of saved counters from "start" */ + long long accum[MAX_AUX]; /* accumulator for counters */ +} Papistats; + +typedef struct { + int counter; /* PAPI or Derived counter */ + char *namestr; /* PAPI or Derived counter as string */ + char *str8; /* print string for output timers (8 chars) */ + char *str16; /* print string for output timers (16 chars) */ + char *longstr; /* long descriptive print string */ +} Entry; + +typedef struct { + Entry event; + int numidx; /* derived event: PAPI counter array index for numerator */ + int denomidx; /* derived event: PAPI counter array index for denominator */ +} Pr_event; + +typedef struct TIMER { + char name[MAX_CHARS+1]; /* timer name (user input) */ + bool onflg; /* timer currently on or off */ +#ifdef ENABLE_PMPI + double nbytes; /* number of bytes for MPI call */ +#endif +#ifdef HAVE_PAPI + Papistats aux; /* PAPI stats */ +#endif + Wallstats wall; /* wallclock stats */ + Cpustats cpu; /* cpu stats */ + unsigned long count; /* number of start/stop calls */ + unsigned long nrecurse; /* number of recursive start/stop calls */ + void *address; /* address of timer: used only by _instr routines */ + struct TIMER *next; /* next timer in linked list */ + struct TIMER **parent; /* array of parents */ + struct TIMER **children; /* array of children */ + int *parent_count; /* array of call counts, one for each parent */ + unsigned int recurselvl; /* recursion level */ + unsigned int nchildren; /* number of children */ + unsigned int nparent; /* number of parents */ + unsigned int norphan; /* number of times this timer was an orphan */ + int num_desc; /* number of descendants */ +} Timer; + +typedef struct { + Timer **entries; /* array of timers hashed to the same value */ + unsigned int nument; /* number of entries hashed to the same value */ +} Hashentry; + +/* Function prototypes */ + +extern int GPTLerror (const char *, ...); /* print error msg and return */ +extern void GPTLset_abort_on_error (bool val); /* set flag to abort on error */ +extern void *GPTLallocate (const int); /* malloc wrapper */ + +extern int GPTLstart_instr (void *); /* auto-instrumented start */ +extern int GPTLstop_instr (void *); /* auto-instrumented stop */ +extern int GPTLis_initialized (void); /* needed by MPI_Init wrapper */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef AUTO_INST +extern void __cyg_profile_func_enter (void *, void *); +extern void __cyg_profile_func_exit (void *, void *); +#endif + +#ifdef __cplusplus +}; +#endif + +/* +** These are needed for communication between gptl.c and gptl_papi.c +*/ + +#ifdef HAVE_PAPI +extern int GPTL_PAPIsetoption (const int, const int); +extern int GPTL_PAPIinitialize (const int, const bool, int *, Entry *); +extern int GPTL_PAPIstart (const int, Papistats *); +extern int GPTL_PAPIstop (const int, Papistats *); +extern void GPTL_PAPIprstr (FILE *); +extern void GPTL_PAPIpr (FILE *, const Papistats *, const int, const int, const double); +extern void GPTL_PAPIadd (Papistats *, const Papistats *); +extern void GPTL_PAPIfinalize (int); +extern void GPTL_PAPIquery (const Papistats *, long long *, int); +extern int GPTL_PAPIget_eventvalue (const char *, const Papistats *, double *); +extern bool GPTL_PAPIis_multiplexed (void); +extern void GPTL_PAPIprintenabled (FILE *); +extern void read_counters100 (void); +extern int GPTLget_npapievents (void); +extern int GPTLcreate_and_start_events (const int); +#endif + +#ifdef ENABLE_PMPI +extern Timer *GPTLgetentry (const char *); +extern int GPTLpmpi_setoption (const int, const int); +extern int GPTLpr_has_been_called (void); /* needed by MPI_Finalize wrapper*/ +#endif