From 5096009ff34d515252ecb97c8bf508610b092acd Mon Sep 17 00:00:00 2001 From: Scott Hemmert Date: Wed, 10 Jan 2024 14:52:45 -0700 Subject: [PATCH] Time fault (#1029) * Made links build serially across threads on Mac due to slow deletes if they are build in parallel. * Added check to main run loop to detect time faults, i.e. time moving backward or rolling over the 64-bit core time variable. --- src/sst/core/config.cc | 24 ++++++- src/sst/core/main.cc | 16 ++++- src/sst/core/simulation.cc | 29 ++++++++- .../core/testElements/coreTest_Component.cc | 5 +- .../core/testElements/coreTest_Component.h | 5 +- .../testingframework/sst_unittest_support.py | 5 +- tests/Makefile.inc | 2 + .../refFiles/test_Component_time_overflow.out | 15 +++++ tests/test_Component_time_overflow.py | 65 +++++++++++++++++++ tests/testsuite_default_Component.py | 32 ++++++--- 10 files changed, 180 insertions(+), 18 deletions(-) create mode 100644 tests/refFiles/test_Component_time_overflow.out create mode 100644 tests/test_Component_time_overflow.py diff --git a/src/sst/core/config.cc b/src/sst/core/config.cc index 4e97b3465..8b6b5019b 100644 --- a/src/sst/core/config.cc +++ b/src/sst/core/config.cc @@ -362,6 +362,26 @@ class ConfigHelper return 0; } + static std::string getTimebaseExtHelp() + { + std::string msg = "Timebase:\n\n"; + msg.append("Time in SST core is represented by a 64-bit unsigned integer. By default, each count of that " + "value represents 1ps of time. The timebase option allows you to set that atomic core timebase to " + "a different value.\n "); + msg.append("There are two things to balance when determing a timebase to use:\n\n"); + msg.append("1) The shortest time period or fastest clock frequency you want to represent:\n"); + msg.append(" It is recommended that the core timebase be set to ~1000x smaller than the shortest time period " + "(fastest frequency) in your simulation. For the default 1ps timebase, clocks in the 1-10GHz range " + "are easily represented. If you want to have higher frequency clocks, you can set the timebase to " + "a smaller value, at the cost of decreasing the amount of time you can simulate.\n\n"); + msg.append("2) How much simulated time you need to support:\n"); + msg.append(" The default timebase of 1ps will support ~215.5 days (5124 hours) of simulated time. If you are " + "using SST to simulate longer term phenomena, you will need to make the core timebase longer. A " + "consequence of increaing the timebase is that the minimum time period that can be represented will " + "increase (conversely, the maximum frequency that can be represented will increase)."); + return msg; + } + static std::string getProfilingExtHelp() { std::string msg = "Profiling Points [EXPERIMENTAL]:\n\n"; @@ -696,9 +716,9 @@ Config::insertOptions() /* Advanced Features */ DEF_SECTION_HEADING("Advanced Options"); - DEF_ARG( + DEF_ARG_EH( "timebase", 0, "TIMEBASE", "Set the base time step of the simulation (default: 1ps)", - std::bind(&ConfigHelper::setTimebase, this, _1), true); + std::bind(&ConfigHelper::setTimebase, this, _1), std::bind(&ConfigHelper::getTimebaseExtHelp), true); #ifdef SST_CONFIG_HAVE_MPI DEF_ARG_OPTVAL( "parallel-load", 0, "MODE", diff --git a/src/sst/core/main.cc b/src/sst/core/main.cc index aac9e2843..9c4d14b31 100644 --- a/src/sst/core/main.cc +++ b/src/sst/core/main.cc @@ -240,7 +240,6 @@ do_statengine_initialization(ConfigGraph* graph, SST::Simulation_impl* sim, cons static void do_link_preparation(ConfigGraph* graph, SST::Simulation_impl* sim, const RankInfo& myRank, SimTime_t min_part) { - sim->prepareLinks(*graph, myRank, min_part); } @@ -362,7 +361,22 @@ start_simulation(uint32_t tid, SimThreadInfo_t& info, Core::ThreadSafe::Barrier& // Prepare the links, which creates the ComponentInfo objects and // Link and puts the links in the LinkMap for each ComponentInfo. +#ifdef SST_COMPILE_MACOSX + // Some versions of clang on mac have an issue with deleting links + // that were created interleaved between threads, so force + // serialization of threads during link creation. This has been + // confirmed on both Intel and ARM for Xcode 14 and 15. We should + // revisit this in the future. This is easy to see when running + // sst-benchmark with 1024 components and multiple threads. At + // time of adding this code, the difference in delete times was + // 3-5 minutes versuses less than a second. + for ( uint32_t i = 0; i < info.world_size.thread; ++i ) { + if ( i == info.myRank.thread ) { do_link_preparation(info.graph, sim, info.myRank, info.min_part); } + barrier.wait(); + } +#else do_link_preparation(info.graph, sim, info.myRank, info.min_part); +#endif barrier.wait(); // Create all the simulation components diff --git a/src/sst/core/simulation.cc b/src/sst/core/simulation.cc index a22a5551c..ed3be19fe 100644 --- a/src/sst/core/simulation.cc +++ b/src/sst/core/simulation.cc @@ -661,10 +661,22 @@ Simulation_impl::run() run_phase_start_time = sst_get_cpu_time(); - while ( LIKELY(!endSim) ) { + // Will check to make sure time doesn't "go backwards". This will + // also catch the case of rollover (exceeding the 64-bit value + // space of SimTime_t). To avoid yet another branch in the main + // run loop, we will check for a time fault, but will execute the + // next event and only exit the run loop on the next iteration. + // If there was a fault, a message will be printed. + bool time_fault = false; + while ( LIKELY(!endSim && !time_fault) ) { current_activity = timeVortex->pop(); - currentSimCycle = current_activity->getDeliveryTime(); - currentPriority = current_activity->getPriority(); + + // Check for time fault + SimTime_t event_time = current_activity->getDeliveryTime(); + time_fault = event_time < currentSimCycle; + + currentSimCycle = event_time; + currentPriority = current_activity->getPriority(); current_activity->execute(); #if SST_PERIODIC_PRINT @@ -711,6 +723,17 @@ Simulation_impl::run() } #endif } + + // Check to see if there was a time fault + if ( time_fault ) { + sim_output.fatal( + CALL_INFO, 1, + "ERROR: SST Core detected a time fault (an event had an earlier time than the previous event). The most " + "likely cause of this is that the 64-bit core time had an overflow condition. This is typically caused by " + "having low frequency events with too low of a timebase. See the extended help for --timebase option (sst " + "--help timebase)\n"); + } + /* We shouldn't need to do this, but to be safe... */ runBarrier.wait(); // TODO<- Is this needed? diff --git a/src/sst/core/testElements/coreTest_Component.cc b/src/sst/core/testElements/coreTest_Component.cc index 7c13496f0..3aa48c076 100644 --- a/src/sst/core/testElements/coreTest_Component.cc +++ b/src/sst/core/testElements/coreTest_Component.cc @@ -37,6 +37,9 @@ coreTestComponent::coreTestComponent(ComponentId_t id, Params& params) : coreTes commSize = params.find("commSize", 16); + + std::string clockFrequency = params.find("clockFrequency", "1GHz"); + // init randomness srand(1); neighbor = rng->generateNextInt32() % 4; @@ -62,7 +65,7 @@ coreTestComponent::coreTestComponent(ComponentId_t id, Params& params) : coreTes assert(W); // set our clock - registerClock("1GHz", new Clock::Handler(this, &coreTestComponent::clockTic)); + registerClock(clockFrequency, new Clock::Handler(this, &coreTestComponent::clockTic)); } coreTestComponent::~coreTestComponent() diff --git a/src/sst/core/testElements/coreTest_Component.h b/src/sst/core/testElements/coreTest_Component.h index 5d07f044f..3281d26b7 100644 --- a/src/sst/core/testElements/coreTest_Component.h +++ b/src/sst/core/testElements/coreTest_Component.h @@ -29,7 +29,8 @@ class coreTestComponentBase : public SST::Component SST_ELI_REGISTER_COMPONENT_BASE(SST::CoreTestComponent::coreTestComponentBase) SST_ELI_DOCUMENT_PARAMS( - { "workPerCycle", "Count of busy work to do during a clock tick.", NULL} + { "workPerCycle", "Count of busy work to do during a clock tick.", NULL}, + { "clockFrequency", "Frequency of the clock", "1GHz"} ) SST_ELI_DOCUMENT_STATISTICS( @@ -55,7 +56,7 @@ class coreTestComponentBase2 : public coreTestComponentBase SST::CoreTestComponent::coreTestComponentBase2, SST::CoreTestComponent::coreTestComponentBase) SST_ELI_DOCUMENT_PARAMS( - { "commFreq", "Approximate frequency of sending an event during a clock tick.", NULL}, + { "commFreq", "There is a 1/commFreq chance each clock cycle of sending an event to a neighbor", NULL} ) SST_ELI_DOCUMENT_STATISTICS( diff --git a/src/sst/core/testingframework/sst_unittest_support.py b/src/sst/core/testingframework/sst_unittest_support.py index 0d48aba64..beb39a160 100644 --- a/src/sst/core/testingframework/sst_unittest_support.py +++ b/src/sst/core/testingframework/sst_unittest_support.py @@ -1070,8 +1070,9 @@ def filter(self, line): class IgnoreAllAfterFilter(LineFilter): """ Filters out any line that starts with a specified string and all lines after it """ - def __init__(self, prefix): + def __init__(self, prefix, keep_line = False): self._prefix = prefix; + self._keep_line = keep_line self._found = False def reset(self): @@ -1091,6 +1092,8 @@ def filter(self, line): if self._found: return None if ( line.startswith(self._prefix) ): self._found = True + if self._keep_line: + return line return None return line diff --git a/tests/Makefile.inc b/tests/Makefile.inc index ee97267d7..6fb0ada22 100644 --- a/tests/Makefile.inc +++ b/tests/Makefile.inc @@ -17,6 +17,7 @@ EXTRA_DIST += \ tests/testsuite_default_MemPoolTest.py \ tests/testsuite_testengine_testing.py \ tests/test_Component.py \ + tests/test_Component_time_overflow.py \ tests/test_ClockerComponent.py \ tests/test_DistribComponent_discrete.py \ tests/test_DistribComponent_expon.py \ @@ -44,6 +45,7 @@ EXTRA_DIST += \ tests/test_PythonUnitAlgebra.py \ tests/test_PerfComponent.py \ tests/refFiles/test_Component.out \ + tests/refFiles/test_Component_time_overflow.out \ tests/refFiles/test_PerfComponent.out \ tests/refFiles/test_DistribComponent_discrete.out \ tests/refFiles/test_DistribComponent_expon.out \ diff --git a/tests/refFiles/test_Component_time_overflow.out b/tests/refFiles/test_Component_time_overflow.out new file mode 100644 index 000000000..aef7f3504 --- /dev/null +++ b/tests/refFiles/test_Component_time_overflow.out @@ -0,0 +1,15 @@ +FATAL: SST Core: ERROR: SST Core detected a time fault (an event had an earlier time than the previous event). The most likely cause of this is that the 64-bit core time had an overflow condition. This is typically caused by having low frequency events with too low of a timebase. See the extended help for --timebase option (sst --help timebase) +SST Fatal Backtrace Information: + 0 : 0 sstsim.x 0x000000010024ea3f _ZNK3SST6Output5fatalEjPKcS2_iS2_z + 879 + 1 : 1 sstsim.x 0x0000000100269b82 _ZN3SST15Simulation_impl3runEv + 866 + 2 : 2 sstsim.x 0x00000001001f1b20 _ZL16start_simulationjR15SimThreadInfo_tRN3SST4Core10ThreadSafe7BarrierE + 3200 + 3 : 3 sstsim.x 0x00000001001ed044 main + 7492 + 4 : 4 dyld 0x000000010916852e start + 462 +-------------------------------------------------------------------------- +MPI_ABORT was invoked on rank 0 in communicator MPI_COMM_WORLD +with errorcode 5. + +NOTE: invoking MPI_ABORT causes Open MPI to kill all MPI processes. +You may or may not see output from other processes, depending on +exactly when Open MPI kills them. +-------------------------------------------------------------------------- diff --git a/tests/test_Component_time_overflow.py b/tests/test_Component_time_overflow.py new file mode 100644 index 000000000..a64975367 --- /dev/null +++ b/tests/test_Component_time_overflow.py @@ -0,0 +1,65 @@ +import sst + +# Define SST core options +#sst.setProgramOption("stop-at", "25us") + +# Define the simulation components +comp_c0_0 = sst.Component("c0_0", "coreTestElement.coreTestComponent") +comp_c0_0.addParams({ + "workPerCycle" : "100", + "commSize" : "100", + "commFreq" : "1000", + "clockFrequency" : "60s" +}) + +comp_c0_1 = sst.Component("c0_1", "coreTestElement.coreTestComponent") +comp_c0_1.addParams({ + "workPerCycle" : "100", + "commSize" : "100", + "commFreq" : "1000", + "clockFrequency" : "60s" +}) + +comp_c1_0 = sst.Component("c1_0", "coreTestElement.coreTestComponent") +comp_c1_0.addParams({ + "workPerCycle" : "100", + "commSize" : "100", + "commFreq" : "1000", + "clockFrequency" : "60s" +}) + +comp_c1_1 = sst.Component("c1_1", "coreTestElement.coreTestComponent") +comp_c1_1.addParams({ + "workPerCycle" : "100", + "commSize" : "100", + "commFreq" : "1000", + "clockFrequency" : "60s" +}) + +# Define the simulation links + +# North/South links +link_ns_0_01 = sst.Link("link_ns_0_01") +link_ns_0_01.connect( (comp_c0_0, "Nlink", "10000ps"), (comp_c0_1, "Slink", "10000ps") ) + +link_ns_0_10 = sst.Link("link_ns_0_10") +link_ns_0_10.connect( (comp_c0_0, "Slink", "10000ps"), (comp_c0_1, "Nlink", "10000ps") ) + +link_ns_1_01 = sst.Link("link_ns_1_01") +link_ns_1_01.connect( (comp_c1_0, "Nlink", "10000ps"), (comp_c1_1, "Slink", "10000ps") ) + +link_ns_0_10 = sst.Link("link_ns_1_10") +link_ns_0_10.connect( (comp_c1_0, "Slink", "10000ps"), (comp_c1_1, "Nlink", "10000ps") ) + +# East/West links +link_ew_0_01 = sst.Link("link_ew_0_01") +link_ew_0_01.connect( (comp_c0_0, "Elink", "10000ps"), (comp_c1_0, "Wlink", "10000ps") ) + +link_ew_0_10 = sst.Link("link_ew_0_10") +link_ew_0_10.connect( (comp_c0_0, "Wlink", "10000ps"), (comp_c1_0, "Elink", "10000ps") ) + +link_ew_1_01 = sst.Link("link_ew_1_01") +link_ew_1_01.connect( (comp_c0_1, "Elink", "10000ps"), (comp_c1_1, "Wlink", "10000ps") ) + +link_ew_1_10 = sst.Link("link_ew_1_10") +link_ew_1_10.connect( (comp_c0_1, "Wlink", "10000ps"), (comp_c1_1, "Elink", "10000ps") ) diff --git a/tests/testsuite_default_Component.py b/tests/testsuite_default_Component.py index 2631c47a1..dd978f6f3 100644 --- a/tests/testsuite_default_Component.py +++ b/tests/testsuite_default_Component.py @@ -51,21 +51,37 @@ def tearDown(self): ##### def test_Component(self): - self.component_test_template("component") + self.component_test_template("Component") + + def test_Component_time_overflow(self): + self.component_test_template("Component_time_overflow", 1) ##### - def component_test_template(self, testtype): + def component_test_template(self, testtype, exp_rc = 0): testsuitedir = self.get_testsuite_dir() outdir = test_output_get_run_dir() - sdlfile = "{0}/test_Component.py".format(testsuitedir) - reffile = "{0}/refFiles/test_Component.out".format(testsuitedir) - outfile = "{0}/test_Component.out".format(outdir) + sdlfile = "{0}/test_{1}.py".format(testsuitedir, testtype) + reffile = "{0}/refFiles/test_{1}.out".format(testsuitedir, testtype) + outfile = "{0}/test_{1}.out".format(outdir, testtype) + errfile = "{0}/test_{1}.err".format(outdir, testtype) + + self.run_sst(sdlfile, outfile, errfile, expected_rc = exp_rc) + + # Check the results if exp_rc isn't equal to 0, then we are + # expecting an error and we'll put in a LineFilter to filter + # out everything after the error message before doing the diff + filter1 = LineFilter() + filter2 = LineFilter() + if exp_rc != 0: + filter1 = IgnoreAllAfterFilter("FATAL:", True) + filter2 = RemoveRegexFromLineFilter(r"\[[0-9]+:[0-9]+\]") - self.run_sst(sdlfile, outfile) + testfile = outfile + if exp_rc != 0: + testfile = errfile - # Perform the test - cmp_result = testing_compare_sorted_diff(testtype, outfile, reffile) + cmp_result = testing_compare_filtered_diff(testtype, testfile, reffile, sort=True, filters=[filter1,filter2]) self.assertTrue(cmp_result, "Output/Compare file {0} does not match Reference File {1}".format(outfile, reffile))