Skip to content

Commit

Permalink
Time fault (#1029)
Browse files Browse the repository at this point in the history
* Made links build serially across threads on Mac due to slow deletes if they are build in parallel.

* Added check to main run loop to detect time faults, i.e. time moving backward or rolling over the 64-bit core time variable.
  • Loading branch information
feldergast authored Jan 10, 2024
1 parent eea5b83 commit 5096009
Show file tree
Hide file tree
Showing 10 changed files with 180 additions and 18 deletions.
24 changes: 22 additions & 2 deletions src/sst/core/config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,26 @@ class ConfigHelper
return 0;
}

static std::string getTimebaseExtHelp()
{
std::string msg = "Timebase:\n\n";
msg.append("Time in SST core is represented by a 64-bit unsigned integer. By default, each count of that "
"value represents 1ps of time. The timebase option allows you to set that atomic core timebase to "
"a different value.\n ");
msg.append("There are two things to balance when determing a timebase to use:\n\n");
msg.append("1) The shortest time period or fastest clock frequency you want to represent:\n");
msg.append(" It is recommended that the core timebase be set to ~1000x smaller than the shortest time period "
"(fastest frequency) in your simulation. For the default 1ps timebase, clocks in the 1-10GHz range "
"are easily represented. If you want to have higher frequency clocks, you can set the timebase to "
"a smaller value, at the cost of decreasing the amount of time you can simulate.\n\n");
msg.append("2) How much simulated time you need to support:\n");
msg.append(" The default timebase of 1ps will support ~215.5 days (5124 hours) of simulated time. If you are "
"using SST to simulate longer term phenomena, you will need to make the core timebase longer. A "
"consequence of increaing the timebase is that the minimum time period that can be represented will "
"increase (conversely, the maximum frequency that can be represented will increase).");
return msg;
}

static std::string getProfilingExtHelp()
{
std::string msg = "Profiling Points [EXPERIMENTAL]:\n\n";
Expand Down Expand Up @@ -696,9 +716,9 @@ Config::insertOptions()

/* Advanced Features */
DEF_SECTION_HEADING("Advanced Options");
DEF_ARG(
DEF_ARG_EH(
"timebase", 0, "TIMEBASE", "Set the base time step of the simulation (default: 1ps)",
std::bind(&ConfigHelper::setTimebase, this, _1), true);
std::bind(&ConfigHelper::setTimebase, this, _1), std::bind(&ConfigHelper::getTimebaseExtHelp), true);
#ifdef SST_CONFIG_HAVE_MPI
DEF_ARG_OPTVAL(
"parallel-load", 0, "MODE",
Expand Down
16 changes: 15 additions & 1 deletion src/sst/core/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,6 @@ do_statengine_initialization(ConfigGraph* graph, SST::Simulation_impl* sim, cons
static void
do_link_preparation(ConfigGraph* graph, SST::Simulation_impl* sim, const RankInfo& myRank, SimTime_t min_part)
{

sim->prepareLinks(*graph, myRank, min_part);
}

Expand Down Expand Up @@ -362,7 +361,22 @@ start_simulation(uint32_t tid, SimThreadInfo_t& info, Core::ThreadSafe::Barrier&

// Prepare the links, which creates the ComponentInfo objects and
// Link and puts the links in the LinkMap for each ComponentInfo.
#ifdef SST_COMPILE_MACOSX
// Some versions of clang on mac have an issue with deleting links
// that were created interleaved between threads, so force
// serialization of threads during link creation. This has been
// confirmed on both Intel and ARM for Xcode 14 and 15. We should
// revisit this in the future. This is easy to see when running
// sst-benchmark with 1024 components and multiple threads. At
// time of adding this code, the difference in delete times was
// 3-5 minutes versuses less than a second.
for ( uint32_t i = 0; i < info.world_size.thread; ++i ) {
if ( i == info.myRank.thread ) { do_link_preparation(info.graph, sim, info.myRank, info.min_part); }
barrier.wait();
}
#else
do_link_preparation(info.graph, sim, info.myRank, info.min_part);
#endif
barrier.wait();

// Create all the simulation components
Expand Down
29 changes: 26 additions & 3 deletions src/sst/core/simulation.cc
Original file line number Diff line number Diff line change
Expand Up @@ -661,10 +661,22 @@ Simulation_impl::run()

run_phase_start_time = sst_get_cpu_time();

while ( LIKELY(!endSim) ) {
// Will check to make sure time doesn't "go backwards". This will
// also catch the case of rollover (exceeding the 64-bit value
// space of SimTime_t). To avoid yet another branch in the main
// run loop, we will check for a time fault, but will execute the
// next event and only exit the run loop on the next iteration.
// If there was a fault, a message will be printed.
bool time_fault = false;
while ( LIKELY(!endSim && !time_fault) ) {
current_activity = timeVortex->pop();
currentSimCycle = current_activity->getDeliveryTime();
currentPriority = current_activity->getPriority();

// Check for time fault
SimTime_t event_time = current_activity->getDeliveryTime();
time_fault = event_time < currentSimCycle;

currentSimCycle = event_time;
currentPriority = current_activity->getPriority();
current_activity->execute();

#if SST_PERIODIC_PRINT
Expand Down Expand Up @@ -711,6 +723,17 @@ Simulation_impl::run()
}
#endif
}

// Check to see if there was a time fault
if ( time_fault ) {
sim_output.fatal(
CALL_INFO, 1,
"ERROR: SST Core detected a time fault (an event had an earlier time than the previous event). The most "
"likely cause of this is that the 64-bit core time had an overflow condition. This is typically caused by "
"having low frequency events with too low of a timebase. See the extended help for --timebase option (sst "
"--help timebase)\n");
}

/* We shouldn't need to do this, but to be safe... */

runBarrier.wait(); // TODO<- Is this needed?
Expand Down
5 changes: 4 additions & 1 deletion src/sst/core/testElements/coreTest_Component.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ coreTestComponent::coreTestComponent(ComponentId_t id, Params& params) : coreTes

commSize = params.find<int64_t>("commSize", 16);


std::string clockFrequency = params.find<std::string>("clockFrequency", "1GHz");

// init randomness
srand(1);
neighbor = rng->generateNextInt32() % 4;
Expand All @@ -62,7 +65,7 @@ coreTestComponent::coreTestComponent(ComponentId_t id, Params& params) : coreTes
assert(W);

// set our clock
registerClock("1GHz", new Clock::Handler<coreTestComponent>(this, &coreTestComponent::clockTic));
registerClock(clockFrequency, new Clock::Handler<coreTestComponent>(this, &coreTestComponent::clockTic));
}

coreTestComponent::~coreTestComponent()
Expand Down
5 changes: 3 additions & 2 deletions src/sst/core/testElements/coreTest_Component.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ class coreTestComponentBase : public SST::Component
SST_ELI_REGISTER_COMPONENT_BASE(SST::CoreTestComponent::coreTestComponentBase)

SST_ELI_DOCUMENT_PARAMS(
{ "workPerCycle", "Count of busy work to do during a clock tick.", NULL}
{ "workPerCycle", "Count of busy work to do during a clock tick.", NULL},
{ "clockFrequency", "Frequency of the clock", "1GHz"}
)

SST_ELI_DOCUMENT_STATISTICS(
Expand All @@ -55,7 +56,7 @@ class coreTestComponentBase2 : public coreTestComponentBase
SST::CoreTestComponent::coreTestComponentBase2, SST::CoreTestComponent::coreTestComponentBase)

SST_ELI_DOCUMENT_PARAMS(
{ "commFreq", "Approximate frequency of sending an event during a clock tick.", NULL},
{ "commFreq", "There is a 1/commFreq chance each clock cycle of sending an event to a neighbor", NULL}
)

SST_ELI_DOCUMENT_STATISTICS(
Expand Down
5 changes: 4 additions & 1 deletion src/sst/core/testingframework/sst_unittest_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -1070,8 +1070,9 @@ def filter(self, line):
class IgnoreAllAfterFilter(LineFilter):
""" Filters out any line that starts with a specified string and all lines after it
"""
def __init__(self, prefix):
def __init__(self, prefix, keep_line = False):
self._prefix = prefix;
self._keep_line = keep_line
self._found = False

def reset(self):
Expand All @@ -1091,6 +1092,8 @@ def filter(self, line):
if self._found: return None
if ( line.startswith(self._prefix) ):
self._found = True
if self._keep_line:
return line
return None
return line

Expand Down
2 changes: 2 additions & 0 deletions tests/Makefile.inc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ EXTRA_DIST += \
tests/testsuite_default_MemPoolTest.py \
tests/testsuite_testengine_testing.py \
tests/test_Component.py \
tests/test_Component_time_overflow.py \
tests/test_ClockerComponent.py \
tests/test_DistribComponent_discrete.py \
tests/test_DistribComponent_expon.py \
Expand Down Expand Up @@ -44,6 +45,7 @@ EXTRA_DIST += \
tests/test_PythonUnitAlgebra.py \
tests/test_PerfComponent.py \
tests/refFiles/test_Component.out \
tests/refFiles/test_Component_time_overflow.out \
tests/refFiles/test_PerfComponent.out \
tests/refFiles/test_DistribComponent_discrete.out \
tests/refFiles/test_DistribComponent_expon.out \
Expand Down
15 changes: 15 additions & 0 deletions tests/refFiles/test_Component_time_overflow.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FATAL: SST Core: ERROR: SST Core detected a time fault (an event had an earlier time than the previous event). The most likely cause of this is that the 64-bit core time had an overflow condition. This is typically caused by having low frequency events with too low of a timebase. See the extended help for --timebase option (sst --help timebase)
SST Fatal Backtrace Information:
0 : 0 sstsim.x 0x000000010024ea3f _ZNK3SST6Output5fatalEjPKcS2_iS2_z + 879
1 : 1 sstsim.x 0x0000000100269b82 _ZN3SST15Simulation_impl3runEv + 866
2 : 2 sstsim.x 0x00000001001f1b20 _ZL16start_simulationjR15SimThreadInfo_tRN3SST4Core10ThreadSafe7BarrierE + 3200
3 : 3 sstsim.x 0x00000001001ed044 main + 7492
4 : 4 dyld 0x000000010916852e start + 462
--------------------------------------------------------------------------
MPI_ABORT was invoked on rank 0 in communicator MPI_COMM_WORLD
with errorcode 5.

NOTE: invoking MPI_ABORT causes Open MPI to kill all MPI processes.
You may or may not see output from other processes, depending on
exactly when Open MPI kills them.
--------------------------------------------------------------------------
65 changes: 65 additions & 0 deletions tests/test_Component_time_overflow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import sst

# Define SST core options
#sst.setProgramOption("stop-at", "25us")

# Define the simulation components
comp_c0_0 = sst.Component("c0_0", "coreTestElement.coreTestComponent")
comp_c0_0.addParams({
"workPerCycle" : "100",
"commSize" : "100",
"commFreq" : "1000",
"clockFrequency" : "60s"
})

comp_c0_1 = sst.Component("c0_1", "coreTestElement.coreTestComponent")
comp_c0_1.addParams({
"workPerCycle" : "100",
"commSize" : "100",
"commFreq" : "1000",
"clockFrequency" : "60s"
})

comp_c1_0 = sst.Component("c1_0", "coreTestElement.coreTestComponent")
comp_c1_0.addParams({
"workPerCycle" : "100",
"commSize" : "100",
"commFreq" : "1000",
"clockFrequency" : "60s"
})

comp_c1_1 = sst.Component("c1_1", "coreTestElement.coreTestComponent")
comp_c1_1.addParams({
"workPerCycle" : "100",
"commSize" : "100",
"commFreq" : "1000",
"clockFrequency" : "60s"
})

# Define the simulation links

# North/South links
link_ns_0_01 = sst.Link("link_ns_0_01")
link_ns_0_01.connect( (comp_c0_0, "Nlink", "10000ps"), (comp_c0_1, "Slink", "10000ps") )

link_ns_0_10 = sst.Link("link_ns_0_10")
link_ns_0_10.connect( (comp_c0_0, "Slink", "10000ps"), (comp_c0_1, "Nlink", "10000ps") )

link_ns_1_01 = sst.Link("link_ns_1_01")
link_ns_1_01.connect( (comp_c1_0, "Nlink", "10000ps"), (comp_c1_1, "Slink", "10000ps") )

link_ns_0_10 = sst.Link("link_ns_1_10")
link_ns_0_10.connect( (comp_c1_0, "Slink", "10000ps"), (comp_c1_1, "Nlink", "10000ps") )

# East/West links
link_ew_0_01 = sst.Link("link_ew_0_01")
link_ew_0_01.connect( (comp_c0_0, "Elink", "10000ps"), (comp_c1_0, "Wlink", "10000ps") )

link_ew_0_10 = sst.Link("link_ew_0_10")
link_ew_0_10.connect( (comp_c0_0, "Wlink", "10000ps"), (comp_c1_0, "Elink", "10000ps") )

link_ew_1_01 = sst.Link("link_ew_1_01")
link_ew_1_01.connect( (comp_c0_1, "Elink", "10000ps"), (comp_c1_1, "Wlink", "10000ps") )

link_ew_1_10 = sst.Link("link_ew_1_10")
link_ew_1_10.connect( (comp_c0_1, "Wlink", "10000ps"), (comp_c1_1, "Elink", "10000ps") )
32 changes: 24 additions & 8 deletions tests/testsuite_default_Component.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,21 +51,37 @@ def tearDown(self):
#####

def test_Component(self):
self.component_test_template("component")
self.component_test_template("Component")

def test_Component_time_overflow(self):
self.component_test_template("Component_time_overflow", 1)

#####

def component_test_template(self, testtype):
def component_test_template(self, testtype, exp_rc = 0):
testsuitedir = self.get_testsuite_dir()
outdir = test_output_get_run_dir()

sdlfile = "{0}/test_Component.py".format(testsuitedir)
reffile = "{0}/refFiles/test_Component.out".format(testsuitedir)
outfile = "{0}/test_Component.out".format(outdir)
sdlfile = "{0}/test_{1}.py".format(testsuitedir, testtype)
reffile = "{0}/refFiles/test_{1}.out".format(testsuitedir, testtype)
outfile = "{0}/test_{1}.out".format(outdir, testtype)
errfile = "{0}/test_{1}.err".format(outdir, testtype)

self.run_sst(sdlfile, outfile, errfile, expected_rc = exp_rc)

# Check the results if exp_rc isn't equal to 0, then we are
# expecting an error and we'll put in a LineFilter to filter
# out everything after the error message before doing the diff
filter1 = LineFilter()
filter2 = LineFilter()
if exp_rc != 0:
filter1 = IgnoreAllAfterFilter("FATAL:", True)
filter2 = RemoveRegexFromLineFilter(r"\[[0-9]+:[0-9]+\]")

self.run_sst(sdlfile, outfile)
testfile = outfile
if exp_rc != 0:
testfile = errfile

# Perform the test
cmp_result = testing_compare_sorted_diff(testtype, outfile, reffile)
cmp_result = testing_compare_filtered_diff(testtype, testfile, reffile, sort=True, filters=[filter1,filter2])
self.assertTrue(cmp_result, "Output/Compare file {0} does not match Reference File {1}".format(outfile, reffile))

0 comments on commit 5096009

Please sign in to comment.