Skip to content

Commit

Permalink
Unify signal handling infrastructure (#1108)
Browse files Browse the repository at this point in the history
* Unify signal handling infrastructure
Fix bugs in parallel signal handling
Add real-time checkpoint capability
Add real-time heartbeat capability
Fix bug when checkpointing a restarted run
  • Loading branch information
gvoskuilen authored Jul 29, 2024
1 parent a4dbc4a commit 95aed59
Show file tree
Hide file tree
Showing 43 changed files with 3,247 additions and 460 deletions.
3 changes: 3 additions & 0 deletions src/sst/core/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ nobase_dist_sst_HEADERS = \
profile/syncProfileTool.h \
profile/componentProfileTool.h \
rankInfo.h \
realtime.h \
realtimeAction.h \
sparseVectorMap.h \
sst_types.h \
sstpart.h \
Expand Down Expand Up @@ -207,6 +209,7 @@ sst_core_sources = \
uninitializedQueue.cc \
unitAlgebra.cc \
module.cc \
realtime.cc \
ssthandler.cc \
sstpart.cc \
timeVortex.cc \
Expand Down
22 changes: 12 additions & 10 deletions src/sst/core/cfgoutput/jsonConfigOutput.cc
Original file line number Diff line number Diff line change
Expand Up @@ -149,18 +149,20 @@ JSONConfigGraphOutput::generate(const Config* cfg, ConfigGraph* graph)
json::ordered_json outputJson;

// Put in the program options
outputJson["program_options"]["verbose"] = std::to_string(cfg->verbose());
outputJson["program_options"]["stop-at"] = cfg->stop_at();
outputJson["program_options"]["print-timing-info"] = cfg->print_timing() ? "true" : "false";
outputJson["program_options"]["verbose"] = std::to_string(cfg->verbose());
outputJson["program_options"]["stop-at"] = cfg->stop_at();
outputJson["program_options"]["print-timing-info"] = cfg->print_timing() ? "true" : "false";
// Ignore stopAfter for now
// outputJson["program_options"]["stopAfter"] = cfg->stopAfterSec();
outputJson["program_options"]["heartbeat-period"] = cfg->heartbeat_period();
outputJson["program_options"]["timebase"] = cfg->timeBase();
outputJson["program_options"]["partitioner"] = cfg->partitioner();
outputJson["program_options"]["timeVortex"] = cfg->timeVortex();
outputJson["program_options"]["interthread-links"] = cfg->interthread_links() ? "true" : "false";
outputJson["program_options"]["output-prefix-core"] = cfg->output_core_prefix();
outputJson["program_options"]["checkpoint-period"] = cfg->checkpoint_period();
outputJson["program_options"]["heartbeat-sim-period"] = cfg->heartbeat_sim_period();
outputJson["program_options"]["heartbeat-wall-period"] = std::to_string(cfg->heartbeat_wall_period());
outputJson["program_options"]["timebase"] = cfg->timeBase();
outputJson["program_options"]["partitioner"] = cfg->partitioner();
outputJson["program_options"]["timeVortex"] = cfg->timeVortex();
outputJson["program_options"]["interthread-links"] = cfg->interthread_links() ? "true" : "false";
outputJson["program_options"]["output-prefix-core"] = cfg->output_core_prefix();
outputJson["program_options"]["checkpoint-sim-period"] = cfg->checkpoint_sim_period();
outputJson["program_options"]["checkpoint-wall-period"] = std::to_string(cfg->checkpoint_wall_period());


// Put in the global param sets
Expand Down
11 changes: 9 additions & 2 deletions src/sst/core/cfgoutput/pythonConfigOutput.cc
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,10 @@ PythonConfigGraphOutput::generate(const Config* cfg, ConfigGraph* graph)
outputFile, "sst.setProgramOption(\"print-timing-info\", \"%s\")\n", cfg->print_timing() ? "true" : "false");
// Ignore stopAfter for now
// fprintf(outputFile, "sst.setProgramOption(\"stopAfter\", \"%" PRIu32 "\")\n", cfg->stopAfterSec);
fprintf(outputFile, "sst.setProgramOption(\"heartbeat-period\", \"%s\")\n", cfg->heartbeat_period().c_str());
fprintf(
outputFile, "sst.setProgramOption(\"heartbeat-sim-period\", \"%s\")\n", cfg->heartbeat_sim_period().c_str());
fprintf(
outputFile, "sst.setProgramOption(\"heartbeat-wall-period\", \"%" PRIu32 "\")\n", cfg->heartbeat_wall_period());
fprintf(outputFile, "sst.setProgramOption(\"timebase\", \"%s\")\n", cfg->timeBase().c_str());
fprintf(outputFile, "sst.setProgramOption(\"partitioner\", \"%s\")\n", cfg->partitioner().c_str());
fprintf(outputFile, "sst.setProgramOption(\"timeVortex\", \"%s\")\n", cfg->timeVortex().c_str());
Expand All @@ -238,7 +241,11 @@ PythonConfigGraphOutput::generate(const Config* cfg, ConfigGraph* graph)
cfg->interthread_links() ? "true" : "false");
fprintf(outputFile, "sst.setProgramOption(\"output-prefix-core\", \"%s\")\n", cfg->output_core_prefix().c_str());

fprintf(outputFile, "sst.setProgramOption(\"checkpoint-period\", \"%s\")\n", cfg->checkpoint_period().c_str());
fprintf(
outputFile, "sst.setProgramOption(\"checkpoint-sim-period\", \"%s\")\n", cfg->checkpoint_sim_period().c_str());
fprintf(
outputFile, "sst.setProgramOption(\"checkpoint-wall-period\", \"%" PRIu32 "\")\n",
cfg->checkpoint_wall_period());

// Output the global params
fprintf(outputFile, "# Define the global parameter sets:\n");
Expand Down
59 changes: 48 additions & 11 deletions src/sst/core/checkpointAction.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,23 @@ REENABLE_WARNING

namespace SST {

CheckpointAction::CheckpointAction(Config* UNUSED(cfg), int this_rank, Simulation_impl* sim, TimeConverter* period) :
CheckpointAction::CheckpointAction(
Config* UNUSED(cfg), RankInfo this_rank, Simulation_impl* sim, TimeConverter* period) :
Action(),
rank(this_rank),
m_period(period)
rank_(this_rank),
period_(period),
generate_(false)
{
sim->insertActivity(period->getFactor(), this);
if ( (0 == this_rank) ) { lastTime = sst_get_cpu_time(); }
next_sim_time_ = 0;
last_cpu_time_ = 0;

if ( period_ ) {
next_sim_time_ =
(period_->getFactor() * (sim->getCurrentSimCycle() / period_->getFactor())) + period_->getFactor();
sim->insertActivity(next_sim_time_, this);
}

if ( (0 == this_rank.rank) ) { last_cpu_time_ = sst_get_cpu_time(); }
}

CheckpointAction::~CheckpointAction() {}
Expand All @@ -43,20 +53,47 @@ void
CheckpointAction::execute(void)
{
Simulation_impl* sim = Simulation_impl::getSimulation();
const double now = sst_get_cpu_time();
createCheckpoint(sim);

next_sim_time_ += period_->getFactor();
sim->insertActivity(next_sim_time_, this);
}

if ( 0 == rank ) {
void
CheckpointAction::createCheckpoint(Simulation_impl* sim)
{

if ( 0 == rank_.rank ) {
const double now = sst_get_cpu_time();
sim->getSimulationOutput().output(
"# Simulation Checkpoint: Simulated Time %s (Real CPU time since last checkpoint %.5f seconds)\n",
sim->getElapsedSimTime().toStringBestSI().c_str(), (now - lastTime));
sim->getElapsedSimTime().toStringBestSI().c_str(), (now - last_cpu_time_));

lastTime = now;
last_cpu_time_ = now;
}

sim->checkpoint();
}

SimTime_t next = sim->getCurrentSimCycle() + m_period->getFactor();
sim->insertActivity(next, this);
void
CheckpointAction::check()
{
// TODO add logic for simulation-interval checkpoints in parallel
Simulation_impl* sim = Simulation_impl::getSimulation();
if ( generate_ ) { createCheckpoint(sim); }
generate_ = false;
}

void
CheckpointAction::setCheckpoint()
{
generate_ = true;
}

SimTime_t
CheckpointAction::getNextCheckpointSimTime()
{
return next_sim_time_;
}

} // namespace SST
34 changes: 24 additions & 10 deletions src/sst/core/checkpointAction.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "sst/core/config.h"
#include "sst/core/cputimer.h"
#include "sst/core/output.h"
#include "sst/core/rankInfo.h"
#include "sst/core/sst_types.h"

#include <set>
Expand All @@ -35,22 +36,35 @@ class CheckpointAction : public Action
/**
Create a new checkpoint object for the simulation core to initiate checkpoints
*/
CheckpointAction(Config* cfg, int this_rank, Simulation_impl* sim, TimeConverter* period);
CheckpointAction(Config* cfg, RankInfo this_rank, Simulation_impl* sim, TimeConverter* period);
~CheckpointAction();

NotSerializable(SST::CheckpointAction) // Going to have to fix this
/** Generate a checkpoint next time check() is called */
void setCheckpoint();

private :
/** Called by TimeVortex to trigger checkpoint on simulation clock interval - not used in parallel simulation */
void execute(void) override;

CheckpointAction()
{}
/** Called by SyncManager to check whether a checkpoint should be generated */
void check();

/** Return next checkpoint time */
SimTime_t getNextCheckpointSimTime();

NotSerializable(SST::CheckpointAction);

private:
CheckpointAction() {}
CheckpointAction(const CheckpointAction&);

void operator=(CheckpointAction const&);
void execute(void) override;
int rank;
TimeConverter* m_period;
double lastTime;
void operator=(CheckpointAction const&);
void createCheckpoint(Simulation_impl* sim); // The actual checkpoint operation

RankInfo rank_; // RankInfo for this thread/rank
TimeConverter* period_; // Simulation time interval for scheduling or nullptr if not set
double last_cpu_time_; // Last time a checkpoint was triggered
bool generate_; // Whether a checkpoint should be done next time check() is called
SimTime_t next_sim_time_; // Next simulationt ime a checkpoint should trigger at or 0 if not applicable
};

} // namespace SST
Expand Down
Loading

0 comments on commit 95aed59

Please sign in to comment.