From ef0e5250be40eae817deb2cc82e2e27ab9467ef4 Mon Sep 17 00:00:00 2001 From: Adam Moody Date: Sat, 6 Jun 2020 14:56:13 -0700 Subject: [PATCH 1/3] add output flag to complete, drop start/complete checkpoint --- src/scr.c | 76 ++++++++------------------------------------- src/scr.h | 16 ++-------- src/scr_interpose.c | 5 +-- src/scrf.c | 27 +++------------- 4 files changed, 24 insertions(+), 100 deletions(-) diff --git a/src/scr.c b/src/scr.c index 1323bd83..80d80e0e 100644 --- a/src/scr.c +++ b/src/scr.c @@ -1337,7 +1337,7 @@ static int scr_assign_ownership(scr_filemap* map, int bypass) } /* end phase for current output dataset */ -static int scr_complete_output(int valid) +static int scr_complete_output(int valid, int* allvalid) { /* bail out if there is no active call to Start_output */ if (! scr_in_output) { @@ -1596,6 +1596,9 @@ static int scr_complete_output(int valid) } } + /* set flag based on return value for now */ + *allvalid = (rc == SCR_SUCCESS); + return rc; } @@ -2407,32 +2410,6 @@ int SCR_Start_output(const char* name, int flags) return scr_start_output(name, flags); } -/* informs SCR that a fresh checkpoint set is about to start */ -int SCR_Start_checkpoint() -{ - /* manage state transition */ - if (scr_state != SCR_STATE_IDLE) { - scr_state_transition_error(scr_state, "SCR_Start_checkpoint()", __FILE__, __LINE__); - } - scr_state = SCR_STATE_CHECKPOINT; - - /* if not enabled, bail with an error */ - if (! scr_enabled) { - return SCR_FAILURE; - } - - /* bail out if not initialized -- will get bad results */ - if (! scr_initialized) { - scr_abort(-1, "SCR has not been initialized @ %s:%d", - __FILE__, __LINE__ - ); - return SCR_FAILURE; - } - - /* delegate the rest to start_output */ - return scr_start_output(NULL, SCR_FLAG_CHECKPOINT); -} - /* given a filename, return the full path to the file which the user should write to */ int SCR_Route_file(const char* file, char* newfile) { @@ -2656,7 +2633,7 @@ int SCR_Route_file(const char* file, char* newfile) } /* inform library that the current dataset is complete */ -int SCR_Complete_output(int valid) +int SCR_Complete_output(int valid, int* allvalid) { /* manage state transition */ if (scr_state != SCR_STATE_OUTPUT) { @@ -2679,34 +2656,7 @@ int SCR_Complete_output(int valid) return SCR_FAILURE; } - return scr_complete_output(valid); -} - -/* completes the checkpoint set and marks it as valid or not */ -int SCR_Complete_checkpoint(int valid) -{ - /* manage state transition */ - if (scr_state != SCR_STATE_CHECKPOINT) { - scr_abort(-1, "Must call SCR_Start_checkpoint() before SCR_Complete_checkpoint() @ %s:%d", - __FILE__, __LINE__ - ); - } - scr_state = SCR_STATE_IDLE; - - /* if not enabled, bail with an error */ - if (! scr_enabled) { - return SCR_FAILURE; - } - - /* bail out if not initialized -- will get bad results */ - if (! scr_initialized) { - scr_abort(-1, "SCR has not been initialized @ %s:%d", - __FILE__, __LINE__ - ); - return SCR_FAILURE; - } - - return scr_complete_output(valid); + return scr_complete_output(valid, allvalid); } /* determine whether SCR has a restart available to read, @@ -2813,7 +2763,7 @@ int SCR_Start_restart(char* name) } /* inform library that the current restart is complete */ -int SCR_Complete_restart(int valid) +int SCR_Complete_restart(int valid, int* allvalid) { /* manage state transition */ if (scr_state != SCR_STATE_RESTART) { @@ -2839,11 +2789,12 @@ int SCR_Complete_restart(int valid) /* turn off our restart flag */ scr_have_restart = 0; - /* since we have no output flag to return to user whether all procs - * passed in valid=1, we'll overload the return code for that purpose, - * this should eventually be changed to use an output flag instead */ + /* assume we succeeded */ int rc = SCR_SUCCESS; + /* assume restart was valid */ + *allvalid = 1; + /* check that all procs read valid data */ if (! scr_alltrue(valid, scr_comm_world)) { /* if some process fails, attempt to restart from @@ -2852,9 +2803,8 @@ int SCR_Complete_restart(int valid) * we should also record this current checkpoint as failed in the * index file so that we don't fetch it again*/ - /* use the return code to indicate that some process failed to - * read its checkpoint file */ - rc = SCR_FAILURE; + /* indicate that some process failed to read its checkpoint file */ + *allvalid = 0; /* mark current checkpoint as bad in our index file so that * we don't attempt to fetch it again */ diff --git a/src/scr.h b/src/scr.h index abab93d1..6280250d 100644 --- a/src/scr.h +++ b/src/scr.h @@ -63,30 +63,20 @@ int SCR_Have_restart(int* flag, char* name); int SCR_Start_restart(char* name); /* inform library that the current restart is complete */ -int SCR_Complete_restart(int valid); +int SCR_Complete_restart(int valid, int* allvalid); /***************** - * Checkpoint routines (backwards compatibility) + * Output routines ****************/ /* determine whether a checkpoint should be taken at the current time */ int SCR_Need_checkpoint(int* flag); -/* inform library that a new checkpoint is starting */ -int SCR_Start_checkpoint(void); - -/* inform library that the current checkpoint is complete */ -int SCR_Complete_checkpoint(int valid); - -/***************** - * Output routines - ****************/ - /* inform library that a new output dataset is starting */ int SCR_Start_output(const char* name, int flags); /* inform library that the current dataset is complete */ -int SCR_Complete_output(int valid); +int SCR_Complete_output(int valid, int* allvalid); /***************** * Environment and configuration routines diff --git a/src/scr_interpose.c b/src/scr_interpose.c index d71e95b5..cf6f074b 100644 --- a/src/scr_interpose.c +++ b/src/scr_interpose.c @@ -163,7 +163,7 @@ static int scri_start_checkpoint() /* start the checkpoint */ scri_interpose_enabled = 0; - SCR_Start_checkpoint(); + SCR_Start_output(NULL, SCR_FLAG_CHECKPOINT); scri_interpose_enabled = 1; /* mark us inside a checkpoint */ @@ -196,7 +196,8 @@ static int scri_complete_checkpoint(int index) if (!still_open) { /* disable the interposer since SCR_Complete_checkpoint calls open/close */ scri_interpose_enabled = 0; - SCR_Complete_checkpoint(1); + int allvalid; + SCR_Complete_output(1, &allvalid); scri_interpose_enabled = 1; /* mark us out of the checkpoint */ diff --git a/src/scrf.c b/src/scrf.c index 1049f554..87e24804 100644 --- a/src/scrf.c +++ b/src/scrf.c @@ -183,15 +183,15 @@ FORTRAN_API void FORT_CALL scr_start_restart_(char* name FORT_MIXED_LEN(name_len return; } -FORTRAN_API void FORT_CALL scr_complete_restart_(int* valid, int* ierror) +FORTRAN_API void FORT_CALL scr_complete_restart_(int* valid, int* allvalid, int* ierror) { int valid_tmp = *valid; - *ierror = SCR_Complete_restart(valid_tmp); + *ierror = SCR_Complete_restart(valid_tmp, allvalid); return; } /*================================================ - * Checkpoint functions + * Output functions *================================================*/ FORTRAN_API void FORT_CALL scr_need_checkpoint_(int* flag, int* ierror) @@ -200,23 +200,6 @@ FORTRAN_API void FORT_CALL scr_need_checkpoint_(int* flag, int* ierror) return; } -FORTRAN_API void FORT_CALL scr_start_checkpoint_(int* ierror) -{ - *ierror = SCR_Start_checkpoint(); - return; -} - -FORTRAN_API void FORT_CALL scr_complete_checkpoint_(int* valid, int* ierror) -{ - int valid_tmp = *valid; - *ierror = SCR_Complete_checkpoint(valid_tmp); - return; -} - -/*================================================ - * Output functions - *================================================*/ - FORTRAN_API void FORT_CALL scr_start_output_(char* name FORT_MIXED_LEN(name_len), int* flags, int* ierror FORT_END_LEN(name_len)) { /* convert name from a Fortran string to C string */ @@ -232,10 +215,10 @@ FORTRAN_API void FORT_CALL scr_start_output_(char* name FORT_MIXED_LEN(name_len) return; } -FORTRAN_API void FORT_CALL scr_complete_output_(int* valid, int* ierror) +FORTRAN_API void FORT_CALL scr_complete_output_(int* valid, int* allvalid, int* ierror) { int valid_tmp = *valid; - *ierror = SCR_Complete_output(valid_tmp); + *ierror = SCR_Complete_output(valid_tmp, allvalid); return; } From b86a764a326234fa93797d7dcdb73be4b553e7cd Mon Sep 17 00:00:00 2001 From: Adam Moody Date: Sat, 6 Jun 2020 14:56:59 -0700 Subject: [PATCH 2/3] convert examples to output interface --- examples/test_api.c | 13 +++++------- examples/test_api_file.c | 9 +++++---- examples/test_api_multiple.c | 28 +++++++++++--------------- examples/test_api_multiple_file.c | 33 +++++++++++++++---------------- examples/test_ckpt.F | 24 ++-------------------- examples/test_ckpt.cpp | 5 +++-- 6 files changed, 43 insertions(+), 69 deletions(-) diff --git a/examples/test_api.c b/examples/test_api.c index 6b9bc62b..bb13a4c1 100644 --- a/examples/test_api.c +++ b/examples/test_api.c @@ -231,7 +231,7 @@ double getbw(char* name, char* buf, size_t size, int times) /* using scr, start our output */ scr_retval = SCR_Start_output(label, flags); if (scr_retval != SCR_SUCCESS) { - printf("%d: failed calling SCR_Start_checkpoint(): %d: @%s:%d\n", + printf("%d: failed calling SCR_Start_output(): %d: @%s:%d\n", rank, scr_retval, __FILE__, __LINE__ ); } @@ -301,7 +301,8 @@ double getbw(char* name, char* buf, size_t size, int times) /* mark this checkpoint as complete */ if (use_scr) { - scr_retval = SCR_Complete_output(valid); + int allvalid; + scr_retval = SCR_Complete_output(valid, &allvalid); if (scr_retval != SCR_SUCCESS) { printf("%d: failed calling SCR_Complete_output: %d: @%s:%d\n", rank, scr_retval, __FILE__, __LINE__ @@ -510,12 +511,8 @@ int main (int argc, char* argv[]) } /* indicate to library that we're done with restart, tell it whether we read our data ok */ - scr_retval = SCR_Complete_restart(found_checkpoint); - if (scr_retval == SCR_SUCCESS) { - /* all procs succeeded in reading their checkpoint file, - * we've successfully restarted */ - restarted = 1; - } else { + scr_retval = SCR_Complete_restart(found_checkpoint, &restarted); + if (scr_retval != SCR_SUCCESS) { printf("%d: failed calling SCR_Complete_restart: %d: @%s:%d\n", rank, scr_retval, __FILE__, __LINE__ ); diff --git a/examples/test_api_file.c b/examples/test_api_file.c index ac2e9a5a..f5f43720 100644 --- a/examples/test_api_file.c +++ b/examples/test_api_file.c @@ -54,9 +54,9 @@ double getbw(char* name, char* buf, size_t size, int times) */ /* instruct SCR we are starting the next checkpoint */ - scr_retval = SCR_Start_checkpoint(); + scr_retval = SCR_Start_output(NULL, SCR_FLAG_CHECKPOINT); if (scr_retval != SCR_SUCCESS) { - printf("%d: failed calling SCR_Start_checkpoint(): %d: @%s:%d\n", + printf("%d: failed calling SCR_Start_output(): %d: @%s:%d\n", rank, scr_retval, __FILE__, __LINE__ ); } @@ -107,9 +107,10 @@ double getbw(char* name, char* buf, size_t size, int times) */ /* mark this checkpoint as complete */ - scr_retval = SCR_Complete_checkpoint(valid); + int allvalid; + scr_retval = SCR_Complete_output(valid, &allvalid); if (scr_retval != SCR_SUCCESS) { - printf("%d: failed calling SCR_Complete_checkpoint: %d: @%s:%d\n", + printf("%d: failed calling SCR_Complete_output: %d: @%s:%d\n", rank, scr_retval, __FILE__, __LINE__ ); } diff --git a/examples/test_api_multiple.c b/examples/test_api_multiple.c index ec2ce25d..46903940 100644 --- a/examples/test_api_multiple.c +++ b/examples/test_api_multiple.c @@ -131,11 +131,10 @@ int main (int argc, char* argv[]) } // done reading our checkpoint - SCR_Complete_restart(valid); + int all_valid = 0; + SCR_Complete_restart(valid, &all_valid); // check that everyone found their checkpoint files ok - int all_valid = 0; - MPI_Allreduce(&valid, &all_valid, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD); if (!all_valid && rank == 0) { printf("At least one rank (perhaps all) did not find its checkpoint\n"); } @@ -183,7 +182,7 @@ int main (int argc, char* argv[]) int t; for(t=0; t < 1; t++) { int rc; - int all_valid = 1; + int valid = 1; // define a name for this checkpoint sprintf(ckptname, "timestep.%d", timestep); @@ -198,8 +197,6 @@ int main (int argc, char* argv[]) // write out each of our checkpoint files for (i=0; i < num_files; i++) { - int valid = 0; - // define path to checkpoint file char origpath[1024]; sprintf(origpath, "%s/%s", ckptname, files[i]); @@ -216,8 +213,6 @@ int main (int argc, char* argv[]) // open file and write checkpoint int fd_me = open(file, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); if (fd_me > 0) { - valid = 1; - // write the checkpoint rc = write_checkpoint(fd_me, timestep, bufs[i], filesizes[i]); if (rc < 0) { valid = 0; } @@ -228,12 +223,14 @@ int main (int argc, char* argv[]) // make sure the close is without error rc = close(fd_me); if (rc < 0) { valid = 0; } + } else { + valid = 0; } - if (!valid) { all_valid = 0; } } // complete the checkpoint - scr_retval = SCR_Complete_output(all_valid); + int allvalid; + scr_retval = SCR_Complete_output(valid, &allvalid); if (scr_retval != SCR_SUCCESS) { printf("%d: failed calling SCR_Complete_output(): %d: @%s:%d\n", rank, scr_retval, __FILE__, __LINE__ @@ -250,7 +247,7 @@ int main (int argc, char* argv[]) double time_start = MPI_Wtime(); for(t=0; t < times; t++) { int rc; - int all_valid = 1; + int valid = 1; // define a name for this checkpoint sprintf(ckptname, "timestep.%d", timestep); @@ -265,8 +262,6 @@ int main (int argc, char* argv[]) // write out each of our checkpoint files for (i=0; i < num_files; i++) { - int valid = 0; - // define path to checkpoint file char origpath[1024]; sprintf(origpath, "%s/%s", ckptname, files[i]); @@ -284,7 +279,6 @@ int main (int argc, char* argv[]) int fd_me = open(file, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); if (fd_me > 0) { count++; - valid = 1; // write the checkpoint rc = write_checkpoint(fd_me, timestep, bufs[i], filesizes[i]); @@ -296,12 +290,14 @@ int main (int argc, char* argv[]) // make sure the close is without error rc = close(fd_me); if (rc < 0) { valid = 0; } + } else { + valid = 0; } - if (!valid) { all_valid = 0; } } // complete the checkpoint - scr_retval = SCR_Complete_output(all_valid); + int allvalid; + scr_retval = SCR_Complete_output(valid, &allvalid); if (scr_retval != SCR_SUCCESS) { printf("%d: failed calling SCR_Complete_output(): %d: @%s:%d\n", rank, scr_retval, __FILE__, __LINE__ diff --git a/examples/test_api_multiple_file.c b/examples/test_api_multiple_file.c index 42679f7d..531dc79f 100644 --- a/examples/test_api_multiple_file.c +++ b/examples/test_api_multiple_file.c @@ -170,15 +170,14 @@ int main (int argc, char* argv[]) int t; for(t=0; t < 1; t++) { int rc; - int all_valid = 1; - scr_retval = SCR_Start_checkpoint(); + int valid = 1; + scr_retval = SCR_Start_output(NULL, SCR_FLAG_CHECKPOINT); if (scr_retval != SCR_SUCCESS) { - printf("%d: failed calling SCR_Start_checkpoint(): %d: @%s:%d\n", + printf("%d: failed calling SCR_Start_output(): %d: @%s:%d\n", rank, scr_retval, __FILE__, __LINE__ ); } for (i=0; i < num_files; i++) { - int valid = 0; char file[2094]; scr_retval = SCR_Route_file(files[i], file); if (scr_retval != SCR_SUCCESS) { @@ -188,8 +187,6 @@ int main (int argc, char* argv[]) } int fd_me = open(file, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); if (fd_me > 0) { - valid = 1; - // write the checkpoint rc = write_checkpoint(fd_me, timestep, bufs[i], filesizes[i]); if (rc < 0) { valid = 0; } @@ -200,12 +197,14 @@ int main (int argc, char* argv[]) // make sure the close is without error rc = close(fd_me); if (rc < 0) { valid = 0; } + } else { + valid = 0; } - if (!valid) { all_valid = 0; } } - scr_retval = SCR_Complete_checkpoint(all_valid); + int allvalid; + scr_retval = SCR_Complete_output(valid, &allvalid); if (scr_retval != SCR_SUCCESS) { - printf("%d: failed calling SCR_Complete_checkpoint(): %d: @%s:%d\n", + printf("%d: failed calling SCR_Complete_output(): %d: @%s:%d\n", rank, scr_retval, __FILE__, __LINE__ ); } @@ -220,15 +219,14 @@ int main (int argc, char* argv[]) double time_start = MPI_Wtime(); for(t=0; t < times; t++) { int rc; - int all_valid = 1; - scr_retval = SCR_Start_checkpoint(); + int valid = 1; + scr_retval = SCR_Start_output(NULL, SCR_FLAG_CHECKPOINT); if (scr_retval != SCR_SUCCESS) { - printf("%d: failed calling SCR_Start_checkpoint(): %d: @%s:%d\n", + printf("%d: failed calling SCR_Start_output(): %d: @%s:%d\n", rank, scr_retval, __FILE__, __LINE__ ); } for (i=0; i < num_files; i++) { - int valid = 0; char file[2094]; scr_retval = SCR_Route_file(files[i], file); if (scr_retval != SCR_SUCCESS) { @@ -239,7 +237,6 @@ int main (int argc, char* argv[]) int fd_me = open(file, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); if (fd_me > 0) { count++; - valid = 1; // write the checkpoint rc = write_checkpoint(fd_me, timestep, bufs[i], filesizes[i]); @@ -251,12 +248,14 @@ int main (int argc, char* argv[]) // make sure the close is without error rc = close(fd_me); if (rc < 0) { valid = 0; } + } else { + valid = 0; } - if (!valid) { all_valid = 0; } } - scr_retval = SCR_Complete_checkpoint(all_valid); + int allvalid; + scr_retval = SCR_Complete_output(valid, &allvalid); if (scr_retval != SCR_SUCCESS) { - printf("%d: failed calling SCR_Complete_checkpoint(): %d: @%s:%d\n", + printf("%d: failed calling SCR_Complete_output(): %d: @%s:%d\n", rank, scr_retval, __FILE__, __LINE__ ); } diff --git a/examples/test_ckpt.F b/examples/test_ckpt.F index 7f0aa272..3fd66112 100644 --- a/examples/test_ckpt.F +++ b/examples/test_ckpt.F @@ -55,7 +55,7 @@ program test_ckpt_F read(readunit,iostat=ios) R1 close(readunit) - call SCR_COMPLETE_RESTART(valid, ierr) + call SCR_COMPLETE_RESTART(valid, flag, ierr) endif if (mynod == 0) then @@ -72,26 +72,6 @@ program test_ckpt_F forall(i=1:ni,j=1:nj,k=1:nk) W1(i,j,k) = + nodeoff*mynod+i+ni*(j-1+nj*(k-1)) -! test checkpoint interface - call SCR_START_CHECKPOINT(ierr) - - write(file_suffix, '(i5.5)') loop - ckptname = "ckpt_" // trim(file_suffix) - - writeunit = mynod - write(file_suffix, '(i5.5)') writeunit - fname = trim(ckptname) // "/" // - + trim(basefname) // trim(file_suffix) // ".ckpt" - call SCR_ROUTE_FILE(fname, fname_scr, ierr) - - valid = 1 - open(unit=writeunit,file=fname_scr,form='unformatted', - + action='write') - write(writeunit,iostat=ios) W1 - close(writeunit) - - call SCR_COMPLETE_CHECKPOINT(valid, ierr) - ! test output interface write(file_suffix, '(i5.5)') loop ckptname = "output_" // trim(file_suffix) @@ -110,7 +90,7 @@ program test_ckpt_F write(writeunit,iostat=ios) W1 close(writeunit) - call SCR_COMPLETE_OUTPUT(valid, ierr) + call SCR_COMPLETE_OUTPUT(valid, flag, ierr) call MPI_BARRIER(MPI_COMM_WORLD, ierr) diff --git a/examples/test_ckpt.cpp b/examples/test_ckpt.cpp index e216b414..8bc465a0 100644 --- a/examples/test_ckpt.cpp +++ b/examples/test_ckpt.cpp @@ -34,7 +34,7 @@ int checkpoint(int size_mb) MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* Inform SCR that we are starting a new checkpoint */ - SCR_Start_checkpoint(); + SCR_Start_output(NULL, SCR_FLAG_CHECKPOINT); /* Build the filename for our checkpoint file */ sprintf(tmp, "rank_%d", rank); @@ -50,7 +50,8 @@ int checkpoint(int size_mb) cout << "Out: " << file << "\n"; /* Tell SCR whether this process wrote its checkpoint files successfully */ - SCR_Complete_checkpoint(1); + int allvalid; + SCR_Complete_output(1, &allvalid); return 0; } From 5259f975129ff9d07ad08dbd315a630c32b0cd2e Mon Sep 17 00:00:00 2001 From: Adam Moody Date: Sat, 6 Jun 2020 15:22:40 -0700 Subject: [PATCH 3/3] show use of valid param in documentation --- doc/rst/users/quick.rst | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/rst/users/quick.rst b/doc/rst/users/quick.rst index cbaef6cd..791e9315 100644 --- a/doc/rst/users/quick.rst +++ b/doc/rst/users/quick.rst @@ -200,14 +200,28 @@ other programs in the examples directory. SCR_Route_file(file, scr_file); /* Use the new file name to perform your checkpoint I/O */ + int valid = 1; FILE* fs = fopen(scr_file, "w"); if (fs != NULL) { - fwrite(state, ..., fs); - fclose(fs); + size_t nwritten = fwrite(state, size, count, fs); + if (nwritten < count) { + /* write failed, tell SCR this process failed */ + valid = 0; + } + + int close_rc = fclose(fs); + if (close_rc != 0) { + /* failed to close file, tell SCR this process failed */ + valid = 0; + } + } else { + /* failed to open file, tell SCR this process failed */ + valid = 0; } /* Tell SCR that you are done with your checkpoint phase */ - SCR_Complete_output(1); + int allvalid; + SCR_Complete_output(valid, &allvalid); return; }