From 60ac754f497c0abba0ce76135322e2a888451570 Mon Sep 17 00:00:00 2001 From: Jaime Frey Date: Wed, 26 May 2021 11:00:45 -0500 Subject: [PATCH] Stop making symlinks in ~/.blah_jobproxy_dir/. HTCONDOR-520 To allow proxy refresh to work when the blahp makes a modified copy of the proxy on submit, it creates a symlink to the "live" proxy file under ~/.blah_jobproxy_dir/, named with the job id. These symlinks are frequently not cleaned up, and it's hard to guarantee that they are always cleaned up. The name for the modified proxy file is always the name of the original file plus one of several well-known suffixes. We already assume that the proxy refresh command will use the same proxy filename as the original submit command. So instead of finding the live proxy file via a symlink under HOME, we can check for the existence of the filename we would have created at submit time. --- src/BLClient.c | 4 + src/BLParserLSF.c | 8 + src/BLParserPBS.c | 8 + src/scripts/blah_common_submit_functions.sh | 11 -- src/scripts/condor_status.sh | 7 - src/scripts/condor_submit.sh | 10 -- src/scripts/lsf_status.sh | 14 +- src/scripts/pbs_status.sh | 13 +- src/scripts/slurm_status.sh | 6 +- src/server.c | 159 ++++---------------- 10 files changed, 54 insertions(+), 186 deletions(-) diff --git a/src/BLClient.c b/src/BLClient.c index 20e3d1b3..897754e1 100644 --- a/src/BLClient.c +++ b/src/BLClient.c @@ -217,6 +217,10 @@ main(int argc, char *argv[]) fgets(buffer, MAX_LINE-1, stdin); + /* The pbs/lsf_status.sh script now ignores the proxy removal + * message after the '/', since we no longer make symlinks in + * ~/.blah_jobproxy_dir. + */ if(strstr(buffer,"/")==NULL){ if ((cp = strrchr (buffer, '\n')) != NULL){ *cp = '\0'; diff --git a/src/BLParserLSF.c b/src/BLParserLSF.c index 8c9353eb..18988277 100644 --- a/src/BLParserLSF.c +++ b/src/BLParserLSF.c @@ -843,6 +843,10 @@ LookupAndSend(int m_sock) }else{ t_wnode=make_message("WorkerNode=%s;",j2wn[id]); } + /* This proxy removal message is now ignored by the + * lsf_status.sh script, since we no longer make symlinks in + * ~/.blah_jobproxy_dir. + */ if(j2js[id] && ((strcmp(j2js[id],"3")==0) || (strcmp(j2js[id],"4")==0))){ pr_removal="Yes"; } else { @@ -887,6 +891,10 @@ LookupAndSend(int m_sock) }else{ t_wnode=make_message("WorkerNode=%s;",j2wn[id]); } + /* This proxy removal message is now ignored by the + * lsf_status.sh script, since we no longer make symlinks in + * ~/.blah_jobproxy_dir. + */ if(j2js[id] && ((strcmp(j2js[id],"3")==0) || (strcmp(j2js[id],"4")==0))){ pr_removal="Yes"; } else { diff --git a/src/BLParserPBS.c b/src/BLParserPBS.c index 522ce1f0..23ec5f5e 100644 --- a/src/BLParserPBS.c +++ b/src/BLParserPBS.c @@ -1000,6 +1000,10 @@ LookupAndSend(int m_sock) pthread_mutex_lock(&write_mutex); if(id>0 && j2js[id]!=NULL){ + /* This proxy removal message is now ignored by the + * lsf_status.sh script, since we no longer make symlinks in + * ~/.blah_jobproxy_dir. + */ if(j2js[id] && ((strcmp(j2js[id],"3")==0) || (strcmp(j2js[id],"4")==0))){ pr_removal="Yes"; } else { @@ -1030,6 +1034,10 @@ LookupAndSend(int m_sock) sysfatal("can't malloc out_buf in LookupAndSend: %r"); } + /* This proxy removal message is now ignored by the + * lsf_status.sh script, since we no longer make symlinks in + * ~/.blah_jobproxy_dir. + */ if(j2js[id] && ((strcmp(j2js[id],"3")==0) || (strcmp(j2js[id],"4")==0))){ pr_removal="Yes"; } else { diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index f5bc051c..d0830067 100755 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -283,8 +283,6 @@ function bls_parse_submit_options () bls_opt_proxyrenew="no" fi - bls_proxy_dir=~/.blah_jobproxy_dir - bls_opt_workdir=$PWD #default values for polling interval and min proxy lifetime @@ -518,7 +516,6 @@ function bls_setup_all_files () if [ "x$bls_opt_stgproxy" == "xyes" ] ; then bls_proxy_local_file=${bls_opt_workdir}"/"`basename "$bls_opt_proxy_string"`; [ -r "$bls_proxy_local_file" -a -f "$bls_proxy_local_file" ] || bls_proxy_local_file="$bls_opt_proxy_string" - [ -r "$bls_proxy_local_file" -a -f "$bls_proxy_local_file" ] || bls_proxy_local_file=/tmp/x509up_u`id -u` if [ -r "$bls_proxy_local_file" -a -f "$bls_proxy_local_file" ] ; then bls_proxy_remote_file=${bls_tmp_name}.proxy bls_test_shared_dir "$bls_proxy_local_file" @@ -888,12 +885,4 @@ function bls_wrap_up_submit () cd $bls_opt_temp_dir # DEBUG: cp $bls_tmp_file /tmp rm -f $bls_tmp_file - - if [ "x$job_registry" == "x" ]; then - # Create a softlink to proxy file for proxy renewal - if [ -r "$bls_proxy_local_file" -a -f "$bls_proxy_local_file" ] ; then - [ -d "$bls_proxy_dir" ] || mkdir $bls_proxy_dir - ln -s $bls_proxy_local_file $bls_proxy_dir/$jobID.proxy - fi - fi } diff --git a/src/scripts/condor_status.sh b/src/scripts/condor_status.sh index 7a6b3044..a8c11b03 100755 --- a/src/scripts/condor_status.sh +++ b/src/scripts/condor_status.sh @@ -22,8 +22,6 @@ # limitations under the License. # -proxy_dir=~/.blah_jobproxy_dir - . `dirname $0`/blah_load_config.sh if [ "x$job_registry" != "x" ] ; then @@ -196,11 +194,6 @@ function make_ad { local exit_by_signal=$(echo $line | awk -F ',' '{print $8}') local code_or_signal=$(echo $line | awk -F ',' '{print $9}') - # Clean up proxy renewal links if applicable - if [ "$status" == "3" -o "$status" == "4" ]; then - /bin/rm -f $proxy_dir/$job.proxy.norenew 2>/dev/null - fi - echo -n "[BatchjobId=\"$job\";JobStatus=$status;RemoteSysCpu=${remote_sys_cpu:-0};RemoteUserCpu=${remote_user_cpu:-0};BytesSent=${bytes_sent:-0};BytesRecvd=${bytes_recvd:-0};RemoteWallClockTime=${remote_wall_clock_time:-0};" if [ "$status" == "4" ] ; then if [ "$exit_by_signal" == "0" ] ; then diff --git a/src/scripts/condor_submit.sh b/src/scripts/condor_submit.sh index f95a9357..c7d4a228 100755 --- a/src/scripts/condor_submit.sh +++ b/src/scripts/condor_submit.sh @@ -278,14 +278,4 @@ fi # Clean temporary files -- There only temp file is the one we submit rm -f $submit_file -# Create a softlink to proxy file for proxy renewal - local renewal -# of limited proxy only. - -if [ "x$job_registry" == "x" ]; then - if [ -r "$bls_opt_proxy_string" -a -f "$bls_opt_proxy_string" ] ; then - [ -d "$bls_proxy_dir" ] || mkdir "$bls_proxy_dir" - ln -s "$bls_opt_proxy_string" "$bls_proxy_dir/$jobID.proxy.norenew" - fi -fi - exit $return_code diff --git a/src/scripts/lsf_status.sh b/src/scripts/lsf_status.sh index 79691342..3eba33d2 100755 --- a/src/scripts/lsf_status.sh +++ b/src/scripts/lsf_status.sh @@ -117,7 +117,6 @@ if [ "x$getcreamport" == "xyes" ] ; then exit $retcode fi -proxy_dir=~/.blah_jobproxy_dir pars=$* for reqfull in $pars ; do @@ -185,10 +184,6 @@ END { print "ExitCode=" exitcode ";" } print "]" - if (jobstatus == 3 || jobstatus == 4) { - system("rm " proxyDir "/" jobId ".proxy 2>/dev/null") - } - } ' ` @@ -268,7 +263,7 @@ END { job_data=`grep "$requested" $logs` -result=`echo "$job_data" | awk -v jobId=$requested -v proxyDir=$proxy_dir ' +result=`echo "$job_data" | awk -v jobId=$requested ' BEGIN { rex_queued = "\"JOB_NEW\" \"[0-9\.]+\" [0-9]+ " jobId rex_running = "\"JOB_START\" \"[0-9\.]+\" [0-9]+ " jobId @@ -356,9 +351,6 @@ END { } } print "]" - if (jobstatus == 3 || jobstatus == 4) { - system("rm " proxyDir "/" jobId ".proxy 2>/dev/null") - } } ' ` @@ -371,12 +363,8 @@ END { if [ "x$usedBLParser" == "xyes" ] ; then - pr_removal=`echo $result | sed -e 's/^.*\///'` result=`echo $result | sed 's/\/.*//'` echo "0"$result - if [ "x$pr_removal" == "xYes" ] ; then - rm -f ${proxy_dir}/${requested}.proxy 2>/dev/null - fi usedBLParser="no" fi logs="" diff --git a/src/scripts/pbs_status.sh b/src/scripts/pbs_status.sh index 9c850f52..b889e717 100755 --- a/src/scripts/pbs_status.sh +++ b/src/scripts/pbs_status.sh @@ -163,7 +163,6 @@ if [ "x$getcreamport" == "xyes" ] ; then fi pars=$* -proxy_dir=~/.blah_jobproxy_dir for reqfull in $pars ; do requested="" @@ -228,9 +227,6 @@ END { print "ExitCode=" exitcode ";" } print "]" - if (jobstatus == 3 || jobstatus == 4) { - system("rm " proxyDir "/" jobId ".proxy 2>/dev/null") - } } ' @@ -278,7 +274,7 @@ END { usedBLParser="no" logs="$logpath/$logfile `find $logpath -type f -newer $logpath/$logfile`" log_data=`grep "$reqjob" $logs` - result=`echo "$log_data" | awk -v jobId="$reqjob" -v wn="$workernode" -v proxyDir="$proxy_dir" ' + result=`echo "$log_data" | awk -v jobId="$reqjob" -v wn="$workernode" ' BEGIN { rex_queued = jobId ";Job Queued " rex_running = jobId ";Job Run " @@ -335,9 +331,6 @@ END { print "ExitCode = " exitcode ";" } print "]" - if (jobstatus == 3 || jobstatus == 4) { - system("rm " proxyDir "/" jobId ".proxy 2>/dev/null") - } } ' ` @@ -350,7 +343,6 @@ END { fi fi #close if on pbs_BLParser if [ "x$usedBLParser" == "xyes" ] ; then - pr_removal=`echo $result | sed -e 's/^.*\///'` result=`echo $result | sed 's/\/.*//'` resstatus=`echo $result|sed "s/\[.*JobStatus=\([^;]*\).*/\1/"`; @@ -370,9 +362,6 @@ END { echo "0"$result "Workernode=\"$workernode\";]" fi - if [ "x$pr_removal" == "xYes" ] ; then - rm -f ${proxy_dir}/${reqjob}.proxy 2>/dev/null - fi usedBLParser="no" fi fi #close of if-else on $pbs_nologaccess diff --git a/src/scripts/slurm_status.sh b/src/scripts/slurm_status.sh index 1ac617b7..bb5d6c4f 100755 --- a/src/scripts/slurm_status.sh +++ b/src/scripts/slurm_status.sh @@ -48,7 +48,6 @@ done shift `expr $OPTIND - 1` pars=$* -proxy_dir=~/.blah_jobproxy_dir for reqfull in $pars ; do reqjob=`echo $reqfull | sed -e 's/^.*\///'` @@ -63,7 +62,7 @@ for reqfull in $pars ; do result=`${slurm_binpath}/scontrol $cluster_arg show job $reqjob 2>$staterr` stat_exit_code=$? - result=`echo "$result" | awk -v job_id=$reqjob -v proxy_dir=$proxy_dir ' + result=`echo "$result" | awk -v job_id=$reqjob ' BEGIN { blah_status = 4 slurm_status = "" @@ -101,9 +100,6 @@ END { print "ExitCode=" exit_code ";" } print "]\n" - if ( blah_status == 3 || blah_status == 4 ) { - #system( "rm " proxy_dir "/" job_id ".proxy 2>/dev/null" ) - } } ' ` diff --git a/src/server.c b/src/server.c index 2ac7b403..dc2a0346 100644 --- a/src/server.c +++ b/src/server.c @@ -1775,22 +1775,18 @@ cmd_status_job_all(void *args) } int -get_status_and_old_proxy(int use_glexec, char *jobDescr, const char *proxyFileName, +get_status_and_old_proxy(int map_mode, char *jobDescr, const char *proxyFileName, char **status_argv, char **old_proxy, char **workernode, char **error_string) { - char *proxy_link; char *r_old_proxy=NULL; - int readlink_res, retcod; + int retcod; classad_context status_ad[MAX_JOB_NUMBER]; char errstr[MAX_JOB_NUMBER][ERROR_MAX_LEN]; int jobNumber=0, jobStatus; - char *command, *escaped_command; - char error_buffer[ERROR_MAX_LEN]; - job_registry_split_id *spid; job_registry_entry *ren; int i; - exec_cmd_t exe_command = EXEC_CMD_DEFAULT; + const char *proxy_ext = NULL; if (old_proxy == NULL) return(-1); *old_proxy = NULL; @@ -1826,131 +1822,36 @@ get_status_and_old_proxy(int use_glexec, char *jobDescr, const char *proxyFileNa /* FIXMEPRREG: the job proxy and status can be removed once */ /* FIXMEPRREG: the job registry is used throughout. */ - spid = job_registry_split_blah_id(jobDescr); - if (spid == NULL) return(-1); /* Error */ + switch (map_mode) { + case MEXEC_GLEXEC: + proxy_ext = ".glexec"; + break; + case MEXEC_SUDO: + proxy_ext = ".mapped"; + break; + case MEXEC_NO_MAPPING: + default: + proxy_ext = ".lmt"; + } - if (!use_glexec) + if ((r_old_proxy = make_message("%s%s", proxyFileName, proxy_ext)) == NULL) { - if ((r_old_proxy = (char *)malloc(FILENAME_MAX)) == NULL) - { - fprintf(stderr, "Out of memory.\n"); - exit(MALLOC_ERROR); - } - if ((proxy_link = make_message("%s/.blah_jobproxy_dir/%s.proxy", getenv("HOME"), spid->proxy_id)) == NULL) - { - fprintf(stderr, "Out of memory.\n"); - exit(MALLOC_ERROR); - } - if ((readlink_res = readlink(proxy_link, r_old_proxy, FILENAME_MAX - 2)) == -1) - { - if (error_string != NULL) - { - error_buffer[0] = '\000'; - strerror_r(errno, error_buffer, sizeof(error_buffer)); - *error_string = escape_spaces(error_buffer); - } - /* Proxy link for renewal is not accessible */ - /* Try with .norenew */ - free(proxy_link); - if ((proxy_link = make_message("%s/.blah_jobproxy_dir/%s.proxy.norenew", getenv("HOME"), spid->proxy_id)) == NULL) - { - fprintf(stderr, "Out of memory.\n"); - exit(MALLOC_ERROR); - } - if ((readlink_res = readlink(proxy_link, r_old_proxy, FILENAME_MAX - 2)) >= 0) - { - /* No need to check for job status - */ - /* Proxy has to be renewed locally */ - r_old_proxy[readlink_res] = '\000'; /* readlink does not append final NULL */ - *old_proxy = r_old_proxy; - free(proxy_link); - job_registry_free_split_id(spid); - return 1; /* 'local' state */ - } - // Look for the limited proxy next to the new proxy - this is a common case for HTCondor-based submission. - free(proxy_link); - if ((proxy_link = make_message("%s.lmt", proxyFileName)) == NULL) - { - fprintf(stderr, "Out of memory.\n"); - exit(MALLOC_ERROR); - } - if (access(proxy_link, R_OK) == 0) - { - *old_proxy = proxy_link; - // do not free proxy_link in this case. - free(r_old_proxy); - job_registry_free_split_id(spid); - return 1; - } - free(proxy_link); - free(r_old_proxy); - job_registry_free_split_id(spid); - return -1; /* Error */ - } - r_old_proxy[readlink_res] = '\000'; /* readlink does not append final NULL */ - *old_proxy = r_old_proxy; - free(proxy_link); - + fprintf(stderr, "Out of memory.\n"); + exit(MALLOC_ERROR); } - else + if (access(r_old_proxy, R_OK) == -1) { - /* GLEXEC case */ - exe_command.delegation_type = atoi(status_argv[MEXEC_PARAM_DELEGTYPE]); - exe_command.delegation_cred = status_argv[MEXEC_PARAM_DELEGCRED]; - exe_command.command = make_message("/usr/bin/readlink -n .blah_jobproxy_dir/%s.proxy", spid->proxy_id); - if (exe_command.command == NULL) - { - fprintf(stderr, "Out of memory.\n"); - exit(MALLOC_ERROR); - } - retcod = execute_cmd(&exe_command); - r_old_proxy = strdup(exe_command.output); - if (r_old_proxy == NULL || strlen(r_old_proxy) == 0 || retcod != 0 || exe_command.exit_code != 0) + r_old_proxy[strlen(proxyFileName)] = '\0'; + if (access(r_old_proxy, R_OK) == -1) { - if (r_old_proxy != NULL) - { - free(r_old_proxy); - r_old_proxy = NULL; - } - - if (error_string != NULL) - { - escaped_command = escape_spaces(command); - *error_string = make_message("%s\\ returns\\ %d\\ and\\ no\\ proxy.", escaped_command, retcod); - if (BLAH_DYN_ALLOCATED(escaped_command)) free(escaped_command); - } - - /* Proxy link for renewal is not accessible */ - /* Try with .norenew */ - recycle_cmd(&exe_command); - exe_command.append_to_command = ".norenew"; - retcod = execute_cmd(&exe_command); - r_old_proxy = strdup(exe_command.output); - if (r_old_proxy != NULL && strlen(r_old_proxy) > 0 && retcod == 0) - { - /* No need to check for job status - */ - /* Proxy has to be renewed locally */ - *old_proxy = r_old_proxy; - free(exe_command.command); - cleanup_cmd(&exe_command); - job_registry_free_split_id(spid); - return 1; /* 'local' state */ - } - if (r_old_proxy != NULL) free(r_old_proxy); - free(exe_command.command); - cleanup_cmd(&exe_command); - job_registry_free_split_id(spid); - return(-1); + free(r_old_proxy); + return -1; } - *old_proxy = r_old_proxy; - free(exe_command.command); - cleanup_cmd(&exe_command); } + *old_proxy = r_old_proxy; - job_registry_free_split_id(spid); - - /* If we have a proxy link, and proxy renewal on worker nodes */ - /* was disabled, we need to deal with the proxy locally. */ + /* If we have a proxy, and proxy renewal on worker nodes was */ + /* disabled, we only need to deal with the proxy locally. */ /* We don't need to spend time checking on job status. */ if (disable_wn_proxy_renewal) return 1; /* 'Local' renewal only. */ @@ -1992,13 +1893,15 @@ cmd_renew_proxy(void *args) int i, jobStatus, retcod, count; char *error_string = NULL; char *proxyFileNameNew = NULL; - int use_glexec, use_mapping; + int use_glexec = 0; + int use_mapping = 0; exec_cmd_t exe_command = EXEC_CMD_DEFAULT; char *limited_proxy_name = NULL; - use_mapping = (argv[CMD_RENEW_PROXY_ARGS + 1] != NULL); - use_glexec = ( use_mapping && - (atoi(argv[CMD_RENEW_PROXY_ARGS + 1 + MEXEC_PARAM_DELEGTYPE ]) == MEXEC_GLEXEC) ); + if (argv[CMD_RENEW_PROXY_ARGS + 1] != NULL) { + use_mapping = atoi(argv[CMD_RENEW_PROXY_ARGS + 1 + MEXEC_PARAM_DELEGTYPE ]); + use_glexec = use_mapping == MEXEC_GLEXEC; + } if (blah_children_count>0) check_on_children(blah_children, blah_children_count);