From 1a48735de3ad690259a722a016b87351a2c4ff9c Mon Sep 17 00:00:00 2001 From: Francesco Prelz Date: Fri, 12 Oct 2012 10:42:58 +0200 Subject: [PATCH 001/169] state of branch 1.18 as of 20121012 --- configure.ac | 8 +- project/debfiles/control | 2 +- project/debfiles/glite-ce-blahp.install | 1 + project/debfiles/glite-ce-blahp.postinst | 34 + project/debfiles/glite-ce-blahp.prerm | 32 + project/glite-ce-blahp_sl_any.spec | 11 +- project/properties.xml | 2 +- project/version.properties | 2 +- src/BNotifier.c | 57 +- src/BUpdaterCondor.c | 28 +- src/BUpdaterLSF.c | 84 +- src/BUpdaterPBS.c | 32 +- src/BUpdaterSLURM.c | 855 -------------------- src/BUpdaterSLURM.h | 72 -- src/Bfunctions.c | 2 - src/Makefile.am | 9 +- src/blah_job_registry_scan_by_subject.c | 178 ++-- src/job_registry.c | 18 +- src/job_registry_updater.c | 110 +-- src/resbuffer.c | 1 + src/scripts/Makefile.am | 7 +- src/scripts/blah_common_submit_functions.sh | 1 + src/scripts/blah_load_config.sh | 1 + src/scripts/slurm_cancel.sh | 54 -- src/scripts/slurm_status.sh | 40 - src/scripts/slurm_submit.sh | 100 --- 26 files changed, 268 insertions(+), 1473 deletions(-) create mode 100755 project/debfiles/glite-ce-blahp.postinst create mode 100755 project/debfiles/glite-ce-blahp.prerm delete mode 100644 src/BUpdaterSLURM.c delete mode 100644 src/BUpdaterSLURM.h delete mode 100755 src/scripts/slurm_cancel.sh delete mode 100755 src/scripts/slurm_status.sh delete mode 100755 src/scripts/slurm_submit.sh diff --git a/configure.ac b/configure.ac index 62655be3..929b50e2 100755 --- a/configure.ac +++ b/configure.ac @@ -12,12 +12,12 @@ # Council for the Central Laboratory of the Research Councils (CCLRC), United Kingdom # # Authors: Francesco Prelz -# Version info: $Id: configure.ac,v 1.54 2012/03/01 13:26:54 pandreet Exp $ -# Release: $Name: $ +# Version info: $Id: configure.ac,v 1.53.2.1 2012/06/08 13:09:09 pandreet Exp $ +# Release: $Name: glite-ce-blahp_B_1_18 $ # # Revision history: # $Log: configure.ac,v $ -# Revision 1.54 2012/03/01 13:26:54 pandreet +# Revision 1.53.2.1 2012/06/08 13:09:09 pandreet # Changed version number # # Revision 1.53 2012/02/15 13:55:15 pandreet @@ -163,7 +163,7 @@ # AC_PREREQ(2.57) -AC_INIT([GLite CE blahp], [1.19.0]) +AC_INIT([GLite CE blahp], [1.18.1]) AC_CONFIG_AUX_DIR([./project]) AM_INIT_AUTOMAKE([1.6.3 subdir-objects]) AC_CONFIG_SRCDIR([src/main.c]) diff --git a/project/debfiles/control b/project/debfiles/control index 9f6be901..e193fd86 100644 --- a/project/debfiles/control +++ b/project/debfiles/control @@ -2,7 +2,7 @@ Source: glite-ce-blahp Section: net Priority: optional Maintainer: CREAM group -Build-Depends: debhelper (>= 8.0.0~) +Build-Depends: debhelper (>= 8.0.0~), libtool, libclassad-dev, docbook-xsl, xsltproc Standards-Version: 3.9.1 Homepage: http://glite.cern.ch/ diff --git a/project/debfiles/glite-ce-blahp.install b/project/debfiles/glite-ce-blahp.install index 16079fd9..cfdf34c9 100644 --- a/project/debfiles/glite-ce-blahp.install +++ b/project/debfiles/glite-ce-blahp.install @@ -1,3 +1,4 @@ usr/bin/* +usr/sbin/* etc/*.template usr/share/man/man1/*.1.gz diff --git a/project/debfiles/glite-ce-blahp.postinst b/project/debfiles/glite-ce-blahp.postinst new file mode 100755 index 00000000..dd3e2580 --- /dev/null +++ b/project/debfiles/glite-ce-blahp.postinst @@ -0,0 +1,34 @@ +#!/bin/sh + +set -e + +case "$1" in + configure) + + if test -z "$2"; then + update-rc.d glite-ce-blah-parser start 94 3 4 5 . stop 15 3 4 5 . >/dev/null + + if [ ! "x`grep tomcat6 /etc/passwd`" == "x" ] ; then + mkdir -p /var/log/cream/accounting + chown root.tomcat6 /var/log/cream/accounting + chmod 0730 /var/log/cream/accounting + + mkdir -p /var/blah + chown tomcat6.tomcat6 /var/blah + chmod 771 /var/blah + + fi + fi + ;; + + abort-upgrade|abort-remove|abort-deconfigure) + ;; + + *) + echo "postinst called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +exit 0 + diff --git a/project/debfiles/glite-ce-blahp.prerm b/project/debfiles/glite-ce-blahp.prerm new file mode 100755 index 00000000..87783767 --- /dev/null +++ b/project/debfiles/glite-ce-blahp.prerm @@ -0,0 +1,32 @@ +#!/bin/sh + +set -e + +case "$1" in + remove) + invoke-rc.d glite-ce-blah-parser stop >/dev/null 2>&1 + update-rc.d glite-ce-blah-parser remove >/dev/null 2>&1 + + if [ -d /var/log/cream/accounting ] ; then + rm -rf /var/log/cream/accounting + fi + + if [ -d /var/blah ] ; then + rm -rf /var/blah + fi + ;; + + upgrade) + ;; + + deconfigure|failed-upgrade) + ;; + + *) + echo "prerm called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +exit 0 + diff --git a/project/glite-ce-blahp_sl_any.spec b/project/glite-ce-blahp_sl_any.spec index a062bd3a..2ff4c1d5 100644 --- a/project/glite-ce-blahp_sl_any.spec +++ b/project/glite-ce-blahp_sl_any.spec @@ -42,6 +42,15 @@ if test "x%{extbuilddir}" == "x--" ; then else cp -R %{extbuilddir}/* %{buildroot} fi +strip -s %{buildroot}/usr/sbin/blah_job_registry_* +strip -s %{buildroot}/usr/sbin/blahpd_daemon +strip -s %{buildroot}/usr/sbin/blah_check_config +strip -s %{buildroot}/usr/libexec/blparser_master +strip -s %{buildroot}/usr/libexec/BLClient +strip -s %{buildroot}/usr/libexec/BUpdater* +strip -s %{buildroot}/usr/libexec/BNotifier +strip -s %{buildroot}/usr/libexec/BLParser* +strip -s %{buildroot}/usr/bin/blahpd %clean @@ -92,6 +101,6 @@ fi %doc /usr/share/man/man1/*.1.gz %changelog -* %(date +"%%a %%b %%d %%Y") CREAM group - %{version}-%{release} +* %{extcdate} CREAM group - %{extversion}-%{extage}.%{extdist} - %{extclog} diff --git a/project/properties.xml b/project/properties.xml index cfebbaa4..d978f1fa 100755 --- a/project/properties.xml +++ b/project/properties.xml @@ -21,7 +21,7 @@ Authors: Joachim Flammer Version info: $Id: properties.xml,v 1.8 2010/03/17 10:48:17 mezzadri Exp $ - Release: $Name: $ + Release: $Name: glite-ce-blahp_B_1_18 $ Revision history: $Log: properties.xml,v $ diff --git a/project/version.properties b/project/version.properties index 1ca4fb7d..eb61542c 100755 --- a/project/version.properties +++ b/project/version.properties @@ -1,3 +1,3 @@ #Mon Apr 11 15:13:49 CEST 2005 -module.version=1.19.0 +module.version=1.17.0 module.age=0 diff --git a/src/BNotifier.c b/src/BNotifier.c index 8ffc498c..7e483de2 100644 --- a/src/BNotifier.c +++ b/src/BNotifier.c @@ -42,8 +42,6 @@ char *debuglogname; int c_sock; -config_entry *remupd_conf; - /* moved to per-thread structure int startnotify=FALSE; int startnotifyjob=FALSE; @@ -228,11 +226,6 @@ main(int argc, char *argv[]) } } - remupd_conf = config_get("job_registry_add_remote",cha); - if (remupd_conf == NULL){ - do_log(debuglogfile, debug, 1, "%s: key job_registry_add_remote not found\n",argv0); - } - /* create listening socket for Cream */ if ( !async_notif_port ) { @@ -324,6 +317,7 @@ PollDB() job_registry_handle *rha; job_registry_handle *rhc; char *buffer=NULL; + char *finalbuffer=NULL; char *cdate=NULL; time_t now; int maxtok,i,maxtokl,j; @@ -336,8 +330,7 @@ PollDB() char *cp=NULL; int to_sleep=FALSE; int skip_reg_open=FALSE; - int ret; - + rha=job_registry_init(registry_file, BY_BATCH_ID); if (rha == NULL){ do_log(debuglogfile, debug, 1, "%s: Error initialising job registry %s\n",argv0,registry_file); @@ -372,37 +365,19 @@ PollDB() if ((en=job_registry_get(rhc, tbuf[j])) != NULL){ buffer=ComposeClassad(en); }else{ - if(remupd_conf == NULL){ - cdate=iepoch2str(now); - maxtokl=strtoken(tbuf[j],'_',&lbuf); - if(lbuf[1]){ - if ((cp = strrchr (lbuf[1], '\n')) != NULL){ - *cp = '\0'; - } - if ((cp = strrchr (lbuf[1], '\r')) != NULL){ - *cp = '\0'; - } - buffer=make_message("[BlahJobName=\"%s\"; ClientJobId=\"%s\"; JobStatus=4; JwExitCode=999; ExitReason=\"BUpdater is not able to find the job anymore\"; Reason=\"BUpdater is not able to find the job anymore\"; ChangeTime=\"%s\"; ]\n",tbuf[j],lbuf[1],cdate); + cdate=iepoch2str(now); + maxtokl=strtoken(tbuf[j],'_',&lbuf); + if(lbuf[1]){ + if ((cp = strrchr (lbuf[1], '\n')) != NULL){ + *cp = '\0'; } - freetoken(&lbuf,maxtokl); - free(cdate); - }else{ - maxtokl=strtoken(tbuf[j],':',&lbuf); - JOB_REGISTRY_ASSIGN_ENTRY(en->batch_id,lbuf[0]); - JOB_REGISTRY_ASSIGN_ENTRY(en->blah_id,lbuf[1]); - freetoken(&lbuf,maxtokl); - en->status = 0; - if ((ret=job_registry_append(rhc, en))<0){ - if(ret != JOB_REGISTRY_NOT_FOUND){ - fprintf(stderr,"Update of record returns %d: ",ret); - perror(""); - } - }else{ - if(ret==JOB_REGISTRY_SUCCESS){ - do_log(debuglogfile, debug, 2, "%s: registry append in PollDB for: jobid=%s blahjobid=%s\n",argv0,en->batch_id,en->blah_id); - } - } - } + if ((cp = strrchr (lbuf[1], '\r')) != NULL){ + *cp = '\0'; + } + buffer=make_message("[BlahJobName=\"%s\"; ClientJobId=\"%s\"; JobStatus=4; JwExitCode=999; ExitReason=\"BUpdater is not able to find the job anymore\"; Reason=\"BUpdater is not able to find the job anymore\"; ChangeTime=\"%s\"; ]\n",tbuf[j],lbuf[1],cdate); + } + freetoken(&lbuf,maxtokl); + free(cdate); } free(en); len=strlen(buffer); @@ -643,10 +618,6 @@ STARTNOTIFYJOBEND GetJobList(buffer, &(connection->joblist_string)); connection->startnotifyjob = TRUE; connection->startnotify = FALSE; - } else if (strstr(buffer,"STARTNETWORKSYNC/") != NULL) { - GetJobList(buffer, &(connection->joblist_string)); - connection->startnotifyjob = TRUE; - connection->startnotify = FALSE; } else if (strstr(buffer,"STARTNOTIFYJOBEND/") != NULL) { connection->firstnotify=TRUE; connection->lastnotiftime = time(NULL); diff --git a/src/BUpdaterCondor.c b/src/BUpdaterCondor.c index ec458c8b..1ecbd0c3 100644 --- a/src/BUpdaterCondor.c +++ b/src/BUpdaterCondor.c @@ -38,6 +38,9 @@ int main(int argc, char *argv[]){ char *pidfile=NULL; char *first_duplicate=NULL; + struct pollfd *remupd_pollset = NULL; + int remupd_nfds; + int version=0; int qlen=0; int first=TRUE; @@ -329,18 +332,20 @@ int main(int argc, char *argv[]){ fprintf(stderr,"%s: Error purging job registry %s :",argv0,registry_file); perror(""); + }else{ + purge_time=time(0); } - purge_time=time(0); } now=time(0); if(now - last_consistency_check > bupdater_consistency_check_interval){ if(job_registry_check_index_key_uniqueness(rha,&first_duplicate)==JOB_REGISTRY_FAIL){ - do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s\nJobid should be removed or registry directory should be removed.\n",argv0,first_duplicate); - fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s\nJobid should be removed or registry directory should be removed.",argv0,first_duplicate); + do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s\n",argv0,first_duplicate); + fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s",argv0,first_duplicate); + }else{ + last_consistency_check=time(0); } - last_consistency_check=time(0); } IntStateQuery(); @@ -447,24 +452,21 @@ int ReceiveUpdateFromNetwork() { char *proxy_path, *proxy_subject; - int timeout_ms = -1; - int ret, prret, rhret; + int timeout_ms = 0; + int ent, ret, prret, rhret; job_registry_entry *nen; job_registry_entry *ren; proxy_path = NULL; proxy_subject = NULL; - do_log(debuglogfile, debug, 1, "%s: ReceiveUpdateFromNetwork() thread started\n", argv0); - while ((nen = job_registry_receive_update(remupd_pollset, remupd_nfds,timeout_ms, &proxy_subject, &proxy_path))){ - do_log(debuglogfile, debug, 2, "%s: ReceiveUpdateFromNetwork() received an update for job %s\n", argv0, nen->batch_id); + while (nen = job_registry_receive_update(remupd_pollset, remupd_nfds,timeout_ms, &proxy_subject, &proxy_path)){ JOB_REGISTRY_ASSIGN_ENTRY(nen->subject_hash,"\0"); JOB_REGISTRY_ASSIGN_ENTRY(nen->proxy_link,"\0"); if ((ren=job_registry_get(rha, nen->batch_id)) == NULL){ if ((ret=job_registry_append(rha, nen)) < 0){ - do_log(debuglogfile, debug, 1, "%s: Warning: job_registry_append returns %d: ",argv0,ret); fprintf(stderr,"%s: Warning: job_registry_append returns %d: ",argv0,ret); perror(""); } @@ -598,7 +600,7 @@ IntStateQuery() do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); } } @@ -701,7 +703,7 @@ FinalStateQuery(char *query) job_registry_unlink_proxy(rha, &en); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0); } } @@ -744,7 +746,7 @@ int AssignFinalState(char *batchid){ do_log(debuglogfile, debug, 2, "%s: registry update in AssignStateQuery for: jobid=%s creamjobid=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.status); job_registry_unlink_proxy(rha, &en); if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in AssignFinalState\n",argv0); } } diff --git a/src/BUpdaterLSF.c b/src/BUpdaterLSF.c index ba6166dc..4c3ba5b3 100644 --- a/src/BUpdaterLSF.c +++ b/src/BUpdaterLSF.c @@ -24,8 +24,6 @@ #include "BUpdaterLSF.h" -time_t last_network_update; - int main(int argc, char *argv[]){ FILE *fd; @@ -36,6 +34,9 @@ int main(int argc, char *argv[]){ char *pidfile=NULL; char *first_duplicate=NULL; + struct pollfd *remupd_pollset = NULL; + int remupd_nfds; + int version=0; int first=TRUE; int tmptim; @@ -44,6 +45,7 @@ int main(int argc, char *argv[]){ int rc; int c; + int status; pthread_t RecUpdNetThd; @@ -306,16 +308,6 @@ int main(int argc, char *argv[]){ free(s); } - ret = config_get("bupdater_use_bhist_for_idle",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key bupdater_use_bhist_for_idle not found - using the default:%s\n",argv0,use_bhist_for_idle); - } else { - use_bhist_for_idle=strdup(ret->value); - if(use_bhist_for_idle == NULL){ - sysfatal("strdup failed for use_bhist_for_idle in main: %r"); - } - } - ret = config_get("bupdater_use_bhist_for_killed",cha); if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key bupdater_use_bhist_for_killed not found - using the default:%s\n",argv0,use_bhist_for_killed); @@ -326,6 +318,16 @@ int main(int argc, char *argv[]){ } } + ret = config_get("bupdater_use_bhist_for_idle",cha); + if (ret == NULL){ + do_log(debuglogfile, debug, 1, "%s: key bupdater_use_bhist_for_idle not found - using the default:%s\n",argv0,use_bhist_for_idle); + } else { + use_bhist_for_idle=strdup(ret->value); + if(use_bhist_for_idle == NULL){ + sysfatal("strdup failed for use_bhist_for_idle in main: %r"); + } + } + ret = config_get("lsf_batch_caching_enabled",cha); if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key lsf_batch_caching_enabled not found using default\n",argv0,lsf_batch_caching_enabled); @@ -419,26 +421,22 @@ int main(int argc, char *argv[]){ fprintf(stderr,"%s: Error purging job registry %s :",argv0,registry_file); perror(""); + }else{ + purge_time=time(0); } - purge_time=time(0); } now=time(0); if(now - last_consistency_check > bupdater_consistency_check_interval){ if(job_registry_check_index_key_uniqueness(rha,&first_duplicate)==JOB_REGISTRY_FAIL){ - do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s\nJobid should be removed or registry directory should be removed.\n",argv0,first_duplicate); - fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s\nJobid should be removed or registry directory should be removed.",argv0,first_duplicate); + do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s\n",argv0,first_duplicate); + fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s",argv0,first_duplicate); + }else{ + last_consistency_check=time(0); } - last_consistency_check=time(0); } - - if (now - last_network_update < loop_interval) { - do_log(debuglogfile, debug, 2, "%s: skipping iteration as registry was updated %d seconds ago via network\n", argv0, now - last_network_update); - sleep(loop_interval); - continue; - } - + if(use_btools && strcmp(use_btools,"yes")==0){ IntStateQueryCustom(); @@ -447,7 +445,7 @@ int main(int argc, char *argv[]){ }else{ IntStateQueryShort(); } - + fd = job_registry_open(rha, "r"); if (fd == NULL){ do_log(debuglogfile, debug, 1, "%s: Error opening job registry %s\n",argv0,registry_file); @@ -533,24 +531,21 @@ int ReceiveUpdateFromNetwork() { char *proxy_path, *proxy_subject; - int timeout_ms = -1; - int ret, prret, rhret; + int timeout_ms = 0; + int ent, ret, prret, rhret; job_registry_entry *nen; job_registry_entry *ren; proxy_path = NULL; proxy_subject = NULL; - do_log(debuglogfile, debug, 1, "%s: ReceiveUpdateFromNetwork() thread started\n", argv0); - while ((nen = job_registry_receive_update(remupd_pollset, remupd_nfds, timeout_ms, &proxy_subject, &proxy_path))){ - do_log(debuglogfile, debug, 2, "%s: ReceiveUpdateFromNetwork() received an update for job %s\n", argv0, nen->batch_id); - + while (nen = job_registry_receive_update(remupd_pollset, remupd_nfds,timeout_ms, &proxy_subject, &proxy_path)){ + JOB_REGISTRY_ASSIGN_ENTRY(nen->subject_hash,"\0"); JOB_REGISTRY_ASSIGN_ENTRY(nen->proxy_link,"\0"); if ((ren=job_registry_get(rha, nen->batch_id)) == NULL){ if ((ret=job_registry_append(rha, nen)) < 0){ - do_log(debuglogfile, debug, 1, "%s: Warning: job_registry_append returns %d: ",argv0,ret); fprintf(stderr,"%s: Warning: job_registry_append returns %d: ",argv0,ret); perror(""); } @@ -587,17 +582,14 @@ ReceiveUpdateFromNetwork() } if(job_registry_need_update(ren,nen,JOB_REGISTRY_UPDATE_ALL)){ if ((ret=job_registry_update(rha, nen)) < 0){ - do_log(debuglogfile, debug, 1, "%s: Warning: job_registry_update returns %d: ",argv0,ret); fprintf(stderr,"%s: Warning: job_registry_update returns %d: ",argv0,ret); perror(""); - } else { - last_network_update = time(0); } } } free(nen); } - do_log(debuglogfile, debug, 1, "%s: ReceiveUpdateFromNetwork() thread exiting\n", argv0); + return 0; } @@ -616,6 +608,8 @@ IntStateQueryCustom() int maxtok_l=0; job_registry_entry en; int ret; + char *timestamp; + time_t tmstampepoch; char *tmp=NULL; char *cp=NULL; char *command_string=NULL; @@ -679,7 +673,7 @@ IntStateQueryCustom() do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQueryCustom for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQueryCustom\n",argv0); } } @@ -783,7 +777,7 @@ IntStateQueryCustom() do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQueryCustom for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQueryCustom\n",argv0); } } @@ -882,7 +876,7 @@ IntStateQueryShort() do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQueryShort for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQueryShort\n",argv0); } } @@ -968,7 +962,7 @@ IntStateQueryShort() do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQueryShort for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQueryShort\n",argv0); } } @@ -1066,7 +1060,7 @@ IntStateQuery() do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); } } @@ -1177,7 +1171,7 @@ IntStateQuery() free(ex_str); freetoken(&token,maxtok_t); - if(wexitcode==255 || wexitcode==130 || wexitcode==143){ + if(wexitcode==255 || wexitcode==130){ en.status=REMOVED; en.exitcode=-999; }else{ @@ -1250,7 +1244,7 @@ IntStateQuery() do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); } } @@ -1372,7 +1366,7 @@ exitcode (=0 if Done successfully) or (from Exited with exit code 2) } } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0); } } @@ -1477,7 +1471,7 @@ exitcode (=0 if Done successfully) or (from Exited with exit code 2) job_registry_unlink_proxy(rha, &en); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0); } } @@ -1652,7 +1646,7 @@ int AssignFinalState(char *batchid){ do_log(debuglogfile, debug, 2, "%s: registry update in AssignStateQuery for: jobid=%s creamjobid=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.status); job_registry_unlink_proxy(rha, &en); if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in AssignFinalState\n",argv0); } } diff --git a/src/BUpdaterPBS.c b/src/BUpdaterPBS.c index 3f2b81f8..f6f7ce38 100644 --- a/src/BUpdaterPBS.c +++ b/src/BUpdaterPBS.c @@ -39,6 +39,9 @@ int main(int argc, char *argv[]){ char *first_duplicate=NULL; + struct pollfd *remupd_pollset = NULL; + int remupd_nfds; + int version=0; int first=TRUE; int tmptim; @@ -293,7 +296,7 @@ int main(int argc, char *argv[]){ if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key tracejob_max_output not found using default\n",argv0,tracejob_max_output); } else { - tracejob_max_output=atoi(ret->value); + tracejob_max_output==atoi(ret->value); } remupd_conf = config_get("job_registry_add_remote",cha); @@ -360,18 +363,20 @@ int main(int argc, char *argv[]){ fprintf(stderr,"%s: Error purging job registry %s :",argv0,registry_file); perror(""); + }else{ + purge_time=time(0); } - purge_time=time(0); } now=time(0); if(now - last_consistency_check > bupdater_consistency_check_interval){ if(job_registry_check_index_key_uniqueness(rha,&first_duplicate)==JOB_REGISTRY_FAIL){ - do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s\n.Jobid should be removed or registry directory should be removed.\n",argv0,first_duplicate); - fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s\nJobid should be removed or registry directory should be removed.",argv0,first_duplicate); + do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s\n",argv0,first_duplicate); + fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s",argv0,first_duplicate); + }else{ + last_consistency_check=time(0); } - last_consistency_check=time(0); } IntStateQuery(); @@ -460,24 +465,21 @@ int ReceiveUpdateFromNetwork() { char *proxy_path, *proxy_subject; - int timeout_ms = -1; - int ret, prret, rhret; + int timeout_ms = 0; + int ent, ret, prret, rhret; job_registry_entry *nen; job_registry_entry *ren; proxy_path = NULL; proxy_subject = NULL; - do_log(debuglogfile, debug, 1, "%s: ReceiveUpdateFromNetwork() thread started\n", argv0); - while ((nen = job_registry_receive_update(remupd_pollset, remupd_nfds,timeout_ms, &proxy_subject, &proxy_path))){ - do_log(debuglogfile, debug, 2, "%s: ReceiveUpdateFromNetwork() received an update for job %s\n", argv0, nen->batch_id); + while (nen = job_registry_receive_update(remupd_pollset, remupd_nfds,timeout_ms, &proxy_subject, &proxy_path)){ JOB_REGISTRY_ASSIGN_ENTRY(nen->subject_hash,"\0"); JOB_REGISTRY_ASSIGN_ENTRY(nen->proxy_link,"\0"); if ((ren=job_registry_get(rha, nen->batch_id)) == NULL){ if ((ret=job_registry_append(rha, nen)) < 0){ - do_log(debuglogfile, debug, 1, "%s: Warning: job_registry_append returns %d: ",argv0,ret); fprintf(stderr,"%s: Warning: job_registry_append returns %d: ",argv0,ret); perror(""); } @@ -619,7 +621,7 @@ Job Id: 11.cream-12.pd.infn.it } } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); } } @@ -734,7 +736,7 @@ Job Id: 11.cream-12.pd.infn.it } } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); } } @@ -912,7 +914,7 @@ Job: 13.cream-12.pd.infn.it job_registry_unlink_proxy(rha, &en); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0); } } @@ -959,7 +961,7 @@ int AssignFinalState(char *batchid){ do_log(debuglogfile, debug, 2, "%s: registry update in AssignStateQuery for: jobid=%s creamjobid=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.status); job_registry_unlink_proxy(rha, &en); if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in AssignFinalState\n",argv0); } } diff --git a/src/BUpdaterSLURM.c b/src/BUpdaterSLURM.c deleted file mode 100644 index 1eaf2e79..00000000 --- a/src/BUpdaterSLURM.c +++ /dev/null @@ -1,855 +0,0 @@ -/* -# File: BUpdaterSLURM.c -# -# Author: Massimo Mezzadri -# e-mail: Massimo.Mezzadri@mi.infn.it -# -# Copyright (c) Members of the EGEE Collaboration. 2004. -# See http://www.eu-egee.org/partners/ for details on the copyright -# holders. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -*/ - -#include "BUpdaterSLURM.h" - -int main(int argc, char *argv[]){ - - FILE *fd; - job_registry_entry *en; - time_t now; - time_t purge_time=0; - time_t last_consistency_check=0; - char *pidfile=NULL; - char *first_duplicate=NULL; - - struct pollfd *remupd_pollset = NULL; - int remupd_nfds; - - int version=0; - int first=TRUE; - int tmptim; - time_t finalquery_start_date; - int loop_interval=DEFAULT_LOOP_INTERVAL; - - int rc; - int c; - - pthread_t RecUpdNetThd; - - int confirm_time=0; - - static int help; - static int short_help; - - bact.njobs = 0; - bact.jobs = NULL; - - while (1) { - static struct option long_options[] = - { - {"help", no_argument, &help, 1}, - {"usage", no_argument, &short_help, 1}, - {"nodaemon", no_argument, 0, 'o'}, - {"version", no_argument, 0, 'v'}, - {"prefix", required_argument, 0, 'p'}, - {0, 0, 0, 0} - }; - - int option_index = 0; - - c = getopt_long (argc, argv, "vop:",long_options, &option_index); - - if (c == -1){ - break; - } - - switch (c) - { - - case 0: - if (long_options[option_index].flag != 0){ - break; - } - - case 'v': - version=1; - break; - - case 'o': - nodmn=1; - break; - - case 'p': - break; - - case '?': - break; - - default: - abort (); - } - } - - if(help){ - usage(); - } - - if(short_help){ - short_usage(); - } - - argv0 = argv[0]; - - signal(SIGHUP,sighup); - - if(version) { - printf("%s Version: %s\n",progname,VERSION); - exit(EXIT_SUCCESS); - } - - /* Checking configuration */ - check_config_file("UPDATER"); - - cha = config_read(NULL); - if (cha == NULL) - { - fprintf(stderr,"Error reading config: "); - perror(""); - return -1; - } - - ret = config_get("bupdater_child_poll_timeout",cha); - if (ret != NULL){ - tmptim=atoi(ret->value); - if (tmptim > 0) bfunctions_poll_timeout = tmptim*1000; - } - - ret = config_get("bupdater_debug_level",cha); - if (ret != NULL){ - debug=atoi(ret->value); - } - - ret = config_get("bupdater_debug_logfile",cha); - if (ret != NULL){ - debuglogname=strdup(ret->value); - if(debuglogname == NULL){ - sysfatal("strdup failed for debuglogname in main: %r"); - } - } - if(debug <=0){ - debug=0; - } - - if(debuglogname){ - if((debuglogfile = fopen(debuglogname, "a+"))==0){ - debug = 0; - } - } else { - debug = 0; - } - - ret = config_get("slurm_binpath",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key slurm_binpath not found\n",argv0); - } else { - slurm_binpath=strdup(ret->value); - if(slurm_binpath == NULL){ - sysfatal("strdup failed for slurm_binpath in main: %r"); - } - } - - ret = config_get("job_registry",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key job_registry not found\n",argv0); - sysfatal("job_registry not defined. Exiting"); - } else { - registry_file=strdup(ret->value); - if(registry_file == NULL){ - sysfatal("strdup failed for registry_file in main: %r"); - } - } - - ret = config_get("purge_interval",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key purge_interval not found using the default:%d\n",argv0,purge_interval); - } else { - purge_interval=atoi(ret->value); - } - - ret = config_get("finalstate_query_interval",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key finalstate_query_interval not found using the default:%d\n",argv0,finalstate_query_interval); - } else { - finalstate_query_interval=atoi(ret->value); - } - - ret = config_get("alldone_interval",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key alldone_interval not found using the default:%d\n",argv0,alldone_interval); - } else { - alldone_interval=atoi(ret->value); - } - - ret = config_get("bupdater_consistency_check_interval",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key bupdater_consistency_check_interval not found using the default:%d\n",argv0,bupdater_consistency_check_interval); - } else { - bupdater_consistency_check_interval=atoi(ret->value); - } - - ret = config_get("bupdater_pidfile",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key bupdater_pidfile not found\n",argv0); - } else { - pidfile=strdup(ret->value); - if(pidfile == NULL){ - sysfatal("strdup failed for pidfile in main: %r"); - } - } - - ret = config_get("bupdater_loop_interval",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key bupdater_loop_interval not found - using the default:%d\n",argv0,loop_interval); - } else { - loop_interval=atoi(ret->value); - } - - ret = config_get("job_registry_use_mmap",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key job_registry_use_mmap not found. Default is NO\n",argv0); - } else { - do_log(debuglogfile, debug, 1, "%s: key job_registry_use_mmap is set to %s\n",argv0,ret->value); - } - - remupd_conf = config_get("job_registry_add_remote",cha); - if (remupd_conf == NULL){ - do_log(debuglogfile, debug, 1, "%s: key job_registry_add_remote not found\n",argv0); - }else{ - if (job_registry_updater_setup_receiver(remupd_conf->values,remupd_conf->n_values,&remupd_head) < 0){ - do_log(debuglogfile, debug, 1, "%s: Cannot set network receiver(s) up for remote update\n",argv0); - fprintf(stderr,"%s: Cannot set network receiver(s) up for remote update \n",argv0); - } - - if (remupd_head == NULL){ - do_log(debuglogfile, debug, 1, "%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n",argv0); - fprintf(stderr,"%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n", argv0); - } - - if ((remupd_nfds = job_registry_updater_get_pollfd(remupd_head, &remupd_pollset)) < 0){ - do_log(debuglogfile, debug, 1, "%s: Cannot setup poll set for receiving data.\n",argv0); - fprintf(stderr,"%s: Cannot setup poll set for receiving data.\n", argv0); - } - if (remupd_pollset == NULL || remupd_nfds == 0){ - do_log(debuglogfile, debug, 1, "%s: No poll set available for receiving data.\n",argv0); - fprintf(stderr,"%s: No poll set available for receiving data.\n",argv0); - } - - } - - if( !nodmn ) daemonize(); - - - if( pidfile ){ - writepid(pidfile); - free(pidfile); - } - - rha=job_registry_init(registry_file, BY_BATCH_ID); - if (rha == NULL){ - do_log(debuglogfile, debug, 1, "%s: Error initialising job registry %s\n",argv0,registry_file); - fprintf(stderr,"%s: Error initialising job registry %s :",argv0,registry_file); - perror(""); - } - - if (remupd_conf != NULL){ - pthread_create(&RecUpdNetThd, NULL, (void *(*)(void *))ReceiveUpdateFromNetwork, (void *)NULL); - - if (job_registry_updater_setup_sender(remupd_conf->values,remupd_conf->n_values,0,&remupd_head_send) < 0){ - do_log(debuglogfile, debug, 1, "%s: Cannot set network sender(s) up for remote update\n",argv0); - fprintf(stderr,"%s: Cannot set network sender(s) up for remote update \n",argv0); - } - if (remupd_head_send == NULL){ - do_log(debuglogfile, debug, 1, "%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n",argv0); - fprintf(stderr,"%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n", argv0); - } - } - - config_free(cha); - - for(;;){ - /* Purge old entries from registry */ - now=time(0); - if(now - purge_time > 86400){ - if((rc=job_registry_purge(registry_file, now-purge_interval,0))<0){ - do_log(debuglogfile, debug, 1, "%s: Error purging job registry %s:%d\n",argv0,registry_file,rc); - fprintf(stderr,"%s: Error purging job registry %s :",argv0,registry_file); - perror(""); - - } - purge_time=time(0); - } - - now=time(0); - if(now - last_consistency_check > bupdater_consistency_check_interval){ - if(job_registry_check_index_key_uniqueness(rha,&first_duplicate)==JOB_REGISTRY_FAIL){ - do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s\n.Jobid should be removed or registry directory should be removed.\n",argv0,first_duplicate); - fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s.\nJobid should be removed or registry directory should be removed.",argv0,first_duplicate); - - } - last_consistency_check=time(0); - } - - IntStateQuery(); - - fd = job_registry_open(rha, "r"); - if (fd == NULL){ - do_log(debuglogfile, debug, 1, "%s: Error opening job registry %s\n",argv0,registry_file); - fprintf(stderr,"%s: Error opening job registry %s :",argv0,registry_file); - perror(""); - sleep(loop_interval); - continue; - } - if (job_registry_rdlock(rha, fd) < 0){ - do_log(debuglogfile, debug, 1, "%s: Error read locking job registry %s\n",argv0,registry_file); - fprintf(stderr,"%s: Error read locking job registry %s :",argv0,registry_file); - perror(""); - sleep(loop_interval); - continue; - } - job_registry_firstrec(rha,fd); - fseek(fd,0L,SEEK_SET); - - first=TRUE; - finalquery_start_date = time(0); - - while ((en = job_registry_get_next(rha, fd)) != NULL){ - - if((bupdater_lookup_active_jobs(&bact,en->batch_id) != BUPDATER_ACTIVE_JOBS_SUCCESS) && en->status!=REMOVED && en->status!=COMPLETED){ - - confirm_time=atoi(en->updater_info); - if(confirm_time==0){ - confirm_time=en->mdate; - } - - /* Assign Status=4 and ExitStatus=999 to all entries that after alldone_interval are still not in a final state(3 or 4)*/ - if(now-confirm_time>alldone_interval){ - AssignFinalState(en->batch_id); - free(en); - continue; - } - - if(en->status==IDLE && strlen(en->updater_info)>0){ - if (en->mdate < finalquery_start_date){ - finalquery_start_date=en->mdate; - } - do_log(debuglogfile, debug, 2, "%s: FinalStateQuery needed for jobid=%s with status=%d\n",argv0,en->batch_id,en->status); - runfinal=TRUE; - }else if((now-confirm_time>finalstate_query_interval) && (now > next_finalstatequery)){ - if (en->mdate < finalquery_start_date){ - finalquery_start_date=en->mdate; - } - do_log(debuglogfile, debug, 2, "%s: FinalStateQuery needed for jobid=%s with status=%d\n",argv0,en->batch_id,en->status); - runfinal=TRUE; - } - - - } - free(en); - } - - if(runfinal_oldlogs){ - FinalStateQuery(0,1); - runfinal_oldlogs=FALSE; - runfinal=FALSE; - }else if(runfinal){ - FinalStateQuery(finalquery_start_date,1); - runfinal=FALSE; - } - fclose(fd); - sleep(loop_interval); - } - - job_registry_destroy(rha); - - return 0; - -} - -int -ReceiveUpdateFromNetwork() -{ - char *proxy_path, *proxy_subject; - int timeout_ms = 0; - int ret, prret, rhret; - job_registry_entry *nen; - job_registry_entry *ren; - - proxy_path = NULL; - proxy_subject = NULL; - - while ((nen = job_registry_receive_update(remupd_pollset, remupd_nfds,timeout_ms, &proxy_subject, &proxy_path))){ - - JOB_REGISTRY_ASSIGN_ENTRY(nen->subject_hash,"\0"); - JOB_REGISTRY_ASSIGN_ENTRY(nen->proxy_link,"\0"); - - if ((ren=job_registry_get(rha, nen->batch_id)) == NULL){ - if ((ret=job_registry_append(rha, nen)) < 0){ - fprintf(stderr,"%s: Warning: job_registry_append returns %d: ",argv0,ret); - perror(""); - } - }else{ - - if(ren->subject_hash!=NULL && strlen(ren->subject_hash) && ren->proxy_link!=NULL && strlen(ren->proxy_link)){ - JOB_REGISTRY_ASSIGN_ENTRY(nen->subject_hash,ren->subject_hash); - JOB_REGISTRY_ASSIGN_ENTRY(nen->proxy_link,ren->proxy_link); - }else{ - if (proxy_path != NULL && strlen(proxy_path) > 0){ - prret = job_registry_set_proxy(rha, nen, proxy_path); - if (prret < 0){ - do_log(debuglogfile, debug, 1, "%s: warning: setting proxy to %s\n",argv0,proxy_path); - fprintf(stderr,"%s: warning: setting proxy to %s: ",argv0,proxy_path); - perror(""); - /* Make sure we don't renew non-existing proxies */ - nen->renew_proxy = 0; - } - free(proxy_path); - - nen->subject_hash[0] = '\000'; - if (proxy_subject != NULL && strlen(proxy_subject) > 0){ - job_registry_compute_subject_hash(nen, proxy_subject); - rhret = job_registry_record_subject_hash(rha, nen->subject_hash, proxy_subject, TRUE); - if (rhret < 0){ - do_log(debuglogfile, debug, 1, "%s: warning: recording proxy subject %s (hash %s)\n",argv0, proxy_subject, nen->subject_hash); - fprintf(stderr,"%s: warning: recording proxy subject %s (hash %s): ",argv0, proxy_subject, nen->subject_hash); - perror(""); - } - } - free(proxy_subject); - - } - } - if(job_registry_need_update(ren,nen,JOB_REGISTRY_UPDATE_ALL)){ - if ((ret=job_registry_update(rha, nen)) < 0){ - fprintf(stderr,"%s: Warning: job_registry_update returns %d: ",argv0,ret); - perror(""); - } - } - } - free(nen); - } - - return 0; -} - -int -IntStateQuery() -{ - - FILE *fp; - char *line=NULL; - char **token; - char **token_l; - char **token_e; - int maxtok_t=0; - int maxtok_l=0; - int maxtok_e=0; - job_registry_entry en; - int ret; - time_t tmstampepoch; - char *cp=NULL; - char *batch_str=NULL; - char *command_string=NULL; - job_registry_entry *ren=NULL; - int isresumed=FALSE; - int first=TRUE; - time_t now; - char *string_now=NULL; - - command_string=make_message("%s/scontrol -a show jobid",slurm_binpath); - fp = popen(command_string,"r"); - - en.status=UNDEFINED; - JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0"); - JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,"\0"); - en.exitcode=-1; - bupdater_free_active_jobs(&bact); - - if(fp!=NULL){ - while(!feof(fp) && (line=get_line(fp))){ - if(line && strlen(line)==0){ - free(line); - continue; - } - if ((cp = strrchr (line, '\n')) != NULL){ - *cp = '\0'; - } - do_log(debuglogfile, debug, 3, "%s: line in IntStateQuery is:%s\n",argv0,line); - now=time(0); - string_now=make_message("%d",now); - maxtok_t = strtoken(line, ' ', &token); - if(line && strstr(line,"JobId=")){ - isresumed=FALSE; - if(!first && en.status!=UNDEFINED && ren && ren->status!=REMOVED && ren->status!=COMPLETED){ - if ((ret=job_registry_update_recn_select(rha, &en, ren->recnum, - JOB_REGISTRY_UPDATE_WN_ADDR| - JOB_REGISTRY_UPDATE_STATUS| - JOB_REGISTRY_UPDATE_UDATE| - JOB_REGISTRY_UPDATE_UPDATER_INFO| - JOB_REGISTRY_UPDATE_EXITCODE| - JOB_REGISTRY_UPDATE_EXITREASON)) < 0){ - if(ret != JOB_REGISTRY_NOT_FOUND){ - fprintf(stderr,"Update of record returns %d: ",ret); - perror(""); - } - } else { - if(ret==JOB_REGISTRY_SUCCESS){ - if (en.status == REMOVED || en.status == COMPLETED) { - do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d exitcode=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status,en.exitcode); - job_registry_unlink_proxy(rha, &en); - }else{ - do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); - } - if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ - do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); - } - } - } - } - en.status = UNDEFINED; - JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,"\0"); - JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0"); - en.exitcode=-1; - } - en.status = UNDEFINED; - maxtok_l = strtoken(token[0], '=', &token_l); - batch_str=strdup(token_l[1]); - JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,batch_str); - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - en.exitcode=-1; - bupdater_push_active_job(&bact, en.batch_id); - free(batch_str); - freetoken(&token_l,maxtok_l); - if(!first) free(ren); - if ((ren=job_registry_get(rha, en.batch_id)) == NULL){ - fprintf(stderr,"Get of record returns error "); - perror(""); - } - if(ren){ - if(strlen(ren->updater_info)>0){ - en.udate=ren->udate; - }else{ - en.udate=time(0); - } - } - first=FALSE; - - }else if(line && strstr(line," JobState=")){ - if(token[0] && strstr(line,"JobState=")){ - maxtok_l = strtoken(token[0], '=', &token_l); - if(token_l[1] && strstr(token_l[1],"PENDING")){ - en.status=IDLE; - en.exitcode=-1; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token_l[1] && strstr(token_l[1],"RUNNING")){ - en.status=RUNNING; - en.exitcode=-1; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token_l[1] && strstr(token_l[1],"COMPLETED")){ - en.status=COMPLETED; - en.exitcode=0; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token_l[1] && strstr(token_l[1],"CANCELLED")){ - en.status=REMOVED; - en.exitcode=-999; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token_l[1] && strstr(token_l[1],"FAILED")){ - en.status=COMPLETED; - en.exitcode=0; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token_l[1] && strstr(token_l[1],"SUSPENDED")){ - en.status=HELD; - en.exitcode=-1; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token_l[1] && strstr(token_l[1],"COMPLETING")){ - bupdater_remove_active_job(&bact, en.batch_id); - } - freetoken(&token_l,maxtok_l); - } - }else if(line && strstr(line," BatchHost=")){ - if(token[0] && strstr(line,"BatchHost=")){ - maxtok_l = strtoken(token[0], '=', &token_l); - if(en.status!=IDLE){ - JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,token_l[1]); - } - freetoken(&token_l,maxtok_l); - } - }else if(line && strstr(line," ExitCode=")){ - if(token[3] && strstr(line,"ExitCode=")){ - maxtok_l = strtoken(token[3], '=', &token_l); - maxtok_e = strtoken(token_l[1], ':', &token_e); - if(en.status==COMPLETED){ - en.exitcode=atoi(token_e[0]); - } - freetoken(&token_l,maxtok_l); - freetoken(&token_e,maxtok_e); - } - }else if(line && strstr(line," SubmitTime=")){ - if(en.status==IDLE){ - if(token[0] && strstr(line,"SubmitTime=")){ - maxtok_l = strtoken(token[0], '=', &token_l); - tmstampepoch=str2epoch(token_l[1],"N"); - en.udate=tmstampepoch; - freetoken(&token_l,maxtok_l); - } - } - }else if(line && strstr(line," StartTime=")){ - if(en.status==RUNNING){ - if(token[0] && strstr(line,"StartTime=")){ - maxtok_l = strtoken(token[0], '=', &token_l); - tmstampepoch=str2epoch(token_l[1],"N"); - en.udate=tmstampepoch; - freetoken(&token_l,maxtok_l); - } - } - if(en.status==COMPLETED || en.status==REMOVED){ - if(token[1] && strstr(line,"EndTime=")){ - maxtok_l = strtoken(token[1], '=', &token_l); - tmstampepoch=str2epoch(token_l[1],"N"); - en.udate=tmstampepoch; - freetoken(&token_l,maxtok_l); - } - } - }else if(line && strstr(line," SuspendTime=")){ - if(en.status==HELD){ - if(token[1] && strstr(line,"SuspendTime=")){ - maxtok_l = strtoken(token[1], '=', &token_l); - tmstampepoch=str2epoch(token_l[1],"N"); - en.udate=tmstampepoch; - freetoken(&token_l,maxtok_l); - } - } - } - - free(line); - free(string_now); - freetoken(&token,maxtok_t); - } - pclose(fp); - } - - if(en.status!=UNDEFINED && ren && ren->status!=REMOVED && ren->status!=COMPLETED){ - if ((ret=job_registry_update_recn_select(rha, &en, ren->recnum, - JOB_REGISTRY_UPDATE_WN_ADDR| - JOB_REGISTRY_UPDATE_STATUS| - JOB_REGISTRY_UPDATE_UDATE| - JOB_REGISTRY_UPDATE_UPDATER_INFO| - JOB_REGISTRY_UPDATE_EXITCODE| - JOB_REGISTRY_UPDATE_EXITREASON)) < 0){ - if(ret != JOB_REGISTRY_NOT_FOUND){ - fprintf(stderr,"Update of record returns %d: ",ret); - perror(""); - } - } else { - if(ret==JOB_REGISTRY_SUCCESS){ - if (en.status == REMOVED || en.status == COMPLETED) { - do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d exitcode=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status,en.exitcode); - job_registry_unlink_proxy(rha, &en); - }else{ - do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); - } - if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ - do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); - } - } - } - } - } - - free(ren); - free(command_string); - return 0; -} - -int -FinalStateQuery(time_t start_date, int logs_to_read) -{ - - FILE *fp; - char *line=NULL; - char **token; - char **token_l; - int maxtok_t=0; - int maxtok_l=0; - job_registry_entry en; - int ret; - time_t tmstampepoch; - char *cp=NULL; - char *command_string=NULL; - time_t now; - char *string_now=NULL; - job_registry_entry *ren=NULL; - - - command_string=make_message("%s/sacct -nap -o JobID,JobName,State,ExitCode,submit,start,end 2>/dev/null",slurm_binpath); - - fp = popen(command_string,"r"); - - do_log(debuglogfile, debug, 3, "%s: command_string in FinalStateQuery is:%s\n",argv0,command_string); - - en.status=UNDEFINED; - JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); - - if(fp!=NULL){ - while(!feof(fp) && (line=get_line(fp))){ - if(line && strlen(line)==0){ - free(line); - continue; - } - if ((cp = strrchr (line, '\n')) != NULL){ - *cp = '\0'; - } - do_log(debuglogfile, debug, 3, "%s: line in FinalStateQuery is:%s\n",argv0,line); - now=time(0); - string_now=make_message("%d",now); - maxtok_t = strtoken(line, '|', &token); - JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,token[0]); - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - if(token[2] && strstr(token[2],"COMPLETED")){ - en.status=COMPLETED; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token[2] && strstr(token[2],"CANCELLED")){ - en.status=REMOVED; - en.exitcode=-999; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token[2] && strstr(token[2],"FAILED")){ - en.status=COMPLETED; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - } - - tmstampepoch=str2epoch(token[6],"N"); - en.udate=tmstampepoch; - if(en.status==COMPLETED){ - maxtok_l = strtoken(token[3], ':', &token_l); - en.exitcode=atoi(token_l[0]); - freetoken(&token_l,maxtok_l); - } - - if ((ren=job_registry_get(rha, en.batch_id)) == NULL){ - fprintf(stderr,"Get of record returns error "); - perror(""); - } - if(en.status!=UNDEFINED && en.status!=IDLE && ren && ren->status!=REMOVED && ren->status!=COMPLETED){ - if ((ret=job_registry_update_select(rha, &en, - JOB_REGISTRY_UPDATE_UDATE | - JOB_REGISTRY_UPDATE_STATUS | - JOB_REGISTRY_UPDATE_UPDATER_INFO | - JOB_REGISTRY_UPDATE_EXITCODE | - JOB_REGISTRY_UPDATE_EXITREASON )) < 0){ - if(ret != JOB_REGISTRY_NOT_FOUND){ - fprintf(stderr,"Update of record returns %d: ",ret); - perror(""); - } - } else { - do_log(debuglogfile, debug, 2, "%s: f registry update in FinalStateQuery for: jobid=%s creamjobid=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.status); - if (en.status == REMOVED || en.status == COMPLETED){ - job_registry_unlink_proxy(rha, &en); - } - if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ - do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0); - } - } - } - } - free(string_now); - free(line); - freetoken(&token,maxtok_t); - free(ren); - } - pclose(fp); - } - - free(command_string); - return 0; -} - -int AssignFinalState(char *batchid){ - - job_registry_entry en; - int ret,i; - time_t now; - - now=time(0); - - JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,batchid); - en.status=COMPLETED; - en.exitcode=999; - en.udate=now; - JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0"); - JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); - - if ((ret=job_registry_update(rha, &en)) < 0){ - if(ret != JOB_REGISTRY_NOT_FOUND){ - fprintf(stderr,"Update of record %d returns %d: ",i,ret); - perror(""); - } - } else { - do_log(debuglogfile, debug, 2, "%s: registry update in AssignStateQuery for: jobid=%s creamjobid=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.status); - job_registry_unlink_proxy(rha, &en); - if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ - do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in AssignFinalState\n",argv0); - } - } - } - - - return 0; -} - -void sighup() -{ - if(debug){ - fclose(debuglogfile); - if((debuglogfile = fopen(debuglogname, "a+"))==0){ - debug = 0; - } - } -} - -int -usage() -{ - printf("Usage: BUpdaterSLURM [OPTION...]\n"); - printf(" -o, --nodaemon do not run as daemon\n"); - printf(" -v, --version print version and exit\n"); - printf("\n"); - printf("Help options:\n"); - printf(" -?, --help Show this help message\n"); - printf(" --usage Display brief usage message\n"); - exit(EXIT_SUCCESS); -} - -int -short_usage() -{ - printf("Usage: BUpdaterSLURM [-ov?] [-o|--nodaemon] [-v|--version] [-?|--help] [--usage]\n"); - exit(EXIT_SUCCESS); -} - diff --git a/src/BUpdaterSLURM.h b/src/BUpdaterSLURM.h deleted file mode 100644 index d04203ce..00000000 --- a/src/BUpdaterSLURM.h +++ /dev/null @@ -1,72 +0,0 @@ -/* -# File: BUpdaterSLURM.h -# -# Author: Massimo Mezzadri -# e-mail: Massimo.Mezzadri@mi.infn.it -# -# Copyright (c) Members of the EGEE Collaboration. 2004. -# See http://www.eu-egee.org/partners/ for details on the copyright -# holders. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -*/ - -#include "acconfig.h" - -#include "job_registry.h" -#include "job_registry_updater.h" -#include "Bfunctions.h" -#include "config.h" - -#define DEFAULT_LOOP_INTERVAL 5 - -#ifndef VERSION -#define VERSION "1.8.0" -#endif - -int ReceiveUpdateFromNetwork(); -int IntStateQuery(); -int FinalStateQuery(time_t start_date, int logs_to_read); -int AssignFinalState(char *batchid); -void sighup(); -int usage(); -int short_usage(); - -int runfinal=FALSE; -int runfinal_oldlogs=FALSE; -char *slurm_binpath; -char *registry_file; -int purge_interval=864000; -int finalstate_query_interval=30; -int alldone_interval=36000; -int next_finalstatequery=0; -int bupdater_consistency_check_interval=3600; -int debug=FALSE; -int nodmn=FALSE; - -bupdater_active_jobs bact; - -FILE *debuglogfile; -char *debuglogname=NULL; - -job_registry_handle *rha; -config_handle *cha; -config_entry *ret; -char *progname="BUpdaterSLURM"; - -struct pollfd *remupd_pollset = NULL; -int remupd_nfds; -job_registry_updater_endpoint *remupd_head = NULL; -job_registry_updater_endpoint *remupd_head_send = NULL; -config_entry *remupd_conf; diff --git a/src/Bfunctions.c b/src/Bfunctions.c index 63324e1b..c4bf06bd 100644 --- a/src/Bfunctions.c +++ b/src/Bfunctions.c @@ -316,8 +316,6 @@ str2epoch(char *str, char * f) strptime(str,"%a %b %d %T %Y",&tm); }else if(strcmp(f,"A")==0){ strptime(str,"%m/%d/%Y %T",&tm); - }else if(strcmp(f,"N")==0){ - strptime(str,"%Y-%m-%dT%T",&tm); }else if(strcmp(f,"W")==0){ /* If do not have the year in the date we compare day and month and set the year */ diff --git a/src/Makefile.am b/src/Makefile.am index 6283bde3..3370300a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -2,7 +2,7 @@ # * BLAHP daemon * # **************** # -# $Id: Makefile.am,v 1.60 2012/05/16 09:41:23 mezzadri Exp $ +# $Id: Makefile.am,v 1.57.2.3 2012/03/20 13:38:43 mezzadri Exp $ # # File: Makefile.am # @@ -48,7 +48,7 @@ endif sbin_PROGRAMS = blahpd_daemon blah_job_registry_add blah_job_registry_lkup blah_job_registry_scan_by_subject blah_check_config blah_job_registry_dump blah_job_registry_purge bin_PROGRAMS = blahpd -libexec_PROGRAMS = BLClient BLParserLSF BLParserPBS BUpdaterCondor BNotifier BUpdaterLSF BUpdaterPBS BUpdaterSGE BUpdaterSLURM $(GLOBUS_EXECS) blparser_master +libexec_PROGRAMS = BLClient BLParserLSF BLParserPBS BUpdaterCondor BNotifier BUpdaterLSF BUpdaterPBS BUpdaterSGE $(GLOBUS_EXECS) blparser_master noinst_PROGRAMS = test_job_registry_create test_job_registry_purge test_job_registry_update test_job_registry_access test_job_registry_update_from_network test_cmdbuffer common_sources = console.c job_status.c resbuffer.c server.c commands.c classad_binary_op_unwind.C classad_c_helper.C proxy_hashcontainer.c config.c job_registry.c blah_utils.c env_helper.c mapped_exec.c md5.c cmdbuffer.c @@ -136,9 +136,6 @@ BUpdaterPBS_LDADD = -lpthread -lm BUpdaterSGE_SOURCES = BUpdaterSGE.c Bfunctions.c job_registry.c md5.c config.c blah_utils.c BUpdaterSGE_LDADD = -lpthread -BUpdaterSLURM_SOURCES = BUpdaterSLURM.c Bfunctions.c job_registry.c md5.c config.c blah_utils.c job_registry_updater.c -BUpdaterSLURM_LDADD = -lpthread -lm - blparser_master_SOURCES = blparser_master.c config.c blah_utils.c blparser_master_LDADD = @@ -151,5 +148,5 @@ blah_job_registry_dump_CFLAGS = $(AM_CFLAGS) test_cmdbuffer_SOURCES = cmdbuffer.c test_cmdbuffer_CFLAGS = $(AM_CFLAGS) -DCMDBUF_DEBUG -noinst_HEADERS = blahpd.h classad_binary_op_unwind.h classad_c_helper.h commands.h job_status.h resbuffer.h server.h console.h BPRcomm.h tokens.h BLParserPBS.h BLParserLSF.h proxy_hashcontainer.h job_registry.h md5.h config.h BUpdaterCondor.h Bfunctions.h BNotifier.h BUpdaterLSF.h BUpdaterPBS.h BUpdaterSGE.h BUpdaterSLURM.h blah_utils.h env_helper.h mapped_exec.h blah_check_config.h BLfunctions.h cmdbuffer.h job_registry_updater.h +noinst_HEADERS = blahpd.h classad_binary_op_unwind.h classad_c_helper.h commands.h job_status.h resbuffer.h server.h console.h BPRcomm.h tokens.h BLParserPBS.h BLParserLSF.h proxy_hashcontainer.h job_registry.h md5.h config.h BUpdaterCondor.h Bfunctions.h BNotifier.h BUpdaterLSF.h BUpdaterPBS.h BUpdaterSGE.h blah_utils.h env_helper.h mapped_exec.h blah_check_config.h BLfunctions.h cmdbuffer.h job_registry_updater.h diff --git a/src/blah_job_registry_scan_by_subject.c b/src/blah_job_registry_scan_by_subject.c index daf11d00..f12a8d60 100644 --- a/src/blah_job_registry_scan_by_subject.c +++ b/src/blah_job_registry_scan_by_subject.c @@ -1,13 +1,11 @@ /* * File : blah_job_registry_scan_by_subject.c * - * Author : Francesco Prelz ($Author: fprelz $) + * Author : Francesco Prelz ($Author: mezzadri $) * e-mail : "francesco.prelz@mi.infn.it" * * Revision history : * 5-May-2009 Original release - * 16-Jul-2012 Added statistics count of jobs. Allow empty hash. - * 16-Jul-2012 Added job user prefix filter. * * Description: * Executable to look up for entries in the BLAH job registry @@ -290,7 +288,7 @@ get_format_type(char *fmt, int which, int *totfmts) return result; } -#define USAGE_STRING "ERROR Usage: %s [-total] [<-s (proxy subject)>|<-h (proxy subject hash>] [-p (user prefix)] [-j job_status[\\|job_status]] \"Optional arg1 format\" arg1 \"Optional arg2 format\" arg2, etc.\n" +#define USAGE_STRING "ERROR Usage: %s (<-s (proxy subject)>|<-h (proxy subject hash>) [-j job_status[\\|job_status]] \"Optional arg1 format\" arg1 \"Optional arg2 format\" arg2, etc.\n" static void print_usage(char *name) @@ -325,86 +323,85 @@ main(int argc, char *argv[]) char *lookup_subject = NULL; char *lookup_hash = NULL; int cur_arg; - int format_args = -1; + int format_args; int select_by_job_status = 0; int ifr; int njobs = 0; format_type first_fmt; int nfmts; - int total_only = 0; - char *test_user_prefix = NULL; - int test_user_prefix_len = 0; - if (argc > 1) + if (argc < 2) { - cur_arg = 1; - - while (argv[cur_arg][0] == '-') + print_usage(argv[0]); + return 1; + } + + cur_arg = 1; + + while (argv[cur_arg][0] == '-') + { + format_args = -1; + /* Look up for command line switches */ + if (strlen(argv[cur_arg]) > 2) { - format_args = -1; - /* Look up for command line switches */ - if (strlen(argv[cur_arg]) > 2) + arg = argv[cur_arg] + 2; + if (argc > (cur_arg+1)) { - arg = argv[cur_arg] + 2; - if (argc > (cur_arg+1)) + format_args = cur_arg+1; + } + } + else if (argc > (cur_arg+1)) + { + arg = argv[cur_arg+1]; + if (argc > (cur_arg+2)) + { + format_args = cur_arg+2; + } + } + + if (strlen(arg) <= 0) + { + print_usage(argv[0]); + return 1; + } + + switch (argv[cur_arg][1]) + { + case 'h': + if (lookup_hash != NULL) { - format_args = cur_arg+1; + print_usage(argv[0]); + return 1; } - } - else if (argc > (cur_arg+1)) - { - arg = argv[cur_arg+1]; - if (argc > (cur_arg+2)) - { - format_args = cur_arg+2; - } - } - - if (strlen(arg) <= 0) - { - print_usage(argv[0]); - return 1; - } - - switch (argv[cur_arg][1]) - { - case 'h': - if (lookup_hash != NULL) - { - print_usage(argv[0]); - return 1; - } - lookup_hash = arg; - break; - case 's': - if (lookup_hash != NULL) - { - print_usage(argv[0]); - return 1; - } - job_registry_compute_subject_hash(&hen, arg); - lookup_subject = arg; - lookup_hash = hen.subject_hash; - break; - case 'j': - select_by_job_status = parse_job_state_condition(arg); - break; - case 't': - total_only = 1; - break; - case 'p': - test_user_prefix = arg; - test_user_prefix_len = strlen(arg); - break; - default: + lookup_hash = arg; + break; + case 's': + if (lookup_hash != NULL) + { print_usage(argv[0]); return 1; - } - if ((format_args > 0) && (format_args < argc)) cur_arg = format_args; - else break; + } + job_registry_compute_subject_hash(&hen, arg); + lookup_subject = arg; + lookup_hash = hen.subject_hash; + break; + case 'j': + select_by_job_status = parse_job_state_condition(arg); + break; + default: + print_usage(argv[0]); + return 1; } + if ((format_args > 0) && (format_args < argc)) cur_arg = format_args; + else break; } + if (lookup_hash == NULL) + { + print_usage(argv[0]); + return 1; + } + cha = config_read(NULL); /* Read config from default locations. */ if (cha != NULL) { @@ -449,23 +446,20 @@ main(int argc, char *argv[]) if (cha != NULL) config_free(cha); if (need_to_free_registry_file) free(registry_file); - if (lookup_hash != NULL) + looked_up_subject = job_registry_lookup_subject_hash(rha, lookup_hash); + if (looked_up_subject == NULL) { - looked_up_subject = job_registry_lookup_subject_hash(rha, lookup_hash); - if (looked_up_subject == NULL) + fprintf(stderr,"%s: Hash %s is not found in registry %s.\n",argv[0], + lookup_hash, rha->path); + job_registry_destroy(rha); + return 5; + } else { + if ((lookup_subject != NULL) && + (strcmp(looked_up_subject, lookup_subject) != 0)) { - fprintf(stderr,"%s: Hash %s is not found in registry %s.\n",argv[0], - lookup_hash, rha->path); - job_registry_destroy(rha); - return 5; - } else { - if ((lookup_subject != NULL) && - (strcmp(looked_up_subject, lookup_subject) != 0)) - { - fprintf(stderr, "%s: Warning: cached subject (%s) differs from the requested subject (%s)\n", argv[0], looked_up_subject, lookup_subject); - } - free(looked_up_subject); + fprintf(stderr, "%s: Warning: cached subject (%s) differs from the requested subject (%s)\n", argv[0], looked_up_subject, lookup_subject); } + free(looked_up_subject); } fd = job_registry_open(rha, "r"); @@ -491,31 +485,14 @@ main(int argc, char *argv[]) for (ifr = format_args; ifr < argc; ifr+=2) undo_escapes(argv[ifr]); } - if (lookup_hash == NULL) lookup_hash = ""; - while ((ren = job_registry_get_next_hash_match(rha, fd, lookup_hash)) != NULL) { /* Is the current entry in the requested job status ? */ if ((select_by_job_status != 0) && (!check_job_state_condition(select_by_job_status, ren->status))) - { - free(ren); - continue; - } - - if ((test_user_prefix != NULL) && - (strncmp(ren->user_prefix, test_user_prefix, test_user_prefix_len) != 0)) - { - free(ren); continue; - } njobs++; - if (total_only != 0) - { - free(ren); - continue; - } cad = job_registry_entry_as_classad(rha, ren); if (cad != NULL) @@ -583,8 +560,5 @@ main(int argc, char *argv[]) fclose(fd); job_registry_destroy(rha); - if (total_only != 0) printf("%s: Matched entries: %d\n", argv[0], njobs); - - if (total_only && (njobs>0)) return ((njobs%127)+1); return 0; } diff --git a/src/job_registry.c b/src/job_registry.c index 27d137af..45515974 100644 --- a/src/job_registry.c +++ b/src/job_registry.c @@ -2,7 +2,7 @@ * File : job_registry.c * * - * Author : Francesco Prelz ($Author: mezzadri $) + * Author : Francesco Prelz ($Author: fprelz $) * e-mail : "francesco.prelz@mi.infn.it" * * Revision history : @@ -359,7 +359,7 @@ job_registry_probe_next_record(FILE *fd, job_registry_entry *en) { size_t rsize = 0; size_t act_size; - long start_pos, end_pos; + long start_pos, end_pos, offset; int sret, eret, cret; int ic; size_t allowed_size_incs[N_JOB_REGISTRY_ALLOWED_ENTRY_SIZE_INCS] = @@ -400,7 +400,7 @@ job_registry_probe_next_record(FILE *fd, job_registry_entry *en) } else { - if (feof(fd)) en->magic_start = JOB_REGISTRY_MAGIC_START; + if (feof(fd)) en->magic_start == JOB_REGISTRY_MAGIC_START; break; } } @@ -452,8 +452,10 @@ job_registry_update_reg(const job_registry_handle *rha, { FILE *of, *nf; job_registry_entry en; + unsigned char *enendp; int encount; int rret, wret; + int i; of = fopen(old_path,"r"); if (of == NULL) return -1; @@ -522,6 +524,7 @@ job_registry_init(const char *path, char real_file_name[FILENAME_MAX]; int rlnk_status; mode_t old_umask; + const char *npu_tail="/npu"; int cfd; FILE *fd; char *old_lockfile, *old_npudir, *old_path=NULL; @@ -1494,7 +1497,7 @@ int job_registry_append_op(job_registry_handle *rha, job_registry_entry *entry, FILE *fd, time_t now) { - job_registry_recnum_t found; + job_registry_recnum_t found,curr_recn; job_registry_entry last; long curr_pos; int need_to_fclose = FALSE; @@ -1598,7 +1601,9 @@ job_registry_get_new_npufd(job_registry_handle *rha) FILE *rfd = NULL; int lfd; char *tp; + struct stat fst; const char *npu_tail="/npu_XXXXXX"; + int i; /* Append a filename to rha->npudir, so it can be passed back to */ /* jobregistry_construct_path */ @@ -1690,6 +1695,7 @@ int job_registry_merge_pending_nonpriv_updates(job_registry_handle *rha, FILE *fd) { + int i; int nadd = 0; int rapp; int frret; @@ -2364,7 +2370,7 @@ job_registry_open(job_registry_handle *rha, const char *mode) int job_registry_unlock(FILE *sfd) { - int fd; + int fd, lfd; struct flock ulock; int ret; @@ -2647,6 +2653,7 @@ job_registry_entry_as_classad(const job_registry_handle *rha, "CreateTime=%u; ModifiedTime=%u; UserTime=%u; " "SubmitterUid=%d; %s]"; char *result, *fmt_extra, *extra_attrs=NULL, *new_extra_attrs; + char *extra_attrs_append; int extra_attrs_size = 0; int need_to_free_extra_attrs = FALSE; int esiz,fsiz; @@ -3113,6 +3120,7 @@ job_registry_lookup_subject_hash(const job_registry_handle *rha, { FILE *fd; char subline[JOB_REGISTRY_MAX_SUBJECTLIST_LINE]; + int retcod; char *en; if (rha == NULL || hash == NULL) return NULL; diff --git a/src/job_registry_updater.c b/src/job_registry_updater.c index 57d8ecdf..9e66e8fe 100644 --- a/src/job_registry_updater.c +++ b/src/job_registry_updater.c @@ -2,13 +2,12 @@ * File : job_registry_updater.c * * - * Author : Francesco Prelz ($Author: drebatto $) + * Author : Francesco Prelz ($Author: fprelz $) * e-mail : "francesco.prelz@mi.infn.it" * * Revision history : * 13-Jul-2011 Original release * 19-Jul-2011 Added transfer of full proxy subject and path. - * 19-Jul-2012 Exclude local addresses from possible destinations. * * Description: * Protocol to distribute network updates to the BLAH registry. @@ -42,7 +41,6 @@ #include #include #include -#include #include #include #include @@ -191,12 +189,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, int n_added = 0; int is_multicast; struct ip_mreqn mreq4; - struct ifconf ifc; /* holds IOCTL return value for SIOCGIFCONF */ - int iofd = -1; - int ifconf_ret, numreqs = 30, n; - struct ifreq *ifr; /* points to one interface returned from ioctl */ - struct sockaddr_in *sin, *lin; - struct sockaddr_in6 *sin6, *lin6; if (endpoints == NULL) { @@ -211,34 +203,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, last = cur; } - /* Get local interface data via SIOCGIFCONF ioctl */ - iofd = socket (PF_INET, SOCK_DGRAM, 0); - if (iofd >= 0) - { - memset (&ifc, 0, sizeof(ifc)); - - ifc.ifc_buf = NULL; - - for (;;) - { - ifc.ifc_len = sizeof(struct ifreq) * numreqs; - ifc.ifc_buf = realloc(ifc.ifc_buf, ifc.ifc_len); - - if ((ifconf_ret = ioctl(iofd, SIOCGIFCONF, &ifc)) < 0) - { - break; - } - if (ifc.ifc_len == sizeof(struct ifreq) * numreqs) - { - /* assume it overflowed and try again */ - numreqs += 10; - continue; - } - break; - } - } - - for (i=0; i= 0) @@ -246,54 +210,12 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, /* Look for a workable address */ for (cur_ans = ai_ans; cur_ans != NULL; cur_ans = cur_ans->ai_next) { - /* Exclude local addresses */ - if ((iofd >= 0) && (ifconf_ret >= 0)) - { - /* loop through interfaces returned from SIOCGIFCONF */ - ifr = ifc.ifc_req; - for (n=0; n < ifc.ifc_len; n+=sizeof(struct ifreq)) - { - /* Get the interface address */ - if (ioctl(iofd,SIOCGIFADDR, ifr) == 0 ) - { - if (ifr->ifr_ifru.ifru_addr.sa_family == cur_ans->ai_family) - { - switch(cur_ans->ai_family) - { - case AF_INET: - /* IPV4 case */ - lin = (struct sockaddr_in *)&ifr->ifr_ifru.ifru_addr; - sin = (struct sockaddr_in *)(cur_ans->ai_addr); - if ((lin->sin_addr.s_addr) == (sin->sin_addr.s_addr)) - continue; - break; - case AF_INET6: - /* IPV6 case */ - lin6 = (struct sockaddr_in6 *)&ifr->ifr_ifru.ifru_addr; - sin6 = (struct sockaddr_in6 *)(cur_ans->ai_addr); - if (memcmp(lin6->sin6_addr.s6_addr, sin6->sin6_addr.s6_addr, 16) == 0) - continue; - break; - default: - /* Unknown family */ - break; - } - } - } - } - } - tfd = socket(cur_ans->ai_family, cur_ans->ai_socktype, cur_ans->ai_protocol); if (tfd < 0) { if (n_destinations == 1) { freeaddrinfo(ai_ans); - if (iofd >= 0) - { - free(ifc.ifc_buf); - close(iofd); - } return JOB_REGISTRY_SOCKET_FAIL; } else continue; @@ -319,11 +241,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, if (n_destinations == 1) { freeaddrinfo(ai_ans); - if (iofd >= 0) - { - free(ifc.ifc_buf); - close(iofd); - } return JOB_REGISTRY_MCAST_FAIL; } else continue; @@ -339,11 +256,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, if (n_destinations == 1) { freeaddrinfo(ai_ans); - if (iofd >= 0) - { - free(ifc.ifc_buf); - close(iofd); - } return JOB_REGISTRY_MCAST_FAIL; } else continue; @@ -362,11 +274,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, if (n_destinations == 1) { freeaddrinfo(ai_ans); - if (iofd >= 0) - { - free(ifc.ifc_buf); - close(iofd); - } return JOB_REGISTRY_MALLOC_FAIL; } else continue; @@ -382,11 +289,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, if (n_destinations == 1) { freeaddrinfo(ai_ans); - if (iofd >= 0) - { - free(ifc.ifc_buf); - close(iofd); - } return JOB_REGISTRY_TTL_FAIL; } else continue; @@ -398,11 +300,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, if (n_destinations == 1) { freeaddrinfo(ai_ans); - if (iofd >= 0) - { - free(ifc.ifc_buf); - close(iofd); - } return JOB_REGISTRY_CONNECT_FAIL; } else continue; @@ -424,11 +321,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, return pretcod; } } - if (iofd >= 0) - { - free(ifc.ifc_buf); - close(iofd); - } return n_added; } diff --git a/src/resbuffer.c b/src/resbuffer.c index 1a1782ed..cb9b6849 100644 --- a/src/resbuffer.c +++ b/src/resbuffer.c @@ -149,6 +149,7 @@ get_lines(void) * */ { char *res_lines = NULL; + char *reallocated; /* Acquire lock */ pthread_mutex_lock(&resbuffer_lock); diff --git a/src/scripts/Makefile.am b/src/scripts/Makefile.am index 34f241b5..f1718096 100644 --- a/src/scripts/Makefile.am +++ b/src/scripts/Makefile.am @@ -2,11 +2,11 @@ # * BLAHP scripts * # ***************** # -# $Id: Makefile.am,v 1.11 2012/07/04 11:03:06 drebatto Exp $ +# $Id: Makefile.am,v 1.10 2012/01/13 11:23:44 mezzadri Exp $ # # File: Makefile.am # -# Author(s): Francesco Prelz ($Author: drebatto $) +# Author(s): Francesco Prelz ($Author: mezzadri $) # e-mail: "Francesco.Prelz@mi.infn.it" # # Revision history: @@ -36,7 +36,6 @@ libexec_SCRIPTS = blah_load_config.sh blah_common_submit_functions.sh \ lsf_cancel.sh lsf_status.sh lsf_submit.sh lsf_hold.sh lsf_resume.sh \ condor_cancel.sh condor_status.sh condor_submit.sh condor_hold.sh condor_resume.sh \ sge_cancel.sh sge_helper sge_resume.sh sge_submit.sh sge_filestaging \ - sge_hold.sh sge_status.sh runcmd.pl.template sge_local_submit_attributes.sh \ - slurm_submit.sh slurm_status.sh slurm_cancel.sh + sge_hold.sh sge_status.sh runcmd.pl.template sge_local_submit_attributes.sh EXTRA_DIST = $(bin_SCRIPTS) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index f48166e9..e2b2d52c 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -1,3 +1,4 @@ +#!/bin/bash # File: blah_common_submit_functions.sh # # Author: Francesco Prelz diff --git a/src/scripts/blah_load_config.sh b/src/scripts/blah_load_config.sh index c2b6cc67..1e340144 100755 --- a/src/scripts/blah_load_config.sh +++ b/src/scripts/blah_load_config.sh @@ -1,3 +1,4 @@ +#!/bin/bash # File: blah_load_config.sh # # Author: Francesco Prelz diff --git a/src/scripts/slurm_cancel.sh b/src/scripts/slurm_cancel.sh deleted file mode 100755 index ea67876a..00000000 --- a/src/scripts/slurm_cancel.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -# File: slurm_cancel.sh -# -# Author: David Rebatto -# e-mail: David.Rebatto@mi.infn.it -# -# -# Copyright (c) Members of the EGEE Collaboration. 2004. -# See http://www.eu-egee.org/partners/ for details on the copyright -# holders. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -. `dirname $0`/blah_load_config.sh - -jnr=0 -jc=0 -for job in $@ ; do - jnr=$(($jnr+1)) -done -for job in $@ ; do - requested=`echo $job | sed 's/^.*\///'` - cmdout=`${slurm_binpath}/scancel $requested 2>&1` - retcode=$? - if [ "$retcode" == "0" ] ; then - if [ "$jnr" == "1" ]; then - echo " 0 No\\ error" - else - echo .$jc" 0 No\\ error" - fi - else - escaped_cmdout=`echo $cmdout|sed "s/ /\\\\\ /g"` - if [ "$jnr" == "1" ]; then - echo " $retcode $escaped_cmdout" - else - echo .$jc" $retcode $escaped_cmdout" - fi - fi - jc=$(($jc+1)) -done - diff --git a/src/scripts/slurm_status.sh b/src/scripts/slurm_status.sh deleted file mode 100755 index 56b862e3..00000000 --- a/src/scripts/slurm_status.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash - -# File: slurm_status.sh -# -# Author: David Rebatto -# e-mail: David.Rebatto@mi.infn.it -# -# -# Revision history: -# 18-Jun-2012: Original release -# -# Description: -# Return a classad describing the status of a SLURM job -# -# Copyright (c) Members of the EGEE Collaboration. 2012. -# See http://www.eu-egee.org/partners/ for details on the copyright -# holders. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -. `dirname $0`/blah_load_config.sh - -if [ "x$job_registry" != "x" ] ; then - ${blah_sbin_directory}/blah_job_registry_lkup $@ - exit 0 -else - echo "job registry not enabled (required for SLURM support)" >&2 - exit 1 -fi diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh deleted file mode 100755 index bafe6c27..00000000 --- a/src/scripts/slurm_submit.sh +++ /dev/null @@ -1,100 +0,0 @@ -#!/bin/bash -# -# File: slurm_submit.sh -# Author: David Rebatto (david.rebatto@mi.infn.it) -# -# Revision history: -# 14-Mar-2012: Original release -# -# -# Copyright (c) Members of the EGEE Collaboration. 2004. -# See http://www.eu-egee.org/partners/ for details on the copyright -# holders. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -. `dirname $0`/blah_common_submit_functions.sh - -bls_parse_submit_options "$@" -bls_setup_all_files - -# Default values for configuration variables -slurm_std_storage=${slurm_std_storage:-/dev/null} -slurm_opt_prefix=${slurm_opt_prefix:-SBATCH} - -# Write wrapper preamble -cat >$bls_tmp_file << end_of_preamble -#!/bin/bash -# SLURM job wrapper generated by `basename $0` -# on `/bin/date` -# -# stgcmd = $bls_opt_stgcmd -# proxy_string = $bls_opt_proxy_string -# proxy_local_file = $bls_proxy_local_file -# -# SLURM directives: -#$slurm_opt_prefix -o $slurm_std_storage -#$slurm_opt_prefix -e $slurm_std_storage -end_of_preamble - -# Add site specific directives -bls_local_submit_attributes_file=${blah_libexec_directory}/slurm_local_submit_attributes.sh -bls_set_up_local_and_extra_args - -# Write SLURM directives according to command line options -# handle queue overriding -[ -z "$bls_opt_queue" ] || grep -q "^#$slurm_opt_prefix -p" $bls_tmp_file || - echo "#$slurm_opt_prefix -p $bls_opt_queue" >> $bls_tmp_file - -# Input sandbox setup -bls_fl_subst_and_dump inputsand "scp `hostname -f`:@@F_LOCAL @@F_REMOTE" >> $bls_tmp_file - -# The wrapper's body... -bls_add_job_wrapper - -# Output sandbox setup -echo "# Copy the output file back..." >> $bls_tmp_file -bls_fl_subst_and_dump outputsand "scp @@F_REMOTE `hostname -f`:@@F_LOCAL" >> $bls_tmp_file - -if [ "x$bls_opt_debug" = "xyes" ]; then - echo "Submit file written to $bls_tmp_file" - exit -fi - -############################################################### -# Submit the script -############################################################### - -datenow=`date +%Y%m%d` -jobID=`sbatch $bls_tmp_file | sed 's/Submitted batch job //'` -retcode=$? -if [ "$retcode" != "0" ] ; then - rm -f $bls_tmp_file - exit 1 -fi - -# Compose the blahp jobID ("slurm" + metadata + slurm jobid) -blahp_jobID="slurm/${datenow}/${jobID}" - -if [ "x$job_registry" != "x" ]; then - now=$((`date +%s` - 1)) - ${blah_sbin_directory}/blah_job_registry_add "$blahp_jobID" "$jobID" 1 $now "$bls_opt_creamjobid" "$bls_proxy_local_file" "$bls_opt_proxyrenew_numeric" "$bls_opt_proxy_subject" -fi - -echo "BLAHP_JOBID_PREFIX$blahp_jobID" - -bls_wrap_up_submit - -exit $retcode - From e4aa6ab3e00340123397f56de3551df839534942 Mon Sep 17 00:00:00 2001 From: Francesco Prelz Date: Mon, 15 Oct 2012 16:40:57 +0200 Subject: [PATCH 002/169] Modified scripts for autotools-based build so that both the old --with options and the default (/usr) locations work. --- Makefile.am | 2 +- configure.ac | 46 ++++++--- project/classads.m4 | 83 ++++++++++++++++ project/glite.m4 | 37 +++++++ project/globus.m4 | 233 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 386 insertions(+), 15 deletions(-) create mode 100644 project/classads.m4 create mode 100644 project/glite.m4 create mode 100644 project/globus.m4 diff --git a/Makefile.am b/Makefile.am index 7e097586..db07c2bb 100755 --- a/Makefile.am +++ b/Makefile.am @@ -25,7 +25,7 @@ doc_DATA = LICENSE SUBDIRS = src config doc ## Default flags to run aclocal -ACLOCAL_AMFLAGS = -I project -I ../org.glite/project +ACLOCAL_AMFLAGS = -I project stage: @set fnord $(MAKEFLAGS); amf=$$2; \ diff --git a/configure.ac b/configure.ac index 929b50e2..e8e53cdf 100755 --- a/configure.ac +++ b/configure.ac @@ -12,12 +12,12 @@ # Council for the Central Laboratory of the Research Councils (CCLRC), United Kingdom # # Authors: Francesco Prelz -# Version info: $Id: configure.ac,v 1.53.2.1 2012/06/08 13:09:09 pandreet Exp $ -# Release: $Name: glite-ce-blahp_B_1_18 $ +# Version info: $Id: configure.ac,v 1.54 2012/03/01 13:26:54 pandreet Exp $ +# Release: $Name: $ # # Revision history: # $Log: configure.ac,v $ -# Revision 1.53.2.1 2012/06/08 13:09:09 pandreet +# Revision 1.54 2012/03/01 13:26:54 pandreet # Changed version number # # Revision 1.53 2012/02/15 13:55:15 pandreet @@ -163,7 +163,7 @@ # AC_PREREQ(2.57) -AC_INIT([GLite CE blahp], [1.18.1]) +AC_INIT([GLite CE blahp], [1.19.0]) AC_CONFIG_AUX_DIR([./project]) AM_INIT_AUTOMAKE([1.6.3 subdir-objects]) AC_CONFIG_SRCDIR([src/main.c]) @@ -197,9 +197,23 @@ AC_HEADER_TIME dnl Checks for library functions. AC_CHECK_FUNCS(select socket strdup strerror bsearch vsnprintf mmap munmap) -GLITE_CHECK_LIBDIR -GLITE_CHECK_INITDIR -AC_GLITE_DOCBOOK +dnl GLITE_CHECK_LIBDIR +dnl GLITE_CHECK_INITDIR +dnl AC_GLITE_DOCBOOK + +GLITE_DB_MANPAGES_STYLESHEET="/usr/share/sgml/docbook/xsl-stylesheets/manpages/docbook.xsl" + +AC_ARG_WITH(manpage_stylesheet, + [ --with-manpage-stylesheet=PATH (path to the docbook stylesheet for man pages ($GLITE_DB_MANPAGES_STYLESHEET)], + [], + with_manpage_stylesheet=$GLITE_DB_MANPAGES_STYLESHEET) + +if ! test -r "$with_manpage_stylesheet" ; then + AC_MSG_ERROR("$with_manpage_stylesheet not found. Try setting --with-manpage-stylesheet.") +fi + +GLITE_DB_MANPAGES_STYLESHEET=$with_manpage_stylesheet +AC_SUBST(GLITE_DB_MANPAGES_STYLESHEET) AC_ARG_WITH(dist_location, [ --with-dist-location=PFX prefix where DIST location is. (pwd)], @@ -221,13 +235,17 @@ AC_SUBST(DISTTAR) AC_CLASSADS([], AC_MSG_RESULT(["CLASSADS ok"]), AC_MSG_ERROR(["CLASSADS not found"])) -have_globus=yes -PKG_CHECK_MODULES(GLOBUS_GSI_CRED, globus-gsi-credential, , have_globus=no) -PKG_CHECK_MODULES(GLOBUS_GSI_PROXY, globus-gsi-proxy-core, , have_globus=no) -PKG_CHECK_MODULES(GLOBUS_GSI_UTILS, globus-gsi-cert-utils, , have_globus=no) -PKG_CHECK_MODULES(GLOBUS_GSS_ASSIST, globus-gss-assist, , have_globus=no) -PKG_CHECK_MODULES(GLOBUS_GSI_SYSCFG, globus-gsi-sysconfig, , have_globus=no) -AC_MSG_RESULT(["GLOBUS found $have_globus"]) +AC_GLOBUS([], have_globus=yes, have_globus=no) + +if test $have_globus = no; then + have_globus=yes + PKG_CHECK_MODULES(GLOBUS_GSI_CRED, globus-gsi-credential, , have_globus=no) + PKG_CHECK_MODULES(GLOBUS_GSI_PROXY, globus-gsi-proxy-core, , have_globus=no) + PKG_CHECK_MODULES(GLOBUS_GSI_UTILS, globus-gsi-cert-utils, , have_globus=no) + PKG_CHECK_MODULES(GLOBUS_GSS_ASSIST, globus-gss-assist, , have_globus=no) + PKG_CHECK_MODULES(GLOBUS_GSI_SYSCFG, globus-gsi-sysconfig, , have_globus=no) +fi +AC_MSG_RESULT(["GLOBUS found: $have_globus"]) AM_CONDITIONAL([HAVE_GLOBUS], [test "x$bprserver" == "xyes" -a "x$have_globus" == "xyes"]) dnl Temporarily built with no optimisation diff --git a/project/classads.m4 b/project/classads.m4 new file mode 100644 index 00000000..82fa3795 --- /dev/null +++ b/project/classads.m4 @@ -0,0 +1,83 @@ +dnl Usage: +dnl AC_CLASSAD(MINIMUM-VERSION, [ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]]]) +dnl - CLASSAD_CFLAGS (compiler flags) +dnl - CLASSAD_LIBS (linker flags, stripping and path) +dnl - CLASSAD_DL_LIBS +dnl - CLASSAD_INSTALL_PATH +dnl - CLASSAD_PATH + +AC_DEFUN([AC_CLASSADS], +[ + AC_ARG_WITH(classads_prefix, + [ --with-classads-prefix=PFX prefix where the Classad libraries are installed (/usr)], + [], + with_classads_prefix="/usr") + + AC_MSG_CHECKING([for CLASSAD installation]) + + CLASSAD_CFLAGS="" + if test -n "$with_classads_prefix" ; then + AC_MSG_RESULT([prefix: $with_classads_prefix]) + + ac_classads_prefix=$with_classads_prefix + if test "$with_classads_prefix" != "/usr"; then + + CLASSAD_CFLAGS="-I$with_classads_prefix/include -I$with_classads_prefix/include/classad" + CLASSAD_LIBS="-L$with_classads_prefix/lib -lclassad" + CLASSAD_DL_LIBS="-L$with_classads_prefix/lib -lclassad_dl" + else + CLASSAD_CFLAGS="-I$with_classads_prefix/include/classad" + CLASSAD_LIBS="-lclassad" + CLASSAD_DL_LIBS="-lclassad_dl" + fi + fi + + AC_LANG_SAVE + AC_LANG_CPLUSPLUS + ac_save_cppflags=$CPPFLAGS + ac_save_libs=$LIBS + CPPFLAGS="$CLASSAD_CFLAGS $CPPFLAGS" + BASE_LIBS="$LIBS" + LIBS="$CLASSAD_LIBS $LIBS" + AC_MSG_CHECKING([if a small classads program compiles]) + AC_TRY_LINK([ #include ], + [ classad::ClassAd ad; classad::ClassAdParser parser; ], + [ ac_have_classads=yes ], [ ac_have_classads=no ]) + if test x$ac_have_classads = xno ; then + CLASSAD_CFLAGS="$CLASSAD_CFLAGS -DWANT_CLASSAD_NAMESPACE -DWANT_NAMESPACES" + CLASSAD_LIBS="-L$with_classads_prefix/lib -lclassad_ns" + LIBS="$CLASSAD_LIBS $BASE_LIBS" + CPPFLAGS="$CLASSAD_CFLAGS $ac_save_cppflags" + AC_TRY_LINK([ #include ], + [ classad::ClassAd ad; classad::ClassAdParser parser; ], + [ ac_have_classads=yes ], [ ac_have_classads=no ]) + fi + AC_MSG_RESULT([$ac_have_classads]) + + CPPFLAGS=$ac_save_cppflags + LIBS=$ac_save_libs + AC_LANG_RESTORE + + CLASSAD_PATH=$with_classads_prefix + + if test x$ac_have_classads = xyes ; then + CLASSAD_INSTALL_PATH=$ac_classads_prefix + ifelse([$2], , :, [$2]) + else + AC_MSG_WARN([ + *** Cannot compile a small classads program: check whether the + *** Condor ClassADs library is installed]) + CLASSAD_CFLAGS="" + CLASSAD_LIBS="" + CLASSAD_DL_LIBS="" + CLASSAD_PATH="" + ifelse([$3], , :, [$3]) + fi + + AC_SUBST(CLASSAD_INSTALL_PATH) + AC_SUBST(CLASSAD_CFLAGS) + AC_SUBST(CLASSAD_LIBS) + AC_SUBST(CLASSAD_DL_LIBS) + AC_SUBST(CLASSAD_PATH) +]) + diff --git a/project/glite.m4 b/project/glite.m4 new file mode 100644 index 00000000..55fe945f --- /dev/null +++ b/project/glite.m4 @@ -0,0 +1,37 @@ +dnl Usage: +dnl AC_GLITE +dnl - GLITE_LOCATION +dnl - GLITE_CFLAGS +dnl - DISTTAR + +AC_DEFUN([AC_GLITE], +[ + AC_ARG_WITH(glite_location, + [ --with-glite-location=PFX prefix where GLITE is installed. (/opt/glite)], + [], + with_glite_location=/opt/glite) + + if test -n "with_glite_location" ; then + GLITE_LOCATION="$with_glite_location" + GLITE_CFLAGS="-I$GLITE_LOCATION/include" + else + GLITE_LOCATION="" + GLITE_CFLAGS="" + fi + + AC_MSG_RESULT([GLITE_LOCATION set to $GLITE_LOCATION]) + + AC_SUBST(GLITE_LOCATION) + AC_SUBST(GLITE_CFLAGS) + + AC_ARG_WITH(dist_location, + [ --with-dist-location=PFX prefix where DIST location is. (pwd)], + [], + with_dist_location=$WORKDIR/../dist) + + DISTTAR=$with_dist_location + + AC_SUBST(DISTTAR) + +]) + diff --git a/project/globus.m4 b/project/globus.m4 new file mode 100644 index 00000000..f6e519d6 --- /dev/null +++ b/project/globus.m4 @@ -0,0 +1,233 @@ +dnl Usage: +dnl AC_GLOBUS(MINIMUM-VERSION, [ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]]]) +dnl - GLOBUS_LOCATION +dnl - GLOBUS_NOTHR_FLAVOR +dnl - GLOBUS_THR_FLAVOR +dnl - GLOBUS_NOTHR_CFLAGS +dnl - GLOBUS_THR_CFLAGS +dnl - GLOBUS_NOTHR_LIBS +dnl - GLOBUS_THR_LIBS +dnl - GLOBUS_COMMON_NOTHR_LIBS +dnl - GLOBUS_COMMON_THR_LIBS +dnl - GLOBUS_STATIC_COMMON_NOTHR_LIBS +dnl - GLOBUS_STATIC_COMMON_THR_LIBS +dnl - GLOBUS_FTP_CLIENT_NOTHR_LIBS +dnl - GLOBUS_FTP_CLIENT_THR_LIBS +dnl - GLOBUS_SSL_NOTHR_LIBS +dnl - GLOBUS_SSL_THR_LIBS +dnl - GLOBUS_STATIC_SSL_NOTHR_LIBS +dnl - GLOBUS_STATIC_SSL_THR_LIBS +dnl - GLOBUS_GSS_NOTHR_LIBS +dnl - GLOBUS_GSS_THR_LIBS +dnl - GLOBUS_LDAP_THR_LIBS + +AC_DEFUN([AC_GLOBUS], +[ + AC_ARG_WITH(globus_prefix, + [ --with-globus-prefix=PFX prefix where GLOBUS is installed. ($GLOBUS_LOCATION or /usr or pkg-config)], + [], + with_globus_prefix=${GLOBUS_LOCATION:-/usr}) + + AC_ARG_WITH(globus_nothr_flavor, + [ --with-globus-nothr-flavor=flavor [default=gcc32dbg]], + [], + with_globus_nothr_flavor=${GLOBUS_FLAVOR:-gcc32dbg}) + + AC_MSG_RESULT(["GLOBUS nothread flavor is $with_globus_nothr_flavor"]) + + AC_ARG_WITH(globus_thr_flavor, + [ --with-globus-thr-flavor=flavor [default=gcc32dbgpthr]], + [], + with_globus_thr_flavor=${GLOBUS_FLAVOR:-gcc32dbgpthr}) + + AC_MSG_RESULT(["GLOBUS thread flavor is $with_globus_thr_flavor"]) + + ac_cv_globus_nothr_valid=no + ac_cv_globus_thr_valid1=no + ac_cv_globus_thr_valid2=no + + GLOBUS_NOTHR_CFLAGS="$with_globus_prefix/include/$with_globus_nothr_flavor" + GLOBUS_THR_CFLAGS="$with_globus_prefix/include/$with_globus_thr_flavor" + + ac_globus_ldlib="-L$with_globus_prefix/lib" + + GLOBUS_COMMON_NOTHR_LIBS="$ac_globus_ldlib -lglobus_common_$with_globus_nothr_flavor" + GLOBUS_COMMON_THR_LIBS="$ac_globus_ldlib -lglobus_common_$with_globus_thr_flavor" + + GLOBUS_STATIC_COMMON_NOTHR_LIBS="$with_globus_prefix/lib/libglobus_common_$with_globus_nothr_flavor.a" + GLOBUS_STATIC_COMMON_THR_LIBS="$with_globus_prefix/lib/libglobus_common_$with_globus_thr_flavor.a" + + GLOBUS_FTP_CLIENT_NOTHR_LIBS="$ac_globus_ldlib -lglobus_ftp_client_$with_globus_nothr_flavor" + GLOBUS_FTP_CLIENT_THR_LIBS="$ac_globus_ldlib -lglobus_ftp_client_$with_globus_thr_flavor" + + GLOBUS_GSS_NOTHR_LIBS="$ac_globus_ldlib -lglobus_gssapi_gsi_$with_globus_nothr_flavor -lglobus_gss_assist_$with_globus_nothr_flavor" + GLOBUS_GSS_THR_LIBS="$ac_globus_ldlib -lglobus_gssapi_gsi_$with_globus_thr_flavor -lglobus_gss_assist_$with_globus_thr_flavor" + + GLOBUS_SSL_NOTHR_LIBS="$ac_globus_ldlib -lssl_$with_globus_nothr_flavor -lcrypto_$with_globus_nothr_flavor" + GLOBUS_SSL_THR_LIBS="$ac_globus_ldlib -lssl_$with_globus_thr_flavor -lcrypto_$with_globus_thr_flavor" + + GLOBUS_STATIC_SSL_NOTHR_LIBS="$with_globus_prefix/lib/libssl_$with_globus_nothr_flavor.a $with_globus_prefix/lib/libcrypto_$with_globus_nothr_flavor.a" + GLOBUS_STATIC_SSL_THR_LIBS="$with_globus_prefix/lib/libssl_$with_globus_thr_flavor.a $with_globus_prefix/lib/libcrypto_$with_globus_thr_flavor.a" + + GLOBUS_LDAP_THR_LIBS="$ac_globus_ldlib -lldap_$with_globus_thr_flavor -llber_$with_globus_thr_flavor" + + dnl Needed by LCAS/LCMAPS voms plugins + GLOBUS_GSI_NOTHR_LIBS="$ac_globus_ldlib -lglobus_gsi_credential_$with_globus_nothr_flavor" + GLOBUS_GSI_THR_LIBS="$ac_globus_ldlib -lglobus_gsi_credential_$with_globus_thr_flavor" + + dnl + dnl check nothr openssl header + dnl + ac_globus_nothr_ssl="$with_globus_prefix/include/$with_globus_nothr_flavor/openssl" + + AC_MSG_CHECKING([for $ac_globus_nothr_ssl/ssl.h]) + + if test ! -f "$ac_globus_nothr_ssl/ssl.h" ; then + ac_globus_nothr_ssl="" + AC_MSG_RESULT([no]) + else + AC_MSG_RESULT([yes]) + fi + + AC_MSG_CHECKING([for openssl nothr]) + + if test -n "$ac_globus_nothr_ssl" ; then + GLOBUS_NOTHR_CFLAGS="-I$ac_globus_nothr_ssl -I$GLOBUS_NOTHR_CFLAGS" + fi + + if test -n "$ac_globus_nothr_ssl" ; then + dnl + dnl maybe do some complex test of globus instalation here later + dnl + ac_save_CFLAGS=$CFLAGS + CFLAGS="$GLOBUS_NOTHR_CFLAGS $CFLAGS" + AC_TRY_COMPILE([ + #include "ssl.h" + #include "globus_gss_assist.h" + ], + [globus_gss_assist_ex aex], + [ac_cv_globus_nothr_valid=yes], + [ac_cv_globus_nothr_valid=no]) + CFLAGS=$ac_save_CFLAGS + AC_MSG_RESULT([$ac_cv_globus_nothr_valid]) + fi + + dnl + dnl check thr openssl header + dnl + ac_globus_thr_ssl="$with_globus_prefix/include/$with_globus_thr_flavor/openssl" + + AC_MSG_CHECKING([for $ac_globus_thr_ssl/ssl.h]) + + if test ! -f "$ac_globus_thr_ssl/ssl.h" ; then + ac_globus_thr_ssl="" + AC_MSG_RESULT([no]) + else + AC_MSG_RESULT([yes]) + fi + + if test -n "$ac_globus_thr_ssl" ; then + GLOBUS_THR_CFLAGS="-I$ac_globus_thr_ssl -I$GLOBUS_THR_CFLAGS" + fi + + AC_MSG_CHECKING([checking openssl thr]) + + if test -n "$ac_globus_thr_ssl" ; then + dnl + dnl maybe do some complex test of globus instalation here later + dnl + ac_save_CFLAGS=$CFLAGS + CFLAGS="$GLOBUS_THR_CFLAGS $CFLAGS" + AC_TRY_COMPILE([ + #include "openssl/ssl.h" + #include "globus_gss_assist.h" + ], + [globus_gss_assist_ex aex], + [ac_cv_globus_thr_valid1=yes], + [ac_cv_globus_thr_valid1=no]) + CFLAGS=$ac_save_CFLAGS + AC_MSG_RESULT([$ac_cv_globus_thr_valid1]) + fi + + dnl + dnl check thr ldap header + dnl + ac_globus_thr_ldap="$with_globus_prefix/include/$with_globus_thr_flavor" + + AC_MSG_CHECKING([for $ac_globus_thr_ldap/lber.h]) + + if test ! -f "$ac_globus_thr_ldap/lber.h" ; then + ac_globus_thr_ldap="" + AC_MSG_RESULT([no]) + else + AC_MSG_RESULT([yes]) + fi + + AC_MSG_CHECKING([for ldap thr]) + + if test -n "$ac_globus_thr_ldap" ; then + dnl + dnl maybe do some complex test of globus instalation here later + dnl + ac_save_CFLAGS=$CFLAGS + CFLAGS="$GLOBUS_THR_CFLAGS $CFLAGS" + AC_TRY_COMPILE([ + #include "ldap.h" + #include "lber.h" + ], + [ + LDAPMessage *ldresult; + BerElement *ber; + ], + [ac_cv_globus_thr_valid2=yes], + [ac_cv_globus_thr_valid2=no]) + CFLAGS=$ac_save_CFLAGS + AC_MSG_RESULT([$ac_cv_globus_thr_valid2]) + fi + + if test x$ac_cv_globus_nothr_valid = xyes -a x$ac_cv_globus_thr_valid1 = xyes -a x$ac_cv_globus_thr_valid2 = xyes ; then + GLOBUS_LOCATION=$with_globus_prefix + GLOBUS_NOTHR_FLAVOR=$with_globus_nothr_flavor + GLOBUS_THR_FLAVOR=$with_globus_thr_flavor + ifelse([$2], , :, [$2]) + else + GLOBUS_NOTHR_CFLAGS="" + GLOBUS_THR_CFLAGS="" + GLOBUS_NOTHR_LIBS="" + GLOBUS_THR_LIBS="" + GLOBUS_COMMON_NOTHR_LIBS="" + GLOBUS_COMMON_THR_LIBS="" + GLOBUS_STATIC_COMMON_NOTHR_LIBS="" + GLOBUS_STATIC_COMMON_THR_LIBS="" + GLOBUS_FTP_CLIENT_NOTHR_LIBS="" + GLOBUS_FTP_CLIENT_THR_LIBS="" + GLOBUS_SSL_NOTHR_LIBS="" + GLOBUS_SSL_THR_LIBS="" + GLOBUS_STATIC_SSL_NOTHR_LIBS="" + GLOBUS_STATIC_SSL_THR_LIBS="" + GLOBUS_LDAP_THR_LIBS="" + ifelse([$3], , :, [$3]) + fi + + AC_SUBST(GLOBUS_LOCATION) + AC_SUBST(GLOBUS_NOTHR_FLAVOR) + AC_SUBST(GLOBUS_THR_FLAVOR) + AC_SUBST(GLOBUS_NOTHR_CFLAGS) + AC_SUBST(GLOBUS_THR_CFLAGS) + AC_SUBST(GLOBUS_NOTHR_LIBS) + AC_SUBST(GLOBUS_THR_LIBS) + AC_SUBST(GLOBUS_COMMON_NOTHR_LIBS) + AC_SUBST(GLOBUS_COMMON_THR_LIBS) + AC_SUBST(GLOBUS_STATIC_COMMON_NOTHR_LIBS) + AC_SUBST(GLOBUS_STATIC_COMMON_THR_LIBS) + AC_SUBST(GLOBUS_FTP_CLIENT_NOTHR_LIBS) + AC_SUBST(GLOBUS_FTP_CLIENT_THR_LIBS) + AC_SUBST(GLOBUS_SSL_NOTHR_LIBS) + AC_SUBST(GLOBUS_SSL_THR_LIBS) + AC_SUBST(GLOBUS_STATIC_SSL_NOTHR_LIBS) + AC_SUBST(GLOBUS_STATIC_SSL_THR_LIBS) + AC_SUBST(GLOBUS_GSS_NOTHR_LIBS) + AC_SUBST(GLOBUS_GSS_THR_LIBS) + AC_SUBST(GLOBUS_LDAP_THR_LIBS) +]) + From fbb06f635fc25f8f5c1dc966d2b7af31217f0b35 Mon Sep 17 00:00:00 2001 From: Francesco Prelz Date: Mon, 15 Oct 2012 16:45:33 +0200 Subject: [PATCH 003/169] Restored default location of Globus to /opt/globus. --- project/globus.m4 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/project/globus.m4 b/project/globus.m4 index f6e519d6..9e4edfd7 100644 --- a/project/globus.m4 +++ b/project/globus.m4 @@ -24,9 +24,9 @@ dnl - GLOBUS_LDAP_THR_LIBS AC_DEFUN([AC_GLOBUS], [ AC_ARG_WITH(globus_prefix, - [ --with-globus-prefix=PFX prefix where GLOBUS is installed. ($GLOBUS_LOCATION or /usr or pkg-config)], + [ --with-globus-prefix=PFX prefix where GLOBUS is installed. ($GLOBUS_LOCATION or /opt/globus or pkg-config)], [], - with_globus_prefix=${GLOBUS_LOCATION:-/usr}) + with_globus_prefix=${GLOBUS_LOCATION:-/opt/globus}) AC_ARG_WITH(globus_nothr_flavor, [ --with-globus-nothr-flavor=flavor [default=gcc32dbg]], From 81b5d550e3abd27121849a89c72b6651b9ba36db Mon Sep 17 00:00:00 2001 From: Francesco Prelz Date: Tue, 6 Nov 2012 16:11:32 +0100 Subject: [PATCH 004/169] Ported change from CVS. --- src/scripts/sge_submit.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/sge_submit.sh b/src/scripts/sge_submit.sh index 5e05ed48..2d1b0fd1 100755 --- a/src/scripts/sge_submit.sh +++ b/src/scripts/sge_submit.sh @@ -83,7 +83,7 @@ fi # Write SGE directives according to command line options # handle queue overriding [ -z "$bls_opt_queue" ] || grep -q "^#\$ -q" $bls_tmp_file || echo "#\$ -q $bls_opt_queue" >> $bls_tmp_file -[ -z "$bls_opt_mpinodes" -o "x${bls_opt_mpinodes}" = "x1" ] || grep -q"^#\$ -pe *\\*" $bls_tmp_file || echo "#\$ -pe * $bls_opt_mpinodes" >>$bls_tmp_file +[ -z "$bls_opt_mpinodes" -o "x${bls_opt_mpinodes}" = "x1" ] || grep -q "^#\$ -pe *\\*" $bls_tmp_file || echo "#\$ -pe * $bls_opt_mpinodes" >>$bls_tmp_file # Input and output sandbox setup. bls_fl_subst_and_accumulate inputsand "@@F_REMOTE@`hostname -f`:@@F_LOCAL" "@@@" From fb395fb24d6c4072563fe8f991efc3c5e1eefbb2 Mon Sep 17 00:00:00 2001 From: Massimo Mezzadri Date: Wed, 7 Nov 2012 10:27:19 +0100 Subject: [PATCH 005/169] aded CMakeLists.txt files to compile with cmake --- CMakeLists.txt | 52 +++++++++++++++ config/CMakeLists.txt | 24 +++++++ doc/CMakeLists.txt | 47 ++++++++++++++ src/CMakeLists.txt | 144 ++++++++++++++++++++++++++++++++++++++++++ src/FindClassAd.cmake | 112 ++++++++++++++++++++++++++++++++ src/acconfig.h | 0 6 files changed, 379 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 config/CMakeLists.txt create mode 100644 doc/CMakeLists.txt create mode 100644 src/CMakeLists.txt create mode 100644 src/FindClassAd.cmake create mode 100644 src/acconfig.h diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..eb9e0045 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,52 @@ +# **************** +# * BLAHP daemon * +# **************** +# +# $Id: $ +# +# File: CMakeLists.txt +# +# Author(s): Francesco Prelz ($Author: $) +# e-mail: "Francesco.Prelz@mi.infn.it" +# +# Revision history: +# +# 5-Nov-2012 Created + +cmake_minimum_required(VERSION 2.6) + +project(BLAH) + +# CPack info + +if (UNIX AND NOT APPLE) +set(CPACK_GENERATOR "STGZ;DEB;RPM") +execute_process(COMMAND uname -i + OUTPUT_VARIABLE local_arch OUTPUT_STRIP_TRAILING_WHITESPACE) +set(CPACK_SYSTEM_NAME ${local_arch}) +endif (UNIX AND NOT APPLE) + +set(CPACK_PACKAGE_VENDOR "EMI") +set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Batch Local ASCII Helper Protocol suite") +set(CPACK_PACKAGE_VERSION_MAJOR "1") +set(CPACK_PACKAGE_VERSION_MINOR "19") +set(CPACK_PACKAGE_VERSION_PATCH "0") +set(CPACK_PACKAGE_VERSION + "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") + +set(CPACK_PACKAGE_CONTACT "blah@mi.infn.it") +set(CPACK_RPM_PACKAGE_LICENSE "Apache Software License") +set(CPACK_RPM_PACKAGE_RELEASE "0") +set(CPACK_RPM_PACKAGE_GROUP "Applications/Internet") + +set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE + ${CMAKE_CURRENT_SOURCE_DIR}/project/glite-ce-blahp.post) +set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE + ${CMAKE_CURRENT_SOURCE_DIR}/project/glite-ce-blahp.preun) + +add_subdirectory(src build) +add_subdirectory(config) +add_subdirectory(doc) + +install(FILES LICENSE + DESTINATION share/doc/${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION}) diff --git a/config/CMakeLists.txt b/config/CMakeLists.txt new file mode 100644 index 00000000..54882f81 --- /dev/null +++ b/config/CMakeLists.txt @@ -0,0 +1,24 @@ +# **************** +# * BLAHP daemon * +# **************** +# +# $Id: $ +# +# File: CMakeLists.txt +# +# Author(s): Francesco Prelz ($Author: $) +# e-mail: "Francesco.Prelz@mi.infn.it" +# +# Revision history: +# +# 5-Nov-2012 Created + +cmake_minimum_required(VERSION 2.6) + +install(FILES + blah.config.template + blparser.conf.template + glite-ce-blah-parser + glite-ce-check-blparser + DESTINATION etc) + diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt new file mode 100644 index 00000000..d091e419 --- /dev/null +++ b/doc/CMakeLists.txt @@ -0,0 +1,47 @@ +# **************** +# * BLAHP daemon * +# **************** +# +# $Id: $ +# +# File: CMakeLists.txt +# +# Author(s): Francesco Prelz ($Author: $) +# e-mail: "Francesco.Prelz@mi.infn.it" +# +# Revision history: +# +# 5-Nov-2012 Created + +cmake_minimum_required(VERSION 2.6) + +find_program(XSLTPROC_EXECUTABLE xsltproc) + +find_file(XSLTPROC_MANPAGE_STYLESHEET + NAMES docbook.xsl + PATHS /usr/share/sgml/docbook/xsl-stylesheets/manpages) + +set(MAN1PAGES_TO_CREATE + blah_job_registry_add.1 + blah_job_registry_dump.1 + blah_job_registry_lkup.1 + blah_job_registry_scan_by_subject.1 + blah_check_config.1 + blahpd.1 +) + +foreach (manpage ${MAN1PAGES_TO_CREATE}) + string(REGEX REPLACE ".[1-9]$" ".xml" manpage_src ${manpage}) + add_custom_command(OUTPUT ${manpage} + COMMAND ${XSLTPROC_EXECUTABLE} + ${XSLTPROC_MANPAGE_STYLESHEET} + ${manpage_src} + DEPENDS ${manpage_src}) + set_source_files_properties(${manpage} PROPERTIES GENERATED TRUE) +endforeach() + +add_custom_target(all_manpages ALL + DEPENDS ${MAN1PAGES_TO_CREATE} ) + +install(FILES ${MAN1PAGES_TO_CREATE} DESTINATION man/man1) + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 00000000..b8d82f1e --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,144 @@ +# **************** +# * BLAHP daemon * +# **************** +# +# $Id: $ +# +# File: CMakeLists.txt +# +# Author(s): Francesco Prelz ($Author: $) +# e-mail: "Francesco.Prelz@mi.infn.it" +# +# Revision history: +# +# 26-Oct-2012 Created + +cmake_minimum_required(VERSION 2.6) + +include(FindClassAd.cmake) +include_directories(${ClassAd_INCLUDE_DIR}) +include(FindPkgConfig) + +pkg_check_modules(GLOBUS_COMMON globus-common) +include_directories(${GLOBUS_COMMON_INCLUDE_DIRS}) + +pkg_check_modules(GLOBUS_IO globus-io) +pkg_check_modules(GLOBUS_GSSAPI_GSI globus-gssapi-gsi) +pkg_check_modules(GLOBUS_GSS_ASSIST globus-gss-assist) +pkg_check_modules(GLOBUS_GSI_CREDENTIAL globus-gsi-credential) +pkg_check_modules(GLOBUS_GSI_PROXY_CORE globus-gsi-proxy-core) + +include_directories(.) + +set (main_common_sources + console.c job_status.c resbuffer.c server.c commands.c + classad_binary_op_unwind.C classad_c_helper.C proxy_hashcontainer.c + config.c job_registry.c blah_utils.c env_helper.c mapped_exec.c md5.c + cmdbuffer.c) + +set (bupdater_common_sources + Bfunctions.c job_registry.c md5.c config.c blah_utils.c + job_registry_updater.c) + +# programs for 'sbin' +add_executable(blahpd_daemon main_daemon.c ${main_common_sources}) +set_target_properties(blahpd_daemon PROPERTIES COMPILE_FLAGS ${ClassAd_CXX_FLAGS}) +target_link_libraries(blahpd_daemon -lpthread ${ClassAd_LIBRARY}) +add_executable(blah_job_registry_add + blah_job_registry_add.c job_registry.c + job_registry_updater.c md5.c config.c) +add_executable(blah_job_registry_lkup + blah_job_registry_lkup.c job_registry.c md5.c config.c) +add_executable(blah_job_registry_scan_by_subject + blah_job_registry_scan_by_subject.c classad_c_helper.C + classad_binary_op_unwind.C job_registry.c md5.c config.c) +set_target_properties(blah_job_registry_scan_by_subject PROPERTIES COMPILE_FLAGS ${ClassAd_CXX_FLAGS}) +target_link_libraries(blah_job_registry_scan_by_subject ${ClassAd_LIBRARY}) +add_executable(blah_check_config + blah_check_config.c Bfunctions.c config.c blah_utils.c) +add_executable(blah_job_registry_dump + blah_job_registry_dump.c job_registry.c md5.c config.c) +add_executable(blah_job_registry_purge + blah_job_registry_purge.c job_registry.c md5.c) + +# programs for 'bin' +add_executable(blahpd main.c ${main_common_sources}) +set_target_properties(blahpd PROPERTIES COMPILE_FLAGS ${ClassAd_CXX_FLAGS}) +target_link_libraries(blahpd -lpthread ${ClassAd_LIBRARY}) + +# programs for 'libexec' +add_executable(BLClient BLClient.c blah_utils.c BLfunctions.c) +add_executable(BLParserLSF BLParserLSF.c blah_utils.c BLfunctions.c) +target_link_libraries(BLParserLSF -lpthread) +add_executable(BLParserPBS BLParserPBS.c blah_utils.c BLfunctions.c) +target_link_libraries(BLParserPBS -lpthread) +add_executable(BUpdaterCondor BUpdaterCondor.c ${bupdater_common_sources}) +target_link_libraries(BUpdaterCondor -lpthread) +add_executable(BNotifier + BNotifier.c Bfunctions.c job_registry.c md5.c config.c blah_utils.c) +target_link_libraries(BNotifier -lpthread) +add_executable(BUpdaterLSF BUpdaterLSF.c ${bupdater_common_sources}) +target_link_libraries(BUpdaterLSF -lpthread -lm) +add_executable(BUpdaterPBS BUpdaterPBS.c ${bupdater_common_sources}) +target_link_libraries(BUpdaterPBS -lpthread -lm) +add_executable(BUpdaterSGE + BUpdaterSGE.c Bfunctions.c job_registry.c md5.c config.c + blah_utils.c) +add_executable(blparser_master blparser_master.c config.c blah_utils.c) + +if (${GLOBUS_COMMON_FOUND} AND ${GLOBUS_IO_FOUND}) +add_executable(BPRclient BPRclient.c BPRcomm.c tokens.c) +target_link_libraries(BPRclient + ${GLOBUS_GSI_PROXY_CORE_LDFLAGS} + ${GLOBUS_GSI_CREDENTIALS_LDFLAGS} + ${GLOBUS_GSS_ASSIST_LDFLAGS}) +add_executable(BPRserver BPRserver.c BPRcomm.c tokens.c) +target_link_libraries(BPRserver + ${GLOBUS_GSI_PROXY_CORE_LDFLAGS} + ${GLOBUS_GSI_CREDENTIALS_LDFLAGS} + ${GLOBUS_GSS_ASSIST_LDFLAGS}) +set_target_properties(BPRserver PROPERTIES COMPILE_FLAGS "-static") +endif (${GLOBUS_COMMON_FOUND} AND ${GLOBUS_IO_FOUND}) + +# test programs +add_executable(test_job_registry_create test_job_registry_create.c job_registry.c md5.c) +add_executable(test_job_registry_purge test_job_registry_purge.c job_registry.c md5.c) +add_executable(test_job_registry_update test_job_registry_update.c job_registry.c md5.c) +add_executable(test_job_registry_access test_job_registry_access.c job_registry.c md5.c) +add_executable(test_job_registry_update_from_network + test_job_registry_update_from_network.c job_registry.c + job_registry_updater.c md5.c config.c) +add_executable(test_cmdbuffer cmdbuffer.c) +set_target_properties(test_cmdbuffer PROPERTIES COMPILE_FLAGS "-DCMDBUF_DEBUG") + +# CPack info + +install(TARGETS blahpd RUNTIME DESTINATION bin) +install(TARGETS + blahpd_daemon blah_job_registry_add blah_job_registry_lkup + blah_job_registry_scan_by_subject blah_check_config + blah_job_registry_dump blah_job_registry_purge + RUNTIME DESTINATION sbin) +install(TARGETS + BLClient BLParserLSF BLParserPBS BUpdaterCondor BNotifier + BUpdaterLSF BUpdaterPBS BUpdaterSGE + blparser_master + RUNTIME DESTINATION libexec) +install(FILES + scripts/blah_load_config.sh scripts/blah_common_submit_functions.sh + scripts/pbs_cancel.sh scripts/pbs_status.sh scripts/pbs_submit.sh + scripts/pbs_hold.sh scripts/pbs_resume.sh scripts/lsf_cancel.sh + scripts/lsf_status.sh scripts/lsf_submit.sh scripts/lsf_hold.sh + scripts/lsf_resume.sh scripts/condor_cancel.sh scripts/condor_status.sh + scripts/condor_submit.sh scripts/condor_hold.sh scripts/condor_resume.sh + scripts/sge_cancel.sh scripts/sge_helper scripts/sge_resume.sh + scripts/sge_submit.sh scripts/sge_filestaging scripts/sge_hold.sh + scripts/sge_status.sh scripts/runcmd.pl.template + scripts/sge_local_submit_attributes.sh + DESTINATION libexec) + +if (${GLOBUS_COMMON_FOUND} AND ${GLOBUS_IO_FOUND}) +install(TARGETS BPRclient BPRserver RUNTIME DESTINATION libexec) +endif (${GLOBUS_COMMON_FOUND} AND ${GLOBUS_IO_FOUND}) + +include(CPack) diff --git a/src/FindClassAd.cmake b/src/FindClassAd.cmake new file mode 100644 index 00000000..a5a3e4b5 --- /dev/null +++ b/src/FindClassAd.cmake @@ -0,0 +1,112 @@ +# - Finds Condor Classified Ad (Classad) binary distribution. +# The following variables are set: +# ClassAd_CXX_FLAGS - flags to add to the CXX compiler for Classad support +# CLASSAD_FOUND - true if the Classad distribution is detected +# +# Supported compilers can be found at http://openmp.org/wp/openmp-compilers/ + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +include(CheckCSourceCompiles) +include(CheckCXXSourceCompiles) +include(FindPackageHandleStandardArgs) + +set(ClassAd_INCLUDE_PATH_DESCRIPTION "top-level directory containing the Condor ClassAd include directories. E.g /opt/classad/include") +set(ClassAd_INCLUDE_DIR_MESSAGE "Set the ClassAd_INCLUDE_DIR cmake cache entry to the ${ClassAd_INCLUDE_PATH_DESCRIPTION}") +set(ClassAd_LIBRARY_PATH_DESCRIPTION "top-level directory containing the Condor ClassAd libraries.") +set(ClassAd_LIBRARY_DIR_MESSAGE "Set the ClassAd_LIBRARY_DIR cmake cache entry to the ${ClassAd_LIBRARY_PATH_DESCRIPTION}") + +find_path(ClassAd_INCLUDE_DIR + NAMES classad_distribution.h + PATHS + # Look in other places. + ${ClassAd_ROOT_DIRECTORIES} + PATH_SUFFIXES + classad + include + # Help the user find it if we cannot. + DOC "The ${ClassAd_INCLUDE_DIR_MESSAGE}" +) + +message(STATUS "ClassAd_INCLUDE_DIR == " ${ClassAd_INCLUDE_DIR}) + +# The ClassAd library (should have namespaces enabled). +set (ClassAd_LIBRARY_TO_FIND classad_ns) + +# Setting some more prefixes for the library +set (ClassAd_LIB_PREFIX "") +if ( WIN32 ) + set (ClassAd_LIB_PREFIX ${ClassAd_LIB_PREFIX} "lib") + set ( ClassAd_LIBRARY_TO_FIND ${ClassAd_LIB_PREFIX}${ClassAd_LIBRARY_TO_FIND}) +endif() + +find_library( ClassAd_LIBRARY + NAMES ${ClassAd_LIBRARY_TO_FIND} + PATHS + ${ClassAd_LIBRARY_DIR} + PATH_SUFFIXES + lib +) + +get_filename_component(ClassAd_LIBRARY_DIR ${ClassAd_LIBRARY} PATH) +message(STATUS "ClassAd_LIBRARY == " ${ClassAd_LIBRARY}) + +# sample Classad source code to test +set(ClassAd_CXX_TEST_SOURCE +" +#include +classad::ClassAd ad; +classad::ClassAdParser parser; + +int +main(int argc, char *argv[]) +{ +} +") + +set(ClassAd_CXX_FLAG_CANDIDATES + "-DWANT_NAMESPACES" + "-DWANT_NAMESPACES -DWANT_CLASSAD_NAMESPACE" +) + +# check cxx compiler +foreach(FLAG ${ClassAd_CXX_FLAG_CANDIDATES}) + set(SAFE_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}") + set(SAFE_CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + set(SAFE_CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES}") + set(CMAKE_REQUIRED_FLAGS "${FLAG}") + set(CMAKE_REQUIRED_LIBRARIES "${ClassAd_LIBRARY}") + set(CMAKE_REQUIRED_INCLUDES "${ClassAd_INCLUDE_DIR}") + unset(ClassAd_FLAG_DETECTED CACHE) + message(STATUS "Try Classad CXX flag = [${FLAG}] (library = [${ClassAd_LIBRARY}])") + check_cxx_source_compiles("${ClassAd_CXX_TEST_SOURCE}" ClassAd_FLAG_DETECTED) + set(CMAKE_REQUIRED_FLAGS "${SAFE_CMAKE_REQUIRED_FLAGS}") + set(CMAKE_REQUIRED_LIBRARIES "${SAFE_CMAKE_REQUIRED_LIBRARIES}") + set(CMAKE_REQUIRED_INCLUDES "${SAFE_CMAKE_REQUIRED_INCLUDES}") + if(ClassAd_FLAG_DETECTED) + set(ClassAd_CXX_FLAGS_INTERNAL "${FLAG}") + break() + endif(ClassAd_FLAG_DETECTED) +endforeach(FLAG ${ClassAd_CXX_FLAG_CANDIDATES}) + +set(ClassAd_CXX_FLAGS "${ClassAd_CXX_FLAGS_INTERNAL}" + CACHE STRING "C++ compiler flags for use of the Condor Classad library") +message(STATUS "ClassAd_CXX_FLAGS == " ${ClassAd_CXX_FLAGS}) +# handle the standard arguments for find_package +find_package_handle_standard_args(ClassAd DEFAULT_MSG + ClassAd_LIBRARY ClassAd_INCLUDE_DIR) + +mark_as_advanced( + ClassAd_CXX_FLAGS + ClassAd_LIBRARY +) diff --git a/src/acconfig.h b/src/acconfig.h new file mode 100644 index 00000000..e69de29b From 47b88f5d88eb31944428a86c08dd1415fda81862 Mon Sep 17 00:00:00 2001 From: Massimo Mezzadri Date: Wed, 7 Nov 2012 10:31:48 +0100 Subject: [PATCH 006/169] fix for bug #97491 --- src/BUpdaterLSF.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/BUpdaterLSF.c b/src/BUpdaterLSF.c index 4c3ba5b3..6549cff1 100644 --- a/src/BUpdaterLSF.c +++ b/src/BUpdaterLSF.c @@ -484,12 +484,13 @@ int main(int argc, char *argv[]){ } /* Try to run FinalStateQuery reading older log files*/ - if(now-confirm_time>bhist_finalstate_interval){ - runfinal_oldlogs=TRUE; - free(en); - continue; - } - + if(now-confirm_time>bhist_finalstate_interval && use_bhist_for_idle && strcmp(use_bhist_for_idle,"yes")==0){ + do_log(debuglogfile, debug, 2, "%s: FinalStateQuery needed for jobid=%s with status=%d from old logs\n",argv0,en->batch_id,en->status); + runfinal_oldlogs=TRUE; + free(en); + continue; + } + if(en->status==IDLE && strlen(en->updater_info)>0 && use_bhist_for_idle && strcmp(use_bhist_for_idle,"yes")==0){ if (en->mdate < finalquery_start_date){ finalquery_start_date=en->mdate; From db6f50c743239d55894b3296cdeadb5623af8236 Mon Sep 17 00:00:00 2001 From: Massimo Mezzadri Date: Wed, 7 Nov 2012 10:53:26 +0100 Subject: [PATCH 007/169] changed 1.19 to 1.18 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eb9e0045..f7be238d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,7 +29,7 @@ endif (UNIX AND NOT APPLE) set(CPACK_PACKAGE_VENDOR "EMI") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Batch Local ASCII Helper Protocol suite") set(CPACK_PACKAGE_VERSION_MAJOR "1") -set(CPACK_PACKAGE_VERSION_MINOR "19") +set(CPACK_PACKAGE_VERSION_MINOR "18") set(CPACK_PACKAGE_VERSION_PATCH "0") set(CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") From 9dea51ee2897be76cd2b43e5d894ea7bb9aead06 Mon Sep 17 00:00:00 2001 From: Massimo Mezzadri Date: Thu, 8 Nov 2012 11:45:54 +0100 Subject: [PATCH 008/169] changed path to some config files in rpm --- CMakeLists.txt | 12 ++++++++++-- config/CMakeLists.txt | 5 ++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f7be238d..a98c49fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,13 +19,17 @@ project(BLAH) # CPack info +set(CPACK_RPM_PACKAGE_RELEASE "1") + if (UNIX AND NOT APPLE) set(CPACK_GENERATOR "STGZ;DEB;RPM") execute_process(COMMAND uname -i OUTPUT_VARIABLE local_arch OUTPUT_STRIP_TRAILING_WHITESPACE) -set(CPACK_SYSTEM_NAME ${local_arch}) +set(CPACK_SYSTEM_NAME "${CPACK_RPM_PACKAGE_RELEASE}.${local_arch}") endif (UNIX AND NOT APPLE) +set(CMAKE_INSTALL_PREFIX "/usr") + set(CPACK_PACKAGE_VENDOR "EMI") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Batch Local ASCII Helper Protocol suite") set(CPACK_PACKAGE_VERSION_MAJOR "1") @@ -34,9 +38,13 @@ set(CPACK_PACKAGE_VERSION_PATCH "0") set(CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") +# For the following setting, see CMAKE manual, page 155 +# or http://public.kitware.com/Bug/view.php?id=7000 +set(CPACK_SET_DESTDIR ON) +set(CPACK_PACKAGE_RELOCATABLE "false") + set(CPACK_PACKAGE_CONTACT "blah@mi.infn.it") set(CPACK_RPM_PACKAGE_LICENSE "Apache Software License") -set(CPACK_RPM_PACKAGE_RELEASE "0") set(CPACK_RPM_PACKAGE_GROUP "Applications/Internet") set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE diff --git a/config/CMakeLists.txt b/config/CMakeLists.txt index 54882f81..d16d9370 100644 --- a/config/CMakeLists.txt +++ b/config/CMakeLists.txt @@ -18,7 +18,10 @@ cmake_minimum_required(VERSION 2.6) install(FILES blah.config.template blparser.conf.template + DESTINATION /etc) + +install(FILES glite-ce-blah-parser glite-ce-check-blparser - DESTINATION etc) + DESTINATION /etc/rc.d/init.d) From b8829815b1226717e611c883d1df9d43ca41edf5 Mon Sep 17 00:00:00 2001 From: Massimo Mezzadri Date: Fri, 9 Nov 2012 11:17:03 +0100 Subject: [PATCH 009/169] added description to rpm --- CMakeLists.txt | 6 ++++-- blah_description.txt | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 blah_description.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index a98c49fc..bd700f71 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,11 +15,12 @@ cmake_minimum_required(VERSION 2.6) +#project(glite-ce-blahp) project(BLAH) # CPack info -set(CPACK_RPM_PACKAGE_RELEASE "1") +set(CPACK_RPM_PACKAGE_RELEASE "0") if (UNIX AND NOT APPLE) set(CPACK_GENERATOR "STGZ;DEB;RPM") @@ -32,9 +33,10 @@ set(CMAKE_INSTALL_PREFIX "/usr") set(CPACK_PACKAGE_VENDOR "EMI") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Batch Local ASCII Helper Protocol suite") +set(CPACK_PACKAGE_DESCRIPTION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/blah_description.txt") set(CPACK_PACKAGE_VERSION_MAJOR "1") set(CPACK_PACKAGE_VERSION_MINOR "18") -set(CPACK_PACKAGE_VERSION_PATCH "0") +set(CPACK_PACKAGE_VERSION_PATCH "1") set(CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") diff --git a/blah_description.txt b/blah_description.txt new file mode 100644 index 00000000..70d04f67 --- /dev/null +++ b/blah_description.txt @@ -0,0 +1,2 @@ +The BLAHP daemon is a light component accepting commands to manage jobs on different Local Resources Management Systems + From 9c5d4746e0bbf1632e4e26cc085e4c6a9df52133 Mon Sep 17 00:00:00 2001 From: Massimo Mezzadri Date: Fri, 9 Nov 2012 11:19:44 +0100 Subject: [PATCH 010/169] added executable bit to shell scripts and startup scripts --- config/CMakeLists.txt | 3 +++ src/CMakeLists.txt | 10 +++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/config/CMakeLists.txt b/config/CMakeLists.txt index d16d9370..58de7947 100644 --- a/config/CMakeLists.txt +++ b/config/CMakeLists.txt @@ -23,5 +23,8 @@ install(FILES install(FILES glite-ce-blah-parser glite-ce-check-blparser + PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE + GROUP_READ GROUP_EXECUTE + WORLD_READ WORLD_EXECUTE DESTINATION /etc/rc.d/init.d) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b8d82f1e..549b40c2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -124,7 +124,8 @@ install(TARGETS BUpdaterLSF BUpdaterPBS BUpdaterSGE blparser_master RUNTIME DESTINATION libexec) -install(FILES + +set(blah_scripts scripts/blah_load_config.sh scripts/blah_common_submit_functions.sh scripts/pbs_cancel.sh scripts/pbs_status.sh scripts/pbs_submit.sh scripts/pbs_hold.sh scripts/pbs_resume.sh scripts/lsf_cancel.sh @@ -135,6 +136,13 @@ install(FILES scripts/sge_submit.sh scripts/sge_filestaging scripts/sge_hold.sh scripts/sge_status.sh scripts/runcmd.pl.template scripts/sge_local_submit_attributes.sh + ) + +install(FILES + ${blah_scripts} + PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE + GROUP_READ GROUP_EXECUTE + WORLD_READ WORLD_EXECUTE DESTINATION libexec) if (${GLOBUS_COMMON_FOUND} AND ${GLOBUS_IO_FOUND}) From 6e5de335ba116a2906ae68188db2c7db2c64da69 Mon Sep 17 00:00:00 2001 From: Massimo Mezzadri Date: Thu, 15 Nov 2012 11:27:47 +0100 Subject: [PATCH 011/169] fix for bug #98855 --- src/Bfunctions.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Bfunctions.c b/src/Bfunctions.c index c4bf06bd..2da92162 100644 --- a/src/Bfunctions.c +++ b/src/Bfunctions.c @@ -603,6 +603,7 @@ bupdater_remove_active_job(bupdater_active_jobs *bact, if (cmp == 0) { /* Job ID found. Remove it from list. */ + free(bact->jobs[cur]); for (resize = cur+1; resizenjobs; resize++) { bact->jobs[resize - 1] = bact->jobs[resize]; From 37efb5b7b84a171ec7381cf8173f08182981a27a Mon Sep 17 00:00:00 2001 From: Massimo Mezzadri Date: Thu, 22 Nov 2012 10:18:17 +0100 Subject: [PATCH 012/169] added -a flag when bjobsinfo is called to have a longer persistency of job info in the output of bjobsinfo --- src/BUpdaterLSF.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/BUpdaterLSF.c b/src/BUpdaterLSF.c index 6549cff1..5b99d631 100644 --- a/src/BUpdaterLSF.c +++ b/src/BUpdaterLSF.c @@ -621,7 +621,7 @@ IntStateQueryCustom() int wexitcode=0; int wexitinfo=0; - command_string=make_message("%s%s/bjobsinfo",batch_command,btools_path); + command_string=make_message("%s%s/bjobsinfo -a",batch_command,btools_path); fp = popen(command_string,"r"); do_log(debuglogfile, debug, 3, "%s: command in IntStateQueryCustom is:%s\n",argv0,command_string); From b29d6dd386c23824fe0b9d137d070a62f52375d2 Mon Sep 17 00:00:00 2001 From: Massimo Mezzadri Date: Mon, 3 Dec 2012 14:48:48 +0100 Subject: [PATCH 013/169] added all possible config variable in the template --- config/blah.config.template | 291 +++++++++++++++++++++++++----------- 1 file changed, 201 insertions(+), 90 deletions(-) diff --git a/config/blah.config.template b/config/blah.config.template index c4fcee11..369211fe 100644 --- a/config/blah.config.template +++ b/config/blah.config.template @@ -1,24 +1,85 @@ -#Supported batch systems -supported_lrms=pbs,lsf +## +#####Common section +## + +##Blah common variables + +#Supported batch systems (e.g. pbs,lsf) +supported_lrms= #DGAS logfile BLAHPD_ACCOUNTING_INFO_LOG= #Set to yes if you wish to disable BLAH's machinery for transferring -#or delegating proxies to the worker node where a job is running. -blah_disable_wn_proxy_renewal=no +#or delegating proxies to the worker node where a job is running. (default = no) +blah_disable_wn_proxy_renewal= #Set to yes to enable delegation (instead of copy) of renewed proxies #to worker nodes. NOTE: limited *and* delegated proxes are not #accepted for default GSI authentication as of VDT 1.2, so this should -#be enabled only if non-limited proxies are used for proxy renewal. -blah_delegate_renewed_proxies=no +#be enabled only if non-limited proxies are used for proxy renewal. (default = no) +blah_delegate_renewed_proxies= + +#max number of concurrent threads to serve commands (default = 500) +#blah_max_threaded_cmds=100 + +#Colon-separated list of paths that are shared among batch system +#head and worker nodes. +#blah_shared_directories=/home:/users + +#By default the job temporary work directory is created as a subdirectory +#of wherever the batch system is configured to land the job. +#This variable changes the location where the work directory is created. +#A shell variable escaped or in single quotes will be resolved on the +#worker node in the job environment. Non-escaped variables will be resolved +#on the submit node in the submit environment. +#blah_wn_temporary_home_dir='$GLITE_LOCATION_TMP' + +#These two attributes allow to change the directory on the worker node where +#the batch system is instructed to transfer input/output sandbox files to +#and from. +#These can be set in case the batch system default is not good enough +#(e.g.: the batch systems leaves output files behind) +#These variables can be resolved on the submit node -only-. +#blah_wn_inputsandbox=/tmp +#blah_wn_outputsandbox=/tmp + +#The following configuration attributes allow for additional +#submit command attributes to be set in the local shell callout +#for batch system customizations. +# +#Set this variable to pass all submit command attributes: +#blah_pass_all_submit_attributes=yes +# +#-Or- select individual attributes as follows: +#blah_pass_submit_attributes[0]="x509UserProxySubject" +#blah_pass_submit_attributes[1]="x509UserProxy" + +#timeout before blah kill a process (default 20) +blah_graceful_kill_timeout= + +#Enable condor/glexec commands (default no) +blah_enable_glexec_from_condor= + +#umask for accounting log +blah_accounting_log_umask= + +#interval between two child consecutive restart (default 150) +blah_children_restart_interval= + +#if blah requires proxy on submit (default no) +blah_require_proxy_on_submit= + +#disable proxy user copy (default no) +blah_disable_proxy_user_copy= + +##PBS common variables #Path where PBS executables are located -pbs_binpath=/usr/pbs/bin +pbs_binpath= #Path where the PBS logs are located ($pbs_spoolpath/server_logs) -pbs_spoolpath=/usr/spool/PBS +pbs_spoolpath= #If it is set to yes blah does not check the jobid in the logfiles pbs_nochecksubmission= @@ -31,14 +92,40 @@ pbs_nologaccess= #locally from the logs if BLParser is not present pbs_fallback=no + +##LSF common variables + +#Path where LSF executables are located +lsf_binpath= + +#Path where the LSF conf file is located ($lsf_confpath/lsf.conf) +lsf_confpath= + +#If it is set to yes blah does not check the jobid in the logfiles +lsf_nochecksubmission= + +#If it is set to yes blah does NOT use log files to get job status, +#but uses only standard LRMS query (bhist) +lsf_nologaccess= + +#If it is set to no blah scripts for LSF will not try to read +#locally from the logs if BLParser is not present +lsf_fallback=no + +## +#####BLParser section +## + +##PBS subsection + #Set to yes to use Blah Log Parser for PBS pbs_BLParser= #Host where Blah Log Parser for PBS is running -pbs_BLPserver=127.0.0.1 +pbs_BLPserver= #Port where Blah Log Parser for PBS is running -pbs_BLPport=33332 +pbs_BLPport= #Number of Blah Log Parser to try for PBS (if it is null pbs_BLPserver and pbs_BLPport are used) pbs_num_BLParser= @@ -61,33 +148,16 @@ pbs_BLPport2= # use it as failback solution if neither 'yes' nor 'no' works for you. blah_torque_multiple_staging_directive_bug=no -#### - -#Path where LSF executables are located -lsf_binpath=/usr/local/lsf/bin - -#Path where the LSF conf file is located ($lsf_confpath/lsf.conf) -lsf_confpath=/etc - -#If it is set to yes blah does not check the jobid in the logfiles -lsf_nochecksubmission= - -#If it is set to yes blah does NOT use log files to get job status, -#but uses only standard LRMS query (bhist) -lsf_nologaccess= - -#If it is set to no blah scripts for LSF will not try to read -#locally from the logs if BLParser is not present -lsf_fallback=no +##LSF subsection #Set to yes to use Blah Log Parser for LSF lsf_BLParser= #Host where Blah Log Parser for LSF is running -lsf_BLPserver=127.0.0.1 +lsf_BLPserver= #Port where Blah Log Parser for LSF is running -lsf_BLPport=33333 +lsf_BLPport= #Number of Blah Log Parser to try for LSF (if it is null lsf_BLPserver and lsf_BLPport are used) lsf_num_BLParser= @@ -98,32 +168,28 @@ lsf_BLPport1= lsf_BLPserver2= lsf_BLPport2= -# -#LSF Updater -# -#number of logs to be read by bhist (default:3) -bhist_logs_to_read= -# -# Condor -# +## +#####BUpdater/BNotifier section +## + +#seconds to sleep in the main loop +loop_interval= + +## +#####BUpdater subsection +## + +##Common BUpdater variables #Updater location bupdater_path= -#Notifier location -bnotifier_path= - #Updater pid file bupdater_pidfile=/var/tmp/cream_tomcat_bupdater.pid -#Notifier pid file -bnotifier_pidfile=/var/tmp/cream_tomcat_bnotifier.pid - -#condor bin location -condor_binpath=/opt/condor-c/bin - #Registry file location job_registry=/var/tmp/cream_tomcat_registry.db + #Set the following variable to 'yes' to have multiple BLAHPD instances #share the job registry -index- via mmap: job_registry_use_mmap=no @@ -140,68 +206,113 @@ bupdater_debug_level=1 #bupdater debug log file bupdater_debug_logfile=/var/tmp/bupdater.log -#bnotifier debug level -bnotifier_debug_level=1 - -#bnotifier debug log file -bnotifier_debug_logfile=/var/tmp/bnotifier.log - # purge interval purge_interval=7200 -#after that interval a bhist with -n bhist_logs_to_read is tried (default:120) -bhist_finalstate_interval=120 - #Minimum interval of time between the last update of a jobid entry and the first finalstate query try (default:30) finalstate_query_interval=30 #after that interval an unseen job is set as done (status == 4) and exitstatus == 999 (default:3600) alldone_interval=3600 -#path to condor_config -export CONDOR_CONFIG="/opt/condor-c/etc/condor_config" +#Command use to cache info abput the job in the batch system +batch_command_caching_filter= -#max number of concurrent threads to serve commands (default = 500) -#blah_max_threaded_cmds=100 +#poll timeout +bupdater_child_poll_timeout= -#seconds to sleep in the main loop -#loop_interval= +#set to yes to enable the blah clustering +job_registry_add_remote= + +#time interval between consistency check of blah registry (default 3600) +bupdater_consistency_check_interval= + +##LSF + +#number of logs to be read by bhist (default:3) +bhist_logs_to_read= + +#after that interval a bhist with -n bhist_logs_to_read is tried (default:120) +bhist_finalstate_interval=120 #use the long format for bjobs command (-l instead of -w) (yes|no) (default=yes) bupdater_bjobs_long_format=yes +#Enable the use of the caching for the batch system commands +#(the command is specified by batch_command_caching_filter) +lsf_batch_caching_enabled= + #use bhist to calculate suspended jobs timestamp bupdater_use_bhist_for_susp=no -#Colon-separated list of paths that are shared among batch system -#head and worker nodes. -#blah_shared_directories=/home:/users +#if set to yes bhist uses a time constraint to reduce the output (default no) +bupdater_use_bhist_time_constraint= -#By default the job temporary work directory is created as a subdirectory -#of wherever the batch system is configured to land the job. -#This variable changes the location where the work directory is created. -#A shell variable escaped or in single quotes will be resolved on the -#worker node in the job environment. Non-escaped variables will be resolved -#on the submit node in the submit environment. -#blah_wn_temporary_home_dir='$GLITE_LOCATION_TMP' +#use btools (default no) +bupdater_use_btools= -#These two attributes allow to change the directory on the worker node where -#the batch system is instructed to transfer input/output sandbox files to -#and from. -#These can be set in case the batch system default is not good enough -#(e.g.: the batch systems leaves output files behind) -#These variables can be resolved on the submit node -only-. -#blah_wn_inputsandbox=/tmp -#blah_wn_outputsandbox=/tmp +#btools path (default /usr/local/bin) +bupdater_btools_path= -#The following configuration attributes allow for additional -#submit command attributes to be set in the local shell callout -#for batch system customizations. -# -#Set this variable to pass all submit command attributes: -#blah_pass_all_submit_attributes=yes -# -#-Or- select individual attributes as follows: -#blah_pass_submit_attributes[0]="x509UserProxySubject" -#blah_pass_submit_attributes[1]="x509UserProxy" +#use bhist for jobs in idle state (default yes) +bupdater_use_bhist_for_idle= + +#use bhist for killed jobs (default yes) +bupdater_use_bhist_for_killed= + +##PBS + +#Enable the use of the caching for the batch system commands +#(the command is specified by batch_command_caching_filter) +pbs_batch_caching_enabled= + +#number of logs that tracejob read (default 2) +tracejob_logs_to_read= + +#max number of lines in tracejob output. This is done to get rid of +# a bug in pbs that causes tracejob to produce a large output (default 1000) +tracejob_max_output= +##Condor + +#condor bin location +condor_binpath=/opt/condor-c/bin + +#path to condor_config +export CONDOR_CONFIG="/opt/condor-c/etc/condor_config" + +#Enable the use of the caching for the batch system commands +#(the command is specified by batch_command_caching_filter) +condor_batch_caching_enabled= + +#If condor_history should be used or not to the final state info about the jobs. +bupdater_use_condor_history= + +##SGE + +sge_binpath= + +sge_cellname= + +sge_rootpath= + +##SLURM + +#path to the slurm executables +slurm_binpath= + +## +#####BNotifier subsection +## + +#Notifier location +bnotifier_path= + +#Notifier pid file +bnotifier_pidfile=/var/tmp/cream_tomcat_bnotifier.pid + +#bnotifier debug level +bnotifier_debug_level=1 + +#bnotifier debug log file +bnotifier_debug_logfile=/var/tmp/bnotifier.log From 99b73b124674511b101437726380d9a353e32e81 Mon Sep 17 00:00:00 2001 From: Francesco Prelz Date: Mon, 10 Dec 2012 11:22:57 +0100 Subject: [PATCH 014/169] Added ability to export to the local environment new environment variables created in the config file (see Savannah #99351). --- src/config.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++- src/config.h | 4 ++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/src/config.c b/src/config.c index 2000d8f1..0244bb7f 100644 --- a/src/config.c +++ b/src/config.c @@ -9,6 +9,8 @@ * 23-Nov-2007 Original release * 24-Apr-2009 Added parsing of shell arrays. * 13-Jan-2012 Added sbin and libexec install dirs. + * 30-Nov-2012 Added ability to locally setenv the env variables + * that are exported in the config file. * * Description: * Small library for access to the BLAH configuration file. @@ -77,8 +79,47 @@ config_parse_array_values(config_entry *en) } } +int +config_setenv(const char *ipath) + { + const char *printenv_command_before = "printenv"; + const char *printenv_command_after = ". %s;printenv"; + config_handle *envs_before; + config_handle *envs_after; + config_entry *cur; + int n_added = 0; + + envs_before = config_read_cmd(ipath, printenv_command_before); + envs_after = config_read_cmd(ipath, printenv_command_after); + + + /* Set in the local environment all env variables that were exported in */ + /* the config file. */ + + for (cur = envs_after->list; cur != NULL; cur=cur->next) + { + if (config_get(cur->key, envs_before) == NULL) + { + setenv(cur->key, cur->value, 1); + n_added++; + } + } + + config_free(envs_before); + config_free(envs_after); + + return n_added; + } + config_handle * config_read(const char *ipath) + { + const char *set_command_format = ". %s; set"; + return config_read_cmd(ipath, set_command_format); + } + +config_handle * +config_read_cmd(const char *ipath, const char *set_command_format) { char *path; char *install_location=NULL; @@ -91,7 +132,6 @@ config_read(const char *ipath) config_entry *c_tail = NULL; config_entry *found,*new_entry=NULL; char *set_command=NULL; - const char *set_command_format = ". %s; set"; int set_command_size; int line_len = 0; int line_alloc = 0; @@ -434,6 +474,8 @@ int main(int argc, char *argv[]) { int tcf; + int n_env; + char *test_env; char *path; const char *test_config = "\n" @@ -451,6 +493,7 @@ main(int argc, char *argv[]) "b4=0\n" "b4=\" Junk\"\n" "b5=\" False\"\n" + "export e1=\" My Env Variable \"\n" "file=/tmp/test_`whoami`.bjr\n" "arr[0]=value_0\n" "arr[3]=value_3\n" @@ -488,13 +531,27 @@ main(int argc, char *argv[]) setenv("BLAHPD_CONFIG_LOCATION",path,1); cha = config_read(NULL); + n_env = config_setenv(NULL); unlink(path); if (cha == NULL) { fprintf(stderr,"%s: Error reading config from %s: ",argv[0],path); + perror(""); return 4; } + if (n_env <= 0) + { + fprintf(stderr,"%s: No new env variables found in %s.\n",argv[0],path); + r=30; + } + if ((test_env = getenv("e1")) == NULL) + { + fprintf(stderr,"%s: Env variable e1 not found in %s.\n",argv[0],path); + r=31; + } + else printf("e1 env == <%s>\n", test_env); + ret = config_get("a",cha); if (ret == NULL) fprintf(stderr,"%s: key a not found\n",argv[0]),r=5; else if (atoi(ret->value) != 123) fprintf(stderr,"%s: key a != 123\n",argv[0]),r=6; diff --git a/src/config.h b/src/config.h index c96b9d28..f13af836 100644 --- a/src/config.h +++ b/src/config.h @@ -8,6 +8,8 @@ * Revision history : * 23-Nov-2007 Original release * 13-Jan-2012 Added sbin and libexec install dirs. + * 30-Nov-2012 Added ability to locally setenv the env variables + * that are exported in the config file. * * Description: * Prototypes of functions defined in config.c @@ -55,6 +57,8 @@ typedef struct config_handle_s } config_handle; config_handle *config_read(const char *path); +config_handle *config_read_cmd(const char *path, const char *cmd); +int config_setenv(const char *ipath); config_entry *config_get(const char *key, config_handle *handle); int config_test_boolean(const config_entry *entry); void config_free(config_handle *handle); From c03e573f5f607200ae51d580ed5e35a0bf3e77cd Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 07:15:53 -0600 Subject: [PATCH 015/169] Add default LRMS. Corresponds to add-sge.patch and add-condor.patch from the Condor distribution. --- config/blah.config.template | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config/blah.config.template b/config/blah.config.template index 369211fe..8f5509f2 100644 --- a/config/blah.config.template +++ b/config/blah.config.template @@ -5,7 +5,7 @@ ##Blah common variables #Supported batch systems (e.g. pbs,lsf) -supported_lrms= +supported_lrms=pbs,lsf,sge,slurm,condor #DGAS logfile BLAHPD_ACCOUNTING_INFO_LOG= @@ -292,9 +292,9 @@ bupdater_use_condor_history= sge_binpath= -sge_cellname= +sge_cellname=$SGE_CELL -sge_rootpath= +sge_rootpath=$SGE_ROOT ##SLURM From 69ac192ebd5ac9a938f8d670aeb12d0fd14f71b4 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 07:16:33 -0600 Subject: [PATCH 016/169] Add support for modern classad packaging. Corresponds to classad.patch from the Condor distribution. --- src/classad_binary_op_unwind.C | 6 +++--- src/classad_binary_op_unwind.h | 6 +----- src/classad_c_helper.C | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/classad_binary_op_unwind.C b/src/classad_binary_op_unwind.C index 4eb7f0d7..570466a0 100644 --- a/src/classad_binary_op_unwind.C +++ b/src/classad_binary_op_unwind.C @@ -39,10 +39,10 @@ #include // strcasecmp -#include "classad_distribution.h" +#include "classad/classad_distribution.h" #include "classad_binary_op_unwind.h" -#ifdef WANT_NAMESPACES +#ifdef 1 namespace classad { #endif @@ -221,6 +221,6 @@ UnparseAux( std::string &buffer, std::string &fnName, std::vector& ar return; } -#ifdef WANT_NAMESPACES +#ifdef 1 } // end of classad namespace #endif diff --git a/src/classad_binary_op_unwind.h b/src/classad_binary_op_unwind.h index 227892f3..ec4a492b 100644 --- a/src/classad_binary_op_unwind.h +++ b/src/classad_binary_op_unwind.h @@ -33,16 +33,14 @@ # */ -#include "classad_distribution.h" +#include "classad/classad_distribution.h" #ifndef __CLASSAD_BINARY_OP_UNWIND_H__ #define __CLASSAD_BINARY_OP_UNWIND_H__ -#ifdef WANT_NAMESPACES using namespace classad; namespace classad { -#endif class BinaryOpUnwind : public ClassAdUnParser { @@ -64,8 +62,6 @@ class BinaryOpUnwind : public ClassAdUnParser std::vector m_unwind_output; }; -#ifdef WANT_NAMESPACES } // end of classad namespace -#endif #endif // defined __CLASSAD_BINARY_OP_UNWIND_H__ diff --git a/src/classad_c_helper.C b/src/classad_c_helper.C index 5e98d750..fea13af1 100644 --- a/src/classad_c_helper.C +++ b/src/classad_c_helper.C @@ -40,7 +40,7 @@ */ #include -#include "classad_distribution.h" +#include "classad/classad_distribution.h" #include "classad_binary_op_unwind.h" #ifdef WANT_NAMESPACES From 7b2faf5153e9713094941ab0e402a7cdc426739e Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 07:17:17 -0600 Subject: [PATCH 017/169] Workaround for broken condor_history -f in prior Condor. Corresponds to condor-history.patch from the Condor distribution. --- src/scripts/condor_status.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/scripts/condor_status.sh b/src/scripts/condor_status.sh index e909c34a..e0dc6ff7 100755 --- a/src/scripts/condor_status.sh +++ b/src/scripts/condor_status.sh @@ -258,9 +258,12 @@ for job in $* ; do ### the history of an unexpected queue. # We can possibly get the location of the history file and check it. + # NOTE: In Condor 7.7.6-7.8.1, the -f option to condor_history was + # broken. To work around that, we set HISTORY via the environment + # instead of using -f. history_file=$($condor_binpath/condor_config_val $target -schedd history) if [ "$?" == "0" ]; then - line=$(echo $FORMAT | xargs $condor_binpath/condor_history -f $history_file -backwards $id) + line=$(echo $FORMAT | _condor_HISTORY="$history_file" xargs $condor_binpath/condor_history -f $history_file -backwards $id) if [ ! -z "$line" ] ; then echo "0$(make_ad $job "$line")" exit 0 From 01001ee7907e142360e9f48e49e3417ff57db1ad Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 07:20:35 -0600 Subject: [PATCH 018/169] Check for presence of dlopen. Corresponds to dl.patch from Condor distribution. --- configure.ac | 1 + 1 file changed, 1 insertion(+) diff --git a/configure.ac b/configure.ac index e8e53cdf..5de2f191 100755 --- a/configure.ac +++ b/configure.ac @@ -196,6 +196,7 @@ AC_HEADER_TIME dnl Checks for library functions. AC_CHECK_FUNCS(select socket strdup strerror bsearch vsnprintf mmap munmap) +AC_CHECK_FUNCS(dlopen, ,AC_CHECK_LIB(dl, dlopen)) dnl GLITE_CHECK_LIBDIR dnl GLITE_CHECK_INITDIR From b9eb0cf195e837c018df7ba22a7aa33beb4f1001 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 07:21:15 -0600 Subject: [PATCH 019/169] Escape more meta-characters. Corresponds to escape-args.patch from Condor distribution. --- src/server.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/server.c b/src/server.c index afcf0f39..7293bdce 100644 --- a/src/server.c +++ b/src/server.c @@ -3457,6 +3457,11 @@ ConvertArgs(char* original, char separator) memcpy(result + j, CONVARG_DBLQUOTESC, CONVARG_DBLQUOTESC_LEN); j += CONVARG_DBLQUOTESC_LEN; } + else if ((original[i] == '(') || (original[i] == ')') || (original[i] == '&')) + { /* Must escape a few meta-characters for wordexp */ + result[j++] = '\\'; + result[j++] = original[i]; + } else { /* plain copy from the original */ result[j++] = original[i]; From 3ee38689d503777000a474d9de81678b282d66e3 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 07:30:12 -0600 Subject: [PATCH 020/169] Update to modern ClassAds API. Corresponds to iclassad.patch from Condor distribution. --- src/classad_c_helper.C | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/classad_c_helper.C b/src/classad_c_helper.C index fea13af1..560a2ab7 100644 --- a/src/classad_c_helper.C +++ b/src/classad_c_helper.C @@ -289,7 +289,7 @@ extern "C" ExprList *et_value; et_value = ExprList::MakeExprList(et_ads); - if (ad->Insert (name, et_value)) return C_CLASSAD_NO_ERROR; + if (ad->Insert (name, (ExprTree* &)et_value)) return C_CLASSAD_NO_ERROR; else return C_CLASSAD_INSERT_FAILED; } From 6f9137d9b95a36f41a28e5008f8ab3d34ad8adfa Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 07:31:00 -0600 Subject: [PATCH 021/169] Disable LD_LIBRARY_PATH changes. Simply not necessary. Corresponds to ld-library-path.patch from Condor distribution. --- src/server.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/server.c b/src/server.c index 7293bdce..6d4c6109 100644 --- a/src/server.c +++ b/src/server.c @@ -372,6 +372,13 @@ serveConnection(int cli_socket, char* cli_ip_addr) tmp_dir = DEFAULT_TEMP_DIR; } +/* In the Condor build of the blahp, we can find all the libraries we need + * via the RUNPATH. Setting LD_LIBRARY_PATH can muck up the command line + * tools for the local batch system. + * + * Similarly, in OSG, all Globus libraries are in the expected location. + */ +#if 0 needed_libs = make_message("%s/lib:%s/externals/lib:%s/lib:/opt/lcg/lib", result, result, getenv("GLOBUS_LOCATION") ? getenv("GLOBUS_LOCATION") : "/opt/globus"); old_ld_lib=getenv("LD_LIBRARY_PATH"); if(old_ld_lib) @@ -387,7 +394,7 @@ serveConnection(int cli_socket, char* cli_ip_addr) } else setenv("LD_LIBRARY_PATH",needed_libs,1); - +#endif blah_script_location = strdup(blah_config_handle->libexec_path); blah_version = make_message(RCSID_VERSION, VERSION, "poly,new_esc_format"); require_proxy_on_submit = config_test_boolean(config_get("blah_require_proxy_on_submit",blah_config_handle)); From abb5acb244645e7b7b4fc23c8156fdd41a4ef5c5 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 07:34:10 -0600 Subject: [PATCH 022/169] Conditionally compile in mcheck.h. Corresponds to mtrace.patch in the Condor distribution. --- src/blparser_master.c | 2 ++ src/main.c | 3 ++- src/main_daemon.c | 3 ++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/blparser_master.c b/src/blparser_master.c index 1c73baee..7ef6b3b3 100644 --- a/src/blparser_master.c +++ b/src/blparser_master.c @@ -34,7 +34,9 @@ #include #include #include +#ifdef MTRACE_ON #include +#endif #include #include #include diff --git a/src/main.c b/src/main.c index 0c7f842d..a5a7c485 100644 --- a/src/main.c +++ b/src/main.c @@ -43,8 +43,9 @@ #include #include #include +#ifdef MTRACE_ON #include - +#endif #include "blahpd.h" #include "server.h" #include "console.h" diff --git a/src/main_daemon.c b/src/main_daemon.c index dbaebaec..24fd5dce 100644 --- a/src/main_daemon.c +++ b/src/main_daemon.c @@ -43,8 +43,9 @@ #include #include #include +#ifdef MTRACE_ON #include - +#endif #include "blahpd.h" #include "server.h" #include "console.h" From f7541d73c944155442403d9ee57ef816b02a070c Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 07:37:21 -0600 Subject: [PATCH 023/169] Set some sane default paths for PBS, LSF, and Condor. Corresponds to config-paths.patch from Condor distribution. --- config/blah.config.template | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/config/blah.config.template b/config/blah.config.template index 8f5509f2..234bcfec 100644 --- a/config/blah.config.template +++ b/config/blah.config.template @@ -76,17 +76,18 @@ blah_disable_proxy_user_copy= ##PBS common variables #Path where PBS executables are located -pbs_binpath= +# NOTE: this path is computed many times; I worry about the overhead here. -BB, 12-13-2012 +pbs_binpath=`which qsub 2>/dev/null|sed 's|/[^/]*$||'` #Path where the PBS logs are located ($pbs_spoolpath/server_logs) -pbs_spoolpath= +#pbs_spoolpath= #If it is set to yes blah does not check the jobid in the logfiles pbs_nochecksubmission= #If it is set to yes blah does NOT use log files to get job status, #but uses only standard LRMS query (qstat) -pbs_nologaccess= +pbs_nologaccess=yes #If it is set to no blah scripts for PBS will not try to read #locally from the logs if BLParser is not present @@ -96,7 +97,7 @@ pbs_fallback=no ##LSF common variables #Path where LSF executables are located -lsf_binpath= +lsf_binpath=`which bsub 2>/dev/null|sed 's|/[^/]*$||'` #Path where the LSF conf file is located ($lsf_confpath/lsf.conf) lsf_confpath= @@ -276,10 +277,10 @@ tracejob_max_output= ##Condor #condor bin location -condor_binpath=/opt/condor-c/bin +condor_binpath=`which condor_submit 2>/dev/null|sed 's|/[^/]*$||'` #path to condor_config -export CONDOR_CONFIG="/opt/condor-c/etc/condor_config" +#export CONDOR_CONFIG="/etc/condor/condor_config" #Enable the use of the caching for the batch system commands #(the command is specified by batch_command_caching_filter) From c5b0b94d2c56ed2392bf66093a34cc2c3e8ec6b7 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 07:44:25 -0600 Subject: [PATCH 024/169] Assume a job that has left the queue and completed if qstat doesn't show it in the queue. Corresponds to pbs-completion.patch from the Condor distribution. --- src/scripts/pbs_status.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/scripts/pbs_status.sh b/src/scripts/pbs_status.sh index 01629c97..df89941e 100755 --- a/src/scripts/pbs_status.sh +++ b/src/scripts/pbs_status.sh @@ -169,7 +169,9 @@ for reqfull in $pars ; do staterr=/tmp/${reqjob}_staterr -result=`${pbs_binpath}/qstat -f $reqjob 2>$staterr | awk -v jobId=$reqjob ' +result=`${pbs_binpath}/qstat -f $reqjob 2>$staterr` +qstat_exit_code=$? +result=`echo "$result" | awk -v jobId=$reqjob ' BEGIN { current_job = "" current_wn = "" @@ -223,6 +225,11 @@ END { if [ -z "$errout" ] ; then echo "0"$result retcode=0 + elif [ "$qstat_exit_code" -eq "153" ] ; then + # If the job has disappeared, assume it's completed + # (same as globus) + echo "0[BatchJobId=\"$reqjob\";JobStatus=4;ExitCode=0]" + retcode=0 else echo "1ERROR: Job not found" retcode=1 From c46b0e2a57be0752becfc59258979674ede4134a Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 07:49:25 -0600 Subject: [PATCH 025/169] Do not specify a proxy format - let grid-proxy-init deduce the correct one. Corresponds to proxy-init.patch from Condor distribution. --- src/server.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/server.c b/src/server.c index 6d4c6109..6339adaf 100644 --- a/src/server.c +++ b/src/server.c @@ -2631,14 +2631,14 @@ limit_proxy(char* proxy_name, char *limited_proxy_name) if (seconds_left <= 0) { /* Something's wrong with the current proxy - use defaults */ - exe_command.command = make_message("%s/bin/grid-proxy-init -old -limited -cert %s -key %s -out %s", + exe_command.command = make_message("%s/bin/grid-proxy-init -limited -cert %s -key %s -out %s", globuslocation, proxy_name, proxy_name, limit_command_output); } else { hours_left = (int)(seconds_left/3600); minutes_left = (int)((seconds_left%3600)/60) + 1; - exe_command.command = make_message("%s/bin/grid-proxy-init -old -limited -valid %d:%d -cert %s -key %s -out %s", + exe_command.command = make_message("%s/bin/grid-proxy-init -limited -valid %d:%d -cert %s -key %s -out %s", globuslocation, hours_left, minutes_left, proxy_name, proxy_name, limit_command_output); } From 4cad4f61100a7559489fbef0fe2b5e72c385eff9 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 07:51:09 -0600 Subject: [PATCH 026/169] Turn off job registry by default. Corresponds to registry.patch from Condor distribution. --- config/blah.config.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/blah.config.template b/config/blah.config.template index 234bcfec..375cce84 100644 --- a/config/blah.config.template +++ b/config/blah.config.template @@ -189,7 +189,7 @@ bupdater_path= bupdater_pidfile=/var/tmp/cream_tomcat_bupdater.pid #Registry file location -job_registry=/var/tmp/cream_tomcat_registry.db +job_registry= #Set the following variable to 'yes' to have multiple BLAHPD instances #share the job registry -index- via mmap: From 5037e77afdf2f449ca41ee98cfb6fd9bb110320e Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 07:57:59 -0600 Subject: [PATCH 027/169] Enable the ability for a user to specify a job directory. Corresponds to run-dir.patch from the Condor distribution. --- src/scripts/blah_common_submit_functions.sh | 12 +++++++++--- src/scripts/condor_submit.sh | 3 ++- src/server.c | 1 + 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index e2b2d52c..9cdcd330 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -268,7 +268,7 @@ function bls_parse_submit_options () ############################################################### # Parse parameters ############################################################### - while getopts "a:i:o:e:c:s:v:V:dw:q:n:N:z:h:S:r:p:l:x:u:j:T:I:O:R:C:" arg + while getopts "a:i:o:e:c:s:v:V:dw:q:n:N:z:h:S:r:p:l:x:u:j:T:I:O:R:C:D:" arg do case "$arg" in a) bls_opt_xtra_args="$OPTARG" ;; @@ -298,6 +298,7 @@ function bls_parse_submit_options () O) bls_opt_outputflstring="$OPTARG" ;; R) bls_opt_outputflstringremap="$OPTARG" ;; C) bls_opt_req_file="$OPTARG";; + D) bls_opt_run_dir="$OPTARG";; -) break ;; ?) echo $usage_string exit 1 ;; @@ -620,10 +621,15 @@ function bls_start_job_wrapper () echo "old_home=\`pwd\`" # Set the temporary home (including cd'ing into it) + if [ "x$bls_opt_run_dir" != "x" ] ; then + run_dir="$bls_opt_run_dir" + else + run_dir="home_$bls_tmp_name" + fi if [ -n "$blah_wn_temporary_home_dir" ] ; then - echo "new_home=${blah_wn_temporary_home_dir}/home_$bls_tmp_name" + echo "new_home=${blah_wn_temporary_home_dir}/$run_dir" else - echo "new_home=\${old_home}/home_$bls_tmp_name" + echo "new_home=\${old_home}/$run_dir" fi echo "mkdir \$new_home" diff --git a/src/scripts/condor_submit.sh b/src/scripts/condor_submit.sh index 8337fbb2..ea118eb0 100755 --- a/src/scripts/condor_submit.sh +++ b/src/scripts/condor_submit.sh @@ -53,7 +53,7 @@ mpinodes=0 # Name of local requirements file: currently unused req_file="" -while getopts "a:i:o:de:j:n:v:V:c:w:x:u:q:r:s:T:I:O:R:C:" arg +while getopts "a:i:o:de:j:n:v:V:c:w:x:u:q:r:s:T:I:O:R:C:D:" arg do case "$arg" in a) xtra_args="$OPTARG" ;; @@ -77,6 +77,7 @@ do O) outputflstring="$OPTARG" ;; R) remaps="$OPTARG" ;; C) req_file="$OPTARG" ;; + D) run_dir="$OPTARG" ;; -) break ;; ?) echo $usage_string exit 1 ;; diff --git a/src/server.c b/src/server.c index 6339adaf..c7237cd5 100644 --- a/src/server.c +++ b/src/server.c @@ -1275,6 +1275,7 @@ cmd_submit_job(void *args) (set_cmd_int_option (&command, cad, "HostSMPSize", "-N", INT_NOQUOTE) == C_CLASSAD_OUT_OF_MEMORY) || (set_cmd_bool_option (&command, cad, "StageCmd", "-s", NO_QUOTE) == C_CLASSAD_OUT_OF_MEMORY) || (set_cmd_string_option(&command, cad, "ClientJobId","-j", NO_QUOTE) == C_CLASSAD_OUT_OF_MEMORY) || + (set_cmd_string_option(&command, cad, "JobDirectory","-D", NO_QUOTE) == C_CLASSAD_OUT_OF_MEMORY) || (set_cmd_string_option(&command, cad, "BatchExtraSubmitArgs", "-a", SINGLE_QUOTE) == C_CLASSAD_OUT_OF_MEMORY)) // (set_cmd_string_option(&command, cad, "Args", "--", SINGLE_QUOTE) == C_CLASSAD_OUT_OF_MEMORY)) { From 642ebf2c6f098b5ae02bc56d000825456d72ec9d Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 07:59:27 -0600 Subject: [PATCH 028/169] Fallback to using qstat if sge_helper cannot find the job. Corresponds to sge-status-fallback.patch from the Condor distribution. --- src/scripts/sge_status.sh | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/scripts/sge_status.sh b/src/scripts/sge_status.sh index 551069fb..9fbddaa7 100755 --- a/src/scripts/sge_status.sh +++ b/src/scripts/sge_status.sh @@ -69,5 +69,37 @@ jobid=${tmpid}.default blahp_status=`exec ${sge_helper_path:-/opt/glite/bin}/sge_helper --status $getwn $jobid` retcode=$? +# Now see if we need to run qstat 'manually' +if [ $retcode -ne 0 ]; then + + qstat_out=`qstat` + + # First, find the column with the State information: + state_col=`echo "$qstat_out" | head -n 1 | awk '{ for (i = 1; i<=NF; i++) if ($i == "state") print i;}'` + job_state=`echo "$qstat_out" | awk -v "STATE_COL=$state_col" -v "JOBID=$tmpid" '{ if ($1 == JOBID) print $STATE_COL; }'` + + if [[ "$job_state" =~ q ]]; then + jobstatus=1 + elif [[ "$job_state" =~ [rt] ]]; then + jobstatus=2 + elif [[ "$job_state" =~ h ]]; then + jobstatus=5 + elif [[ "$job_state" =~ E ]]; then + jobstatus=4 + elif [[ "$job_state" =~ d ]]; then + jobstatus=3 + elif [ "x$job_state" == "x" ]; then + jobstatus=4 + fi + + if [ $jobstatus -eq 4 ]; then + blahp_status="[BatchJobId=\"$tmpid\";JobStatus=$jobstatus;ExitCode=0]" + else + blahp_status="[BatchJobId=\"$tmpid\";JobStatus=$jobstatus]" + fi + retcode=0 + +fi + echo ${retcode}${blahp_status} #exit $retcode From d99628b71fea7a62d67d01918a114cac800e4845 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 08:01:12 -0600 Subject: [PATCH 029/169] Use correct cellname. Corresponds to sge_status.patch from the Condor distribution. --- src/scripts/sge_status.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/scripts/sge_status.sh b/src/scripts/sge_status.sh index 9fbddaa7..edeb05f5 100755 --- a/src/scripts/sge_status.sh +++ b/src/scripts/sge_status.sh @@ -23,6 +23,8 @@ #[ -f ${GLITE_LOCATION:-/opt/glite}/etc/blah.config ] && . ${GLITE_LOCATION:-/opt/glite}/etc/blah.config . `dirname $0`/blah_load_config.sh +sge_helper_path=${GLITE_LOCATION:-/opt/glite}/bin + usage_string="Usage: $0 [-w] [-n]" #get worker node info @@ -63,7 +65,7 @@ fi tmpid=`echo "$@"|sed 's/.*\/.*\///g'` # ASG Keith way -jobid=${tmpid}.default +jobid=${tmpid}.${sge_cellname:-default} blahp_status=`exec ${sge_helper_path:-/opt/glite/bin}/sge_helper --status $getwn $jobid` From 3bd98d7bbed2c914052fffbd56a81003824dfc0e Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 08:04:08 -0600 Subject: [PATCH 030/169] Maximize use of shared directory by using absolute paths. Corresponds to shared-dir.patch from Condor distribution. --- src/scripts/blah_common_submit_functions.sh | 39 +++++++++------------ 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index 9cdcd330..88d70841 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -495,6 +495,7 @@ function bls_setup_all_files () # Setup stdin, stdout & stderr if [ ! -z "$bls_opt_stdin" ] ; then + if [ "${bls_opt_stdin:0:1}" != "/" ] ; then bls_opt_stdin=${bls_opt_workdir}/${bls_opt_stdin} ; fi if [ -f "$bls_opt_stdin" ] ; then bls_test_shared_dir "$bls_opt_stdin" if [ "x$bls_is_in_shared_dir" == "xyes" ] ; then @@ -509,22 +510,22 @@ function bls_setup_all_files () fi fi if [ ! -z "$bls_opt_stdout" ] ; then + if [ "${bls_opt_stdout:0:1}" != "/" ] ; then bls_opt_stdout=${bls_opt_workdir}/${bls_opt_stdout} ; fi bls_test_shared_dir "$bls_opt_stdout" if [ "x$bls_is_in_shared_dir" == "xyes" ] ; then bls_arguments="$bls_arguments > \"$bls_opt_stdout\"" else - if [ "${bls_opt_stdout:0:1}" != "/" ] ; then bls_opt_stdout=${bls_opt_workdir}/${bls_opt_stdout} ; fi bls_unique_stdout_name="${blah_wn_outputsandbox}out_${bls_tmp_name}_`basename $bls_opt_stdout`" bls_arguments="$bls_arguments > \"$bls_unique_stdout_name\"" bls_fl_add_value outputsand "$bls_opt_stdout" "$bls_unique_stdout_name" fi fi if [ ! -z "$bls_opt_stderr" ] ; then + if [ "${bls_opt_stderr:0:1}" != "/" ] ; then bls_opt_stderr=${bls_opt_workdir}/${bls_opt_stderr} ; fi bls_test_shared_dir "$bls_opt_stderr" if [ "x$bls_is_in_shared_dir" == "xyes" ] ; then bls_arguments="$bls_arguments 2> \"$bls_opt_stderr\"" else - if [ "${bls_opt_stderr:0:1}" != "/" ] ; then bls_opt_stderr=${bls_opt_workdir}/${bls_opt_stderr} ; fi if [ "$bls_opt_stderr" == "$bls_opt_stdout" ]; then bls_arguments="$bls_arguments 2>&1" else @@ -544,6 +545,7 @@ function bls_setup_all_files () exec 4< "$bls_opt_inputflstring" while read xfile <&4 ; do if [ ! -z $xfile ] ; then + if [ "${xfile:0:1}" != "/" ] ; then xfile=${bls_opt_workdir}/${xfile} ; fi bls_test_shared_dir "$xfile" if [ "x$bls_is_in_shared_dir" == "xyes" ] ; then bls_fl_add_value inputcopy "$xfile" "./`basename ${xfile}`" @@ -571,28 +573,19 @@ function bls_setup_all_files () read xfileremap <&6 fi - bls_test_shared_dir "$xfile" + if [ -z $xfileremap ] ; then + xfileremap="$xfile" + fi + if [ "${xfileremap:0:1}" != "/" ] ; then + xfileremap=${bls_opt_workdir}/${xfileremap} + fi + bls_test_shared_dir "$xfileremap" if [ "x$bls_is_in_shared_dir" != "xyes" ] ; then - if [ "${xfile:0:1}" != "/" ] ; then - xfile_base="`basename ${xfile}`" - xfile_transfer="${blah_wn_outputsandbox}${xfile_base}.$uni_ext" - else - xfile_transfer="$xfile" - fi - if [ ! -z $xfileremap ] ; then - if [ "${xfileremap:0:1}" != "/" ] ; then - bls_fl_add_value outputsand "${bls_opt_workdir}/${xfileremap}" "$xfile_transfer" "$xfile" - else - bls_test_shared_dir "$xfileremap" - if [ "x$bls_is_in_shared_dir" == "xyes" ] ; then - bls_fl_add_value outputmove "${xfileremap}" "$xfile" - else - bls_fl_add_value outputsand "${xfileremap}" "$xfile_transfer" "$xfile" - fi - fi - else - bls_fl_add_value outputsand "${bls_opt_workdir}/${xfile}" "$xfile_transfer" "$xfile" - fi + xfile_base="`basename ${xfile}`" + xfile_transfer="${blah_wn_outputsandbox}${xfile_base}.$uni_ext" + bls_fl_add_value outputsand "$xfileremap" "$xfile_transfer" "$xfile" + else + bls_fl_add_value outputmove "$xfileremap" "$xfile" fi fi done From 9c342412daff261779fa66fc04d1275a06e8ea9f Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 08:12:37 -0600 Subject: [PATCH 031/169] Assume all directories are shared. Corresponds to shared-fs.patch in the Condor distribution. --- config/blah.config.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/blah.config.template b/config/blah.config.template index 375cce84..23edffb0 100644 --- a/config/blah.config.template +++ b/config/blah.config.template @@ -25,7 +25,7 @@ blah_delegate_renewed_proxies= #Colon-separated list of paths that are shared among batch system #head and worker nodes. -#blah_shared_directories=/home:/users +blah_shared_directories=/ #By default the job temporary work directory is created as a subdirectory #of wherever the batch system is configured to land the job. From aa3a9ba43cefbdf14d86e8b638c1006e73539c47 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 08:13:58 -0600 Subject: [PATCH 032/169] Do not source user environment. Corresponds to submit-l.patch in the Condor distribution. --- src/scripts/condor_submit.sh | 2 +- src/scripts/lsf_submit.sh | 2 +- src/scripts/pbs_submit.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/scripts/condor_submit.sh b/src/scripts/condor_submit.sh index ea118eb0..d2302ea3 100755 --- a/src/scripts/condor_submit.sh +++ b/src/scripts/condor_submit.sh @@ -1,4 +1,4 @@ -#!/bin/bash -l +#!/bin/bash # # File: condor_submit.sh # Author: Giuseppe Fiorentino (giuseppe.fiorentino@mi.infn.it) diff --git a/src/scripts/lsf_submit.sh b/src/scripts/lsf_submit.sh index 3c4c3dc8..8f4f7126 100755 --- a/src/scripts/lsf_submit.sh +++ b/src/scripts/lsf_submit.sh @@ -1,4 +1,4 @@ -#!/bin/bash -l +#!/bin/bash # # File: lsf_submit.sh # Author: David Rebatto (david.rebatto@mi.infn.it) diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index 40d2eb60..e97e41dc 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -1,4 +1,4 @@ -#!/bin/bash -l +#!/bin/bash # # File: pbs_submit.sh # Author: David Rebatto (david.rebatto@mi.infn.it) From f281e667f41c881718024ea2b1b1f3b933a44a99 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 08:18:20 -0600 Subject: [PATCH 033/169] Do not enable the blah parser by default. Corresponds to blahp_chkconfig.patch from the OSG distribution. --- config/glite-ce-blah-parser | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/glite-ce-blah-parser b/config/glite-ce-blah-parser index cbe081c9..d42f5632 100755 --- a/config/glite-ce-blah-parser +++ b/config/glite-ce-blah-parser @@ -19,7 +19,7 @@ # # description: gLite CE blah parser -# chkconfig: 345 94 15 +# chkconfig: - 94 15 # Source function library . /etc/rc.d/init.d/functions From 63ab7610b698c8a658d0052b250c4d4d3cce3b44 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 08:21:45 -0600 Subject: [PATCH 034/169] Comment out disable_proxy_user_copy by default. Corresponds to blahp_condor_config.patch from OSG distribution. --- config/blah.config.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/blah.config.template b/config/blah.config.template index 23edffb0..5fc570d1 100644 --- a/config/blah.config.template +++ b/config/blah.config.template @@ -71,7 +71,7 @@ blah_children_restart_interval= blah_require_proxy_on_submit= #disable proxy user copy (default no) -blah_disable_proxy_user_copy= +#blah_disable_proxy_user_copy=yes ##PBS common variables From 163b0edc38dfe321becd82780e9328fd2a2ca2eb Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 08:22:43 -0600 Subject: [PATCH 035/169] Change the default location of blahp scripts to /usr/libexec/blahp. Corresponds to blahp_init_script_paths.patch from OSG distribution. --- config/glite-ce-blah-parser | 2 +- config/glite-ce-check-blparser | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config/glite-ce-blah-parser b/config/glite-ce-blah-parser index d42f5632..2800790f 100755 --- a/config/glite-ce-blah-parser +++ b/config/glite-ce-blah-parser @@ -39,7 +39,7 @@ else blparser_bin_directory="${GLITE_LOCATION}/bin" else # Default value when everything else fails. - blparser_bin_directory="/usr/libexec" + blparser_bin_directory="/usr/libexec/blahp" fi fi diff --git a/config/glite-ce-check-blparser b/config/glite-ce-check-blparser index 84a62220..a078ee39 100755 --- a/config/glite-ce-check-blparser +++ b/config/glite-ce-check-blparser @@ -31,7 +31,7 @@ else blparser_bin_directory="${GLITE_LOCATION}/bin" else # Default when everything else fails. - blparser_bin_directory="/usr/libexec" + blparser_bin_directory="/usr/libexec/blahp" fi fi From dbbfd62942cd5d9be71345bbea9acb536b7062d2 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 08:24:11 -0600 Subject: [PATCH 036/169] Make blahp be permissive of proxies with relative paths. Also fixed upstream in the gridmanager. Corresponds to blahp-relative-proxypath.patch from the OSG distribution. --- src/server.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/server.c b/src/server.c index c7237cd5..59ae2859 100644 --- a/src/server.c +++ b/src/server.c @@ -1031,6 +1031,7 @@ cmd_submit_job(void *args) char *error_string; int res = 1; char *proxyname = NULL; + char *iwd = NULL; char *proxysubject = NULL; char *proxyfqan = NULL; char *proxynameNew = NULL; @@ -1091,6 +1092,30 @@ cmd_submit_job(void *args) proxyname = NULL; } } + /* If the proxy is a relative path, we must prepend the Iwd to make it absolute */ + if (proxyname && proxyname[0] != '/') { + if (classad_get_dstring_attribute(cad, "Iwd", &iwd) == C_CLASSAD_NO_ERROR) { + size_t iwdlen = strlen(iwd); + size_t proxylen = iwdlen + strlen(proxyname) + 1; + char *proxynameTmp; + proxynameTmp = malloc(proxylen + 1); + if (!proxynameTmp) { + resultLine = make_message("%s 1 Malloc\\ failure N/A", reqId); + goto cleanup_lrms; + } + memcpy(proxynameTmp, iwd, iwdlen); + proxynameTmp[iwdlen] = '/'; + strcpy(proxynameTmp+iwdlen+1, proxyname); + free(proxyname); + free(iwd); + iwd = NULL; + proxyname = proxynameTmp; + proxynameTmp = NULL; + } else { + resultLine = make_message("%s 1 Relative\\ x509UserProxy\\ specified\\ without\\ Iwd N/A", reqId); + goto cleanup_lrms; + } + } /* If there are additional arguments, we have to map on a different id */ if(argv[CMD_SUBMIT_JOB_ARGS + 1] != NULL) From 1725c9b19ff3c741ed5cc9b6e5188de30e59903b Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 08:25:45 -0600 Subject: [PATCH 037/169] Add basic files to gitignore for cmake. --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..50d4e8be --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +CMakeCache.txt +CMakeFiles +CPackSourceConfig.cmake +CPackConfig.cmake From 6638e6189c164604d75a7a2674ad3007a8ac3bbc Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 08:31:52 -0600 Subject: [PATCH 038/169] Look for OSG-style m4 macro files, if present. --- bootstrap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bootstrap b/bootstrap index 4d8ecb1b..32993aea 100755 --- a/bootstrap +++ b/bootstrap @@ -24,7 +24,7 @@ set -x if [ -d ../org.glite/project ]; then aclocal -I project -I ../org.glite/project else - aclocal -I project + aclocal -I project -I /usr/share/glite-build-common-cpp/m4 fi libtoolize --force From 929b2b3c8cc13c742ff172850c7a2f65e5bd0516 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 08:32:30 -0600 Subject: [PATCH 039/169] Add OSG-developed caching PBS status script. --- src/scripts/pbs_status.py | 432 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 432 insertions(+) create mode 100755 src/scripts/pbs_status.py diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py new file mode 100755 index 00000000..60f4a262 --- /dev/null +++ b/src/scripts/pbs_status.py @@ -0,0 +1,432 @@ +#!/usr/bin/python + +# File: pbs_status.py +# +# Author: Brian Bockelman +# e-mail: bbockelm@cse.unl.edu +# +# +# Copyright (c) University of Nebraska-Lincoln. 2012 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import re +import pwd +import sys +import time +import errno +import fcntl +import struct +import tempfile + +cache_timeout = 60 + +launchtime = time.time() + +# This function was written by me for an older project, the OSG-GIP +def pbsOutputFilter(fp): + """ + PBS can be a pain to work with because it automatically cuts + lines off at 80 chars and continues the line on the next line. For + example:: + + Server: red + server_state = Active + server_host = red.unl.edu + scheduling = True + total_jobs = 2996 + state_count = Transit:0 Queued:2568 Held:0 Waiting:0 Running:428 Exiting + :0 Begun:0 + acl_roots = t3 + managers = mfurukaw@red.unl.edu,root@t3 + + This function puts the line ":0 Begun:0" with the above line. It's meant + to filter the output, so you should "scrub" PBS output like this:: + + fp = runCommand() + for line in pbsOutputFilter(fp): + ... parse line ... + + This function uses iterators + """ + class PBSIter: + """ + An iterator for PBS output; this allows us to easily parse over + PBS-style line continuations. + """ + + def __init__(self, fp): + self.fp = fp + self.fp_iter = fp.__iter__() + self.prevline = None + self.done = False + + def next(self): + """ + Return the next full line of output for the iterator. + """ + if self.prevline == None: + line = self.fp_iter.next() + if line.startswith('\t'): + # Bad! The output shouldn't start with a + # partial line + raise ValueError("PBS output contained bad data.") + self.prevline = line + return self.next() + if self.done: + raise StopIteration() + try: + line = self.fp_iter.next() + if line.startswith('\t'): + self.prevline = self.prevline[:-1] + line[1:-1] + return self.next() + else: + old_line = self.prevline + self.prevline = line + return old_line + except StopIteration: + self.done = True + return self.prevline + + class PBSFilter: + """ + An iterable object based upon the PBSIter iterator. + """ + + def __init__(self, myiter): + self.iter = myiter + + def __iter__(self): + return self.iter + + return PBSFilter(PBSIter(fp)) + +# Something else from a prior life - see gratia-probe-common's GratiaWrapper.py +def ExclusiveLock(fd, timeout=120): + """ + Grabs an exclusive lock on fd + + If the lock is owned by another process, and that process is older than the + timeout, then the other process will be signaled. If the timeout is + negative, then the other process is never signaled. + + If we are unable to hold the lock, this call will not block on the lock; + rather, it will throw an exception. + + By default, the timeout is 120 seconds. + """ + + # POSIX file locking is cruelly crude. There's nothing to do besides + # try / sleep to grab the lock, no equivalent of polling. + # Why hello, thundering herd. + + # An alternate would be to block on the lock, and use signals to interupt. + # This would mess up Gratia's flawed use of signals already, and not be + # able to report on who has the lock. I don't like indefinite waits! + max_tries = 5 + for tries in range(1, max_tries+1): + try: + fcntl.lockf(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + return + except IOError, ie: + if not ((ie.errno == errno.EACCES) or (ie.errno == errno.EAGAIN)): + raise + if check_lock(fd, timeout): + time.sleep(.2) # Fast case; however, we have *no clue* how + # long it takes to clean/release the old lock. + # Nor do we know if we'd get it if we did + # fcntl.lockf w/ blocking immediately. Blech. + # Check again immediately, especially if this was the last + # iteration in the for loop. + try: + fcntl.lockf(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + return + except IOError, ie: + if not ((ie.errno == errno.EACCES) or (ie.errno == errno.EAGAIN)): + raise + print >> sys.stderr, "Unable to acquire lock, try %i; will sleep for %i " \ + "seconds and try %i more times." % (tries, tries, max_tries-tries) + time.sleep(tries) + + raise Exception("Unable to acquire lock") + +def check_lock(fd, timeout): + """ + For internal use only. + + Given a fd that is locked, determine which process has the lock. + Kill said process if it is older than "timeout" seconds. + This will log the PID of the "other process". + """ + + pid = get_lock_pid(fd) + if pid == os.getpid(): + return True + + if timeout < 0: + print >> sys.stderr, "Another process, %d, holds the cache lock." % pid + return False + + try: + age = get_pid_age(pid) + except: + print >> sys.stderr, "Another process, %d, holds the cache lock." % pid + print >> sys.stderr, "Unable to get the other process's age; will not time " \ + "it out." + return False + + print >> sys.stderr, "Another process, %d (age %d seconds), holds the cache " \ + "lock." % (pid, age) + + if age > timeout: + os.kill(pid, signal.SIGKILL) + else: + return False + + return True + +linux_struct_flock = "hhxxxxqqixxxx" +try: + os.O_LARGEFILE +except AttributeError: + start_len = "hhlli" + +def get_lock_pid(fd): + # For reference, here's the definition of struct flock on Linux + # (/usr/include/bits/fcntl.h). + # + # struct flock + # { + # short int l_type; /* Type of lock: F_RDLCK, F_WRLCK, or F_UNLCK. */ + # short int l_whence; /* Where `l_start' is relative to (like `lseek'). */ + # __off_t l_start; /* Offset where the lock begins. */ + # __off_t l_len; /* Size of the locked area; zero means until EOF. */ + # __pid_t l_pid; /* Process holding the lock. */ + # }; + # + # Note that things are different on Darwin + # Assuming off_t is unsigned long long, pid_t is int + try: + if sys.platform == "darwin": + arg = struct.pack("QQihh", 0, 0, 0, fcntl.F_WRLCK, 0) + else: + arg = struct.pack(linux_struct_flock, fcntl.F_WRLCK, 0, 0, 0, 0) + result = fcntl.fcntl(fd, fcntl.F_GETLK, arg) + except IOError, ie: + if ie.errno != errno.EINVAL: + raise + print >> sys.stderr, "Unable to determine which PID has the lock due to a " \ + "python portability failure. Contact the developers with your" \ + " platform information for support." + return False + if sys.platform == "darwin": + _, _, pid, _, _ = struct.unpack("QQihh", result) + else: + _, _, _, _, pid = struct.unpack(linux_struct_flock, result) + return pid + +def get_pid_age(pid): + now = time.time() + st = os.stat("/proc/%d" % pid) + return now - st.st_ctime + +def qstat(jobid=""): + """ + Call qstat directly for a jobid. + If none is specified, query all jobid's. + + Returns a python dictionary with the job info. + """ + qstat = get_qstat_location() + child_stdout = os.popen("%s -f %s" % (qstat, jobid)) + result = parse_qstat_fd(child_stdout) + exit_status = child_stdout.close() + if exit_status: + exit_code = 0 + if os.WIFEXITED(exit_status): + exit_code = os.WEXITSTATUS(exit_status) + if exit_code == 153: # Completed + result = {jobid: {'BatchJobId': '"%s"' % jobid, "JobStatus": "4", "ExitCode": ' 0'}} + elif exit_code == 271: # Removed + result = {jobid: {'BatchJobId': '"%s"' % jobid, 'JobStatus': '3', 'ExitCode': ' 0'}} + else: + raise Exception("qstat failed with exit code %s" % str(exit_status)) + return result + +_qstat_location_cache = None +def get_qstat_location(): + """ + Locate the copy of qstat the blahp configuration wants to use. + """ + global _qstat_location_cache + if _qstat_location_cache != None: + return _qstat_location_cache + if os.path.exists("blah_load_config.sh") and os.access("blah_load_config.sh", os.R_OK): + cmd = 'source blah_load_config.sh && echo "$pbs_binpath/qstat"' + else: + cmd = 'which qstat' + child_stdout = os.popen(cmd) + output = child_stdout.read() + location = output.split("\n")[0].strip() + if child_stdout.close(): + raise Exception("Unable to determine qstat location: %s" % output) + _qstat_location_cache = location + return location + +job_id_re = re.compile("\s*Job Id: ([0-9]+[\.\w\-]+)") +exec_host_re = re.compile("\s*exec_host = ([\w\-\/.]+)") +status_re = re.compile("\s*job_state = ([QRECH])") +exit_status_re = re.compile("\s*exit_status = (-?[0-9]+)") +status_mapping = {"Q": 1, "R": 2, "E": 2, "C": 4, "H": 5} +def parse_qstat_fd(fd): + """ + Parse the stdout fd of "qstat -f" into a python dictionary containing + the information we need. + """ + job_info = {} + cur_job_id = None + cur_job_info = {} + for line in pbsOutputFilter(fd): + line = line.strip() + m = job_id_re.match(line) + if m: + if cur_job_id: + job_info[cur_job_id] = cur_job_info + cur_job_id = m.group(1) + cur_job_info = {"BatchJobId": '"%s"' % cur_job_id.split(".")[0]} + continue + if cur_job_id == None: + continue + m = exec_host_re.match(line) + if m: + cur_job_info["WorkerNode"] = '"' + m.group(1).split("/")[0] + '"' + continue + m = status_re.match(line) + if m: + status = status_mapping.get(m.group(1), 0) + if status != 0: + cur_job_info["JobStatus"] = str(status) + continue + m = exit_status_re.match(line) + if m: + cur_job_info["ExitCode"] = ' %s' % m.group(1) + continue + if cur_job_id: + job_info[cur_job_id] = cur_job_info + return job_info + +def job_dict_to_string(info): + result = ["%s=%s;" % (i[0], i[1]) for i in info.items()] + return "[" + " ".join(result) + " ]" + +def fill_cache(cache_location): + results = qstat() + (fd, filename) = tempfile.mkstemp() + try: + for key, val in results.items(): + os.write(fd, "%s: %s\n" % (key, job_dict_to_string(val))) + os.fsync(fd) + os.close(fd) + except: + os.unlink(filename) + raise + os.rename(filename, cache_location) + global launchtime + launchtime = time.time() + +cache_line_re = re.compile("([0-9]+[\.\w\-]+):\s+(.+)") +def cache_to_status(jobid, fd): + for line in fd.readlines(): + line = line.strip() + m = cache_line_re.match(line) + if m and m.group(1) == jobid: + return m.group(2) + +def check_cache(jobid, recurse=True): + uid = os.geteuid() + username = pwd.getpwuid(uid).pw_name + cache_dir = os.path.join("/tmp", "qstat_cache_%s" % username) + if recurse: + try: + s = os.stat(cache_dir) + except OSError, oe: + if oe.errno != 2: + raise + os.mkdir(cache_dir) + s = os.stat(cache_dir) + if s.st_uid != uid: + raise Exception("Unable to check cache because it is owned by UID %d" % s.st_uid) + cache_location = os.path.join(cache_dir, "results_cache") + try: + fd = open(cache_location, "a+") + except IOError, ie: + if ie.errno != 2: + raise + # Create an empty file so we can hold the file lock + fd = open(cache_location, "w+") + ExclusiveLock(fd) + # If someone grabbed the lock between when we opened and tried to + # acquire, they may have filled the cache + if os.stat(cache_location).st_size == 0: + fill_cache(cache_location) + fd.close() + if recurse: + return check_cache(jobid, recurse=False) + else: + return None + ExclusiveLock(fd) + s = os.fstat(fd.fileno()) + if s.st_uid != uid: + raise Exception("Unable to check cache file because it is owned by UID %d" % s.st_uid) + if (s.st_size == 0) or (launchtime - s.st_mtime > cache_timeout): + # If someone filled the cache between when we opened the file and + # grabbed the lock, we may not need to fill the cache. + s2 = os.stat(cache_location) + if (s2.st_size == 0) or (launchtime - s2.st_mtime > cache_timeout): + fill_cache(cache_location) + if recurse: + return check_cache(jobid, recurse=False) + else: + return None + return cache_to_status(jobid, fd) + +def main(): + # To debug, uncommenting these lines is useful. + fd = open("/dev/null", "w") + old_stderr = os.dup(2) + os.dup2(fd.fileno(), 2) + if len(sys.argv) != 2: + print "1Usage: pbs_status.sh pbs//" + return 1 + jobid = sys.argv[1].split("/")[-1] + cache_contents = check_cache(jobid) + if not cache_contents: + results = qstat(jobid) + if not results: + print "1ERROR: Unable to find job %s" % jobid + print "0%s" % job_dict_to_string(results[jobid]) + else: + print "0%s" % cache_contents + return 0 + +if __name__ == "__main__": + try: + sys.exit(main()) + except SystemExit: + raise + except Exception, e: + print "1ERROR: %s" % str(e).replace("\n", "\\n") + sys.exit(0) + From 1d217fad9c6b54a5e543f7a9d050e77047be0bb1 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 08:39:09 -0600 Subject: [PATCH 040/169] SLURM support. Patches from http://jira.opensciencegrid.org/browse/CAMPUS-36. --- src/scripts/pbs_status.sh | 11 ++++++++++- src/scripts/pbs_submit.sh | 3 +++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/scripts/pbs_status.sh b/src/scripts/pbs_status.sh index df89941e..a58ec725 100755 --- a/src/scripts/pbs_status.sh +++ b/src/scripts/pbs_status.sh @@ -82,6 +82,9 @@ BEGIN { /Job Id:/ { current_job = substr($0, index($0, ":") + 2) + end = index(current_job, ".") + if ( end == 0 ) { end = length(current_job) + 1 } + current_job = substr(current_job, 1, end) } /exec_host =/ { current_wn = substr($0, index($0, "=")+2) @@ -176,11 +179,14 @@ BEGIN { current_job = "" current_wn = "" current_js = "" + exitcode = "-1" } /Job Id:/ { current_job = substr($0, index($0, ":") + 2) - current_job = substr(current_job, 1, index(current_job, ".")-1) + end = index(current_job, ".") + if ( end == 0 ) { end = length(current_job) + 1 } + current_job = substr(current_job, 1, end) print "[BatchJobId=\"" current_job "\";" } /exec_host =/ { @@ -209,6 +215,9 @@ END { } print "JobStatus=" jobstatus ";" if (jobstatus == 4) { + if (exitcode == "-1") { + exitcode = "0" + } print "ExitCode=" exitcode ";" } print "]" diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index e97e41dc..24c88a8c 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -197,6 +197,9 @@ if [ "$retcode" != "0" ] ; then exit 1 fi +# The job id is actually the first numbers in the string (slurm support) +jobID=`echo $jobID | awk 'match($0,/[0-9]+/){print substr($0, RSTART, RLENGTH)}'` + if [ "x$pbs_nologaccess" != "xyes" -a "x$pbs_nochecksubmission" != "xyes" ]; then # Don't trust qsub retcode, it could have crashed From 5f2776689dc09c02682e20a0b857d1a217654284 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 08:47:17 -0600 Subject: [PATCH 041/169] Use pbs_status.py. --- src/CMakeLists.txt | 1 + src/job_status.c | 11 +++++++++-- src/scripts/Makefile.am | 3 ++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 549b40c2..f22041ad 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -136,6 +136,7 @@ set(blah_scripts scripts/sge_submit.sh scripts/sge_filestaging scripts/sge_hold.sh scripts/sge_status.sh scripts/runcmd.pl.template scripts/sge_local_submit_attributes.sh + scripts/pbs_status.py ) install(FILES diff --git a/src/job_status.c b/src/job_status.c index 411da3e2..ac26f22e 100644 --- a/src/job_status.c +++ b/src/job_status.c @@ -126,8 +126,15 @@ get_status(const char *jobDesc, classad_context *cad, char **deleg_parameters, c return(255); } - exec_command.command = make_message("%s/%s_status.sh %s %s", blah_script_location, - spid->lrms, (get_workernode ? "-w" : ""), jobDesc); + if (strcmp(spid->lrms, "pbs") == 0) { + exec_command.command = make_message("%s/%s_status.py %s %s", blah_script_location, + spid->lrms, (get_workernode ? "-w" : ""), jobDesc); + } + else + { + exec_command.command = make_message("%s/%s_status.sh %s %s", blah_script_location, + spid->lrms, (get_workernode ? "-w" : ""), jobDesc); + } if (exec_command.command == NULL) { fprintf(stderr, "blahpd: out of memory"); diff --git a/src/scripts/Makefile.am b/src/scripts/Makefile.am index f1718096..987b55f6 100644 --- a/src/scripts/Makefile.am +++ b/src/scripts/Makefile.am @@ -36,6 +36,7 @@ libexec_SCRIPTS = blah_load_config.sh blah_common_submit_functions.sh \ lsf_cancel.sh lsf_status.sh lsf_submit.sh lsf_hold.sh lsf_resume.sh \ condor_cancel.sh condor_status.sh condor_submit.sh condor_hold.sh condor_resume.sh \ sge_cancel.sh sge_helper sge_resume.sh sge_submit.sh sge_filestaging \ - sge_hold.sh sge_status.sh runcmd.pl.template sge_local_submit_attributes.sh + sge_hold.sh sge_status.sh runcmd.pl.template sge_local_submit_attributes.sh \ + pbs_status.py EXTRA_DIST = $(bin_SCRIPTS) From c6ae9d0678d37660725ea2ee7ba36d31acdb604d Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 08:54:19 -0600 Subject: [PATCH 042/169] Fix bad port of classad.patch from Condor distribution. --- src/classad_binary_op_unwind.C | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/classad_binary_op_unwind.C b/src/classad_binary_op_unwind.C index 570466a0..bd352698 100644 --- a/src/classad_binary_op_unwind.C +++ b/src/classad_binary_op_unwind.C @@ -42,7 +42,7 @@ #include "classad/classad_distribution.h" #include "classad_binary_op_unwind.h" -#ifdef 1 +#if 1 namespace classad { #endif @@ -221,6 +221,6 @@ UnparseAux( std::string &buffer, std::string &fnName, std::vector& ar return; } -#ifdef 1 +#if 1 } // end of classad namespace #endif From 960253ee628bc661f924908ff079d7cc6fcff411 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 09:03:28 -0600 Subject: [PATCH 043/169] Specify initdir manually, as it does not seem to be defined on RHEL6. --- config/Makefile.am | 1 + 1 file changed, 1 insertion(+) diff --git a/config/Makefile.am b/config/Makefile.am index ac2401a6..c48e81bf 100644 --- a/config/Makefile.am +++ b/config/Makefile.am @@ -26,6 +26,7 @@ EXTRA_DIST = blah.config.template \ glite-ce-blah-parser \ glite-ce-check-blparser +initdir = rc.d/init.d bldir = $(sysconfdir)/$(initdir) bl_SCRIPTS = glite-ce-blah-parser glite-ce-check-blparser From f624dcf23478980cc9ebb48ef9b8ff16337ae9c9 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 13:57:53 -0600 Subject: [PATCH 044/169] Cleanup 'git status' after an automake build. --- .gitignore | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/.gitignore b/.gitignore index 50d4e8be..e6108ade 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,60 @@ CMakeCache.txt CMakeFiles CPackSourceConfig.cmake CPackConfig.cmake +src/*.o +Makefile +Makefile.in +config/Makefile +config/Makefile.in +src/Makefile +src/Makefile.in +aclocal.m4 +autom4te.cache +config.log +config.status +configure +doc/blah_check_config.1 +doc/blah_job_registry_add.1 +/blah_job_registry_dump.1 +doc/blah_job_registry_lkup.1 +doc/blah_job_registry_scan_by_subject.1 +doc/blah_job_registry_dump.1 +doc/blahpd.1 +libtool +project/compile +project/config.guess +project/config.sub +project/depcomp +project/install-sh +project/libtool.m4 +project/ltmain.sh +project/ltoptions.m4 +project/ltsugar.m4 +project/ltversion.m4 +project/lt~obsolete.m4 +project/missing +src/.deps/ +src/BLClient +src/BLParserLSF +src/BLParserPBS +src/BNotifier +src/BUpdaterCondor +src/BUpdaterLSF +src/BUpdaterPBS +src/BUpdaterSGE +src/autogen/ +src/blah_check_config +src/blah_job_registry_add +src/blah_job_registry_dump +src/blah_job_registry_lkup +src/blah_job_registry_purge +src/blah_job_registry_scan_by_subject +src/blahpd +src/blahpd_daemon +src/blparser_master +src/test_cmdbuffer +src/test_job_registry_access +src/test_job_registry_create +src/test_job_registry_purge +src/test_job_registry_update +src/test_job_registry_update_from_network From 19bdb682c8902a8fd7ce3f28c9a769dc6d169c3b Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 17:33:45 -0600 Subject: [PATCH 045/169] Provide more descriptive error message when failing to limit proxy. --- src/server.c | 62 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/src/server.c b/src/server.c index 59ae2859..551ec8b7 100644 --- a/src/server.c +++ b/src/server.c @@ -143,7 +143,7 @@ int set_cmd_list_option(char **command, classad_context cad, const char *attribu int set_cmd_string_option(char **command, classad_context cad, const char *attribute, const char *option, const int quote_style); int set_cmd_int_option(char **command, classad_context cad, const char *attribute, const char *option, const int quote_style); int set_cmd_bool_option(char **command, classad_context cad, const char *attribute, const char *option, const int quote_style); -char *limit_proxy(char* proxy_name, char *requested_name); +static char *limit_proxy(char* proxy_name, char *requested_name, char **error_message); int getProxyInfo(char* proxname, char** subject, char** fqan); int logAccInfo(char* jobId, char* server_lrms, classad_context cad, char* fqan, char* userDN, char** environment); int CEReq_parse(classad_context cad, char* filename, char *proxysubject, char *proxyfqan); @@ -961,7 +961,7 @@ cmd_set_glexec_dn(void *args) /* proxt4 must be limited for subsequent submission */ if(argv[3][0]=='0') { - if((proxynameNew = limit_proxy(proxt4, NULL)) == NULL) + if((proxynameNew = limit_proxy(proxt4, NULL, NULL)) == NULL) { free(mapping_parameter[MEXEC_PARAM_DELEGCRED]); mapping_parameter[MEXEC_PARAM_DELEGCRED] = NULL; @@ -1153,10 +1153,14 @@ cmd_submit_job(void *args) else if (proxyname != NULL) { /* not in glexec mode: need to limit the proxy */ - if((proxynameNew = limit_proxy(proxyname, NULL)) == NULL) + char *errmsg; + if((proxynameNew = limit_proxy(proxyname, NULL, &errmsg)) == NULL) { /* PUSH A FAILURE */ - resultLine = make_message("%s 1 Unable\\ to\\ limit\\ the\\ proxy N/A", reqId); + char * escaped_errmsg = (errmsg) ? escape_spaces(errmsg) : NULL; + if (escaped_errmsg) resultLine = make_message("%s 1 Unable\\ to\\ limit\\ the\\ proxy\\ (%s) N/A", reqId, escaped_errmsg); + else resultLine = make_message("%s 1 Unable\\ to\\ limit\\ the\\ proxy N/A", reqId); + if (errmsg) free(errmsg); goto cleanup_proxyname; } free(proxyname); @@ -1988,7 +1992,7 @@ cmd_renew_proxy(void *args) case 1: /* job queued: copy the proxy locally */ if (!use_mapping) { - limit_proxy(proxyFileName, old_proxy); /*FIXME: should check if limited proxies are enabled? */ + limit_proxy(proxyFileName, old_proxy, NULL); /*FIXME: should check if limited proxies are enabled? */ resultLine = make_message("%s 0 Proxy\\ renewed", reqId); } else @@ -1999,7 +2003,7 @@ cmd_renew_proxy(void *args) { exe_command.source_proxy = argv[CMD_RENEW_PROXY_ARGS + 1 + MEXEC_PARAM_SRCPROXY]; } else { - limited_proxy_name = limit_proxy(proxyFileName, NULL); + limited_proxy_name = limit_proxy(proxyFileName, NULL, NULL); exe_command.source_proxy = limited_proxy_name; } exe_command.dest_proxy = old_proxy; @@ -2115,7 +2119,7 @@ cmd_send_proxy_to_worker_node(void *args) { if(!use_glexec) { - proxyFileNameNew = limit_proxy(proxyFileName, NULL); + proxyFileNameNew = limit_proxy(proxyFileName, NULL, NULL); } else proxyFileNameNew = strdup(argv[CMD_SEND_PROXY_TO_WORKER_NODE_ARGS + MEXEC_PARAM_SRCPROXY + 1]); @@ -2588,8 +2592,8 @@ set_cmd_list_option(char **command, classad_context cad, const char *attribute, return(result); } -char * -limit_proxy(char* proxy_name, char *limited_proxy_name) +static char * +limit_proxy(char* proxy_name, char *limited_proxy_name, char **error_message) { int seconds_left, hours_left, minutes_left; char *limcommand; @@ -2611,6 +2615,31 @@ limit_proxy(char* proxy_name, char *limited_proxy_name) limited_proxy_name = limited_proxy_made_up_name; } + /* Sanity check - make sure the destination is writable and the source exists */ + tmpfd = open(limited_proxy_name, O_WRONLY|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR); + if (tmpfd == -1) + { + char * errmsg = make_message("Unable to create limited proxy file (%s):" + " errno=%d, %s", limited_proxy_name, errno, strerror(errno)); + if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); + if (!errmsg) return(NULL); + if (error_message) *error_message = errmsg; else free(errmsg); + return NULL; + } + else + { + close(tmpfd); + } + if ((tmpfd = open(proxy_name, O_WRONLY|O_CREAT, S_IRUSR|S_IWUSR)) == -1) + { + char * errmsg = make_message("Unable to read proxy file (%s):" + " errno=%d, %s", proxy_name, errno, strerror(errno)); + if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); + if (!errmsg) return(NULL); + if (error_message) *error_message = errmsg; else if (errmsg) free(errmsg); + return NULL; + } + globuslocation = (getenv("GLOBUS_LOCATION") ? getenv("GLOBUS_LOCATION") : "/opt/globus"); exe_command.command = make_message("%s/bin/grid-proxy-info -timeleft -file %s", globuslocation, proxy_name); @@ -2625,7 +2654,10 @@ limit_proxy(char* proxy_name, char *limited_proxy_name) if (res != 0) { perror("blahpd error invoking grid-proxy-info"); + char * errmsg = make_message("blahpd error invoking grid-proxy-info; " + "exit code %d from grid-proxy-info"); if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); + if (error_message && errmsg) *error_message = errmsg; else if (errmsg) free(errmsg); return(NULL); } else @@ -2675,7 +2707,9 @@ limit_proxy(char* proxy_name, char *limited_proxy_name) if (fpr == NULL) { fprintf(stderr, "blahpd limit_proxy: Cannot open %s in append mode to obtain file lock: %s\n", limited_proxy_name, strerror(errno)); + char * errmsg = make_message("Cannot open %s in append mode to obtain file lock: %s", limited_proxy_name, strerror(errno)); if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); + if (error_message && errmsg) *error_message= errmsg; else if (errmsg) free(errmsg); return(NULL); } /* Acquire lock on limited proxy */ @@ -2687,7 +2721,9 @@ limit_proxy(char* proxy_name, char *limited_proxy_name) { fclose(fpr); fprintf(stderr, "blahpd limit_proxy: Cannot obtain write file lock on %s: %s\n", limited_proxy_name, strerror(errno)); + char * errmsg = make_message("Cannot obtain write file lock on %s: %s", limited_proxy_name, strerror(errno)); if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); + if (error_message && errmsg) *error_message= errmsg; else if (errmsg) free(errmsg); return(NULL); } } @@ -2714,15 +2750,19 @@ limit_proxy(char* proxy_name, char *limited_proxy_name) /* the call may have been successful. We just check the temporary proxy */ if (exe_command.exit_code != 0) { + int orig_exit_code = exe_command.exit_code; cleanup_cmd(&exe_command); exe_command.command = make_message("%s/bin/grid-proxy-info -f %s", globuslocation, limit_command_output); res = execute_cmd(&exe_command); free(exe_command.command); if (res != 0 || exe_command.exit_code != 0) { + char * errmsg = make_message("Failed to create limited proxy %s (grid-proxy-init " + "exit_code = %d; grid-proxy-info exit code %d)", limit_command_output, orig_exit_code, res != 0 ? res : exe_command.exit_code); if (limit_command_output != limited_proxy_name) free(limit_command_output); if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); + if (error_message && errmsg) *error_message= errmsg; else if(errmsg) free(errmsg); return(NULL); } } @@ -2738,8 +2778,10 @@ limit_proxy(char* proxy_name, char *limited_proxy_name) { fprintf(stderr, "blahpd limit_proxy: Cannot open %s in append mode to obtain file lock: %s\n", limited_proxy_name, strerror(errno)); unlink(limit_command_output); + char * errmsg = make_message("Cannot open %s in append mode to obtain file lock: %s", limited_proxy_name, strerror(errno)); free(limit_command_output); if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); + if (error_message && errmsg) *error_message= errmsg; else if (errmsg) free(errmsg); return(NULL); } /* Acquire lock on limited proxy */ @@ -2751,9 +2793,11 @@ limit_proxy(char* proxy_name, char *limited_proxy_name) { fclose(fpr); fprintf(stderr, "blahpd limit_proxy: Cannot obtain write file lock on %s: %s\n", limited_proxy_name, strerror(errno)); + char * errmsg = make_message("Cannot obtain write file lock on %s: %s", limited_proxy_name, strerror(errno)); unlink(limit_command_output); free(limit_command_output); if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); + if (error_message && errmsg) *error_message= errmsg; else free(errmsg); return(NULL); } } From f68f1d8f635bb1368eb3b32b84dfa6568b8605e0 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 13 Dec 2012 17:35:39 -0600 Subject: [PATCH 046/169] Fix handling of PBS job ids - use only the ID prior to the cluster name. Also necessary for SLURM support. --- src/scripts/pbs_status.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 60f4a262..e001303d 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -336,6 +336,7 @@ def fill_cache(cache_location): (fd, filename) = tempfile.mkstemp() try: for key, val in results.items(): + key.split(".")[0] os.write(fd, "%s: %s\n" % (key, job_dict_to_string(val))) os.fsync(fd) os.close(fd) @@ -410,13 +411,14 @@ def main(): if len(sys.argv) != 2: print "1Usage: pbs_status.sh pbs//" return 1 - jobid = sys.argv[1].split("/")[-1] + jobid = sys.argv[1].split("/")[-1].split(".")[0] cache_contents = check_cache(jobid) if not cache_contents: results = qstat(jobid) - if not results: + if not results or jobid not in results: print "1ERROR: Unable to find job %s" % jobid - print "0%s" % job_dict_to_string(results[jobid]) + else: + print "0%s" % job_dict_to_string(results[jobid]) else: print "0%s" % cache_contents return 0 From ac1ed439ffde34bd8e4a1941ab78b7c7ec03669a Mon Sep 17 00:00:00 2001 From: Derek Weitzel Date: Thu, 17 Jan 2013 13:08:19 -0600 Subject: [PATCH 047/169] Removing the hostname to the job commit. pbs_submit.sh was changed for slurm support to only look at the job id numbers, not at the hostname (slurm output is weird). pbs_status.py would look for the entire jobid, with hostname, but the gahp would only give the number, therefore pbs_status.py could not find the job. --- src/scripts/pbs_status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index e001303d..871dadde 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -285,7 +285,7 @@ def get_qstat_location(): _qstat_location_cache = location return location -job_id_re = re.compile("\s*Job Id: ([0-9]+[\.\w\-]+)") +job_id_re = re.compile("\s*Job Id: ([0-9]+)[\.\w\-]+") exec_host_re = re.compile("\s*exec_host = ([\w\-\/.]+)") status_re = re.compile("\s*job_state = ([QRECH])") exit_status_re = re.compile("\s*exit_status = (-?[0-9]+)") From 0ab7a44ead9ecda5b2dd0ce282c19877904418df Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Tue, 14 May 2013 19:28:12 -0500 Subject: [PATCH 048/169] Updates of locking algorithm. Implement debug logging. --- src/scripts/pbs_status.py | 186 ++++++++++++++++++-------------------- 1 file changed, 86 insertions(+), 100 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index e001303d..4806a77c 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -21,6 +21,13 @@ # limitations under the License. # +""" +Query PBS (or SLURM with the PBS emulation layer) for the status of a given job + +Internally, it creates a cache of the PBS qstat response and will reuse this +for subsequent queries. +""" + import os import re import pwd @@ -28,90 +35,50 @@ import time import errno import fcntl +import random import struct +import signal import tempfile cache_timeout = 60 launchtime = time.time() -# This function was written by me for an older project, the OSG-GIP -def pbsOutputFilter(fp): +def log(msg): """ - PBS can be a pain to work with because it automatically cuts - lines off at 80 chars and continues the line on the next line. For - example:: - - Server: red - server_state = Active - server_host = red.unl.edu - scheduling = True - total_jobs = 2996 - state_count = Transit:0 Queued:2568 Held:0 Waiting:0 Running:428 Exiting - :0 Begun:0 - acl_roots = t3 - managers = mfurukaw@red.unl.edu,root@t3 - - This function puts the line ":0 Begun:0" with the above line. It's meant - to filter the output, so you should "scrub" PBS output like this:: - - fp = runCommand() - for line in pbsOutputFilter(fp): - ... parse line ... - - This function uses iterators + A very lightweight log - not meant to be used in production, but helps + when debugging scale tests """ - class PBSIter: - """ - An iterator for PBS output; this allows us to easily parse over - PBS-style line continuations. - """ - - def __init__(self, fp): - self.fp = fp - self.fp_iter = fp.__iter__() - self.prevline = None - self.done = False - - def next(self): - """ - Return the next full line of output for the iterator. - """ - if self.prevline == None: - line = self.fp_iter.next() - if line.startswith('\t'): - # Bad! The output shouldn't start with a - # partial line - raise ValueError("PBS output contained bad data.") - self.prevline = line - return self.next() - if self.done: - raise StopIteration() - try: - line = self.fp_iter.next() - if line.startswith('\t'): - self.prevline = self.prevline[:-1] + line[1:-1] - return self.next() - else: - old_line = self.prevline - self.prevline = line - return old_line - except StopIteration: - self.done = True - return self.prevline - - class PBSFilter: - """ - An iterable object based upon the PBSIter iterator. - """ - - def __init__(self, myiter): - self.iter = myiter - - def __iter__(self): - return self.iter - - return PBSFilter(PBSIter(fp)) + print >> sys.stderr, time.strftime("%x %X"), os.getpid(), msg + +def createCacheDir(): + uid = os.geteuid() + username = pwd.getpwuid(uid).pw_name + cache_dir = os.path.join("/var/tmp", "qstat_cache_%s" % username) + + try: + os.mkdir(cache_dir, 0755) + except OSError, oe: + if oe.errno != errno.EEXIST: + raise + s = os.stat(cache_dir) + if s.st_uid != uid: + raise Exception("Unable to check cache because it is owned by UID %d" % s.st_uid) + + return cache_dir + +def initLog(): + """ + Determine whether to create a logfile based on the presence of a file + in the user's qstat cache directory. If so, make the logfile there. + """ + cache_dir = createCacheDir() + if os.path.exists(os.path.join(cache_dir, "pbs_status.debug")): + filename = os.path.join(cache_dir, "pbs_status.log") + else: + filename = "/dev/null" + fd = open(filename, "a") + os.dup2(fd.fileno(), 2) # Something else from a prior life - see gratia-probe-common's GratiaWrapper.py def ExclusiveLock(fd, timeout=120): @@ -135,8 +102,10 @@ def ExclusiveLock(fd, timeout=120): # An alternate would be to block on the lock, and use signals to interupt. # This would mess up Gratia's flawed use of signals already, and not be # able to report on who has the lock. I don't like indefinite waits! - max_tries = 5 - for tries in range(1, max_tries+1): + max_time = 30 + starttime = time.time() + tries = 1 + while time.time() - starttime < max_time: try: fcntl.lockf(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) return @@ -156,10 +125,13 @@ def ExclusiveLock(fd, timeout=120): except IOError, ie: if not ((ie.errno == errno.EACCES) or (ie.errno == errno.EAGAIN)): raise - print >> sys.stderr, "Unable to acquire lock, try %i; will sleep for %i " \ - "seconds and try %i more times." % (tries, tries, max_tries-tries) - time.sleep(tries) + sleeptime = random.random() + log("Unable to acquire lock, try %i; will sleep for %.2f " \ + "seconds and try for %.2f more seconds." % (tries, sleeptime, max_time - time.time()-starttime)) + tries += 1 + time.sleep(sleeptime) + log("Fatal exception - Unable to acquire lock") raise Exception("Unable to acquire lock") def check_lock(fd, timeout): @@ -176,19 +148,17 @@ def check_lock(fd, timeout): return True if timeout < 0: - print >> sys.stderr, "Another process, %d, holds the cache lock." % pid + log("Another process, %d, holds the cache lock." % pid) return False try: age = get_pid_age(pid) except: - print >> sys.stderr, "Another process, %d, holds the cache lock." % pid - print >> sys.stderr, "Unable to get the other process's age; will not time " \ - "it out." + log("Another process, %d, holds the cache lock." % pid) + log("Unable to get the other process's age; will not time it out.") return False - print >> sys.stderr, "Another process, %d (age %d seconds), holds the cache " \ - "lock." % (pid, age) + log("Another process, %d (age %d seconds), holds the cache lock." % (pid, age)) if age > timeout: os.kill(pid, signal.SIGKILL) @@ -227,9 +197,9 @@ def get_lock_pid(fd): except IOError, ie: if ie.errno != errno.EINVAL: raise - print >> sys.stderr, "Unable to determine which PID has the lock due to a " \ + log("Unable to determine which PID has the lock due to a " \ "python portability failure. Contact the developers with your" \ - " platform information for support." + " platform information for support.") return False if sys.platform == "darwin": _, _, pid, _, _ = struct.unpack("QQihh", result) @@ -250,9 +220,12 @@ def qstat(jobid=""): Returns a python dictionary with the job info. """ qstat = get_qstat_location() - child_stdout = os.popen("%s -f %s" % (qstat, jobid)) + starttime = time.time() + log("Starting qstat.") + child_stdout = os.popen("%s -f -1 %s" % (qstat, jobid)) result = parse_qstat_fd(child_stdout) exit_status = child_stdout.close() + log("Finished qstat (time=%f)." % (time.time()-starttime)) if exit_status: exit_code = 0 if os.WIFEXITED(exit_status): @@ -285,7 +258,7 @@ def get_qstat_location(): _qstat_location_cache = location return location -job_id_re = re.compile("\s*Job Id: ([0-9]+[\.\w\-]+)") +job_id_re = re.compile("\s*Job Id:\s([0-9]+)([\w\-\/.]*)") exec_host_re = re.compile("\s*exec_host = ([\w\-\/.]+)") status_re = re.compile("\s*job_state = ([QRECH])") exit_status_re = re.compile("\s*exit_status = (-?[0-9]+)") @@ -298,13 +271,14 @@ def parse_qstat_fd(fd): job_info = {} cur_job_id = None cur_job_info = {} - for line in pbsOutputFilter(fd): + for line in fd: line = line.strip() m = job_id_re.match(line) if m: if cur_job_id: job_info[cur_job_id] = cur_job_info cur_job_id = m.group(1) + #print cur_job_id, line cur_job_info = {"BatchJobId": '"%s"' % cur_job_id.split(".")[0]} continue if cur_job_id == None: @@ -332,11 +306,13 @@ def job_dict_to_string(info): return "[" + " ".join(result) + " ]" def fill_cache(cache_location): + log("Starting query to fill cache.") results = qstat() + log("Finished query to fill cache.") (fd, filename) = tempfile.mkstemp() try: for key, val in results.items(): - key.split(".")[0] + key = key.split(".")[0] os.write(fd, "%s: %s\n" % (key, job_dict_to_string(val))) os.fsync(fd) os.close(fd) @@ -358,18 +334,18 @@ def cache_to_status(jobid, fd): def check_cache(jobid, recurse=True): uid = os.geteuid() username = pwd.getpwuid(uid).pw_name - cache_dir = os.path.join("/tmp", "qstat_cache_%s" % username) + cache_dir = os.path.join("/var/tmp", "qstat_cache_%s" % username) if recurse: try: s = os.stat(cache_dir) except OSError, oe: if oe.errno != 2: raise - os.mkdir(cache_dir) + os.mkdir(cache_dir, 0755) s = os.stat(cache_dir) if s.st_uid != uid: raise Exception("Unable to check cache because it is owned by UID %d" % s.st_uid) - cache_location = os.path.join(cache_dir, "results_cache") + cache_location = os.path.join(cache_dir, "blahp_results_cache") try: fd = open(cache_location, "a+") except IOError, ie: @@ -404,22 +380,32 @@ def check_cache(jobid, recurse=True): return cache_to_status(jobid, fd) def main(): - # To debug, uncommenting these lines is useful. - fd = open("/dev/null", "w") - old_stderr = os.dup(2) - os.dup2(fd.fileno(), 2) + initLog() + if len(sys.argv) != 2: print "1Usage: pbs_status.sh pbs//" return 1 jobid = sys.argv[1].split("/")[-1].split(".")[0] - cache_contents = check_cache(jobid) + log("Checking cache for jobid %s" % jobid) + try: + cache_contents = check_cache(jobid) + except Exception, e: + msg = "1ERROR: Internal exception, %s" % str(e) + log(msg) + print msg if not cache_contents: + log("Jobid %s not in cache; querying PBS" % jobid) results = qstat(jobid) + log("Finished querying PBS for jobid %s" % jobid) if not results or jobid not in results: + log("1ERROR: Unable to find job %s" % jobid) print "1ERROR: Unable to find job %s" % jobid else: + log("0%s" % job_dict_to_string(results[jobid])) print "0%s" % job_dict_to_string(results[jobid]) else: + log("Jobid %s in cache." % jobid) + log("0%s" % cache_contents) print "0%s" % cache_contents return 0 From e0983e8ff9c3d21ac2f2c6474987c5f17e683581 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Tue, 14 May 2013 20:46:21 -0500 Subject: [PATCH 049/169] Improve condor status caching. Port of condor-status.patch from HTCondor build. --- src/scripts/condor_status.sh | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/scripts/condor_status.sh b/src/scripts/condor_status.sh index e0dc6ff7..94dbd779 100755 --- a/src/scripts/condor_status.sh +++ b/src/scripts/condor_status.sh @@ -252,6 +252,15 @@ for job in $* ; do fi fi + # Caching of condor_q output doesn't appear to work properly in + # HTCondor builds of the blahp. So do an explicit condor_q for + # this job before trying condor_history, which can take a long time. + line=$(echo $FORMAT | xargs $condor_binpath/condor_q $target $id) + if [ -n "$line" ] ; then + echo "0$(make_ad $job "$line")" + exit 0 + fi + ### WARNING: This is troubling because the remote history file ### might just happen to be in the same place as a local history ### file, in which case condor_history is going to be looking at @@ -263,7 +272,7 @@ for job in $* ; do # instead of using -f. history_file=$($condor_binpath/condor_config_val $target -schedd history) if [ "$?" == "0" ]; then - line=$(echo $FORMAT | _condor_HISTORY="$history_file" xargs $condor_binpath/condor_history -f $history_file -backwards $id) + line=$(echo $FORMAT | _condor_HISTORY="$history_file" xargs $condor_binpath/condor_history -f $history_file -backwards -match 1 $id) if [ ! -z "$line" ] ; then echo "0$(make_ad $job "$line")" exit 0 From eca11540fae63c7156f0a63109c25a74f912d94e Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Tue, 14 May 2013 20:47:15 -0500 Subject: [PATCH 050/169] Remove empty output lines. Port of lsf-jobid.patch from HTCondor build. --- src/scripts/lsf_submit.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/scripts/lsf_submit.sh b/src/scripts/lsf_submit.sh index 8f4f7126..09b34d54 100755 --- a/src/scripts/lsf_submit.sh +++ b/src/scripts/lsf_submit.sh @@ -178,6 +178,9 @@ fi jobID=`echo "$bsub_out" | awk -F" " '{ print $2 }' | sed "s/>//" |sed "s/ Date: Tue, 14 May 2013 21:22:56 -0500 Subject: [PATCH 051/169] Switch proxy creation to use Globus library calls. Equivalent to proxy-tools.patch from HTCondor. --- configure.ac | 1 + src/Makefile.am | 2 +- src/server.c | 225 +++++++++++++++++++++++++++++++++++------------- 3 files changed, 168 insertions(+), 60 deletions(-) diff --git a/configure.ac b/configure.ac index 5de2f191..b3b0aab2 100755 --- a/configure.ac +++ b/configure.ac @@ -245,6 +245,7 @@ if test $have_globus = no; then PKG_CHECK_MODULES(GLOBUS_GSI_UTILS, globus-gsi-cert-utils, , have_globus=no) PKG_CHECK_MODULES(GLOBUS_GSS_ASSIST, globus-gss-assist, , have_globus=no) PKG_CHECK_MODULES(GLOBUS_GSI_SYSCFG, globus-gsi-sysconfig, , have_globus=no) + PKG_CHECK_MODULES(GLOBUS_GSSAPI_GSI, globus-gssapi-gsi, , have_globus=no) fi AC_MSG_RESULT(["GLOBUS found: $have_globus"]) AM_CONDITIONAL([HAVE_GLOBUS], [test "x$bprserver" == "xyes" -a "x$have_globus" == "xyes"]) diff --git a/src/Makefile.am b/src/Makefile.am index 3370300a..4b3cd62e 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -57,7 +57,7 @@ blahpd_SOURCES = main.c $(common_sources) blahpd_daemon_SOURCES = main_daemon.c $(common_sources) -blahpd_LDADD = $(CLASSAD_LIBS) +blahpd_LDADD = $(CLASSAD_LIBS) $(GLOBUS_GSSSAPI_GSI_LIBS) $(GLOBUS_GSS_ASSIST_LIBS) blahpd_daemon_LDADD = $(blahpd_LDADD) diff --git a/src/server.c b/src/server.c index 551ec8b7..2c137a5a 100644 --- a/src/server.c +++ b/src/server.c @@ -84,6 +84,9 @@ #include #include +#include "globus_gsi_credential.h" +#include "globus_gsi_proxy.h" + #include "blahpd.h" #include "config.h" #include "job_registry.h" @@ -2591,6 +2594,160 @@ set_cmd_list_option(char **command, classad_context cad, const char *attribute, if (to_append) free (to_append); return(result); } + +const char *grid_proxy_errmsg = NULL; + +int activate_globus() +{ + static int active = 0; + + if (active) { + return 0; + } + + if ( globus_thread_set_model( "pthread" ) ) { + grid_proxy_errmsg = "failed to activate Globus"; + return -1; + } + + if ( globus_module_activate(GLOBUS_GSI_CREDENTIAL_MODULE) ) { + grid_proxy_errmsg = "failed to activate Globus"; + return -1; + } + + if ( globus_module_activate(GLOBUS_GSI_PROXY_MODULE) ) { + grid_proxy_errmsg = "failed to activate Globus"; + return -1; + } + + active = 1; + return 0; +} + +/* Returns lifetime left on proxy, in seconds. + * 0 means proxy is expired. + * -1 means an error occurred. + */ +int grid_proxy_info(const char *proxy_filename) +{ + globus_gsi_cred_handle_t handle = NULL; + time_t time_left = -1; + + if ( activate_globus() < 0 ) { + return -1; + } + + if (globus_gsi_cred_handle_init(&handle, NULL)) { + grid_proxy_errmsg = "failed to initialize Globus data structures"; + goto cleanup; + } + + // We should have a proxy file, now, try to read it + if (globus_gsi_cred_read_proxy(handle, proxy_filename)) { + grid_proxy_errmsg = "unable to read proxy file"; + goto cleanup; + } + + if (globus_gsi_cred_get_lifetime(handle, &time_left)) { + grid_proxy_errmsg = "unable to extract expiration time"; + goto cleanup; + } + + if ( time_left < 0 ) { + time_left = 0; + } + + cleanup: + if (handle) { + globus_gsi_cred_handle_destroy(handle); + } + + return time_left; +} + +/* Writes new proxy derived from existing one. Argument lifetime is the + * number of seconds until expiration for the new proxy. A 0 lifetime + * means the same expiration time as the source proxy. + * Returns 0 on success and -1 on error. + */ +int grid_proxy_init(const char *src_filename, char *dst_filename, + int lifetime) +{ + globus_gsi_cred_handle_t src_handle = NULL; + globus_gsi_cred_handle_t dst_handle = NULL; + globus_gsi_proxy_handle_t dst_proxy_handle = NULL; + int rc = -1; + time_t src_time_left = -1; + globus_gsi_cert_utils_cert_type_t cert_type = GLOBUS_GSI_CERT_UTILS_TYPE_LIMITED_PROXY; + + if ( activate_globus() < 0 ) { + return -1; + } + + if (globus_gsi_cred_handle_init(&src_handle, NULL)) { + grid_proxy_errmsg = "failed to initialize Globus data structures"; + goto cleanup; + } + + // We should have a proxy file, now, try to read it + if (globus_gsi_cred_read_proxy(src_handle, src_filename)) { + grid_proxy_errmsg = "unable to read proxy file"; + goto cleanup; + } + + if (globus_gsi_cred_get_lifetime(src_handle, &src_time_left)) { + grid_proxy_errmsg = "unable to extract expiration time"; + goto cleanup; + } + if ( src_time_left < 0 ) { + src_time_left = 0; + } + + if (globus_gsi_proxy_handle_init( &dst_proxy_handle, NULL )) { + grid_proxy_errmsg = "failed to initialize Globus data structures"; + goto cleanup; + } + + // lifetime == desired dst lifetime + // src_time_left == time left on src + if ( lifetime == 0 || lifetime > src_time_left ) { + lifetime = src_time_left; + } + if (globus_gsi_proxy_handle_set_time_valid( dst_proxy_handle, lifetime/60 )) { + grid_proxy_errmsg = "unable to set proxy expiration time"; + goto cleanup; + } + + if (globus_gsi_proxy_handle_set_type( dst_proxy_handle, cert_type)) { + grid_proxy_errmsg = "unable to set proxy type"; + goto cleanup; + } + + if (globus_gsi_proxy_create_signed( dst_proxy_handle, src_handle, &dst_handle)) { + grid_proxy_errmsg = "unable to generate proxy"; + goto cleanup; + } + + if (globus_gsi_cred_write_proxy( dst_handle, dst_filename )) { + grid_proxy_errmsg = "unable to write proxy file"; + goto cleanup; + } + + rc = 0; + + cleanup: + if (src_handle) { + globus_gsi_cred_handle_destroy(src_handle); + } + if (dst_handle) { + globus_gsi_cred_handle_destroy(dst_handle); + } + if ( dst_handle ) { + globus_gsi_proxy_handle_destroy( dst_proxy_handle ); + } + + return rc; +} static char * limit_proxy(char* proxy_name, char *limited_proxy_name, char **error_message) @@ -2598,7 +2755,6 @@ limit_proxy(char* proxy_name, char *limited_proxy_name, char **error_message) int seconds_left, hours_left, minutes_left; char *limcommand; int res; - char* globuslocation; char *limit_command_output; int tmpfd; exec_cmd_t exe_command = EXEC_CMD_DEFAULT; @@ -2639,31 +2795,15 @@ limit_proxy(char* proxy_name, char *limited_proxy_name, char **error_message) if (error_message) *error_message = errmsg; else if (errmsg) free(errmsg); return NULL; } - - globuslocation = (getenv("GLOBUS_LOCATION") ? getenv("GLOBUS_LOCATION") : "/opt/globus"); - exe_command.command = make_message("%s/bin/grid-proxy-info -timeleft -file %s", - globuslocation, proxy_name); - if (exe_command.command == NULL) + else { - fprintf(stderr, "blahpd: out of memory\n"); - exit(1); + close(tmpfd); } - res = execute_cmd(&exe_command); - free(exe_command.command); - if (res != 0) - { - perror("blahpd error invoking grid-proxy-info"); - char * errmsg = make_message("blahpd error invoking grid-proxy-info; " - "exit code %d from grid-proxy-info"); - if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); - if (error_message && errmsg) *error_message = errmsg; else if (errmsg) free(errmsg); - return(NULL); - } - else - { - seconds_left = atoi(exe_command.output); - cleanup_cmd(&exe_command); + seconds_left = grid_proxy_info( proxy_name ); + if ( seconds_left < 0 ) { + perror("blahpd error reading proxy lifetime"); + return NULL; } limit_command_output = make_message("%s_XXXXXX", limited_proxy_name); @@ -2686,18 +2826,9 @@ limit_proxy(char* proxy_name, char *limited_proxy_name, char **error_message) get_lock_on_limited_proxy = config_test_boolean(config_get("blah_get_lock_on_limited_proxies",blah_config_handle)); - if (seconds_left <= 0) - { + if (seconds_left <= 0) { /* Something's wrong with the current proxy - use defaults */ - exe_command.command = make_message("%s/bin/grid-proxy-init -limited -cert %s -key %s -out %s", - globuslocation, proxy_name, proxy_name, limit_command_output); - } - else - { - hours_left = (int)(seconds_left/3600); - minutes_left = (int)((seconds_left%3600)/60) + 1; - exe_command.command = make_message("%s/bin/grid-proxy-init -limited -valid %d:%d -cert %s -key %s -out %s", - globuslocation, hours_left, minutes_left, proxy_name, proxy_name, limit_command_output); + seconds_left = 12*60*60; } if ((limit_command_output == limited_proxy_name) && @@ -2728,8 +2859,7 @@ limit_proxy(char* proxy_name, char *limited_proxy_name, char **error_message) } } - res = execute_cmd(&exe_command); - free(exe_command.command); + res = grid_proxy_init( proxy_name, limit_command_output, seconds_left ); if ((limit_command_output == limited_proxy_name) && get_lock_on_limited_proxy) @@ -2746,29 +2876,6 @@ limit_proxy(char* proxy_name, char *limited_proxy_name, char **error_message) return(NULL); } - /* If exitcode != 0 there may be a problem due to a warning by grid-proxy-init but */ - /* the call may have been successful. We just check the temporary proxy */ - if (exe_command.exit_code != 0) - { - int orig_exit_code = exe_command.exit_code; - cleanup_cmd(&exe_command); - exe_command.command = make_message("%s/bin/grid-proxy-info -f %s", globuslocation, limit_command_output); - res = execute_cmd(&exe_command); - free(exe_command.command); - if (res != 0 || exe_command.exit_code != 0) - { - char * errmsg = make_message("Failed to create limited proxy %s (grid-proxy-init " - "exit_code = %d; grid-proxy-info exit code %d)", limit_command_output, orig_exit_code, res != 0 ? res : exe_command.exit_code); - if (limit_command_output != limited_proxy_name) - free(limit_command_output); - if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); - if (error_message && errmsg) *error_message= errmsg; else if(errmsg) free(errmsg); - return(NULL); - } - } - - cleanup_cmd(&exe_command); - if (limit_command_output != limited_proxy_name) { if (get_lock_on_limited_proxy) From 53e9083bc5bc7f9ae4a2f70ca36e75f8a1fa8828 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Tue, 14 May 2013 21:24:06 -0500 Subject: [PATCH 052/169] Pass through memory requests. Equivalent to request-memory.patch from HTCondor. --- src/scripts/blah_common_submit_functions.sh | 3 ++- src/scripts/condor_submit.sh | 8 +++++++- src/scripts/pbs_submit.sh | 8 ++++++++ src/server.c | 3 ++- 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index 88d70841..57755b31 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -268,7 +268,7 @@ function bls_parse_submit_options () ############################################################### # Parse parameters ############################################################### - while getopts "a:i:o:e:c:s:v:V:dw:q:n:N:z:h:S:r:p:l:x:u:j:T:I:O:R:C:D:" arg + while getopts "a:i:o:e:c:s:v:V:dw:q:n:N:z:h:S:r:p:l:x:u:j:T:I:O:R:C:D:m:" arg do case "$arg" in a) bls_opt_xtra_args="$OPTARG" ;; @@ -299,6 +299,7 @@ function bls_parse_submit_options () R) bls_opt_outputflstringremap="$OPTARG" ;; C) bls_opt_req_file="$OPTARG";; D) bls_opt_run_dir="$OPTARG";; + m) bls_opt_req_mem="$OPTARG";; -) break ;; ?) echo $usage_string exit 1 ;; diff --git a/src/scripts/condor_submit.sh b/src/scripts/condor_submit.sh index d2302ea3..334c85fe 100755 --- a/src/scripts/condor_submit.sh +++ b/src/scripts/condor_submit.sh @@ -53,7 +53,7 @@ mpinodes=0 # Name of local requirements file: currently unused req_file="" -while getopts "a:i:o:de:j:n:v:V:c:w:x:u:q:r:s:T:I:O:R:C:D:" arg +while getopts "a:i:o:de:j:n:v:V:c:w:x:u:q:r:s:T:I:O:R:C:D:m:" arg do case "$arg" in a) xtra_args="$OPTARG" ;; @@ -78,6 +78,7 @@ do R) remaps="$OPTARG" ;; C) req_file="$OPTARG" ;; D) run_dir="$OPTARG" ;; + m) req_mem="$OPTARG" ;; -) break ;; ?) echo $usage_string exit 1 ;; @@ -240,6 +241,11 @@ then echo -e $xtra_args >> $submit_file fi +if [ "x$req_mem" != "x"] +then + echo "request_memory = $req_mem" >> $submit_file +fi + cat >> $submit_file << EOF # We insist on new style quoting in Condor arguments = $arguments diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index 24c88a8c..54419fa8 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -111,6 +111,14 @@ fi #local batch system-specific file output must be added to the submit file bls_local_submit_attributes_file=${blah_libexec_directory}/pbs_local_submit_attributes.sh +if [ "x$bls_opt_req_mem" != "x" ] +then + # Different schedulers require different memory checks + echo "#PBS -l mem=${bls_opt_req_mem}mb" >> $bls_tmp_file + echo "#PBS -l pmem=${bls_opt_req_mem}mb" >> $bls_tmp_file + echo "#PBS -l pvmem=${bls_opt_req_mem}mb" >> $bls_tmp_file +fi + bls_set_up_local_and_extra_args # Write PBS directives according to command line options diff --git a/src/server.c b/src/server.c index 2c137a5a..bdc6157f 100644 --- a/src/server.c +++ b/src/server.c @@ -1308,7 +1308,8 @@ cmd_submit_job(void *args) (set_cmd_bool_option (&command, cad, "StageCmd", "-s", NO_QUOTE) == C_CLASSAD_OUT_OF_MEMORY) || (set_cmd_string_option(&command, cad, "ClientJobId","-j", NO_QUOTE) == C_CLASSAD_OUT_OF_MEMORY) || (set_cmd_string_option(&command, cad, "JobDirectory","-D", NO_QUOTE) == C_CLASSAD_OUT_OF_MEMORY) || - (set_cmd_string_option(&command, cad, "BatchExtraSubmitArgs", "-a", SINGLE_QUOTE) == C_CLASSAD_OUT_OF_MEMORY)) + (set_cmd_string_option(&command, cad, "BatchExtraSubmitArgs", "-a", SINGLE_QUOTE) == C_CLASSAD_OUT_OF_MEMORY) || + (set_cmd_int_option(&command, cad, "RequestMemory", "-m", INT_NOQUOTE) == C_CLASSAD_OUT_OF_MEMORY)) // (set_cmd_string_option(&command, cad, "Args", "--", SINGLE_QUOTE) == C_CLASSAD_OUT_OF_MEMORY)) { /* PUSH A FAILURE */ From 7a5a30a4bdaf5604fea74c9dd92eebc3ed91b445 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Tue, 14 May 2013 21:26:35 -0500 Subject: [PATCH 053/169] Properly exit on error for SGE. Port of sge_helper.error-exit.patch from HTCondor. --- src/scripts/sge_helper | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scripts/sge_helper b/src/scripts/sge_helper index 725f1347..55d83d26 100755 --- a/src/scripts/sge_helper +++ b/src/scripts/sge_helper @@ -203,6 +203,7 @@ foreach my $i ( 0 .. $#results ) { print "[ ", map( "$_ = $results[$i]->{$_}; ", keys %{$results[$i]} ), "]\n"; } else { print "Error\n"; + exit ( 1 ); } } elsif ( $jobstatus ) { printf( "%s %d %d %s %s OK\n", From 3f1bea0df2d9758b29481ad5e090bf50258a63bd Mon Sep 17 00:00:00 2001 From: Derek Weitzel Date: Tue, 17 Sep 2013 11:11:01 -0500 Subject: [PATCH 054/169] Fixing debug output for lock acquisition. --- src/scripts/pbs_status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 4806a77c..264ca306 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -127,7 +127,7 @@ def ExclusiveLock(fd, timeout=120): raise sleeptime = random.random() log("Unable to acquire lock, try %i; will sleep for %.2f " \ - "seconds and try for %.2f more seconds." % (tries, sleeptime, max_time - time.time()-starttime)) + "seconds and try for %.2f more seconds." % (tries, sleeptime, max_time - (time.time()-starttime))) tries += 1 time.sleep(sleeptime) From 2bc743ebb5a6c327c07c5db97b0b2a483129d602 Mon Sep 17 00:00:00 2001 From: Derek Weitzel Date: Tue, 17 Sep 2013 11:11:51 -0500 Subject: [PATCH 055/169] Removing dup2 call since the blahp doesn't like closing stderr --- src/scripts/pbs_status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 264ca306..17fdcd5d 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -78,7 +78,7 @@ def initLog(): else: filename = "/dev/null" fd = open(filename, "a") - os.dup2(fd.fileno(), 2) + #os.dup2(fd.fileno(), 2) # Something else from a prior life - see gratia-probe-common's GratiaWrapper.py def ExclusiveLock(fd, timeout=120): From 1dbd0d73382a9dceddd532282a0a216eea9aa501 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Fri, 20 Sep 2013 08:11:06 -0500 Subject: [PATCH 056/169] Do not close original stderr. --- src/scripts/pbs_status.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 4806a77c..ba7d5b27 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -78,6 +78,9 @@ def initLog(): else: filename = "/dev/null" fd = open(filename, "a") + # Do NOT close the file descriptor blahp originally hands us for stderr. + # This causes blahp to lose all status updates. + os.dup(2) os.dup2(fd.fileno(), 2) # Something else from a prior life - see gratia-probe-common's GratiaWrapper.py From 7044a693ae14fdae02f44e05c13e43351e98a9ce Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Tue, 7 Jan 2014 21:28:27 -0600 Subject: [PATCH 057/169] Cleanup original proxy afterward. --- src/scripts/blah_common_submit_functions.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index 57755b31..9c1e2a5c 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -717,7 +717,11 @@ function bls_start_job_wrapper () function bls_finish_job_wrapper () { echo "cd \$old_home" - + if [ "x$bls_opt_proxy_string" != "x" ] + then + echo "rm -f $bls_opt_proxy_string" + fi + echo "" echo "exit \$user_retcode" From 6c08440dcc2a2f61916488e6c5144258c59e07d3 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Tue, 7 Jan 2014 21:29:59 -0600 Subject: [PATCH 058/169] Try fixing blahp arg escaping for SLRUM. --- src/server.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/server.c b/src/server.c index bdc6157f..5e7ec364 100644 --- a/src/server.c +++ b/src/server.c @@ -3573,10 +3573,10 @@ char* outputfileRemaps(char *sb,char *sbrmp) #define SINGLE_QUOTE_CHAR '\'' #define DOUBLE_QUOTE_CHAR '\"' -#define CONVARG_OPENING "'\"" -#define CONVARG_OPENING_LEN 2 -#define CONVARG_CLOSING "\"'\000" -#define CONVARG_CLOSING_LEN 3 +#define CONVARG_OPENING "'\\\"" +#define CONVARG_OPENING_LEN 3 +#define CONVARG_CLOSING "\\\"'\000" +#define CONVARG_CLOSING_LEN 4 #define CONVARG_QUOTSEP "\\\"%c\\\"" #define CONVARG_QUOTSEP_LEN 5 #define CONVARG_DBLQUOTESC "\\\\\\\"" From 412b52542ac4bdb2202a519e56b0ba532612c7e4 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Tue, 7 Jan 2014 21:31:19 -0600 Subject: [PATCH 059/169] Allow PBS qstat caching to fail and still respond appropriately. --- src/scripts/pbs_status.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index b3b1774b..04e310bb 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -390,12 +390,13 @@ def main(): return 1 jobid = sys.argv[1].split("/")[-1].split(".")[0] log("Checking cache for jobid %s" % jobid) + cache_contents = None try: cache_contents = check_cache(jobid) except Exception, e: msg = "1ERROR: Internal exception, %s" % str(e) log(msg) - print msg + #print msg if not cache_contents: log("Jobid %s not in cache; querying PBS" % jobid) results = qstat(jobid) From 55c6aee2bbdb157b7a74b7c7c5fc8822412dc364 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Tue, 7 Jan 2014 22:05:42 -0600 Subject: [PATCH 060/169] Revert "Try fixing blahp arg escaping for SLRUM." This reverts commit 6c08440dcc2a2f61916488e6c5144258c59e07d3. --- src/server.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/server.c b/src/server.c index 5e7ec364..bdc6157f 100644 --- a/src/server.c +++ b/src/server.c @@ -3573,10 +3573,10 @@ char* outputfileRemaps(char *sb,char *sbrmp) #define SINGLE_QUOTE_CHAR '\'' #define DOUBLE_QUOTE_CHAR '\"' -#define CONVARG_OPENING "'\\\"" -#define CONVARG_OPENING_LEN 3 -#define CONVARG_CLOSING "\\\"'\000" -#define CONVARG_CLOSING_LEN 4 +#define CONVARG_OPENING "'\"" +#define CONVARG_OPENING_LEN 2 +#define CONVARG_CLOSING "\"'\000" +#define CONVARG_CLOSING_LEN 3 #define CONVARG_QUOTSEP "\\\"%c\\\"" #define CONVARG_QUOTSEP_LEN 5 #define CONVARG_DBLQUOTESC "\\\\\\\"" From 9072066c05b3ab14efb3fc72f1be1655347b7837 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Tue, 7 Jan 2014 22:09:56 -0600 Subject: [PATCH 061/169] Revert "Pass arguments and environment to the service scripts so that they" This reverts commit beb3a447860f5b31d710d1e72e884e2cf7be8f4e. --- src/server.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/server.c b/src/server.c index bdc6157f..c778fbab 100644 --- a/src/server.c +++ b/src/server.c @@ -3573,10 +3573,10 @@ char* outputfileRemaps(char *sb,char *sbrmp) #define SINGLE_QUOTE_CHAR '\'' #define DOUBLE_QUOTE_CHAR '\"' -#define CONVARG_OPENING "'\"" -#define CONVARG_OPENING_LEN 2 -#define CONVARG_CLOSING "\"'\000" -#define CONVARG_CLOSING_LEN 3 +#define CONVARG_OPENING "\"\\\"" +#define CONVARG_OPENING_LEN 3 +#define CONVARG_CLOSING "\\\"\"\000" +#define CONVARG_CLOSING_LEN 4 #define CONVARG_QUOTSEP "\\\"%c\\\"" #define CONVARG_QUOTSEP_LEN 5 #define CONVARG_DBLQUOTESC "\\\\\\\"" From a01cf47dba3bf2d1519e722a48809c847b4c3e3e Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Wed, 8 Jan 2014 20:46:12 -0600 Subject: [PATCH 062/169] Fix proxy renewal in the case where the default proxy-limitation-name is used (and no home directory exists). --- src/server.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/server.c b/src/server.c index c778fbab..9926329f 100644 --- a/src/server.c +++ b/src/server.c @@ -1766,7 +1766,7 @@ cmd_status_job_all(void *args) } int -get_status_and_old_proxy(int use_glexec, char *jobDescr, +get_status_and_old_proxy(int use_glexec, char *jobDescr, const char *proxyFileName, char **status_argv, char **old_proxy, char **workernode, char **error_string) { @@ -1858,6 +1858,21 @@ get_status_and_old_proxy(int use_glexec, char *jobDescr, job_registry_free_split_id(spid); return 1; /* 'local' state */ } + // Look for the limited proxy next to the new proxy - this is a common case for HTCondor-based submission. + free(proxy_link); + if ((proxy_link = make_message("%s.lmt", proxyFileName)) == NULL) + { + fprintf(stderr, "Out of memory.\n"); + exit(MALLOC_ERROR); + } + if (access(proxy_link, R_OK) == 0) + { + *old_proxy = proxy_link; + // do not free proxy_link in this case. + free(r_old_proxy); + job_registry_free_split_id(spid); + return 1; + } free(proxy_link); free(r_old_proxy); job_registry_free_split_id(spid); @@ -1978,7 +1993,7 @@ cmd_renew_proxy(void *args) if (blah_children_count>0) check_on_children(blah_children, blah_children_count); - jobStatus=get_status_and_old_proxy(use_mapping, jobDescr, argv + CMD_RENEW_PROXY_ARGS + 1, &old_proxy, &workernode, &error_string); + jobStatus=get_status_and_old_proxy(use_mapping, jobDescr, proxyFileName, argv + CMD_RENEW_PROXY_ARGS + 1, &old_proxy, &workernode, &error_string); old_proxy_len = -1; if (old_proxy != NULL) old_proxy_len = strlen(old_proxy); if ((jobStatus < 0) || (old_proxy == NULL) || (old_proxy_len <= 0)) From 39c0009b0226cff1982313eda6071239c0efa99f Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 9 Jan 2014 08:34:11 -0600 Subject: [PATCH 063/169] Test if tracejob is available before using it. It is not present for the SLURM backend. --- src/scripts/pbs_submit.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index 54419fa8..e3ad4cda 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -46,8 +46,14 @@ logpath=${pbs_spoolpath}/server_logs if [ ! -d $logpath -o ! -x $logpath ]; then - pbs_spoolpath=`${pbs_binpath}/tracejob | grep 'default prefix path'|awk -F" " '{ print $5 }'` - logpath=${pbs_spoolpath}/server_logs + if [ -x "${pbs_binpath}/tracejob" ]; then + pbs_spoolpath=`${pbs_binpath}/tracejob | grep 'default prefix path'|awk -F" " '{ print $5 }'` + logpath=${pbs_spoolpath}/server_logs + else + # EPEL defaults for torque + pbs_spoolpath=/var/lib/torque/spool + logpath=/var/lib/torque/server_logs + fi fi bls_job_id_for_renewal=PBS_JOBID From 1accc9e930532c41f730d2f7eab9fea2a34bfbdd Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Fri, 5 Sep 2014 14:39:56 -0500 Subject: [PATCH 064/169] Add lsf_status.patch to fix memory allocation errors (SOFTWARE-1589) --- src/scripts/lsf_status.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/lsf_status.sh b/src/scripts/lsf_status.sh index d31db430..f7bd89f7 100755 --- a/src/scripts/lsf_status.sh +++ b/src/scripts/lsf_status.sh @@ -229,7 +229,7 @@ END { touch $datefile;chmod 600 $datefile if [ $? -ne 0 ]; then - echo 'Error creating temporary file' + echo '1ERROR: Could not create temporary file' datefile="" echo "1ERROR: Job not found" break From d7184e4b168a6260b55960acc26b5ddf5f58b91e Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Fri, 5 Sep 2014 14:44:08 -0500 Subject: [PATCH 065/169] pbs_status.py fails to find qstat in a non-standard location if it isn't run in the directory that has blah_load_config.sh (SOFTWARE-1594) --- src/scripts/pbs_status.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 04e310bb..48811e64 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -249,7 +249,8 @@ def get_qstat_location(): global _qstat_location_cache if _qstat_location_cache != None: return _qstat_location_cache - if os.path.exists("blah_load_config.sh") and os.access("blah_load_config.sh", os.R_OK): + load_config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'blah_load_config.sh') + if os.path.exists(load_config_path) and os.access(load_config_path, os.R_OK): cmd = 'source blah_load_config.sh && echo "$pbs_binpath/qstat"' else: cmd = 'which qstat' From 09ca872578f9789fa3b22fbfd31e3a5132e484e8 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Tue, 16 Sep 2014 13:15:57 -0500 Subject: [PATCH 066/169] Write datefile to /tmp instead of the current working directory because when it was invoked by HTCondor CE, it tried to write files into /var/log/condor-ce. --- src/scripts/lsf_status.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/lsf_status.sh b/src/scripts/lsf_status.sh index f7bd89f7..7f70a2b9 100755 --- a/src/scripts/lsf_status.sh +++ b/src/scripts/lsf_status.sh @@ -225,7 +225,7 @@ END { if [ "$cliretcode" == "1" -o "x$lsf_BLParser" != "xyes" ] ; then result="" usedBLParser="no" - datefile=blahdate_$RANDOM$RANDOM$RANDOM + datefile=/tmp/blahdate_$RANDOM$RANDOM$RANDOM touch $datefile;chmod 600 $datefile if [ $? -ne 0 ]; then From bd14f3f60f435d0ba2952e0bc2a92b73b804d357 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Fri, 26 Sep 2014 17:06:54 -0500 Subject: [PATCH 067/169] Fix pbs_status.py script to actually use the new load_config_path from d7184e4. --- src/scripts/pbs_status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 48811e64..8d4bf9bf 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -251,7 +251,7 @@ def get_qstat_location(): return _qstat_location_cache load_config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'blah_load_config.sh') if os.path.exists(load_config_path) and os.access(load_config_path, os.R_OK): - cmd = 'source blah_load_config.sh && echo "$pbs_binpath/qstat"' + cmd = 'source %s && echo "$pbs_binpath/qstat"' % load_config_path else: cmd = 'which qstat' child_stdout = os.popen(cmd) From 96fde44f851816ec3cad94d0e5f4e7c4617afaec Mon Sep 17 00:00:00 2001 From: Francesco Prelz Date: Fri, 13 Mar 2015 16:11:17 +0100 Subject: [PATCH 068/169] Re-added fix for Savannah #27966, that popped out from a 2007 commit. --- src/scripts/lsf_submit.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/scripts/lsf_submit.sh b/src/scripts/lsf_submit.sh index 09b34d54..b7b34bdb 100755 --- a/src/scripts/lsf_submit.sh +++ b/src/scripts/lsf_submit.sh @@ -51,6 +51,9 @@ conffile=$lsf_confpath/lsf.conf lsf_base_path=`cat $conffile|grep LSB_SHAREDIR| awk -F"=" '{ print $2 }'` +lsf_confdir=`cat $conffile|grep LSF_CONFDIR| awk -F"=" '{ print $2 }'` +[ -f ${lsf_confdir}/profile.lsf ] && . ${lsf_confdir}/profile.lsf + lsf_clustername=`${lsf_binpath}/lsid | grep 'My cluster name is'|awk -F" " '{ print $5 }'` logpath=$lsf_base_path/$lsf_clustername/logdir From 333a62fcfd1207ba2aee8b6c69521150821a1274 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Mon, 30 Mar 2015 15:46:15 -0500 Subject: [PATCH 069/169] Fix condor_submit syntax error --- src/scripts/condor_submit.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/condor_submit.sh b/src/scripts/condor_submit.sh index 334c85fe..c96595bf 100755 --- a/src/scripts/condor_submit.sh +++ b/src/scripts/condor_submit.sh @@ -183,7 +183,7 @@ if [ ${#remap_files[@]} -gt 0 ] ; then if [ ! -z "${remap_files[0]}" ] ; then map=${remap_files[$i]} else - map=${output_files$i]} + map=${output_files[$i]} fi transfer_output_remaps="$transfer_output_remaps;${output_files[$i]}=$map" done From 0a03709d05d6f74e16dcbe1dce82fe8bc318535d Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Thu, 21 May 2015 13:53:26 -0500 Subject: [PATCH 070/169] Apply gittrac #5041: Blahp should check for empty job id from qsub https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=5041 --- src/scripts/pbs_submit.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index e3ad4cda..67a536a7 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -213,6 +213,12 @@ fi # The job id is actually the first numbers in the string (slurm support) jobID=`echo $jobID | awk 'match($0,/[0-9]+/){print substr($0, RSTART, RLENGTH)}'` +if [ "X$jobID" == "X" ]; then + rm -f $bls_tmp_file + echo "Error: job id missing" >&2 + echo Error # for the sake of waiting fgets in blahpd + exit 1 +fi if [ "x$pbs_nologaccess" != "xyes" -a "x$pbs_nochecksubmission" != "xyes" ]; then From 2c08bb2030ed096ee782f200b02c07cd7a43c54a Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Fri, 18 Sep 2015 11:01:54 -0500 Subject: [PATCH 071/169] Fix for job registry losing track of LSF jobs in its registry (gittrac #5062) --- src/job_registry.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/job_registry.c b/src/job_registry.c index 45515974..74b0deeb 100644 --- a/src/job_registry.c +++ b/src/job_registry.c @@ -29,6 +29,7 @@ * case is guaranteed by the invoking service). * Added job_registry_check_index_key_uniqueness. * 21-Jul-2011 Added job_registry_need_update function. + * 11-Sep-2015 Always return most recent job in job_registry_get_recnum. * * Description: * File-based container to cache job IDs and statuses to implement @@ -2141,7 +2142,8 @@ job_registry_need_update(const job_registry_entry *olde, * Binary search for an entry in the indexed, sorted job registry pointed to by * rha. The record number in the current JR cache is returned. * No file access is required. - * In case multiple entries are found, the lowest recnum is returned. + * In case multiple entries are found, the highest (most recent) recnum + * is returned. * * @param rha Pointer to a job registry handle returned by job_registry_init. * @param id Job id key to be looked up @@ -2171,12 +2173,12 @@ job_registry_get_recnum(const job_registry_handle *rha, /* Check for duplicates. */ for (tcur=cur-1; tcur >=0 && strcmp(rha->entries[tcur].id,id)==0; tcur--) { - if (rha->entries[tcur].recnum < found) found = rha->entries[tcur].recnum; + if (rha->entries[tcur].recnum > found) found = rha->entries[tcur].recnum; } for (tcur=cur+1;tcur < rha->n_entries && strcmp(rha->entries[tcur].id,id)==0; tcur++) { - if (rha->entries[tcur].recnum < found) found = rha->entries[tcur].recnum; + if (rha->entries[tcur].recnum > found) found = rha->entries[tcur].recnum; } break; } From 250e256e27f8c358ad74de9e0e9fec8457b22758 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Fri, 18 Sep 2015 13:21:41 -0500 Subject: [PATCH 072/169] Reduce the number of threads to 50 (SOFTWARE-1980) --- config/blah.config.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/blah.config.template b/config/blah.config.template index 5fc570d1..05102913 100644 --- a/config/blah.config.template +++ b/config/blah.config.template @@ -21,7 +21,7 @@ blah_disable_wn_proxy_renewal= blah_delegate_renewed_proxies= #max number of concurrent threads to serve commands (default = 500) -#blah_max_threaded_cmds=100 +blah_max_threaded_cmds=50 #Colon-separated list of paths that are shared among batch system #head and worker nodes. From 5c82acf54902ee05c05e423ca9e760f5ed4c6788 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Mon, 21 Sep 2015 15:20:42 -0500 Subject: [PATCH 073/169] Add support for PBS Pro (SOFTWARE-1958) --- src/scripts/pbs_status.py | 19 ++++++++++++++----- src/scripts/pbs_submit.sh | 9 +++++---- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 8d4bf9bf..c595c024 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -37,6 +37,7 @@ import fcntl import random import struct +import subprocess import signal import tempfile @@ -223,9 +224,16 @@ def qstat(jobid=""): Returns a python dictionary with the job info. """ qstat = get_qstat_location() + command = (qstat, '--version') + qstat_process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + qstat_version, _ = qstat_process.communicate() + starttime = time.time() log("Starting qstat.") - child_stdout = os.popen("%s -f -1 %s" % (qstat, jobid)) + if re.search(r'PBSPro', qstat_version) + child_stdout = os.popen("%s -f %s" % (qstat, jobid)) # -1 conflicts with -f in PBS Pro + else: + child_stdout = os.popen("%s -f -1 %s" % (qstat, jobid)) result = parse_qstat_fd(child_stdout) exit_status = child_stdout.close() log("Finished qstat (time=%f)." % (time.time()-starttime)) @@ -233,7 +241,7 @@ def qstat(jobid=""): exit_code = 0 if os.WIFEXITED(exit_status): exit_code = os.WEXITSTATUS(exit_status) - if exit_code == 153: # Completed + if exit_code == 153 or exit_code == 35: # Completed result = {jobid: {'BatchJobId': '"%s"' % jobid, "JobStatus": "4", "ExitCode": ' 0'}} elif exit_code == 271: # Removed result = {jobid: {'BatchJobId': '"%s"' % jobid, 'JobStatus': '3', 'ExitCode': ' 0'}} @@ -264,9 +272,10 @@ def get_qstat_location(): job_id_re = re.compile("\s*Job Id:\s([0-9]+)([\w\-\/.]*)") exec_host_re = re.compile("\s*exec_host = ([\w\-\/.]+)") -status_re = re.compile("\s*job_state = ([QRECH])") -exit_status_re = re.compile("\s*exit_status = (-?[0-9]+)") -status_mapping = {"Q": 1, "R": 2, "E": 2, "C": 4, "H": 5} +status_re = re.compile("\s*job_state = ([QREFCH])") +exit_status_re = re.compile("\s*[Ee]xit_status = (-?[0-9]+)") +status_mapping = {"Q": 1, "R": 2, "E": 2, "F": 4, "C": 4, "H": 5} + def parse_qstat_fd(fd): """ Parse the stdout fd of "qstat -f" into a python dictionary containing diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index 67a536a7..f9d289ce 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -44,6 +44,9 @@ . `dirname $0`/blah_common_submit_functions.sh +qstat --version 2>&1 | grep PBSPro > /dev/null 2>&1 +is_pbs_pro=$? + logpath=${pbs_spoolpath}/server_logs if [ ! -d $logpath -o ! -x $logpath ]; then if [ -x "${pbs_binpath}/tracejob" ]; then @@ -117,7 +120,7 @@ fi #local batch system-specific file output must be added to the submit file bls_local_submit_attributes_file=${blah_libexec_directory}/pbs_local_submit_attributes.sh -if [ "x$bls_opt_req_mem" != "x" ] +if [ "x$bls_opt_req_mem" != "x" ] && [ "$is_pbs_pro" -neq "0" ] then # Different schedulers require different memory checks echo "#PBS -l mem=${bls_opt_req_mem}mb" >> $bls_tmp_file @@ -132,7 +135,7 @@ bls_set_up_local_and_extra_args [ -z "$bls_opt_queue" ] || grep -q "^#PBS -q" $bls_tmp_file || echo "#PBS -q $bls_opt_queue" >> $bls_tmp_file # Extended support for MPI attributes -if [ "x$bls_opt_wholenodes" == "xyes" ] ; then +if [ "x$bls_opt_wholenodes" == "xyes" ] && [ "$is_pbs_pro" -neq "0" ]; then bls_opt_hostsmpsize=${bls_opt_hostsmpsize:-1} if [[ ! -z "$bls_opt_smpgranularity" ]] ; then if [[ -z "$bls_opt_hostnumber" ]] ; then @@ -172,8 +175,6 @@ else fi # --- End of MPI directives - - # Input and output sandbox setup. if [ "x$blah_torque_multiple_staging_directive_bug" == "xyes" ]; then bls_fl_subst_and_accumulate inputsand "stagein=@@F_REMOTE@`hostname -f`:@@F_LOCAL" "," From e15e8658a71174fe824dd90738fb7ccb15ba80dd Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 23 Sep 2015 14:34:07 -0500 Subject: [PATCH 074/169] Add config option 'blah_disable_limited_proxy' to disable the limited proxy, which can cause issues with HTCondor-CE --- config/blah.config.template | 3 +++ src/server.c | 10 ++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/config/blah.config.template b/config/blah.config.template index 05102913..552906e4 100644 --- a/config/blah.config.template +++ b/config/blah.config.template @@ -20,6 +20,9 @@ blah_disable_wn_proxy_renewal= #be enabled only if non-limited proxies are used for proxy renewal. (default = no) blah_delegate_renewed_proxies= +#Set to yes to disable creation of a limited proxy. (default = no) +blah_disable_limited_proxy= + #max number of concurrent threads to serve commands (default = 500) blah_max_threaded_cmds=50 diff --git a/src/server.c b/src/server.c index 9926329f..ee567b94 100644 --- a/src/server.c +++ b/src/server.c @@ -188,6 +188,7 @@ int enable_condor_glexec = FALSE; int require_proxy_on_submit = FALSE; int disable_wn_proxy_renewal = FALSE; int disable_proxy_user_copy = FALSE; +int disable_limited_proxy = FALSE; int synchronous_termination = FALSE; static char *mapping_parameter[MEXEC_PARAM_COUNT]; @@ -404,6 +405,7 @@ serveConnection(int cli_socket, char* cli_ip_addr) enable_condor_glexec = config_test_boolean(config_get("blah_enable_glexec_from_condor",blah_config_handle)); disable_wn_proxy_renewal = config_test_boolean(config_get("blah_disable_wn_proxy_renewal",blah_config_handle)); disable_proxy_user_copy = config_test_boolean(config_get("blah_disable_proxy_user_copy",blah_config_handle)); + disable_limited_proxy = config_test_boolean(config_get("blah_disable_limited_proxy",blah_config_handle)); /* Scan configuration for submit attributes to pass to local script */ pass_all_submit_attributes = config_test_boolean(config_get("blah_pass_all_submit_attributes",blah_config_handle)); @@ -1153,7 +1155,7 @@ cmd_submit_job(void *args) } } } - else if (proxyname != NULL) + else if (proxyname != NULL) && (disable_limited_proxy) { /* not in glexec mode: need to limit the proxy */ char *errmsg; @@ -2009,9 +2011,9 @@ cmd_renew_proxy(void *args) switch(jobStatus) { case 1: /* job queued: copy the proxy locally */ - if (!use_mapping) + if (!use_mapping) && (!disable_limited_proxy) { - limit_proxy(proxyFileName, old_proxy, NULL); /*FIXME: should check if limited proxies are enabled? */ + limit_proxy(proxyFileName, old_proxy, NULL); resultLine = make_message("%s 0 Proxy\\ renewed", reqId); } else @@ -2136,7 +2138,7 @@ cmd_send_proxy_to_worker_node(void *args) if (workernode != NULL && strcmp(workernode, "")) { - if(!use_glexec) + if(!use_glexec) && (!disable_limited_proxy) { proxyFileNameNew = limit_proxy(proxyFileName, NULL, NULL); } From 17f1d5fae6fc100652811c456f648b745f9d45bc Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Tue, 22 Sep 2015 16:52:41 -0500 Subject: [PATCH 075/169] Properly request memory in PBS Pro --- src/scripts/pbs_submit.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index f9d289ce..231ff982 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -120,12 +120,18 @@ fi #local batch system-specific file output must be added to the submit file bls_local_submit_attributes_file=${blah_libexec_directory}/pbs_local_submit_attributes.sh -if [ "x$bls_opt_req_mem" != "x" ] && [ "$is_pbs_pro" -neq "0" ] +if [ "x$bls_opt_req_mem" != "x" ] then # Different schedulers require different memory checks - echo "#PBS -l mem=${bls_opt_req_mem}mb" >> $bls_tmp_file echo "#PBS -l pmem=${bls_opt_req_mem}mb" >> $bls_tmp_file echo "#PBS -l pvmem=${bls_opt_req_mem}mb" >> $bls_tmp_file + if [ "$is_pbs_pro" -eq "0" ] + then + # PBS Pro requires mem to be requested within a select statement + echo "#PBS -l select=1:mem=${bls_opt_req_mem}mb" >> $bls_tmp_file + else + echo "#PBS -l mem=${bls_opt_req_mem}mb" >> $bls_tmp_file + fi fi bls_set_up_local_and_extra_args From d61599f7eff07642cf43306f2eea49da1da76ac8 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Mon, 28 Sep 2015 13:23:06 -0500 Subject: [PATCH 076/169] Fix missing not modifier for limiting proxies --- src/server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.c b/src/server.c index ee567b94..28276d82 100644 --- a/src/server.c +++ b/src/server.c @@ -1155,7 +1155,7 @@ cmd_submit_job(void *args) } } } - else if (proxyname != NULL) && (disable_limited_proxy) + else if (proxyname != NULL) && (!disable_limited_proxy) { /* not in glexec mode: need to limit the proxy */ char *errmsg; From 598ce4b4f0fa4e0a4d08757524a5960340de7e2a Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Tue, 29 Sep 2015 12:37:59 -0500 Subject: [PATCH 077/169] Fix syntax error --- src/scripts/pbs_status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index c595c024..d1cfa687 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -230,7 +230,7 @@ def qstat(jobid=""): starttime = time.time() log("Starting qstat.") - if re.search(r'PBSPro', qstat_version) + if re.search(r'PBSPro', qstat_version): child_stdout = os.popen("%s -f %s" % (qstat, jobid)) # -1 conflicts with -f in PBS Pro else: child_stdout = os.popen("%s -f -1 %s" % (qstat, jobid)) From 84d6874e6734ceb90a10d8147af0cd822a8a0e5c Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Tue, 29 Sep 2015 11:49:19 -0500 Subject: [PATCH 078/169] Fix boolean syntax errors --- src/server.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/server.c b/src/server.c index 28276d82..fecc2af0 100644 --- a/src/server.c +++ b/src/server.c @@ -1155,7 +1155,7 @@ cmd_submit_job(void *args) } } } - else if (proxyname != NULL) && (!disable_limited_proxy) + else if ((proxyname) != NULL && (!disable_limited_proxy)) { /* not in glexec mode: need to limit the proxy */ char *errmsg; @@ -2011,8 +2011,8 @@ cmd_renew_proxy(void *args) switch(jobStatus) { case 1: /* job queued: copy the proxy locally */ - if (!use_mapping) && (!disable_limited_proxy) - { + if ((!use_mapping) && (!disable_limited_proxy) + ){ limit_proxy(proxyFileName, old_proxy, NULL); resultLine = make_message("%s 0 Proxy\\ renewed", reqId); } @@ -2138,7 +2138,7 @@ cmd_send_proxy_to_worker_node(void *args) if (workernode != NULL && strcmp(workernode, "")) { - if(!use_glexec) && (!disable_limited_proxy) + if((!use_glexec) && (!disable_limited_proxy)) { proxyFileNameNew = limit_proxy(proxyFileName, NULL, NULL); } From 9cf91da8a9856d0c485fe33ccdf90e29eca25281 Mon Sep 17 00:00:00 2001 From: Derek Weitzel Date: Wed, 21 Oct 2015 14:53:49 -0500 Subject: [PATCH 079/169] Adding error reporting to the pbs_submit --- src/scripts/pbs_submit.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index 231ff982..f62f1a6d 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -215,6 +215,8 @@ jobID=`${pbs_binpath}/qsub $bls_tmp_file` # actual submission retcode=$? if [ "$retcode" != "0" ] ; then rm -f $bls_tmp_file + # Echo the output from qsub onto stderr, which is captured by HTCondor + echo "Error from qsub: $jobID" >&2 exit 1 fi @@ -223,6 +225,7 @@ jobID=`echo $jobID | awk 'match($0,/[0-9]+/){print substr($0, RSTART, RLENGTH)}' if [ "X$jobID" == "X" ]; then rm -f $bls_tmp_file echo "Error: job id missing" >&2 + echo "Error from qsub: $jobID" >&2 echo Error # for the sake of waiting fgets in blahpd exit 1 fi From cd1c706ca6982bee2e00403bff9dde7182048aa4 Mon Sep 17 00:00:00 2001 From: Derek Weitzel Date: Wed, 21 Oct 2015 14:55:03 -0500 Subject: [PATCH 080/169] Revert "Adding error reporting to the pbs_submit" Accidental push without pull request. This reverts commit 9cf91da8a9856d0c485fe33ccdf90e29eca25281. --- src/scripts/pbs_submit.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index f62f1a6d..231ff982 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -215,8 +215,6 @@ jobID=`${pbs_binpath}/qsub $bls_tmp_file` # actual submission retcode=$? if [ "$retcode" != "0" ] ; then rm -f $bls_tmp_file - # Echo the output from qsub onto stderr, which is captured by HTCondor - echo "Error from qsub: $jobID" >&2 exit 1 fi @@ -225,7 +223,6 @@ jobID=`echo $jobID | awk 'match($0,/[0-9]+/){print substr($0, RSTART, RLENGTH)}' if [ "X$jobID" == "X" ]; then rm -f $bls_tmp_file echo "Error: job id missing" >&2 - echo "Error from qsub: $jobID" >&2 echo Error # for the sake of waiting fgets in blahpd exit 1 fi From a88c851115e5003542838241355a44052636ac53 Mon Sep 17 00:00:00 2001 From: Derek Weitzel Date: Wed, 21 Oct 2015 14:53:49 -0500 Subject: [PATCH 081/169] Adding error reporting to the pbs_submit --- src/scripts/pbs_submit.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index 231ff982..f62f1a6d 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -215,6 +215,8 @@ jobID=`${pbs_binpath}/qsub $bls_tmp_file` # actual submission retcode=$? if [ "$retcode" != "0" ] ; then rm -f $bls_tmp_file + # Echo the output from qsub onto stderr, which is captured by HTCondor + echo "Error from qsub: $jobID" >&2 exit 1 fi @@ -223,6 +225,7 @@ jobID=`echo $jobID | awk 'match($0,/[0-9]+/){print substr($0, RSTART, RLENGTH)}' if [ "X$jobID" == "X" ]; then rm -f $bls_tmp_file echo "Error: job id missing" >&2 + echo "Error from qsub: $jobID" >&2 echo Error # for the sake of waiting fgets in blahpd exit 1 fi From d862ec717049a1c03f70f23629cc112ae33fc4ad Mon Sep 17 00:00:00 2001 From: Derek Weitzel Date: Thu, 22 Oct 2015 13:45:47 -0500 Subject: [PATCH 082/169] Fixing indention and removing extra line --- src/scripts/pbs_submit.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index f62f1a6d..2edff175 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -215,8 +215,8 @@ jobID=`${pbs_binpath}/qsub $bls_tmp_file` # actual submission retcode=$? if [ "$retcode" != "0" ] ; then rm -f $bls_tmp_file - # Echo the output from qsub onto stderr, which is captured by HTCondor - echo "Error from qsub: $jobID" >&2 + # Echo the output from qsub onto stderr, which is captured by HTCondor + echo "Error from qsub: $jobID" >&2 exit 1 fi @@ -224,8 +224,7 @@ fi jobID=`echo $jobID | awk 'match($0,/[0-9]+/){print substr($0, RSTART, RLENGTH)}'` if [ "X$jobID" == "X" ]; then rm -f $bls_tmp_file - echo "Error: job id missing" >&2 - echo "Error from qsub: $jobID" >&2 + echo "Error from qsub: $jobID" >&2 echo Error # for the sake of waiting fgets in blahpd exit 1 fi From 22731b4f2185c9a598fa5c5595b266c17d409452 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Mon, 30 Nov 2015 15:57:35 -0600 Subject: [PATCH 083/169] Resync the job registry to solve CE jobs incorrectly being marked as completed --- src/job_registry.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/job_registry.c b/src/job_registry.c index 74b0deeb..a5aeff0b 100644 --- a/src/job_registry.c +++ b/src/job_registry.c @@ -2313,6 +2313,33 @@ job_registry_get(job_registry_handle *rha, } firstrec = job_registry_firstrec(rha,fd); + + /* Determine if the job registry index must be resync'd. + * The record numbers are monotonically increasing through the lifetime + * of the registry; the firstrec we read from the data file above must + * match the firstrec in our in-memory index. The firstrec on the index + * is guaranteed to change if a purge operation occurred. + */ + if (firstrec != rha->firstrec) + { + int retval = job_registry_resync(rha, fd); + if (retval < 0) // Registry failed to update. + { + fclose(fd); + return NULL; + } + if (retval > 0) // Registry has been updated; our lookup was invalid. + { + found = job_registry_lookup(rha, id); + if (found == 0) + { + errno = ENOENT; + fclose(fd); + return NULL; + } + } + } + /* Was this record just purged ? */ if ((firstrec > rha->firstrec) && (found >= rha->firstrec) && (found < firstrec)) { From 3c7fe9b56169d00918d811dcde8aaad59bbf63e3 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Fri, 4 Dec 2015 13:26:33 -0600 Subject: [PATCH 084/169] pbs_status.py fails when /tmp and /var/tmp are on different filesytems (SOFTWARE-2092) --- src/scripts/pbs_status.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index d1cfa687..4ffd2232 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -322,16 +322,18 @@ def fill_cache(cache_location): log("Starting query to fill cache.") results = qstat() log("Finished query to fill cache.") - (fd, filename) = tempfile.mkstemp() + (fd, filename) = tempfile.mkstemp(dir = "/var/tmp") try: - for key, val in results.items(): - key = key.split(".")[0] - os.write(fd, "%s: %s\n" % (key, job_dict_to_string(val))) - os.fsync(fd) + try: + for key, val in results.items(): + key = key.split(".")[0] + os.write(fd, "%s: %s\n" % (key, job_dict_to_string(val))) + os.fsync(fd) + except: + os.unlink(filename) + raise + finally: os.close(fd) - except: - os.unlink(filename) - raise os.rename(filename, cache_location) global launchtime launchtime = time.time() From 3568e2087b6fe42e2b7c7c6988a97c33ab3bb180 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Tue, 15 Dec 2015 09:49:12 -0600 Subject: [PATCH 085/169] Disable limited proxies when using glexec --- src/server.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/server.c b/src/server.c index fecc2af0..0b78df31 100644 --- a/src/server.c +++ b/src/server.c @@ -966,11 +966,12 @@ cmd_set_glexec_dn(void *args) /* proxt4 must be limited for subsequent submission */ if(argv[3][0]=='0') { - if((proxynameNew = limit_proxy(proxt4, NULL, NULL)) == NULL) + if (((proxynameNew = limit_proxy(proxt4, NULL, NULL)) == NULL) || + (disable_limited_proxy)) { free(mapping_parameter[MEXEC_PARAM_DELEGCRED]); mapping_parameter[MEXEC_PARAM_DELEGCRED] = NULL; - result = strdup("F Cannot\\ limit\\ proxy\\ file"); + result = strdup("F Not\\ limiting\\ proxy\\ file"); } else mapping_parameter[MEXEC_PARAM_SRCPROXY] = proxynameNew; @@ -2020,7 +2021,7 @@ cmd_renew_proxy(void *args) { exe_command.delegation_type = atoi(argv[CMD_RENEW_PROXY_ARGS + 1 + MEXEC_PARAM_DELEGTYPE]); exe_command.delegation_cred = argv[CMD_RENEW_PROXY_ARGS + 1 + MEXEC_PARAM_DELEGCRED]; - if (use_glexec) + if ((use_glexec) && (disable_limited_proxy)) { exe_command.source_proxy = argv[CMD_RENEW_PROXY_ARGS + 1 + MEXEC_PARAM_SRCPROXY]; } else { From 9f6a85458aafca2e5abb6161e393184a452b99b7 Mon Sep 17 00:00:00 2001 From: Derek Weitzel Date: Thu, 4 Feb 2016 09:53:30 -0600 Subject: [PATCH 086/169] Gather completion statistics from Slurm jobs. The ExitCode, Cpu user time, and maximum memory usage are gathered for completed Slurm jobs. They are propagated through the blahp and GAHP to be inserted into the jobs final ClassAd. --- src/scripts/pbs_status.py | 71 ++++++++++++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 9 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 4ffd2232..e7d0d1a3 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -40,6 +40,8 @@ import subprocess import signal import tempfile +import pickle +import csv cache_timeout = 60 @@ -247,8 +249,51 @@ def qstat(jobid=""): result = {jobid: {'BatchJobId': '"%s"' % jobid, 'JobStatus': '3', 'ExitCode': ' 0'}} else: raise Exception("qstat failed with exit code %s" % str(exit_status)) + + # If the job has completed... + if jobid is not "" and "JobStatus" in result[jobid] and (result[jobid]["JobStatus"] == '4' or result[jobid]["JobStatus"] == '3'): + # Get the finished job stats and update the result + finished_job_stats = get_finished_job_stats(jobid) + result[jobid].update(finished_job_stats) + return result + +def convert_cpu_to_seconds(cpu_string): + import re + h,m,s = re.split(':',cpu_string) + return int(h) * 3600 + int(m) * 60 + int(s) + +def get_finished_job_stats(jobid): + """ + Get a completed job's statistics such as used RAM and cpu usage. + """ + + # List the attributes that we want + return_dict = { "ImageSize": 0, "ExitCode": 0, "RemoteUserCpu": 0 } + # First, determine if this is a pbs or slurm machine. + + + # Next, query the appropriate interfaces for the completed job information + # TODO: fix for pbs + log("Querying sacct for completed job for jobid: %s" % (str(jobid))) + child_stdout = os.popen("sacct -j %s -l --noconvert -P" % (str(jobid))) + + reader = csv.DictReader(child_stdout, delimiter="|") + # Slurm can return more than 1 row, for some odd reason. + # so sum up relevant values + for row in reader: + if row["AveCPU"] is not "": + return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) + if row["MaxRSS"] is not "": + # Remove the trailing 'K' + return_dict["ImageSize"] += int(row["MaxRSS"].replace('K', '')) + if row["ExitCode"] is not "": + return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) + + return return_dict + + _qstat_location_cache = None def get_qstat_location(): """ @@ -323,11 +368,14 @@ def fill_cache(cache_location): results = qstat() log("Finished query to fill cache.") (fd, filename) = tempfile.mkstemp(dir = "/var/tmp") + # Open the file with a proper python file object + f = os.fdopen(fd, "w") + writer = csv.writer(f, delimiter='\t') try: try: for key, val in results.items(): key = key.split(".")[0] - os.write(fd, "%s: %s\n" % (key, job_dict_to_string(val))) + writer.writerow([key, pickle.dumps(val)]) os.fsync(fd) except: os.unlink(filename) @@ -340,11 +388,10 @@ def fill_cache(cache_location): cache_line_re = re.compile("([0-9]+[\.\w\-]+):\s+(.+)") def cache_to_status(jobid, fd): - for line in fd.readlines(): - line = line.strip() - m = cache_line_re.match(line) - if m and m.group(1) == jobid: - return m.group(2) + reader = csv.reader(fd, delimiter='\t') + for row in reader: + if row[0] == jobid: + return pickle.loads(row[1]) def check_cache(jobid, recurse=True): uid = os.geteuid() @@ -394,6 +441,8 @@ def check_cache(jobid, recurse=True): return None return cache_to_status(jobid, fd) +job_status_re = re.compile(".*JobStatus=(\d+);.*") + def main(): initLog() @@ -421,8 +470,13 @@ def main(): print "0%s" % job_dict_to_string(results[jobid]) else: log("Jobid %s in cache." % jobid) - log("0%s" % cache_contents) - print "0%s" % cache_contents + log("0%s" % job_dict_to_string(cache_contents)) + + if cache_contents["JobStatus"] == '4' or cache_contents["JobStatus"] == '3': + finished_job_stats = get_finished_job_stats(jobid) + cache_contents.update(finished_job_stats) + + print "0%s" % job_dict_to_string(cache_contents) return 0 if __name__ == "__main__": @@ -433,4 +487,3 @@ def main(): except Exception, e: print "1ERROR: %s" % str(e).replace("\n", "\\n") sys.exit(0) - From 38383d38dcc00a02e120846ebefefa03f501054f Mon Sep 17 00:00:00 2001 From: Derek Weitzel Date: Fri, 5 Feb 2016 07:34:08 -0600 Subject: [PATCH 087/169] First iteration of detecting actual cluster type. A new file, cluster_type, is created in the cache directory which contains the type of underlying cluster, right now either slurm or pbs. This new file is only written when the cache is being written. --- src/scripts/pbs_status.py | 95 ++++++++++++++++++++++++++++++++------- 1 file changed, 79 insertions(+), 16 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index e7d0d1a3..c3f4fd50 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -259,11 +259,35 @@ def qstat(jobid=""): return result +def which(program): + """ + Determine if the program is in the path. + + arg program: name of the program to search + returns: full path to executable, or None if executable is not found + """ + def is_exe(fpath): + return os.path.isfile(fpath) and os.access(fpath, os.X_OK) + + fpath, fname = os.path.split(program) + if fpath: + if is_exe(program): + return program + else: + for path in os.environ["PATH"].split(os.pathsep): + path = path.strip('"') + exe_file = os.path.join(path, program) + if is_exe(exe_file): + return exe_file + + return None + def convert_cpu_to_seconds(cpu_string): import re h,m,s = re.split(':',cpu_string) return int(h) * 3600 + int(m) * 60 + int(s) +_cluster_type_cache = None def get_finished_job_stats(jobid): """ Get a completed job's statistics such as used RAM and cpu usage. @@ -272,24 +296,47 @@ def get_finished_job_stats(jobid): # List the attributes that we want return_dict = { "ImageSize": 0, "ExitCode": 0, "RemoteUserCpu": 0 } # First, determine if this is a pbs or slurm machine. + uid = os.geteuid() + username = pwd.getpwuid(uid).pw_name + cache_dir = os.path.join("/var/tmp", "qstat_cache_%s" % username) + cluster_type_file = os.path.join(cache_dir, "cluster_type") + global _cluster_type_cache + if not _cluster_type_cache: + # Look for the special file, cluster_type + if os.path.exists(cluster_type_file): + _cluster_type_cache = open(cluster_type_file).read() + else: + # No idea what type of cluster is running, not set, so give up + log("cluster_type file is not present, not checking for completed job statistics") + return return_dict + # Slurm completion + if _cluster_type_cache == "slurm": - # Next, query the appropriate interfaces for the completed job information - # TODO: fix for pbs - log("Querying sacct for completed job for jobid: %s" % (str(jobid))) - child_stdout = os.popen("sacct -j %s -l --noconvert -P" % (str(jobid))) - - reader = csv.DictReader(child_stdout, delimiter="|") - # Slurm can return more than 1 row, for some odd reason. - # so sum up relevant values - for row in reader: - if row["AveCPU"] is not "": - return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) - if row["MaxRSS"] is not "": - # Remove the trailing 'K' - return_dict["ImageSize"] += int(row["MaxRSS"].replace('K', '')) - if row["ExitCode"] is not "": - return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) + # Next, query the appropriate interfaces for the completed job information + # TODO: fix for pbs + log("Querying sacct for completed job for jobid: %s" % (str(jobid))) + child_stdout = os.popen("sacct -j %s -l --noconvert -P" % (str(jobid))) + + try: + reader = csv.DictReader(child_stdout, delimiter="|") + except Exception, e: + log("Unable to read in CSV output from sacct: %s" str(e)) + + # Slurm can return more than 1 row, for some odd reason. + # so sum up relevant values + for row in reader: + if row["AveCPU"] is not "": + return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) + if row["MaxRSS"] is not "": + # Remove the trailing 'K' + return_dict["ImageSize"] += int(row["MaxRSS"].replace('K', '')) + if row["ExitCode"] is not "": + return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) + + # PBS completion + elif _cluster_type_cache == "pbs": + pass return return_dict @@ -383,6 +430,22 @@ def fill_cache(cache_location): finally: os.close(fd) os.rename(filename, cache_location) + + # Create the cluster_type file + uid = os.geteuid() + username = pwd.getpwuid(uid).pw_name + cache_dir = os.path.join("/var/tmp", "qstat_cache_%s" % username) + cluster_type_file = os.path.join(cache_dir, "cluster_type") + (fd, filename) = tempfile.mkstemp(dir = "/var/tmp") + global _cluster_type_cache + if which("sacct"): + os.write(fd, "slurm") + _cluster_type_cache = "slurm" + else: + log("Unable to find cluster type") + os.close(fd) + os.rename(filename, cluster_type_file) + global launchtime launchtime = time.time() From 46a0184c71d6ab07dbb486a16deaee8a5208277e Mon Sep 17 00:00:00 2001 From: Derek Weitzel Date: Mon, 8 Feb 2016 13:18:58 -0600 Subject: [PATCH 088/169] Fix bug when sacct doesn't reply. When sacct doesn't reply, either because the DB is busy or some other bottleneck, the pbs_status.py finish function should gracefully return nothing. --- src/scripts/pbs_status.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index c3f4fd50..22047a28 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -321,7 +321,8 @@ def get_finished_job_stats(jobid): try: reader = csv.DictReader(child_stdout, delimiter="|") except Exception, e: - log("Unable to read in CSV output from sacct: %s" str(e)) + log("Unable to read in CSV output from sacct: %s" % str(e)) + return return_dict # Slurm can return more than 1 row, for some odd reason. # so sum up relevant values From b9709751d6ada22dae17afed1789f44dbda8f41b Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 10 Feb 2016 14:56:33 -0600 Subject: [PATCH 089/169] Handle LSF suspended states (SOFTWARE-2168) --- src/scripts/lsf_status.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/scripts/lsf_status.sh b/src/scripts/lsf_status.sh index 7f70a2b9..dada5844 100755 --- a/src/scripts/lsf_status.sh +++ b/src/scripts/lsf_status.sh @@ -319,15 +319,15 @@ $0 ~ rex_finished { } $0 ~ rex_uhold { - jobstatus = 5 + jobstatus = 7 } $0 ~ rex_phold { - jobstatus = 5 + jobstatus = 1 } $0 ~ rex_shold { - jobstatus = 5 + jobstatus = 7 } END { From 71cbdb188ab782cf0bf6420f223101f18dbfa1d1 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 10 Feb 2016 14:59:05 -0600 Subject: [PATCH 090/169] Re-apply source path fixes that were overwritten in 708ae5d (SOFTWARE-2199) --- src/scripts/sge_cancel.sh | 2 +- src/scripts/sge_hold.sh | 2 +- src/scripts/sge_resume.sh | 2 +- src/scripts/sge_status.sh | 5 ++--- src/scripts/sge_submit.sh | 2 +- 5 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/scripts/sge_cancel.sh b/src/scripts/sge_cancel.sh index c631f6c1..821d4b61 100755 --- a/src/scripts/sge_cancel.sh +++ b/src/scripts/sge_cancel.sh @@ -20,7 +20,7 @@ # -[ -f ${GLITE_LOCATION:-/opt/glite}/etc/blah.config ] && . ${GLITE_LOCATION:-/opt/glite}/etc/blah.config +. `dirname $0`/blah_load_config.sh if [ -z "$sge_rootpath" ]; then sge_rootpath="/usr/local/sge/pro"; fi if [ -r "$sge_rootpath/${sge_cellname:-default}/common/settings.sh" ] diff --git a/src/scripts/sge_hold.sh b/src/scripts/sge_hold.sh index 677e11fb..67ee17df 100755 --- a/src/scripts/sge_hold.sh +++ b/src/scripts/sge_hold.sh @@ -20,7 +20,7 @@ # -[ -f ${GLITE_LOCATION:-/opt/glite}/etc/blah.config ] && . ${GLITE_LOCATION:-/opt/glite}/etc/blah.config +. `dirname $0`/blah_load_config.sh if [ -z "$sge_rootpath" ]; then sge_rootpath="/usr/local/sge/pro"; fi if [ -r "$sge_rootpath/${sge_cellname:-default}/common/settings.sh" ] diff --git a/src/scripts/sge_resume.sh b/src/scripts/sge_resume.sh index cfcda85c..525dab3c 100755 --- a/src/scripts/sge_resume.sh +++ b/src/scripts/sge_resume.sh @@ -20,7 +20,7 @@ # -[ -f ${GLITE_LOCATION:-/opt/glite}/etc/blah.config ] && . ${GLITE_LOCATION:-/opt/glite}/etc/blah.config +. `dirname $0`/blah_load_config.sh if [ -z "$sge_rootpath" ]; then sge_rootpath="/usr/local/sge/pro"; fi if [ -r "$sge_rootpath/${sge_cellname:-default}/common/settings.sh" ] diff --git a/src/scripts/sge_status.sh b/src/scripts/sge_status.sh index edeb05f5..3633dc49 100755 --- a/src/scripts/sge_status.sh +++ b/src/scripts/sge_status.sh @@ -20,10 +20,9 @@ # -#[ -f ${GLITE_LOCATION:-/opt/glite}/etc/blah.config ] && . ${GLITE_LOCATION:-/opt/glite}/etc/blah.config . `dirname $0`/blah_load_config.sh -sge_helper_path=${GLITE_LOCATION:-/opt/glite}/bin +sge_helper_path=${blah_libexec_directory} usage_string="Usage: $0 [-w] [-n]" @@ -68,7 +67,7 @@ tmpid=`echo "$@"|sed 's/.*\/.*\///g'` jobid=${tmpid}.${sge_cellname:-default} -blahp_status=`exec ${sge_helper_path:-/opt/glite/bin}/sge_helper --status $getwn $jobid` +blahp_status=`exec ${sge_helper_path}/sge_helper --status $getwn $jobid` retcode=$? # Now see if we need to run qstat 'manually' diff --git a/src/scripts/sge_submit.sh b/src/scripts/sge_submit.sh index 2d1b0fd1..941e8fb8 100755 --- a/src/scripts/sge_submit.sh +++ b/src/scripts/sge_submit.sh @@ -64,7 +64,7 @@ cat > $bls_tmp_file << end_of_preamble end_of_preamble #local batch system-specific file output must be added to the submit file -local_submit_attributes_file=${GLITE_LOCATION:-/opt/glite}/bin/sge_local_submit_attributes.sh +local_submit_attributes_file=${blah_libexec_directory}/sge_local_submit_attributes.sh if [ -r $local_submit_attributes_file ] ; then echo \#\!/bin/sh > $bls_opt_tmp_req_file if [ ! -z $bls_opt_req_file ] ; then From fbbbdc1671c2fffd930846b37c98584ed044b379 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 18 Feb 2016 15:49:37 -0600 Subject: [PATCH 091/169] Add ability to wrap the payload job. Provide hook for wrapping the payload job with an admin-controlled script on the worker node. This will help us run in places like NERSC where we need to run a separate script to launch parallel versions of the payload. --- src/scripts/blah_common_submit_functions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index 9c1e2a5c..f60f2320 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -664,7 +664,7 @@ function bls_start_job_wrapper () echo "\$new_home/`basename $bls_opt_the_command` $bls_arguments &" echo "fi" else - echo "$bls_opt_the_command $bls_arguments &" + echo "$blah_job_wrapper $bls_opt_the_command $bls_arguments &" fi echo "job_pid=\$!" From e7f5c619d0663ffa0207f9afcf8e4ef0e8679c91 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 18 Feb 2016 16:02:15 -0600 Subject: [PATCH 092/169] Add env var for number of nodes Adds an environment variable for number of nodes; allows more flexibility in the blah_job_wrapper. --- src/scripts/blah_common_submit_functions.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index f60f2320..c36f0b94 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -664,6 +664,7 @@ function bls_start_job_wrapper () echo "\$new_home/`basename $bls_opt_the_command` $bls_arguments &" echo "fi" else + echo "\$NODE_COUNT=$$bls_opt_mpinodes" echo "$blah_job_wrapper $bls_opt_the_command $bls_arguments &" fi From c49b3e1186d434945b05258089a6f53947946b54 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Thu, 18 Feb 2016 16:02:39 -0600 Subject: [PATCH 093/169] Fix typo --- src/scripts/blah_common_submit_functions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index c36f0b94..399ba81a 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -664,7 +664,7 @@ function bls_start_job_wrapper () echo "\$new_home/`basename $bls_opt_the_command` $bls_arguments &" echo "fi" else - echo "\$NODE_COUNT=$$bls_opt_mpinodes" + echo "\$NODE_COUNT=$bls_opt_mpinodes" echo "$blah_job_wrapper $bls_opt_the_command $bls_arguments &" fi From c3627533fec560eca927a0ae82b4be73ca60057a Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Thu, 3 Mar 2016 16:29:49 -0600 Subject: [PATCH 094/169] Use the pbs_binpath when checking for PBS Pro --- src/scripts/pbs_submit.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index 2edff175..ff8b090a 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -44,9 +44,6 @@ . `dirname $0`/blah_common_submit_functions.sh -qstat --version 2>&1 | grep PBSPro > /dev/null 2>&1 -is_pbs_pro=$? - logpath=${pbs_spoolpath}/server_logs if [ ! -d $logpath -o ! -x $logpath ]; then if [ -x "${pbs_binpath}/tracejob" ]; then @@ -120,6 +117,8 @@ fi #local batch system-specific file output must be added to the submit file bls_local_submit_attributes_file=${blah_libexec_directory}/pbs_local_submit_attributes.sh +${pbs_binpath}/qstat --version | grep PBSPro +is_pbs_pro=$? if [ "x$bls_opt_req_mem" != "x" ] then # Different schedulers require different memory checks From ae02a7e3b0950efaa833005a6cc6caa446417aed Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Mon, 7 Mar 2016 13:48:59 -0600 Subject: [PATCH 095/169] Disable MPI completely for PBS Pro --- src/scripts/pbs_submit.sh | 72 ++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index ff8b090a..5d4d7829 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -140,43 +140,45 @@ bls_set_up_local_and_extra_args [ -z "$bls_opt_queue" ] || grep -q "^#PBS -q" $bls_tmp_file || echo "#PBS -q $bls_opt_queue" >> $bls_tmp_file # Extended support for MPI attributes -if [ "x$bls_opt_wholenodes" == "xyes" ] && [ "$is_pbs_pro" -neq "0" ]; then - bls_opt_hostsmpsize=${bls_opt_hostsmpsize:-1} - if [[ ! -z "$bls_opt_smpgranularity" ]] ; then - if [[ -z "$bls_opt_hostnumber" ]] ; then - echo "#PBS -l nodes=1:ppn=$bls_opt_hostsmpsize" >> $bls_tmp_file +if [ "$is_pbs_pro" -neq "0" ]; then + if [ "x$bls_opt_wholenodes" == "xyes" ]; then + bls_opt_hostsmpsize=${bls_opt_hostsmpsize:-1} + if [[ ! -z "$bls_opt_smpgranularity" ]] ; then + if [[ -z "$bls_opt_hostnumber" ]] ; then + echo "#PBS -l nodes=1:ppn=$bls_opt_hostsmpsize" >> $bls_tmp_file + else + echo "#PBS -l nodes=$bls_opt_hostnumber:ppn=$bls_opt_hostsmpsize" >> $bls_tmp_file + fi + echo "#PBS -W x=NACCESSPOLICY:SINGLEJOB" >> $bls_tmp_file + else + if [[ ! -z "$bls_opt_hostnumber" ]] ; then + if [[ $bls_opt_mpinodes -gt 0 ]] ; then + r=$((bls_opt_mpinodes % bls_opt_hostnumber)) + (( r )) && mpireminder="+$r:ppn=$bls_opt_hostsmpsize" + echo "#PBS -l nodes=$((bls_opt_hostnumber-r)):ppn=${bls_opt_hostsmpsize}${mpireminder}" >> $bls_tmp_file + else + echo "#PBS -l nodes=$bls_opt_hostnumber:ppn=$bls_opt_hostsmpsize" >> $bls_tmp_file + fi + echo "#PBS -W x=NACCESSPOLICY:SINGLEJOB" >> $bls_tmp_file + fi + fi else - echo "#PBS -l nodes=$bls_opt_hostnumber:ppn=$bls_opt_hostsmpsize" >> $bls_tmp_file + if [[ ! -z "$bls_opt_smpgranularity" ]] ; then + n=$((bls_opt_mpinodes / bls_opt_smpgranularity)) + r=$((bls_opt_mpinodes % bls_opt_smpgranularity)) + (( r )) && mpireminder="+1:ppn=$r" + echo "#PBS -l nodes=$n:ppn=${bls_opt_smpgranularity}${mpireminder}" >> $bls_tmp_file + else + if [[ ! -z "$bls_opt_hostnumber" ]] ; then + n=$((bls_opt_mpinodes / bls_opt_hostnumber)) + r=$((bls_opt_mpinodes % bls_opt_hostnumber)) + (( r )) && mpireminder="+$r:ppn=$((n+1))" + echo "#PBS -l nodes=$((bls_opt_hostnumber-r)):ppn=$n$mpireminder" >> $bls_tmp_file + elif [[ $bls_opt_mpinodes -gt 0 ]] ; then + echo "#PBS -l nodes=$bls_opt_mpinodes" >> $bls_tmp_file + fi + fi fi - echo "#PBS -W x=NACCESSPOLICY:SINGLEJOB" >> $bls_tmp_file - else - if [[ ! -z "$bls_opt_hostnumber" ]] ; then - if [[ $bls_opt_mpinodes -gt 0 ]] ; then - r=$((bls_opt_mpinodes % bls_opt_hostnumber)) - (( r )) && mpireminder="+$r:ppn=$bls_opt_hostsmpsize" - echo "#PBS -l nodes=$((bls_opt_hostnumber-r)):ppn=${bls_opt_hostsmpsize}${mpireminder}" >> $bls_tmp_file - else - echo "#PBS -l nodes=$bls_opt_hostnumber:ppn=$bls_opt_hostsmpsize" >> $bls_tmp_file - fi - echo "#PBS -W x=NACCESSPOLICY:SINGLEJOB" >> $bls_tmp_file - fi - fi -else - if [[ ! -z "$bls_opt_smpgranularity" ]] ; then - n=$((bls_opt_mpinodes / bls_opt_smpgranularity)) - r=$((bls_opt_mpinodes % bls_opt_smpgranularity)) - (( r )) && mpireminder="+1:ppn=$r" - echo "#PBS -l nodes=$n:ppn=${bls_opt_smpgranularity}${mpireminder}" >> $bls_tmp_file - else - if [[ ! -z "$bls_opt_hostnumber" ]] ; then - n=$((bls_opt_mpinodes / bls_opt_hostnumber)) - r=$((bls_opt_mpinodes % bls_opt_hostnumber)) - (( r )) && mpireminder="+$r:ppn=$((n+1))" - echo "#PBS -l nodes=$((bls_opt_hostnumber-r)):ppn=$n$mpireminder" >> $bls_tmp_file - elif [[ $bls_opt_mpinodes -gt 0 ]] ; then - echo "#PBS -l nodes=$bls_opt_mpinodes" >> $bls_tmp_file - fi - fi fi # --- End of MPI directives From 5c60008610a5a356689dddb25e31d7ce2b347d5f Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Mon, 7 Mar 2016 14:08:31 -0600 Subject: [PATCH 096/169] Build PBS select statement piece-by-piece --- src/scripts/pbs_submit.sh | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index 5d4d7829..ea7e60cb 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -119,16 +119,20 @@ bls_local_submit_attributes_file=${blah_libexec_directory}/pbs_local_submit_attr ${pbs_binpath}/qstat --version | grep PBSPro is_pbs_pro=$? +# Begin building the select statement: select=x where x is the number of 'chunks' +# to request. Chunk requests should precede any resource requests (resource +# requests are order independent). An example from the PBS Pro manual: +# #PBS -l select=2:ncpus=8:mpiprocs=8:mem=6gb:interconnect=10g,walltime=16:00:00 +# Only one chunk is required for OSG needs at this time. +pbs_select="#PBS -l select=1" + if [ "x$bls_opt_req_mem" != "x" ] then # Different schedulers require different memory checks echo "#PBS -l pmem=${bls_opt_req_mem}mb" >> $bls_tmp_file echo "#PBS -l pvmem=${bls_opt_req_mem}mb" >> $bls_tmp_file - if [ "$is_pbs_pro" -eq "0" ] - then - # PBS Pro requires mem to be requested within a select statement - echo "#PBS -l select=1:mem=${bls_opt_req_mem}mb" >> $bls_tmp_file - else + pbs_select="$pbs_select:mem=${bls_opt_req_mem}mb" + if [ "$is_pbs_pro" -neq "0" ]; then echo "#PBS -l mem=${bls_opt_req_mem}mb" >> $bls_tmp_file fi fi @@ -140,7 +144,9 @@ bls_set_up_local_and_extra_args [ -z "$bls_opt_queue" ] || grep -q "^#PBS -q" $bls_tmp_file || echo "#PBS -q $bls_opt_queue" >> $bls_tmp_file # Extended support for MPI attributes -if [ "$is_pbs_pro" -neq "0" ]; then +if [ "$is_pbs_pro" -eq "0" ]; then + pbs_select="$pbs_select:ncpus=1" +else if [ "x$bls_opt_wholenodes" == "xyes" ]; then bls_opt_hostsmpsize=${bls_opt_hostsmpsize:-1} if [[ ! -z "$bls_opt_smpgranularity" ]] ; then @@ -198,6 +204,10 @@ else [ -z "$bls_fl_subst_and_accumulate_result" ] || echo "#PBS -W stageout=\\'$bls_fl_subst_and_accumulate_result\\'" >> $bls_tmp_file fi +if [ "$is_pbs_pro" -eq "0" ]; then + echo $pbs_select >> $bls_tmp_file +fi + echo "#PBS -m n" >> $bls_tmp_file bls_add_job_wrapper From d8f9709527f418232542ced3083133230944ed03 Mon Sep 17 00:00:00 2001 From: Brian Bockelman Date: Fri, 25 Mar 2016 15:55:18 -0500 Subject: [PATCH 097/169] Fix bash syntax error --- src/scripts/blah_common_submit_functions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index 399ba81a..42e95ba9 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -664,7 +664,7 @@ function bls_start_job_wrapper () echo "\$new_home/`basename $bls_opt_the_command` $bls_arguments &" echo "fi" else - echo "\$NODE_COUNT=$bls_opt_mpinodes" + echo "export NODE_COUNT=$bls_opt_mpinodes" echo "$blah_job_wrapper $bls_opt_the_command $bls_arguments &" fi From 5c7257fa99112ee57d48290b69d24c3482d400c8 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 6 Apr 2016 17:39:42 -0500 Subject: [PATCH 098/169] Support dynamic assignment of env variables (SOFTWARE-2221) --- src/scripts/blah_common_submit_functions.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index 9c1e2a5c..0f6d22cb 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -613,6 +613,12 @@ function bls_start_job_wrapper () fi fi + JOB_ENV="/var/lib/osg/osg-job-environment.conf" + LOCAL_JOB_ENV="/var/lib/osg/osg-local-job-environment.conf" + for fname in $JOB_ENV $LOCAL_JOB_ENV; do + test -r $fname && echo "`grep -G \"^[^# ]\" $fname`" + done + echo "old_home=\`pwd\`" # Set the temporary home (including cd'ing into it) if [ "x$bls_opt_run_dir" != "x" ] ; then From 5ea956fbb5f182353f672605657804c9abef89b0 Mon Sep 17 00:00:00 2001 From: "Gabriel A. von Winckler" Date: Thu, 14 Apr 2016 15:50:39 -0300 Subject: [PATCH 099/169] Fix pbs_status.py to be compatible with Slurm < 15.8 --- src/scripts/pbs_status.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 22047a28..9a10e3d9 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -317,9 +317,17 @@ def get_finished_job_stats(jobid): # TODO: fix for pbs log("Querying sacct for completed job for jobid: %s" % (str(jobid))) child_stdout = os.popen("sacct -j %s -l --noconvert -P" % (str(jobid))) + sacct_data = child_stdout.readlines() + ret = child_stdout.close() + + if ret: + # retry without --noconvert for slurm < 15.8 + child_stdout = os.popen("sacct -j %s -l -P" % (str(jobid))) + sacct_data = child_stdout.readlines() + child_stdout.close() try: - reader = csv.DictReader(child_stdout, delimiter="|") + reader = csv.DictReader(sacct_data, delimiter="|") except Exception, e: log("Unable to read in CSV output from sacct: %s" % str(e)) return return_dict From 8e157201fbb3a9798bf0770da54608985960e5d6 Mon Sep 17 00:00:00 2001 From: "Gabriel A. von Winckler" Date: Thu, 14 Apr 2016 15:50:54 -0300 Subject: [PATCH 100/169] Fix close() in pbs_status.py --- src/scripts/pbs_status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 9a10e3d9..a5975253 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -437,7 +437,7 @@ def fill_cache(cache_location): os.unlink(filename) raise finally: - os.close(fd) + f.close() os.rename(filename, cache_location) # Create the cluster_type file From 470c7207858e823ee23cbaaa20bf3ec085e3f2f2 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Tue, 12 Apr 2016 17:46:19 -0500 Subject: [PATCH 101/169] Fix mem requests (SOFTWARE-2260) --- config/blah.config.template | 3 +++ src/scripts/pbs_submit.sh | 29 +++++++++++++++++------------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/config/blah.config.template b/config/blah.config.template index 552906e4..189e5911 100644 --- a/config/blah.config.template +++ b/config/blah.config.template @@ -96,6 +96,9 @@ pbs_nologaccess=yes #locally from the logs if BLParser is not present pbs_fallback=no +#Set to 'yes' to request pvmem when submitting jobs +pbs_set_pvmem=no + ##LSF common variables diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index ea7e60cb..f4aee87b 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -117,7 +117,7 @@ fi #local batch system-specific file output must be added to the submit file bls_local_submit_attributes_file=${blah_libexec_directory}/pbs_local_submit_attributes.sh -${pbs_binpath}/qstat --version | grep PBSPro +${pbs_binpath}/qstat --version 2>&1 | grep PBSPro > /dev/null 2>&1 is_pbs_pro=$? # Begin building the select statement: select=x where x is the number of 'chunks' # to request. Chunk requests should precede any resource requests (resource @@ -126,15 +126,20 @@ is_pbs_pro=$? # Only one chunk is required for OSG needs at this time. pbs_select="#PBS -l select=1" -if [ "x$bls_opt_req_mem" != "x" ] -then - # Different schedulers require different memory checks - echo "#PBS -l pmem=${bls_opt_req_mem}mb" >> $bls_tmp_file - echo "#PBS -l pvmem=${bls_opt_req_mem}mb" >> $bls_tmp_file - pbs_select="$pbs_select:mem=${bls_opt_req_mem}mb" - if [ "$is_pbs_pro" -neq "0" ]; then - echo "#PBS -l mem=${bls_opt_req_mem}mb" >> $bls_tmp_file - fi +if [ "x$bls_opt_req_mem" != "x" ]; then + # Max amount of virtual memory allocated to a single process + if [[ "x$pbs_set_pvmem" == "xyes" ]]; then + echo "#PBS -l pvmem=${bls_opt_req_mem}mb" >> $bls_tmp_file + fi + # Max amount of physical memory allocated to a single process + if [[ "$bls_opt_smpgranularity" == 1 ]]; then + echo "#PBS -l pmem=${bls_opt_req_mem}mb" >> $bls_tmp_file + fi + # Total amount of memory allocated to the job + pbs_select="$pbs_select:mem=${bls_opt_req_mem}mb" + if [ "$is_pbs_pro" != 0 ]; then + echo "#PBS -l mem=${bls_opt_req_mem}mb" >> $bls_tmp_file + fi fi bls_set_up_local_and_extra_args @@ -144,7 +149,7 @@ bls_set_up_local_and_extra_args [ -z "$bls_opt_queue" ] || grep -q "^#PBS -q" $bls_tmp_file || echo "#PBS -q $bls_opt_queue" >> $bls_tmp_file # Extended support for MPI attributes -if [ "$is_pbs_pro" -eq "0" ]; then +if [ "$is_pbs_pro" == 0 ]; then pbs_select="$pbs_select:ncpus=1" else if [ "x$bls_opt_wholenodes" == "xyes" ]; then @@ -204,7 +209,7 @@ else [ -z "$bls_fl_subst_and_accumulate_result" ] || echo "#PBS -W stageout=\\'$bls_fl_subst_and_accumulate_result\\'" >> $bls_tmp_file fi -if [ "$is_pbs_pro" -eq "0" ]; then +if [ "$is_pbs_pro" == 0 ]; then echo $pbs_select >> $bls_tmp_file fi From a825612991a9a03d68b8f55d6510e0791e5958f7 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Mon, 18 Apr 2016 11:25:34 -0500 Subject: [PATCH 102/169] Copy blahp SLURM scripts from Condor (SOFTWARE-2256) --- src/CMakeLists.txt | 2 + src/scripts/Makefile.am | 2 + src/scripts/slurm_cancel.sh | 60 ++++++++++++++++++ src/scripts/slurm_hold.sh | 46 ++++++++++++++ src/scripts/slurm_resume.sh | 40 ++++++++++++ src/scripts/slurm_status.sh | 120 ++++++++++++++++++++++++++++++++++++ src/scripts/slurm_submit.sh | 106 +++++++++++++++++++++++++++++++ 7 files changed, 376 insertions(+) create mode 100644 src/scripts/slurm_cancel.sh create mode 100644 src/scripts/slurm_hold.sh create mode 100644 src/scripts/slurm_resume.sh create mode 100644 src/scripts/slurm_status.sh create mode 100644 src/scripts/slurm_submit.sh diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f22041ad..575efc63 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -136,6 +136,8 @@ set(blah_scripts scripts/sge_submit.sh scripts/sge_filestaging scripts/sge_hold.sh scripts/sge_status.sh scripts/runcmd.pl.template scripts/sge_local_submit_attributes.sh + scripts/slurm_cancel.sh scripts/slurm_resume.sh scripts/slurm_status.sh + scripts/slurm_submit.sh scripts/pbs_status.py ) diff --git a/src/scripts/Makefile.am b/src/scripts/Makefile.am index 987b55f6..367c8990 100644 --- a/src/scripts/Makefile.am +++ b/src/scripts/Makefile.am @@ -37,6 +37,8 @@ libexec_SCRIPTS = blah_load_config.sh blah_common_submit_functions.sh \ condor_cancel.sh condor_status.sh condor_submit.sh condor_hold.sh condor_resume.sh \ sge_cancel.sh sge_helper sge_resume.sh sge_submit.sh sge_filestaging \ sge_hold.sh sge_status.sh runcmd.pl.template sge_local_submit_attributes.sh \ + slurm_cancel.sh slurm_resume.sh slurm_status.sh \ + slurm_submit.sh \ pbs_status.py EXTRA_DIST = $(bin_SCRIPTS) diff --git a/src/scripts/slurm_cancel.sh b/src/scripts/slurm_cancel.sh new file mode 100644 index 00000000..1e555bcb --- /dev/null +++ b/src/scripts/slurm_cancel.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# File: slurm_cancel.sh +# Author: Jaime Frey (jfrey@cs.wisc.edu) +# Based on code by David Rebatto (david.rebatto@mi.infn.it) +# +# Copyright (c) Members of the EGEE Collaboration. 2004. +# Copyright (c) HTCondor Team, Computer Sciences Department, +# University of Wisconsin-Madison, WI. 2015. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +. `dirname $0`/blah_load_config.sh + +if [ -z "$slurm_binpath" ] ; then + slurm_binpath=/usr/bin +fi + +jnr=0 +jc=0 +for job in $@ ; do + jnr=$(($jnr+1)) +done +for job in $@ ; do + requested=`echo $job | sed 's/^.*\///'` + cmdout=`${slurm_binpath}/scancel $requested 2>&1` + retcode=$? + # If the job is already completed or no longer in the queue, + # treat it as successfully deleted. + if echo "$cmdout" | grep -q 'Invalid job id specified' ; then + retcode=0 + if [ "$retcode" == "0" ] ; then + if [ "$jnr" == "1" ]; then + echo " 0 No\\ error" + else + echo .$jc" 0 No\\ error" + fi + else + escaped_cmdout=`echo $cmdout|sed "s/ /\\\\\ /g"` + if [ "$jnr" == "1" ]; then + echo " $retcode $escaped_cmdout" + else + echo .$jc" $retcode $escaped_cmdout" + fi + fi + jc=$(($jc+1)) +done + diff --git a/src/scripts/slurm_hold.sh b/src/scripts/slurm_hold.sh new file mode 100644 index 00000000..67f41f6f --- /dev/null +++ b/src/scripts/slurm_hold.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# File: slurm_hold.sh +# Author: Jaime Frey (jfrey@cs.wisc.edu) +# Based on code by David Rebatto (david.rebatto@mi.infn.it) +# +# Copyright (c) Members of the EGEE Collaboration. 2004. +# Copyright (c) HTCondor Team, Computer Sciences Department, +# University of Wisconsin-Madison, WI. 2015. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +. `dirname $0`/blah_load_config.sh + +if [ -z "$slurm_binpath" ] ; then + slurm_binpath=/usr/bin +fi + +requested=`echo $1 | sed 's/^.*\///'` + +cmdout=`${slurm_binpath}/scontrol hold $requested 2>&1` +retcode=$? +if echo "$cmdout" | grep -q 'Job is no longer pending execution' ; then + cmdout=`${slurm_binpath}/scontrol requeuehold $requested 2>&1` + retcode=$? +fi + +if [ "$retcode" == "0" ] ; then + echo " 0 No\\ error" + exit 0 +else + echo " 1 Error" + exit 1 +fi diff --git a/src/scripts/slurm_resume.sh b/src/scripts/slurm_resume.sh new file mode 100644 index 00000000..188b22cf --- /dev/null +++ b/src/scripts/slurm_resume.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# File: slurm_resume.sh +# Author: Jaime Frey (jfrey@cs.wisc.edu) +# Based on code by David Rebatto (david.rebatto@mi.infn.it) +# +# Copyright (c) Members of the EGEE Collaboration. 2004. +# Copyright (c) HTCondor Team, Computer Sciences Department, +# University of Wisconsin-Madison, WI. 2015. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +. `dirname $0`/blah_load_config.sh + +if [ -z "$slurm_binpath" ] ; then + slurm_binpath=/usr/bin +fi + +requested=`echo $1 | sed 's/^.*\///'` +${slurm_binpath}/scontrol release $requested >&/dev/null + +if [ "$?" == "0" ]; then + echo " 0 No\\ error" + exit 0 +else + echo " 1 Error" + exit 1 +fi + diff --git a/src/scripts/slurm_status.sh b/src/scripts/slurm_status.sh new file mode 100644 index 00000000..1b96a9e4 --- /dev/null +++ b/src/scripts/slurm_status.sh @@ -0,0 +1,120 @@ +#!/bin/bash + +# File: slurm_status.sh +# Author: Jaime Frey (jfrey@cs.wisc.edu) +# Based on code by David Rebatto (david.rebatto@mi.infn.it) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +. `dirname $0`/blah_load_config.sh + +if [ -z "$slurm_binpath" ] ; then + slurm_binpath=/usr/bin +fi + +usage_string="Usage: $0 [-w] [-n]" + +#echo $0 "$@" >>~/slurm.debug + +############################################################### +# Parse parameters +############################################################### + +while getopts "wn" arg +do + case "$arg" in + w) getwn="yes" ;; + n) ;; + + -) break ;; + ?) echo $usage_string + exit 1 ;; + esac +done + +shift `expr $OPTIND - 1` + +pars=$* +proxy_dir=~/.blah_jobproxy_dir + +for reqfull in $pars ; do + reqjob=`echo $reqfull | sed -e 's/^.*\///'` + + staterr=/tmp/${reqjob}_staterr + +#echo "running: ${slurm_binpath}/scontrol show job $reqjob" >>~/slurm.debug + result=`${slurm_binpath}/scontrol show job $reqjob 2>$staterr` + stat_exit_code=$? +#echo "stat_exit_code=$stat_exit_code" >>~/slurm.debug + result=`echo "$result" | awk -v job_id=$reqjob -v proxy_dir=$proxy_dir ' +BEGIN { + blah_status = 4 + slurm_status = "" + exit_code = "0" +} + +/JobState=/ { + slurm_status = substr( $1, index( $1, "=" ) + 1 ) +} + +/ExitCode=/ { + if ( split( $4, tmp, "[=:]" ) == 3 ) { + exit_code = tmp[2] + } +} + +END { + if ( slurm_status ~ "BOOT_FAIL" ) { blah_status = 4 } + if ( slurm_status ~ "CANCELLED" ) { blah_status = 3 } + if ( slurm_status ~ "COMPLETED" ) { blah_status = 4 } + if ( slurm_status ~ "CONFIGURING" ) { blah_status = 1 } + if ( slurm_status ~ "COMPLETING" ) { blah_status = 2 } + if ( slurm_status ~ "FAILED" ) { blah_status = 4 } + if ( slurm_status ~ "NODE_FAIL" ) { blah_status = 4 } + if ( slurm_status ~ "PENDING" ) { blah_status = 1 } + if ( slurm_status ~ "PREEMPTED" ) { blah_status = 4 } + if ( slurm_status ~ "RUNNING" ) { blah_status = 2 } + if ( slurm_status ~ "SPECIAL_EXIT" ) { blah_status = 4 } + if ( slurm_status ~ "STOPPED" ) { blah_status = 2 } + if ( slurm_status ~ "SUSPENDED" ) { blah_status = 2 } + + print "[BatchJobId=\"" job_id "\";JobStatus=" blah_status ";" + if ( blah_status == 4 ) { + print "ExitCode=" exit_code ";" + } + print "]\n" + if ( blah_status == 3 || blah_status == 4 ) { + #system( "rm " proxy_dir "/" job_id ".proxy 2>/dev/null" ) + } +} +' +` +#echo result=$result >>~/slurm.debug + errout=`cat $staterr` + rm -f $staterr 2>/dev/null + + if echo "$errout" | grep -q "Invalid job id specified" ; then + stat_exit_code=0 + fi + if [ $stat_exit_code -eq 0 ] ; then + echo 0${result} +#echo 0${result} >>~/slurm.debug + else + echo 1Error: ${errout} +#echo 1Error: ${errout} >>~/slurm.debug + fi + +done + +exit 0 diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh new file mode 100644 index 00000000..a798f132 --- /dev/null +++ b/src/scripts/slurm_submit.sh @@ -0,0 +1,106 @@ +#!/bin/bash +# +# File: slurm_submit.sh +# Author: Jaime Frey (jfrey@cs.wisc.edu) +# Based on code by David Rebatto (david.rebatto@mi.infn.it) +# +# Description: +# Submission script for SLURM, to be invoked by blahpd server. +# Usage: +# slurm_submit.sh -c [-i ] [-o ] [-e ] [-w working dir] [-- command's arguments] +# +# Copyright (c) Members of the EGEE Collaboration. 2004. +# Copyright (c) HTCondor Team, Computer Sciences Department, +# University of Wisconsin-Madison, WI. 2015. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +. `dirname $0`/blah_common_submit_functions.sh + +# Default values for configuration variables +slurm_std_storage=${slurm_std_storage:-/dev/null} +slurm_binpath=${slurm_binpath:-/usr/bin} + +bls_parse_submit_options "$@" + +bls_setup_all_files + +# Write wrapper preamble +cat > $bls_tmp_file << end_of_preamble +#!/bin/bash +# SLURM job wrapper generated by `basename $0` +# on `/bin/date` +# +# stgcmd = $bls_opt_stgcmd +# proxy_string = $bls_opt_proxy_string +# proxy_local_file = $bls_proxy_local_file +# +# SLURM directives: +#SBATCH -o $slurm_std_storage +#SBATCH -e $slurm_std_storage +end_of_preamble + +#local batch system-specific file output must be added to the submit file +bls_local_submit_attributes_file=${blah_bin_directory}/slurm_local_submit_attributes.sh + +if [ "x$bls_opt_req_mem" != "x" ] +then + # Different schedulers require different memory checks + echo "#SBATCH --mem=${bls_opt_req_mem}" >> $bls_tmp_file +fi + +bls_set_up_local_and_extra_args + +# Simple support for multi-cpu attributes +if [[ $bls_opt_mpinodes -gt 1 ]] ; then + echo "#SBATCH -N $bls_opt_mpinodes" >> $bls_tmp_file +fi + + +# Input and output sandbox setup. +# Assume all filesystems are shared. + +bls_add_job_wrapper + + +############################################################### +# Submit the script +############################################################### + +datenow=`date +%Y%m%d` +jobID=`${slurm_binpath}/sbatch $bls_tmp_file` # actual submission +retcode=$? +cp $bls_tmp_file ~/bls_tmp_file.$$ +if [ "$retcode" != "0" ] ; then + rm -f $bls_tmp_file + exit 1 +fi + +# The job id is actually the first numbers in the string (slurm support) +jobID=`echo $jobID | awk 'match($0,/[0-9]+/){print substr($0, RSTART, RLENGTH)}'` +if [ "X$jobID" == "X" ]; then + rm -f $bls_tmp_file + echo "Error: job id missing" >&2 + echo Error # for the sake of waiting fgets in blahpd + exit 1 +fi + +# Compose the blahp jobID ("slurm/" + datenow + pbs jobid) +blahp_jobID="slurm/`basename $datenow`/$jobID" + +echo "BLAHP_JOBID_PREFIX$blahp_jobID" + +bls_wrap_up_submit + +exit $retcode From 532fb7f68a0d945381568dddb6b05de4a2ccc1e0 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 20 Apr 2016 12:27:40 -0500 Subject: [PATCH 103/169] Add default SLURM config --- config/blah.config.template | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/config/blah.config.template b/config/blah.config.template index 552906e4..b4497ef4 100644 --- a/config/blah.config.template +++ b/config/blah.config.template @@ -303,7 +303,11 @@ sge_rootpath=$SGE_ROOT ##SLURM #path to the slurm executables -slurm_binpath= +#default: /usr/bin +slurm_binpath=/usr/bin + +#default: /dev/null +slurm_std_storage=/dev/null ## #####BNotifier subsection From 3e780c20b116b964b3673b8b797e3f94d7efb4d6 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 18 May 2016 10:17:42 -0500 Subject: [PATCH 104/169] Add HTCondor's patch for multicore HTCondor (SOFTWARE-2303) --- src/scripts/condor_submit.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/scripts/condor_submit.sh b/src/scripts/condor_submit.sh index c96595bf..8b55cb9f 100755 --- a/src/scripts/condor_submit.sh +++ b/src/scripts/condor_submit.sh @@ -47,13 +47,13 @@ original_args="$@" # script debug flag: currently unused debug=no -# number of MPI nodes: currently unused -mpinodes=0 +# number of MPI nodes: interpretted as a core count for vanilla universe +mpinodes=1 # Name of local requirements file: currently unused req_file="" -while getopts "a:i:o:de:j:n:v:V:c:w:x:u:q:r:s:T:I:O:R:C:D:m:" arg +while getopts "a:i:o:de:j:n:N:z:h:S:v:V:c:w:x:u:q:r:s:T:I:O:R:C:D:m:" arg do case "$arg" in a) xtra_args="$OPTARG" ;; @@ -66,6 +66,10 @@ do V) environment="$OPTARG";; c) command="$OPTARG" ;; n) mpinodes="$OPTARG" ;; + N) hostsmpsize="$OPTARG";; + z) wholenodes="$OPTARG";; + h) hostnumber="$OPTARG";; + S) smpgranularity="$OPTARG";; w) workdir="$OPTARG";; x) proxy_file="$OPTARG" ;; u) proxy_subject="$OPTARG" ;; @@ -247,6 +251,7 @@ then fi cat >> $submit_file << EOF +request_cpus = $mpinodes # We insist on new style quoting in Condor arguments = $arguments input = $stdin From 47eaf9a06fc682a33eaf204b4096fec7f2c5615e Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Tue, 21 Jun 2016 15:28:16 -0500 Subject: [PATCH 105/169] Fix Slurm file leak (SOFTWARE-2367) --- src/scripts/slurm_submit.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index a798f132..64834798 100644 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -81,7 +81,7 @@ bls_add_job_wrapper datenow=`date +%Y%m%d` jobID=`${slurm_binpath}/sbatch $bls_tmp_file` # actual submission retcode=$? -cp $bls_tmp_file ~/bls_tmp_file.$$ + if [ "$retcode" != "0" ] ; then rm -f $bls_tmp_file exit 1 From 17bcdfef93ca484f40f5315162b7c44b607ffefe Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 22 Jun 2016 10:55:40 -0500 Subject: [PATCH 106/169] Print sbatch errors to stderr for handling by HTCondor's gridmanager --- src/scripts/slurm_submit.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index 64834798..9e67467c 100644 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -84,6 +84,7 @@ retcode=$? if [ "$retcode" != "0" ] ; then rm -f $bls_tmp_file + echo "Error from sbatch: $jobID" >&2 exit 1 fi From e053390d8eaf7b98b2e97c16f4923ce898a86162 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 22 Jun 2016 14:13:09 -0500 Subject: [PATCH 107/169] Fix bad spacing on request memory if statement --- src/scripts/condor_submit.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/condor_submit.sh b/src/scripts/condor_submit.sh index 8b55cb9f..10849855 100755 --- a/src/scripts/condor_submit.sh +++ b/src/scripts/condor_submit.sh @@ -245,7 +245,7 @@ then echo -e $xtra_args >> $submit_file fi -if [ "x$req_mem" != "x"] +if [ "x$req_mem" != "x" ] then echo "request_memory = $req_mem" >> $submit_file fi From 6ed421fa3101a272e79fe2eb630c32b212e2712e Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Thu, 23 Jun 2016 13:21:53 -0500 Subject: [PATCH 108/169] Set executable bit on scripts --- src/scripts/lsf_hold.sh | 0 src/scripts/lsf_resume.sh | 0 src/scripts/pbs_hold.sh | 0 src/scripts/pbs_resume.sh | 0 src/scripts/slurm_cancel.sh | 0 src/scripts/slurm_hold.sh | 0 src/scripts/slurm_resume.sh | 0 src/scripts/slurm_status.sh | 0 src/scripts/slurm_submit.sh | 0 9 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 src/scripts/lsf_hold.sh mode change 100644 => 100755 src/scripts/lsf_resume.sh mode change 100644 => 100755 src/scripts/pbs_hold.sh mode change 100644 => 100755 src/scripts/pbs_resume.sh mode change 100644 => 100755 src/scripts/slurm_cancel.sh mode change 100644 => 100755 src/scripts/slurm_hold.sh mode change 100644 => 100755 src/scripts/slurm_resume.sh mode change 100644 => 100755 src/scripts/slurm_status.sh mode change 100644 => 100755 src/scripts/slurm_submit.sh diff --git a/src/scripts/lsf_hold.sh b/src/scripts/lsf_hold.sh old mode 100644 new mode 100755 diff --git a/src/scripts/lsf_resume.sh b/src/scripts/lsf_resume.sh old mode 100644 new mode 100755 diff --git a/src/scripts/pbs_hold.sh b/src/scripts/pbs_hold.sh old mode 100644 new mode 100755 diff --git a/src/scripts/pbs_resume.sh b/src/scripts/pbs_resume.sh old mode 100644 new mode 100755 diff --git a/src/scripts/slurm_cancel.sh b/src/scripts/slurm_cancel.sh old mode 100644 new mode 100755 diff --git a/src/scripts/slurm_hold.sh b/src/scripts/slurm_hold.sh old mode 100644 new mode 100755 diff --git a/src/scripts/slurm_resume.sh b/src/scripts/slurm_resume.sh old mode 100644 new mode 100755 diff --git a/src/scripts/slurm_status.sh b/src/scripts/slurm_status.sh old mode 100644 new mode 100755 diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh old mode 100644 new mode 100755 From 898f72b024bd7535c5e370b241276c9e04dd8648 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Thu, 23 Jun 2016 13:59:06 -0500 Subject: [PATCH 109/169] Package slurm_hold.sh (SOFTWARE-2375) --- src/CMakeLists.txt | 2 +- src/scripts/Makefile.am | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 575efc63..bec5df47 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -137,7 +137,7 @@ set(blah_scripts scripts/sge_status.sh scripts/runcmd.pl.template scripts/sge_local_submit_attributes.sh scripts/slurm_cancel.sh scripts/slurm_resume.sh scripts/slurm_status.sh - scripts/slurm_submit.sh + scripts/slurm_hold.sh scripts/slurm_submit.sh scripts/pbs_status.py ) diff --git a/src/scripts/Makefile.am b/src/scripts/Makefile.am index 367c8990..23c397cb 100644 --- a/src/scripts/Makefile.am +++ b/src/scripts/Makefile.am @@ -37,7 +37,7 @@ libexec_SCRIPTS = blah_load_config.sh blah_common_submit_functions.sh \ condor_cancel.sh condor_status.sh condor_submit.sh condor_hold.sh condor_resume.sh \ sge_cancel.sh sge_helper sge_resume.sh sge_submit.sh sge_filestaging \ sge_hold.sh sge_status.sh runcmd.pl.template sge_local_submit_attributes.sh \ - slurm_cancel.sh slurm_resume.sh slurm_status.sh \ + slurm_cancel.sh slurm_hold.sh slurm_resume.sh slurm_status.sh \ slurm_submit.sh \ pbs_status.py From 42e738e839d89a54a90d210fcc96dc4659a79b92 Mon Sep 17 00:00:00 2001 From: efajardo Date: Wed, 20 Jul 2016 14:25:42 -0700 Subject: [PATCH 110/169] Updated the changes from HTCondor ticket 5722 for SOFTWARE-2399 --- src/scripts/Makefile.am | 3 +- src/scripts/pbs_status.sh | 4 + src/scripts/slurm_status.py | 518 ++++++++++++++++++++++++++++++++++++ src/scripts/slurm_status.sh | 4 + 4 files changed, 528 insertions(+), 1 deletion(-) create mode 100644 src/scripts/slurm_status.py diff --git a/src/scripts/Makefile.am b/src/scripts/Makefile.am index 23c397cb..c4b0a0be 100644 --- a/src/scripts/Makefile.am +++ b/src/scripts/Makefile.am @@ -39,6 +39,7 @@ libexec_SCRIPTS = blah_load_config.sh blah_common_submit_functions.sh \ sge_hold.sh sge_status.sh runcmd.pl.template sge_local_submit_attributes.sh \ slurm_cancel.sh slurm_hold.sh slurm_resume.sh slurm_status.sh \ slurm_submit.sh \ - pbs_status.py + pbs_status.py \ + slurm_status.py EXTRA_DIST = $(bin_SCRIPTS) diff --git a/src/scripts/pbs_status.sh b/src/scripts/pbs_status.sh index a58ec725..51cda061 100755 --- a/src/scripts/pbs_status.sh +++ b/src/scripts/pbs_status.sh @@ -33,6 +33,10 @@ . `dirname $0`/blah_load_config.sh +if [ -x ${blah_libexec_directory}/pbs_status.py ] ; then + exec ${blah_libexec_directory}/pbs_status.py "$@" +fi + if [ "x$job_registry" != "x" ] ; then ${blah_sbin_directory}/blah_job_registry_lkup $@ exit 0 diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py new file mode 100644 index 00000000..6ea9831d --- /dev/null +++ b/src/scripts/slurm_status.py @@ -0,0 +1,518 @@ +#!/usr/bin/python + +# File: slurm_status.py +# +# Author: Brian Bockelman (bbockelm@cse.unl.edu) +# Jaime Frey (jfrey@cs.wisc.edu) +# +# Copyright (c) University of Nebraska-Lincoln. 2012 +# University of Wisconsin-Madison. 2016 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Query SLURM for the status of a given job + +Internally, it creates a cache of the SLURM response for all jobs and +will reuse this for subsequent queries. +""" + +import os +import re +import pwd +import sys +import time +import errno +import fcntl +import random +import struct +import subprocess +import signal +import tempfile +import pickle +import csv + +cache_timeout = 60 + +launchtime = time.time() + +def log(msg): + """ + A very lightweight log - not meant to be used in production, but helps + when debugging scale tests + """ + print >> sys.stderr, time.strftime("%x %X"), os.getpid(), msg + +def createCacheDir(): + uid = os.geteuid() + username = pwd.getpwuid(uid).pw_name + cache_dir = os.path.join("/var/tmp", "slurm_cache_%s" % username) + + try: + os.mkdir(cache_dir, 0755) + except OSError, oe: + if oe.errno != errno.EEXIST: + raise + s = os.stat(cache_dir) + if s.st_uid != uid: + raise Exception("Unable to check cache because it is owned by UID %d" % s.st_uid) + + return cache_dir + +def initLog(): + """ + Determine whether to create a logfile based on the presence of a file + in the user's slurm cache directory. If so, make the logfile there. + """ + cache_dir = createCacheDir() + if os.path.exists(os.path.join(cache_dir, "slurm_status.debug")): + filename = os.path.join(cache_dir, "slurm_status.log") + else: + filename = "/dev/null" + fd = open(filename, "a") + # Do NOT close the file descriptor blahp originally hands us for stderr. + # This causes blahp to lose all status updates. + os.dup(2) + os.dup2(fd.fileno(), 2) + +# Something else from a prior life - see gratia-probe-common's GratiaWrapper.py +def ExclusiveLock(fd, timeout=120): + """ + Grabs an exclusive lock on fd + + If the lock is owned by another process, and that process is older than the + timeout, then the other process will be signaled. If the timeout is + negative, then the other process is never signaled. + + If we are unable to hold the lock, this call will not block on the lock; + rather, it will throw an exception. + + By default, the timeout is 120 seconds. + """ + + # POSIX file locking is cruelly crude. There's nothing to do besides + # try / sleep to grab the lock, no equivalent of polling. + # Why hello, thundering herd. + + # An alternate would be to block on the lock, and use signals to interupt. + # This would mess up Gratia's flawed use of signals already, and not be + # able to report on who has the lock. I don't like indefinite waits! + max_time = 30 + starttime = time.time() + tries = 1 + while time.time() - starttime < max_time: + try: + fcntl.lockf(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + return + except IOError, ie: + if not ((ie.errno == errno.EACCES) or (ie.errno == errno.EAGAIN)): + raise + if check_lock(fd, timeout): + time.sleep(.2) # Fast case; however, we have *no clue* how + # long it takes to clean/release the old lock. + # Nor do we know if we'd get it if we did + # fcntl.lockf w/ blocking immediately. Blech. + # Check again immediately, especially if this was the last + # iteration in the for loop. + try: + fcntl.lockf(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + return + except IOError, ie: + if not ((ie.errno == errno.EACCES) or (ie.errno == errno.EAGAIN)): + raise + sleeptime = random.random() + log("Unable to acquire lock, try %i; will sleep for %.2f " \ + "seconds and try for %.2f more seconds." % (tries, sleeptime, max_time - (time.time()-starttime))) + tries += 1 + time.sleep(sleeptime) + + log("Fatal exception - Unable to acquire lock") + raise Exception("Unable to acquire lock") + +def check_lock(fd, timeout): + """ + For internal use only. + + Given a fd that is locked, determine which process has the lock. + Kill said process if it is older than "timeout" seconds. + This will log the PID of the "other process". + """ + + pid = get_lock_pid(fd) + if pid == os.getpid(): + return True + + if timeout < 0: + log("Another process, %d, holds the cache lock." % pid) + return False + + try: + age = get_pid_age(pid) + except: + log("Another process, %d, holds the cache lock." % pid) + log("Unable to get the other process's age; will not time it out.") + return False + + log("Another process, %d (age %d seconds), holds the cache lock." % (pid, age)) + + if age > timeout: + os.kill(pid, signal.SIGKILL) + else: + return False + + return True + +linux_struct_flock = "hhxxxxqqixxxx" +try: + os.O_LARGEFILE +except AttributeError: + start_len = "hhlli" + +def get_lock_pid(fd): + # For reference, here's the definition of struct flock on Linux + # (/usr/include/bits/fcntl.h). + # + # struct flock + # { + # short int l_type; /* Type of lock: F_RDLCK, F_WRLCK, or F_UNLCK. */ + # short int l_whence; /* Where `l_start' is relative to (like `lseek'). */ + # __off_t l_start; /* Offset where the lock begins. */ + # __off_t l_len; /* Size of the locked area; zero means until EOF. */ + # __pid_t l_pid; /* Process holding the lock. */ + # }; + # + # Note that things are different on Darwin + # Assuming off_t is unsigned long long, pid_t is int + try: + if sys.platform == "darwin": + arg = struct.pack("QQihh", 0, 0, 0, fcntl.F_WRLCK, 0) + else: + arg = struct.pack(linux_struct_flock, fcntl.F_WRLCK, 0, 0, 0, 0) + result = fcntl.fcntl(fd, fcntl.F_GETLK, arg) + except IOError, ie: + if ie.errno != errno.EINVAL: + raise + log("Unable to determine which PID has the lock due to a " \ + "python portability failure. Contact the developers with your" \ + " platform information for support.") + return False + if sys.platform == "darwin": + _, _, pid, _, _ = struct.unpack("QQihh", result) + else: + _, _, _, _, pid = struct.unpack(linux_struct_flock, result) + return pid + +def get_pid_age(pid): + now = time.time() + st = os.stat("/proc/%d" % pid) + return now - st.st_ctime + +def call_scontrol(jobid=""): + """ + Call scontrol directly for a jobid. + If none is specified, query all jobid's. + + Returns a python dictionary with the job info. + """ + scontrol = get_slurm_location('scontrol') + + starttime = time.time() + log("Starting scontrol.") + child_stdout = os.popen("%s show job %s" % (scontrol, jobid)) + result = parse_scontrol_fd(child_stdout) + exit_status = child_stdout.close() + log("Finished scontrol (time=%f)." % (time.time()-starttime)) + if exit_status: + exit_code = 0 + if os.WIFEXITED(exit_status): + exit_code = os.WEXITSTATUS(exit_status) + if exit_code == 1: # Completed + result = {jobid: {'BatchJobId': '"%s"' % jobid, "JobStatus": "4", "ExitCode": ' 0'}} + elif exit_code == 271: # Removed + result = {jobid: {'BatchJobId': '"%s"' % jobid, 'JobStatus': '3', 'ExitCode': ' 0'}} + else: + raise Exception("scontrol failed with exit code %s" % str(exit_status)) + + # If the job has completed... + if jobid is not "" and "JobStatus" in result[jobid] and (result[jobid]["JobStatus"] == '4' or result[jobid]["JobStatus"] == '3'): + # Get the finished job stats and update the result + finished_job_stats = get_finished_job_stats(jobid) + result[jobid].update(finished_job_stats) + + return result + + +def which(program): + """ + Determine if the program is in the path. + + arg program: name of the program to search + returns: full path to executable, or None if executable is not found + """ + def is_exe(fpath): + return os.path.isfile(fpath) and os.access(fpath, os.X_OK) + + fpath, fname = os.path.split(program) + if fpath: + if is_exe(program): + return program + else: + for path in os.environ["PATH"].split(os.pathsep): + path = path.strip('"') + exe_file = os.path.join(path, program) + if is_exe(exe_file): + return exe_file + + return None + +def convert_cpu_to_seconds(cpu_string): + import re + h,m,s = re.split(':',cpu_string) + return int(h) * 3600 + int(m) * 60 + int(s) + +def get_finished_job_stats(jobid): + """ + Get a completed job's statistics such as used RAM and cpu usage. + """ + + # First, list the attributes that we want + return_dict = { "ImageSize": 0, "ExitCode": 0, "RemoteUserCpu": 0 } + + # Next, query the appropriate interfaces for the completed job information + sacct = get_slurm_location('sacct') + log("Querying sacct for completed job for jobid: %s" % (str(jobid))) + child_stdout = os.popen("%s -j %s -l --noconvert -P" % (sacct, str(jobid))) + sacct_data = child_stdout.readlines() + ret = child_stdout.close() + + if ret: + # retry without --noconvert for slurm < 15.8 + child_stdout = os.popen("sacct -j %s -l -P" % (str(jobid))) + sacct_data = child_stdout.readlines() + child_stdout.close() + + try: + reader = csv.DictReader(sacct_data, delimiter="|") + except Exception, e: + log("Unable to read in CSV output from sacct: %s" % str(e)) + return return_dict + + # Slurm can return more than 1 row, for some odd reason. + # so sum up relevant values + for row in reader: + if row["AveCPU"] is not "": + return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) + if row["MaxRSS"] is not "": + # Remove the trailing 'K' + return_dict["ImageSize"] += int(row["MaxRSS"].replace('K', '')) + if row["ExitCode"] is not "": + return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) + + return return_dict + + +_slurm_location_cache = None +def get_slurm_location(program): + """ + Locate the copy of the slurm bin the blahp configuration wants to use. + """ + global _slurm_location_cache + if _slurm_location_cache != None: + return os.path.join(_slurm_location_cache, program) + load_config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'blah_load_config.sh') + if os.path.exists(load_config_path) and os.access(load_config_path, os.R_OK): + cmd = 'source %s && echo "${slurm_binpath:-/usr/bin}/%s"' % (load_config_path, program) + else: + cmd = 'which %s' % program + child_stdout = os.popen(cmd) + output = child_stdout.read() + location = output.split("\n")[0].strip() + if child_stdout.close(): + raise Exception("Unable to determine scontrol location: %s" % output) + _slurm_location_cache = os.path.dirname(location) + return location + +job_id_re = re.compile("JobId=([0-9]+) .*") +exec_host_re = re.compile("\s*BatchHost=([\w\-.]+)") +status_re = re.compile("\s*JobState=([\w]+) .*") +exit_status_re = re.compile(".* ExitCode=(-?[0-9]+:[0-9]+)") +status_mapping = {"BOOT_FAIL": 4, "CANCELLED": 3, "COMPLETED": 4, "CONFIGURING": 1, "COMPLETING": 2, "FAILED": 4, "NODE_FAIL": 4, "PENDING": 1, "PREEMPTED": 4, "RUNNING": 2, "SPECIAL_EXIT": 4, "STOPPED": 2, "SUSPENDED": 2} + +def parse_scontrol_fd(fd): + """ + Parse the stdout fd of "scontrol show job" into a python dictionary + containing the information we need. + """ + job_info = {} + cur_job_id = None + cur_job_info = {} + for line in fd: + line = line.strip() + m = job_id_re.match(line) + if m: + if cur_job_id: + job_info[cur_job_id] = cur_job_info + cur_job_id = m.group(1) + #print cur_job_id, line + cur_job_info = {"BatchJobId": '"%s"' % cur_job_id} + continue + if cur_job_id == None: + continue + m = exec_host_re.match(line) + if m: + cur_job_info["WorkerNode"] = '"' + m.group(1) + '"' + continue + m = status_re.match(line) + if m: + status = status_mapping.get(m.group(1), 0) + if status != 0: + cur_job_info["JobStatus"] = str(status) + continue + m = exit_status_re.match(line) + if m: + cur_job_info["ExitCode"] = ' %s' % m.group(1).split(":")[0] + continue + if cur_job_id: + job_info[cur_job_id] = cur_job_info + return job_info + +def job_dict_to_string(info): + result = ["%s=%s;" % (i[0], i[1]) for i in info.items()] + return "[" + " ".join(result) + " ]" + +def fill_cache(cache_location): + log("Starting query to fill cache.") + results = call_scontrol() + log("Finished query to fill cache.") + (fd, filename) = tempfile.mkstemp(dir = "/var/tmp") + # Open the file with a proper python file object + f = os.fdopen(fd, "w") + writer = csv.writer(f, delimiter='\t') + try: + try: + for key, val in results.items(): + key = key.split(".")[0] + writer.writerow([key, pickle.dumps(val)]) + os.fsync(fd) + except: + os.unlink(filename) + raise + finally: + f.close() + os.rename(filename, cache_location) + + global launchtime + launchtime = time.time() + +cache_line_re = re.compile("([0-9]+[\.\w\-]+):\s+(.+)") +def cache_to_status(jobid, fd): + reader = csv.reader(fd, delimiter='\t') + for row in reader: + if row[0] == jobid: + return pickle.loads(row[1]) + +def check_cache(jobid, recurse=True): + uid = os.geteuid() + username = pwd.getpwuid(uid).pw_name + cache_dir = os.path.join("/var/tmp", "slurm_cache_%s" % username) + if recurse: + try: + s = os.stat(cache_dir) + except OSError, oe: + if oe.errno != 2: + raise + os.mkdir(cache_dir, 0755) + s = os.stat(cache_dir) + if s.st_uid != uid: + raise Exception("Unable to check cache because it is owned by UID %d" % s.st_uid) + cache_location = os.path.join(cache_dir, "blahp_results_cache") + try: + fd = open(cache_location, "a+") + except IOError, ie: + if ie.errno != 2: + raise + # Create an empty file so we can hold the file lock + fd = open(cache_location, "w+") + ExclusiveLock(fd) + # If someone grabbed the lock between when we opened and tried to + # acquire, they may have filled the cache + if os.stat(cache_location).st_size == 0: + fill_cache(cache_location) + fd.close() + if recurse: + return check_cache(jobid, recurse=False) + else: + return None + ExclusiveLock(fd) + s = os.fstat(fd.fileno()) + if s.st_uid != uid: + raise Exception("Unable to check cache file because it is owned by UID %d" % s.st_uid) + if (s.st_size == 0) or (launchtime - s.st_mtime > cache_timeout): + # If someone filled the cache between when we opened the file and + # grabbed the lock, we may not need to fill the cache. + s2 = os.stat(cache_location) + if (s2.st_size == 0) or (launchtime - s2.st_mtime > cache_timeout): + fill_cache(cache_location) + if recurse: + return check_cache(jobid, recurse=False) + else: + return None + return cache_to_status(jobid, fd) + +job_status_re = re.compile(".*JobStatus=(\d+);.*") + +def main(): + initLog() + + if len(sys.argv) != 2: + print "1Usage: slurm_status.py slurm//" + return 1 + jobid = sys.argv[1].split("/")[-1].split(".")[0] + log("Checking cache for jobid %s" % jobid) + cache_contents = None + try: + cache_contents = check_cache(jobid) + except Exception, e: + msg = "1ERROR: Internal exception, %s" % str(e) + log(msg) + #print msg + if not cache_contents: + log("Jobid %s not in cache; querying SLURM" % jobid) + results = call_scontrol(jobid) + log("Finished querying SLURM for jobid %s" % jobid) + if not results or jobid not in results: + log("1ERROR: Unable to find job %s" % jobid) + print "1ERROR: Unable to find job %s" % jobid + else: + log("0%s" % job_dict_to_string(results[jobid])) + print "0%s" % job_dict_to_string(results[jobid]) + else: + log("Jobid %s in cache." % jobid) + log("0%s" % job_dict_to_string(cache_contents)) + + if cache_contents["JobStatus"] == '4' or cache_contents["JobStatus"] == '3': + finished_job_stats = get_finished_job_stats(jobid) + cache_contents.update(finished_job_stats) + + print "0%s" % job_dict_to_string(cache_contents) + return 0 + +if __name__ == "__main__": + try: + sys.exit(main()) + except SystemExit: + raise + except Exception, e: + print "1ERROR: %s" % str(e).replace("\n", "\\n") + sys.exit(0) diff --git a/src/scripts/slurm_status.sh b/src/scripts/slurm_status.sh index 1b96a9e4..ae800124 100755 --- a/src/scripts/slurm_status.sh +++ b/src/scripts/slurm_status.sh @@ -19,6 +19,10 @@ . `dirname $0`/blah_load_config.sh +if [ -x ${blah_libexec_directory}/slurm_status.py ] ; then + exec ${blah_libexec_directory}/slurm_status.py "$@" +fi + if [ -z "$slurm_binpath" ] ; then slurm_binpath=/usr/bin fi From b96afb33649feef9699c1761954d928d04c1773d Mon Sep 17 00:00:00 2001 From: Edgar Fajardo Date: Tue, 26 Jul 2016 12:22:23 -0700 Subject: [PATCH 111/169] Update slurm_cancel.sh There was a bug caught in HCodnor Ticket [5804](https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=5804) . This fixes it for [SOFTWARE-TICKET-2404](https://jira.opensciencegrid.org/browse/SOFTWARE-2404) --- src/scripts/slurm_cancel.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scripts/slurm_cancel.sh b/src/scripts/slurm_cancel.sh index 1e555bcb..b80245e4 100755 --- a/src/scripts/slurm_cancel.sh +++ b/src/scripts/slurm_cancel.sh @@ -41,6 +41,7 @@ for job in $@ ; do # treat it as successfully deleted. if echo "$cmdout" | grep -q 'Invalid job id specified' ; then retcode=0 + fi if [ "$retcode" == "0" ] ; then if [ "$jnr" == "1" ]; then echo " 0 No\\ error" From b60fe7da3707e4f783e42f2bd9c1ea3f901d9988 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Mon, 23 May 2016 10:48:39 -0500 Subject: [PATCH 112/169] Allow users to set the SGE parallel environment policy --- config/blah.config.template | 3 +++ src/scripts/sge_submit.sh | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/config/blah.config.template b/config/blah.config.template index 174fdde1..909ba5b8 100644 --- a/config/blah.config.template +++ b/config/blah.config.template @@ -303,6 +303,9 @@ sge_cellname=$SGE_CELL sge_rootpath=$SGE_ROOT +#set the SGE parallel environment policy +sge_pe_policy=* + ##SLURM #path to the slurm executables diff --git a/src/scripts/sge_submit.sh b/src/scripts/sge_submit.sh index 941e8fb8..c95e7325 100755 --- a/src/scripts/sge_submit.sh +++ b/src/scripts/sge_submit.sh @@ -83,7 +83,8 @@ fi # Write SGE directives according to command line options # handle queue overriding [ -z "$bls_opt_queue" ] || grep -q "^#\$ -q" $bls_tmp_file || echo "#\$ -q $bls_opt_queue" >> $bls_tmp_file -[ -z "$bls_opt_mpinodes" -o "x${bls_opt_mpinodes}" = "x1" ] || grep -q "^#\$ -pe *\\*" $bls_tmp_file || echo "#\$ -pe * $bls_opt_mpinodes" >>$bls_tmp_file +[ -z "$bls_opt_mpinodes" -o "x${bls_opt_mpinodes}" = "x1" ] || grep -q "^#\$ -pe *\\*" $bls_tmp_file \ + || echo "#\$ -pe $sge_pe_policy $bls_opt_mpinodes" >>$bls_tmp_file # Input and output sandbox setup. bls_fl_subst_and_accumulate inputsand "@@F_REMOTE@`hostname -f`:@@F_LOCAL" "@@@" From 83ba3845ba61781bb7efde6a8da17184473baade Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Thu, 12 May 2016 11:19:48 -0500 Subject: [PATCH 113/169] Enable multicore support to PBS Pro (SOFTWARE-2326) --- src/scripts/pbs_submit.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index f4aee87b..12bdabc8 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -150,7 +150,7 @@ bls_set_up_local_and_extra_args # Extended support for MPI attributes if [ "$is_pbs_pro" == 0 ]; then - pbs_select="$pbs_select:ncpus=1" + pbs_select="$pbs_select:ncpus=$bls_opt_smpgranularity" else if [ "x$bls_opt_wholenodes" == "xyes" ]; then bls_opt_hostsmpsize=${bls_opt_hostsmpsize:-1} From 1bb0cfc303fc51b8170b839d8c0bdb113551f2a4 Mon Sep 17 00:00:00 2001 From: Derek Weitzel Date: Wed, 24 Aug 2016 14:41:05 -0500 Subject: [PATCH 114/169] Change from -N to -c for multi-cores per node The -N option is for the number of nodes. -c is for the number of cores per task. Since we care about the number of cores on a single node, -c is more appropriate. --- src/scripts/slurm_submit.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index 9e67467c..d7ef30cc 100755 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -64,7 +64,7 @@ bls_set_up_local_and_extra_args # Simple support for multi-cpu attributes if [[ $bls_opt_mpinodes -gt 1 ]] ; then - echo "#SBATCH -N $bls_opt_mpinodes" >> $bls_tmp_file + echo "#SBATCH -c $bls_opt_mpinodes" >> $bls_tmp_file fi From c78e2e4c2b28b73eb0e793c4f831f91b5a55ff45 Mon Sep 17 00:00:00 2001 From: Derek Weitzel Date: Wed, 24 Aug 2016 16:06:51 -0500 Subject: [PATCH 115/169] Use number of nodes and tasks --- src/scripts/slurm_submit.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index d7ef30cc..e9349b09 100755 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -64,7 +64,8 @@ bls_set_up_local_and_extra_args # Simple support for multi-cpu attributes if [[ $bls_opt_mpinodes -gt 1 ]] ; then - echo "#SBATCH -c $bls_opt_mpinodes" >> $bls_tmp_file + echo "#SBATCH -N 1" >> $bls_tmp_file + echo "#SBATCH -n $bls_opt_mpinodes" >> $bls_tmp_file fi From 2bbe27a6870ee6ab1784ec71db7cf038148f7b28 Mon Sep 17 00:00:00 2001 From: Derek Weitzel Date: Wed, 24 Aug 2016 16:32:32 -0500 Subject: [PATCH 116/169] Expand small options for clarity --- src/scripts/slurm_submit.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index e9349b09..934b12ca 100755 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -64,8 +64,8 @@ bls_set_up_local_and_extra_args # Simple support for multi-cpu attributes if [[ $bls_opt_mpinodes -gt 1 ]] ; then - echo "#SBATCH -N 1" >> $bls_tmp_file - echo "#SBATCH -n $bls_opt_mpinodes" >> $bls_tmp_file + echo "#SBATCH --nodes=1" >> $bls_tmp_file + echo "#SBATCH --ntasks=$bls_opt_mpinodes" >> $bls_tmp_file fi From d738b1d590aa8e456ddd3a79e4c7858fa5cadaaa Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Thu, 25 Aug 2016 11:18:46 -0500 Subject: [PATCH 117/169] Add slurm_local_submit_attributes.sh --- src/scripts/slurm_local_submit_attributes.sh | 28 ++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 src/scripts/slurm_local_submit_attributes.sh diff --git a/src/scripts/slurm_local_submit_attributes.sh b/src/scripts/slurm_local_submit_attributes.sh new file mode 100644 index 00000000..971bcf6d --- /dev/null +++ b/src/scripts/slurm_local_submit_attributes.sh @@ -0,0 +1,28 @@ +#/bin/sh + +# This file is sourced by blahp before submitting the job to slurm +# Anything printed to stdout is included in the submit file. +# For example, to set a default walltime of 24 hours in PBS, you +# could uncomment this line: + +# echo "#SBATCH --time=24:00:00" + +# blahp allows arbitrary attributes to be passed to this script on a per-job +# basis. If you add the following to your HTCondor-G submit file: + +#+remote_cerequirements = NumJobs == 100 && foo = 5 + +# Then an environment variable, NumJobs, will be exported prior to calling this +# script and set to a value of 100. The variable foo will be set to 5. + +# You could allow users to set the walltime for the job with the following +# customization (slurm syntax given; adjust for the appropriate batch system): + +# Uncomment the else block to default to 24 hours of runtime; otherwise, the queue +# default is used. +if [ -n "$Walltime" ]; then + let Walltime=Walltime/60 + echo "#SBATCH --time=$Walltime" +# else +# echo "#SBATCH --time=24:00:00" +fi From 1e687fab5fb6537f905464623303b43532930822 Mon Sep 17 00:00:00 2001 From: Derek Weitzel Date: Thu, 25 Aug 2016 11:44:47 -0500 Subject: [PATCH 118/169] Add partition support to Slurm submit --- src/scripts/slurm_submit.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index 9e67467c..f6852639 100755 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -54,6 +54,9 @@ end_of_preamble #local batch system-specific file output must be added to the submit file bls_local_submit_attributes_file=${blah_bin_directory}/slurm_local_submit_attributes.sh +# Handle queues and paritions (same thing in SLURM) (copied from PBS submit file) +[ -z "$bls_opt_queue" ] || grep -q "^#SBATCH --partition" $bls_tmp_file || echo "#SBATCH --partition=$bls_opt_queue" >> $bls_tmp_file + if [ "x$bls_opt_req_mem" != "x" ] then # Different schedulers require different memory checks From 041f885effb4953b88cdc81e97784d8a1ad8c7b0 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Thu, 25 Aug 2016 13:41:27 -0500 Subject: [PATCH 119/169] Fix path of slurm_local_submit_attributes.sh --- src/scripts/slurm_submit.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index 9e67467c..13331734 100755 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -52,7 +52,7 @@ cat > $bls_tmp_file << end_of_preamble end_of_preamble #local batch system-specific file output must be added to the submit file -bls_local_submit_attributes_file=${blah_bin_directory}/slurm_local_submit_attributes.sh +bls_local_submit_attributes_file=${blah_libexec_directory}/slurm_local_submit_attributes.sh if [ "x$bls_opt_req_mem" != "x" ] then From e957b97d646427b620328cc5ec425d883ae96f75 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Thu, 25 Aug 2016 11:14:09 -0500 Subject: [PATCH 120/169] Fix issues with qstat() (SOFTWARE-2358) --- src/scripts/pbs_status.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index a5975253..50d8fac1 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -225,31 +225,30 @@ def qstat(jobid=""): Returns a python dictionary with the job info. """ - qstat = get_qstat_location() - command = (qstat, '--version') + qstat_bin = get_qstat_location() + command = (qstat_bin, '--version') qstat_process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) qstat_version, _ = qstat_process.communicate() starttime = time.time() log("Starting qstat.") if re.search(r'PBSPro', qstat_version): - child_stdout = os.popen("%s -f %s" % (qstat, jobid)) # -1 conflicts with -f in PBS Pro + command = (qstat_bin, '-f', jobid) # -1 conflicts with -f in PBS Pro else: - child_stdout = os.popen("%s -f -1 %s" % (qstat, jobid)) - result = parse_qstat_fd(child_stdout) - exit_status = child_stdout.close() + command = (qstat_bin, '-f', '-1', jobid) + qstat_proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + qstat_out, _ = qstat_proc.communicate() + + result = parse_qstat_fd(qstat_out) log("Finished qstat (time=%f)." % (time.time()-starttime)) - if exit_status: - exit_code = 0 - if os.WIFEXITED(exit_status): - exit_code = os.WEXITSTATUS(exit_status) - if exit_code == 153 or exit_code == 35: # Completed - result = {jobid: {'BatchJobId': '"%s"' % jobid, "JobStatus": "4", "ExitCode": ' 0'}} - elif exit_code == 271: # Removed - result = {jobid: {'BatchJobId': '"%s"' % jobid, 'JobStatus': '3', 'ExitCode': ' 0'}} - else: - raise Exception("qstat failed with exit code %s" % str(exit_status)) - + + if qstat_proc.returncode in [35, 153]: # Completed or no longer in queue (presumably completed successfully) + result = {jobid: {'BatchJobId': '"%s"' % jobid, "JobStatus": "4", "ExitCode": ' 0'}} + elif qstat_proc.returncode == 271: # Removed + result = {jobid: {'BatchJobId': '"%s"' % jobid, 'JobStatus': '3', 'ExitCode': ' 0'}} + elif qstat_proc.returncode != 0: + raise Exception("qstat failed with exit code %s" % str(qstat_proc.returncode)) + # If the job has completed... if jobid is not "" and "JobStatus" in result[jobid] and (result[jobid]["JobStatus"] == '4' or result[jobid]["JobStatus"] == '3'): # Get the finished job stats and update the result From ceb01bc37618fb7475cc61520df174c3889b4bf5 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Fri, 26 Aug 2016 13:31:13 -0500 Subject: [PATCH 121/169] Add slurm_submit_attributes.sh to makefile lists --- src/CMakeLists.txt | 1 + src/scripts/Makefile.am | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bec5df47..c83f19c6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -138,6 +138,7 @@ set(blah_scripts scripts/sge_local_submit_attributes.sh scripts/slurm_cancel.sh scripts/slurm_resume.sh scripts/slurm_status.sh scripts/slurm_hold.sh scripts/slurm_submit.sh + scripts/slurm_local_submit_attributes.sh scripts/pbs_status.py ) diff --git a/src/scripts/Makefile.am b/src/scripts/Makefile.am index c4b0a0be..2edcffc3 100644 --- a/src/scripts/Makefile.am +++ b/src/scripts/Makefile.am @@ -38,7 +38,7 @@ libexec_SCRIPTS = blah_load_config.sh blah_common_submit_functions.sh \ sge_cancel.sh sge_helper sge_resume.sh sge_submit.sh sge_filestaging \ sge_hold.sh sge_status.sh runcmd.pl.template sge_local_submit_attributes.sh \ slurm_cancel.sh slurm_hold.sh slurm_resume.sh slurm_status.sh \ - slurm_submit.sh \ + slurm_submit.sh slurm_local_submit_attributes.sh \ pbs_status.py \ slurm_status.py From e495d6cab6c74bf233f48936af784ad272271969 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Fri, 9 Sep 2016 12:52:20 -0500 Subject: [PATCH 122/169] Fix qstart parsing errors that caused issues with caching introduced by e957b97 --- src/scripts/pbs_status.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 50d8fac1..f47b25cf 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -239,7 +239,7 @@ def qstat(jobid=""): qstat_proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) qstat_out, _ = qstat_proc.communicate() - result = parse_qstat_fd(qstat_out) + result = parse_qstat(qstat_out) log("Finished qstat (time=%f)." % (time.time()-starttime)) if qstat_proc.returncode in [35, 153]: # Completed or no longer in queue (presumably completed successfully) @@ -376,15 +376,15 @@ def get_qstat_location(): exit_status_re = re.compile("\s*[Ee]xit_status = (-?[0-9]+)") status_mapping = {"Q": 1, "R": 2, "E": 2, "F": 4, "C": 4, "H": 5} -def parse_qstat_fd(fd): +def parse_qstat(output): """ - Parse the stdout fd of "qstat -f" into a python dictionary containing + Parse the stdout of "qstat -f" into a python dictionary containing the information we need. """ job_info = {} cur_job_id = None cur_job_info = {} - for line in fd: + for line in output.split('\n'): line = line.strip() m = job_id_re.match(line) if m: From 3545be8b05698126331bdb90cdc6efe30e24aa0d Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Mon, 19 Sep 2016 13:31:41 -0500 Subject: [PATCH 123/169] Refactor scontrol calls to use subprocess (SOFTWARE-2450) --- src/scripts/slurm_status.py | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index 6ea9831d..1594425c 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -229,27 +229,26 @@ def call_scontrol(jobid=""): starttime = time.time() log("Starting scontrol.") - child_stdout = os.popen("%s show job %s" % (scontrol, jobid)) - result = parse_scontrol_fd(child_stdout) - exit_status = child_stdout.close() + command = (scontrol, 'show', 'job', jobid) + scontrol_proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + scontrol_out, _ = scontrol_proc.communicate() + + result = parse_scontrol(scontrol_out) log("Finished scontrol (time=%f)." % (time.time()-starttime)) - if exit_status: - exit_code = 0 - if os.WIFEXITED(exit_status): - exit_code = os.WEXITSTATUS(exit_status) - if exit_code == 1: # Completed - result = {jobid: {'BatchJobId': '"%s"' % jobid, "JobStatus": "4", "ExitCode": ' 0'}} - elif exit_code == 271: # Removed - result = {jobid: {'BatchJobId': '"%s"' % jobid, 'JobStatus': '3', 'ExitCode': ' 0'}} - else: - raise Exception("scontrol failed with exit code %s" % str(exit_status)) - + + if scontrol_proc.returncode == 1: # Completed + result = {jobid: {'BatchJobId': '"%s"' % jobid, "JobStatus": "4", "ExitCode": ' 0'}} + elif scontrol_proc.returncode == 271: # Removed + result = {jobid: {'BatchJobId': '"%s"' % jobid, 'JobStatus': '3', 'ExitCode': ' 0'}} + elif scontrol_proc.returncode != 0: + raise Exception("scontrol failed with exit code %s" % str(scontrol_proc.returncode)) + # If the job has completed... if jobid is not "" and "JobStatus" in result[jobid] and (result[jobid]["JobStatus"] == '4' or result[jobid]["JobStatus"] == '3'): # Get the finished job stats and update the result finished_job_stats = get_finished_job_stats(jobid) result[jobid].update(finished_job_stats) - + return result @@ -349,15 +348,15 @@ def get_slurm_location(program): exit_status_re = re.compile(".* ExitCode=(-?[0-9]+:[0-9]+)") status_mapping = {"BOOT_FAIL": 4, "CANCELLED": 3, "COMPLETED": 4, "CONFIGURING": 1, "COMPLETING": 2, "FAILED": 4, "NODE_FAIL": 4, "PENDING": 1, "PREEMPTED": 4, "RUNNING": 2, "SPECIAL_EXIT": 4, "STOPPED": 2, "SUSPENDED": 2} -def parse_scontrol_fd(fd): +def parse_scontrol(output): """ - Parse the stdout fd of "scontrol show job" into a python dictionary + Parse the stdout of "scontrol show job" into a python dictionary containing the information we need. """ job_info = {} cur_job_id = None cur_job_info = {} - for line in fd: + for line in output.split('\n'): line = line.strip() m = job_id_re.match(line) if m: From 012c0fbcf1990d4002347e0207dfd765917c3cdc Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 12 Oct 2016 15:03:06 -0500 Subject: [PATCH 124/169] Fix segfault when using glexec and disabling limited proxies (SOFTWARE-2475) --- src/server.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/server.c b/src/server.c index 0b78df31..02a018ec 100644 --- a/src/server.c +++ b/src/server.c @@ -2012,21 +2012,25 @@ cmd_renew_proxy(void *args) switch(jobStatus) { case 1: /* job queued: copy the proxy locally */ - if ((!use_mapping) && (!disable_limited_proxy) - ){ + if (!use_mapping) + { + if (!disable_limited_proxy) + { limit_proxy(proxyFileName, old_proxy, NULL); - resultLine = make_message("%s 0 Proxy\\ renewed", reqId); + } + resultLine = make_message("%s 0 Proxy\\ renewed", reqId); } else { exe_command.delegation_type = atoi(argv[CMD_RENEW_PROXY_ARGS + 1 + MEXEC_PARAM_DELEGTYPE]); exe_command.delegation_cred = argv[CMD_RENEW_PROXY_ARGS + 1 + MEXEC_PARAM_DELEGCRED]; - if ((use_glexec) && (disable_limited_proxy)) + if ((use_glexec) || (disable_limited_proxy)) { exe_command.source_proxy = argv[CMD_RENEW_PROXY_ARGS + 1 + MEXEC_PARAM_SRCPROXY]; } else { - limited_proxy_name = limit_proxy(proxyFileName, NULL, NULL); - exe_command.source_proxy = limited_proxy_name; + limited_proxy_name = limit_proxy(proxyFileName, NULL, NULL); + exe_command.source_proxy = limited_proxy_name; + } exe_command.dest_proxy = old_proxy; if (exe_command.source_proxy == NULL) @@ -2139,9 +2143,16 @@ cmd_send_proxy_to_worker_node(void *args) if (workernode != NULL && strcmp(workernode, "")) { - if((!use_glexec) && (!disable_limited_proxy)) + if (!use_glexec) { + if (disable_limited_proxy) + { + proxyFileNameNew = strdup(proxyFileName); + } + else + { proxyFileNameNew = limit_proxy(proxyFileName, NULL, NULL); + } } else proxyFileNameNew = strdup(argv[CMD_SEND_PROXY_TO_WORKER_NODE_ARGS + MEXEC_PARAM_SRCPROXY + 1]); From 745cae86832cfb2c8af6f75d0c2e8ebf58cd20d4 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 26 Oct 2016 17:56:34 -0500 Subject: [PATCH 125/169] Fix broken PBS/slurm job queries when no jobid was specified When using subprocess.Popen, empty strings are respected as arguments causing some versions of qstat (UFL's 4.2.9) to fail when job IDs were not specified in the qstat call --- src/scripts/pbs_status.py | 10 +++++----- src/scripts/slurm_status.py | 4 +++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index f47b25cf..a48d33b9 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -232,13 +232,13 @@ def qstat(jobid=""): starttime = time.time() log("Starting qstat.") - if re.search(r'PBSPro', qstat_version): - command = (qstat_bin, '-f', jobid) # -1 conflicts with -f in PBS Pro - else: - command = (qstat_bin, '-f', '-1', jobid) + command = (qstat_bin, '-f') + if not re.search(r'PBSPro', qstat_version): + command += ('-1',) # -1 conflicts with -f in PBS Pro + if jobid: + command += (jobid,) qstat_proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) qstat_out, _ = qstat_proc.communicate() - result = parse_qstat(qstat_out) log("Finished qstat (time=%f)." % (time.time()-starttime)) diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index 1594425c..9120f7e4 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -229,7 +229,9 @@ def call_scontrol(jobid=""): starttime = time.time() log("Starting scontrol.") - command = (scontrol, 'show', 'job', jobid) + command = (scontrol, 'show', 'job') + if jobid: + command += (jobid,) scontrol_proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) scontrol_out, _ = scontrol_proc.communicate() From 1e1e1afb88c341fcf0f46aef2b9749a8ce3544d8 Mon Sep 17 00:00:00 2001 From: Derek Weitzel Date: Thu, 2 Feb 2017 10:45:59 -0600 Subject: [PATCH 126/169] Put all #SBATCH commands together Slurm ignores #SBATCH commands that are after normal commands. We allow the admins to include local attributes, which can also include commands such as including modules... All of which has to be after the #SBATCH commands. --- src/scripts/slurm_submit.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index 0f0d97ae..8e78ad2c 100755 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -63,14 +63,15 @@ then echo "#SBATCH --mem=${bls_opt_req_mem}" >> $bls_tmp_file fi -bls_set_up_local_and_extra_args - # Simple support for multi-cpu attributes if [[ $bls_opt_mpinodes -gt 1 ]] ; then echo "#SBATCH --nodes=1" >> $bls_tmp_file echo "#SBATCH --ntasks=$bls_opt_mpinodes" >> $bls_tmp_file fi +# Do the local and extra args after all #SBATCH commands, otherwise slurm ignores anything +# after a non-#SBATCH command +bls_set_up_local_and_extra_args # Input and output sandbox setup. # Assume all filesystems are shared. From 1be2d93870ce3d8036c931ffdc4d292171f00bb2 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Thu, 9 Feb 2017 15:11:37 -0600 Subject: [PATCH 127/169] Blahp python scripts should ignore optional '-w' argument (SOFTWARE-2603) --- src/scripts/pbs_status.py | 9 +++++++-- src/scripts/slurm_status.py | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index a48d33b9..12b57b0b 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -517,10 +517,15 @@ def check_cache(jobid, recurse=True): def main(): initLog() - if len(sys.argv) != 2: + # Accept the optional -w argument, but ignore it + if len(sys.argv) == 2: + jobid_arg = sys.argv[1] + elif len(sys.argv) == 3 and sys.argv[1] == "-w": + jobid_arg = sys.argv[2] + else: print "1Usage: pbs_status.sh pbs//" return 1 - jobid = sys.argv[1].split("/")[-1].split(".")[0] + jobid = jobid_arg.split("/")[-1].split(".")[0] log("Checking cache for jobid %s" % jobid) cache_contents = None try: diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index 9120f7e4..df93d1d1 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -476,10 +476,15 @@ def check_cache(jobid, recurse=True): def main(): initLog() - if len(sys.argv) != 2: + # Accept the optional -w argument, but ignore it + if len(sys.argv) == 2: + jobid_arg = sys.argv[1] + elif len(sys.argv) == 3 and sys.argv[1] == "-w": + jobid_arg = sys.argv[2] + else: print "1Usage: slurm_status.py slurm//" return 1 - jobid = sys.argv[1].split("/")[-1].split(".")[0] + jobid = jobid_arg.split("/")[-1].split(".")[0] log("Checking cache for jobid %s" % jobid) cache_contents = None try: From c2d4b6adc2a3b70d3a2835d49aff68e4af633940 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 15 Feb 2017 18:37:53 -0600 Subject: [PATCH 128/169] Fail gracefully when parsing unexpected sacct output (SOFTWARE-2604) --- src/scripts/pbs_status.py | 28 ++++++++++++++-------------- src/scripts/slurm_status.py | 25 +++++++++++++------------ 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 12b57b0b..3eed8207 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -282,9 +282,8 @@ def is_exe(fpath): return None def convert_cpu_to_seconds(cpu_string): - import re - h,m,s = re.split(':',cpu_string) - return int(h) * 3600 + int(m) * 60 + int(s) + hrs, mins, secs = re.split(':', cpu_string) + return int(hrs) * 3600 + int(mins) * 60 + int(secs) _cluster_type_cache = None def get_finished_job_stats(jobid): @@ -330,24 +329,25 @@ def get_finished_job_stats(jobid): except Exception, e: log("Unable to read in CSV output from sacct: %s" % str(e)) return return_dict - + + sacct_parser = {'RemoteUserCpu': lambda orig, results: orig + \ + convert_cpu_to_seconds(results["AveCPU"]) * int(results["AllocCPUS"]), + 'ImageSize': lambda orig, results: orig + int(results["MaxRSS"].replace('K', '')), + 'ExitCode': lambda orig, results: int(results["ExitCode"].split(":")[0])} # Slurm can return more than 1 row, for some odd reason. # so sum up relevant values for row in reader: - if row["AveCPU"] is not "": - return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) - if row["MaxRSS"] is not "": - # Remove the trailing 'K' - return_dict["ImageSize"] += int(row["MaxRSS"].replace('K', '')) - if row["ExitCode"] is not "": - return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) - - # PBS completion + for attr, func in sacct_parser.items(): + try: + return_dict[attr] = func(return_dict[attr], row) + except (ValueError, KeyError), exc: + log("Could not parse %s for Jobid %s: %s" % (attr, jobid, exc)) + + # PBS completion elif _cluster_type_cache == "pbs": pass return return_dict - _qstat_location_cache = None def get_qstat_location(): diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index df93d1d1..a73e8b00 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -278,9 +278,8 @@ def is_exe(fpath): return None def convert_cpu_to_seconds(cpu_string): - import re - h,m,s = re.split(':',cpu_string) - return int(h) * 3600 + int(m) * 60 + int(s) + hrs, mins, secs = re.split(':', cpu_string) + return int(hrs) * 3600 + int(mins) * 60 + int(secs) def get_finished_job_stats(jobid): """ @@ -308,20 +307,22 @@ def get_finished_job_stats(jobid): except Exception, e: log("Unable to read in CSV output from sacct: %s" % str(e)) return return_dict - + + sacct_parser = {'RemoteUserCpu': lambda orig, results: orig + \ + convert_cpu_to_seconds(results["AveCPU"]) * int(results["AllocCPUS"]), + 'ImageSize': lambda orig, results: orig + int(results["MaxRSS"].replace('K', '')), + 'ExitCode': lambda orig, results: int(results["ExitCode"].split(":")[0])} # Slurm can return more than 1 row, for some odd reason. # so sum up relevant values for row in reader: - if row["AveCPU"] is not "": - return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) - if row["MaxRSS"] is not "": - # Remove the trailing 'K' - return_dict["ImageSize"] += int(row["MaxRSS"].replace('K', '')) - if row["ExitCode"] is not "": - return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) + for attr, func in sacct_parser.items(): + try: + return_dict[attr] = func(return_dict[attr], row) + except (ValueError, KeyError), exc: + log("Could not parse %s for Jobid %s: %s" % (attr, jobid, exc)) return return_dict - + _slurm_location_cache = None def get_slurm_location(program): From 915f0ebc9edb88a0eaa2ebfbbbfada4da6bd62a0 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 7 Jun 2017 11:46:10 -0500 Subject: [PATCH 129/169] Fix setting the number of CPUs per job ntasks only specifies a maximum number of tasks to be given by the slurm controller (default 1 task/node). https://slurm.schedmd.com/sbatch.html https://ticket.opensciencegrid.org/34033 --- src/scripts/slurm_submit.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index 8e78ad2c..ae028523 100755 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -66,7 +66,7 @@ fi # Simple support for multi-cpu attributes if [[ $bls_opt_mpinodes -gt 1 ]] ; then echo "#SBATCH --nodes=1" >> $bls_tmp_file - echo "#SBATCH --ntasks=$bls_opt_mpinodes" >> $bls_tmp_file + echo "#SBATCH --cpus-per-task=$bls_opt_mpinodes" >> $bls_tmp_file fi # Do the local and extra args after all #SBATCH commands, otherwise slurm ignores anything From 2340c75364cd149e306f90a5514039043fdd0c96 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 7 Jun 2017 13:16:18 -0500 Subject: [PATCH 130/169] Hardcode the number of tasks instead of relying on the default --- src/scripts/slurm_submit.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index ae028523..f02f31f2 100755 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -66,6 +66,7 @@ fi # Simple support for multi-cpu attributes if [[ $bls_opt_mpinodes -gt 1 ]] ; then echo "#SBATCH --nodes=1" >> $bls_tmp_file + echo "#SBATCH --ntasks=1" >> $bls_tmp_file echo "#SBATCH --cpus-per-task=$bls_opt_mpinodes" >> $bls_tmp_file fi From 6a71da2e32ab07474839a3ef3432d45315346be4 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Tue, 21 Mar 2017 12:49:18 -0500 Subject: [PATCH 131/169] Add config to differentiate between PBS flavors (SOFTWARE-2628) --- config/blah.config.template | 2 ++ src/scripts/blah.py | 18 ++++++++++++++++++ src/scripts/pbs_status.py | 18 ++++++++++-------- src/scripts/pbs_submit.sh | 8 +++----- 4 files changed, 33 insertions(+), 13 deletions(-) create mode 100644 src/scripts/blah.py diff --git a/config/blah.config.template b/config/blah.config.template index 909ba5b8..2e0ec6de 100644 --- a/config/blah.config.template +++ b/config/blah.config.template @@ -99,6 +99,8 @@ pbs_fallback=no #Set to 'yes' to request pvmem when submitting jobs pbs_set_pvmem=no +#Set to 'yes' if you are running PBS Pro +pbs_pro=no ##LSF common variables diff --git a/src/scripts/blah.py b/src/scripts/blah.py new file mode 100644 index 00000000..5c5a3e41 --- /dev/null +++ b/src/scripts/blah.py @@ -0,0 +1,18 @@ +"""Common functions for BLAH python scripts""" + +import os +import subprocess + +def load_env(config_dir): + """Load blah.config into the environment""" + load_config_path = os.path.join(config_dir, 'blah_load_config.sh') + command = ['bash', '-c', 'source %s && env' % load_config_path] + try: + config_proc = subprocess.Popen(command, stdout=subprocess.PIPE) + config_out, _ = config_proc.communicate() + + for line in config_out.splitlines(): + (key, _, val) = line.partition('=') + os.environ[key] = val + except IOError: + pass diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 3eed8207..34acf673 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -43,6 +43,8 @@ import pickle import csv +import blah + cache_timeout = 60 launchtime = time.time() @@ -226,14 +228,11 @@ def qstat(jobid=""): Returns a python dictionary with the job info. """ qstat_bin = get_qstat_location() - command = (qstat_bin, '--version') - qstat_process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - qstat_version, _ = qstat_process.communicate() starttime = time.time() log("Starting qstat.") command = (qstat_bin, '-f') - if not re.search(r'PBSPro', qstat_version): + if os.environ.get('pbs_pro').lower() != 'yes': command += ('-1',) # -1 conflicts with -f in PBS Pro if jobid: command += (jobid,) @@ -357,10 +356,9 @@ def get_qstat_location(): global _qstat_location_cache if _qstat_location_cache != None: return _qstat_location_cache - load_config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'blah_load_config.sh') - if os.path.exists(load_config_path) and os.access(load_config_path, os.R_OK): - cmd = 'source %s && echo "$pbs_binpath/qstat"' % load_config_path - else: + try: + cmd = os.path.join(os.environ['pbs_binpath'], 'qstat') + except KeyError: cmd = 'which qstat' child_stdout = os.popen(cmd) output = child_stdout.read() @@ -526,6 +524,10 @@ def main(): print "1Usage: pbs_status.sh pbs//" return 1 jobid = jobid_arg.split("/")[-1].split(".")[0] + + config_dir = os.path.dirname(os.path.abspath(__file__)) + blah.load_env(config_dir) + log("Checking cache for jobid %s" % jobid) cache_contents = None try: diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index 12bdabc8..c04df344 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -117,8 +117,6 @@ fi #local batch system-specific file output must be added to the submit file bls_local_submit_attributes_file=${blah_libexec_directory}/pbs_local_submit_attributes.sh -${pbs_binpath}/qstat --version 2>&1 | grep PBSPro > /dev/null 2>&1 -is_pbs_pro=$? # Begin building the select statement: select=x where x is the number of 'chunks' # to request. Chunk requests should precede any resource requests (resource # requests are order independent). An example from the PBS Pro manual: @@ -137,7 +135,7 @@ if [ "x$bls_opt_req_mem" != "x" ]; then fi # Total amount of memory allocated to the job pbs_select="$pbs_select:mem=${bls_opt_req_mem}mb" - if [ "$is_pbs_pro" != 0 ]; then + if [ "x$pbs_pro" != "xyes" ]; then echo "#PBS -l mem=${bls_opt_req_mem}mb" >> $bls_tmp_file fi fi @@ -149,7 +147,7 @@ bls_set_up_local_and_extra_args [ -z "$bls_opt_queue" ] || grep -q "^#PBS -q" $bls_tmp_file || echo "#PBS -q $bls_opt_queue" >> $bls_tmp_file # Extended support for MPI attributes -if [ "$is_pbs_pro" == 0 ]; then +if [ "x$pbs_pro" == "xyes" ]; then pbs_select="$pbs_select:ncpus=$bls_opt_smpgranularity" else if [ "x$bls_opt_wholenodes" == "xyes" ]; then @@ -209,7 +207,7 @@ else [ -z "$bls_fl_subst_and_accumulate_result" ] || echo "#PBS -W stageout=\\'$bls_fl_subst_and_accumulate_result\\'" >> $bls_tmp_file fi -if [ "$is_pbs_pro" == 0 ]; then +if [ "x$pbs_pro" == "xyes" ]; then echo $pbs_select >> $bls_tmp_file fi From a7e65523fd7d9168728f631fdab65c1846124b57 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Tue, 21 Mar 2017 14:01:32 -0500 Subject: [PATCH 132/169] Load blahp environment for slurm --- src/scripts/slurm_status.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index a73e8b00..ec9707fc 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -43,6 +43,8 @@ import pickle import csv +import blah + cache_timeout = 60 launchtime = time.time() @@ -332,10 +334,9 @@ def get_slurm_location(program): global _slurm_location_cache if _slurm_location_cache != None: return os.path.join(_slurm_location_cache, program) - load_config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'blah_load_config.sh') - if os.path.exists(load_config_path) and os.access(load_config_path, os.R_OK): - cmd = 'source %s && echo "${slurm_binpath:-/usr/bin}/%s"' % (load_config_path, program) - else: + try: + cmd = os.path.join(os.environ['slurm_binpath'], program) + except KeyError: cmd = 'which %s' % program child_stdout = os.popen(cmd) output = child_stdout.read() @@ -486,6 +487,10 @@ def main(): print "1Usage: slurm_status.py slurm//" return 1 jobid = jobid_arg.split("/")[-1].split(".")[0] + + config_dir = os.path.dirname(os.path.abspath(__file__)) + blah.load_env(config_dir) + log("Checking cache for jobid %s" % jobid) cache_contents = None try: From 497fb8400077288b33c581eedaa90ca5e4324ba5 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 12 Jul 2017 10:58:23 -0500 Subject: [PATCH 133/169] Add location of blah.py module to PYTHONPATH --- src/CMakeLists.txt | 2 ++ src/scripts/Makefile.am | 1 + src/scripts/__init__.py | 0 src/scripts/pbs_status.py | 1 + src/scripts/slurm_status.py | 1 + 5 files changed, 5 insertions(+) create mode 100644 src/scripts/__init__.py diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c83f19c6..e4bc2ccb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -139,7 +139,9 @@ set(blah_scripts scripts/slurm_cancel.sh scripts/slurm_resume.sh scripts/slurm_status.sh scripts/slurm_hold.sh scripts/slurm_submit.sh scripts/slurm_local_submit_attributes.sh + scripts/blah.py scripts/__init__.py scripts/pbs_status.py + scripts/slurm_status.py ) install(FILES diff --git a/src/scripts/Makefile.am b/src/scripts/Makefile.am index 2edcffc3..6f7db77b 100644 --- a/src/scripts/Makefile.am +++ b/src/scripts/Makefile.am @@ -39,6 +39,7 @@ libexec_SCRIPTS = blah_load_config.sh blah_common_submit_functions.sh \ sge_hold.sh sge_status.sh runcmd.pl.template sge_local_submit_attributes.sh \ slurm_cancel.sh slurm_hold.sh slurm_resume.sh slurm_status.sh \ slurm_submit.sh slurm_local_submit_attributes.sh \ + blah.py __init__.py \ pbs_status.py \ slurm_status.py diff --git a/src/scripts/__init__.py b/src/scripts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 34acf673..57a2d8d3 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -43,6 +43,7 @@ import pickle import csv +sys.path.insert(0, os.path.dirname(__file__)) import blah cache_timeout = 60 diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index ec9707fc..c3c1a580 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -43,6 +43,7 @@ import pickle import csv +sys.path.insert(0, os.path.dirname(__file__)) import blah cache_timeout = 60 From 48485cda8c4265ccbd0994e16eab5431f0c65ac1 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 12 Jul 2017 16:03:14 -0500 Subject: [PATCH 134/169] Ensure that we get the absolute *status.py path --- src/scripts/pbs_status.py | 2 +- src/scripts/slurm_status.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 57a2d8d3..8fd30cf6 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -43,7 +43,7 @@ import pickle import csv -sys.path.insert(0, os.path.dirname(__file__)) +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) import blah cache_timeout = 60 diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index c3c1a580..16b1f3bb 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -43,7 +43,7 @@ import pickle import csv -sys.path.insert(0, os.path.dirname(__file__)) +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) import blah cache_timeout = 60 From 87286a79173e2c4613425b5e8cd22bcfc03e74d3 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 12 Jul 2017 18:49:21 -0500 Subject: [PATCH 135/169] Properly load blah.config (SOFTWARE-2628) Previous method didn't work because the options in blah.config weren't ever exported so they never made it ot the environment --- src/scripts/blah.py | 34 +++++++++++++++++++++------------- src/scripts/pbs_status.py | 7 ++++--- src/scripts/slurm_status.py | 2 +- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/src/scripts/blah.py b/src/scripts/blah.py index 5c5a3e41..d0f9bea4 100644 --- a/src/scripts/blah.py +++ b/src/scripts/blah.py @@ -1,18 +1,26 @@ """Common functions for BLAH python scripts""" import os -import subprocess +from ConfigParser import RawConfigParser +from io import StringIO -def load_env(config_dir): - """Load blah.config into the environment""" - load_config_path = os.path.join(config_dir, 'blah_load_config.sh') - command = ['bash', '-c', 'source %s && env' % load_config_path] - try: - config_proc = subprocess.Popen(command, stdout=subprocess.PIPE) - config_out, _ = config_proc.communicate() +class BlahConfigParser(RawConfigParser, object): + + def __init__(self, path='/etc/blah.config'): + # RawConfigParser requires ini-style [section headers] but since + # blah.config is also used as a shell script we need to fake one + self.header = 'blahp' + with open(path) as f: + config = f.read() + vfile = StringIO(u'[%s]\n%s' % (self.header, config)) + + super(BlahConfigParser, self).__init__() + # TODO: readfp() is replaced by read_file() in Python 3.2+ + self.readfp(vfile) + + def items(self): + return super(BlahConfigParser, self).items(self.header) + + def get(self, option): + return super(BlahConfigParser, self).get(self.header, option) - for line in config_out.splitlines(): - (key, _, val) = line.partition('=') - os.environ[key] = val - except IOError: - pass diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 8fd30cf6..044904bc 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -233,7 +233,7 @@ def qstat(jobid=""): starttime = time.time() log("Starting qstat.") command = (qstat_bin, '-f') - if os.environ.get('pbs_pro').lower() != 'yes': + if config.get('pbs_pro').lower() != 'yes': command += ('-1',) # -1 conflicts with -f in PBS Pro if jobid: command += (jobid,) @@ -358,7 +358,7 @@ def get_qstat_location(): if _qstat_location_cache != None: return _qstat_location_cache try: - cmd = os.path.join(os.environ['pbs_binpath'], 'qstat') + cmd = os.path.join(config.get('pbs_binpath'), 'qstat') except KeyError: cmd = 'which qstat' child_stdout = os.popen(cmd) @@ -527,7 +527,8 @@ def main(): jobid = jobid_arg.split("/")[-1].split(".")[0] config_dir = os.path.dirname(os.path.abspath(__file__)) - blah.load_env(config_dir) + global config + config = blah.BlahConfigParser() log("Checking cache for jobid %s" % jobid) cache_contents = None diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index 16b1f3bb..8fb9e553 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -336,7 +336,7 @@ def get_slurm_location(program): if _slurm_location_cache != None: return os.path.join(_slurm_location_cache, program) try: - cmd = os.path.join(os.environ['slurm_binpath'], program) + cmd = os.path.join(config.get('slurm_binpath'), program) except KeyError: cmd = 'which %s' % program child_stdout = os.popen(cmd) From 273f7bc3c6bb7f5b90e6a0908d8840797f1f764e Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Thu, 13 Jul 2017 14:00:42 -0500 Subject: [PATCH 136/169] Fix bugs in constructing the path to pbs/slurm query commands --- src/scripts/blah.py | 5 ++++- src/scripts/pbs_status.py | 14 ++++++++------ src/scripts/slurm_status.py | 13 +++++++------ 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/scripts/blah.py b/src/scripts/blah.py index d0f9bea4..679add99 100644 --- a/src/scripts/blah.py +++ b/src/scripts/blah.py @@ -22,5 +22,8 @@ def items(self): return super(BlahConfigParser, self).items(self.header) def get(self, option): - return super(BlahConfigParser, self).get(self.header, option) + # ConfigParser happily includes quotes in value strings, which we + # happily allow in /etc/blah.config. This causes failures when joining + # paths, for example. + return super(BlahConfigParser, self).get(self.header, option).strip('"\'') diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 044904bc..01664f70 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -357,15 +357,17 @@ def get_qstat_location(): global _qstat_location_cache if _qstat_location_cache != None: return _qstat_location_cache + try: - cmd = os.path.join(config.get('pbs_binpath'), 'qstat') + location = os.path.join(config.get('pbs_binpath'), 'qstat') except KeyError: cmd = 'which qstat' - child_stdout = os.popen(cmd) - output = child_stdout.read() - location = output.split("\n")[0].strip() - if child_stdout.close(): - raise Exception("Unable to determine qstat location: %s" % output) + child_stdout = os.popen(cmd) + output = child_stdout.read() + location = output.split("\n")[0].strip() + if child_stdout.close(): + raise Exception("Unable to determine qstat location: %s" % output) + _qstat_location_cache = location return location diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index 8fb9e553..b462a298 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -336,14 +336,15 @@ def get_slurm_location(program): if _slurm_location_cache != None: return os.path.join(_slurm_location_cache, program) try: - cmd = os.path.join(config.get('slurm_binpath'), program) + location = os.path.join(config.get('slurm_binpath'), program) except KeyError: cmd = 'which %s' % program - child_stdout = os.popen(cmd) - output = child_stdout.read() - location = output.split("\n")[0].strip() - if child_stdout.close(): - raise Exception("Unable to determine scontrol location: %s" % output) + child_stdout = os.popen(cmd) + output = child_stdout.read() + location = output.split("\n")[0].strip() + if child_stdout.close(): + raise Exception("Unable to determine scontrol location: %s" % output) + _slurm_location_cache = os.path.dirname(location) return location From 9bfeb4f46e00dee8ea8086ff7dabdb6787bdcdf8 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Fri, 21 Jul 2017 17:18:30 -0500 Subject: [PATCH 137/169] Fix bug that broke shell parsing of *_binpath Because the blahp is insane and the defaults are shell commands --- src/scripts/blah.py | 5 +++++ src/scripts/pbs_status.py | 24 +++++++++++------------- src/scripts/slurm_status.py | 28 ++++++++++++++-------------- 3 files changed, 30 insertions(+), 27 deletions(-) diff --git a/src/scripts/blah.py b/src/scripts/blah.py index 679add99..57c05b4f 100644 --- a/src/scripts/blah.py +++ b/src/scripts/blah.py @@ -27,3 +27,8 @@ def get(self, option): # paths, for example. return super(BlahConfigParser, self).get(self.header, option).strip('"\'') + def set(self, option, value): + return super(BlahConfigParser, self).set(self.header, option, value) + + def has_option(self, option): + return super(BlahConfigParser, self).has_option(self.header, option) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 01664f70..acc646e1 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -358,18 +358,17 @@ def get_qstat_location(): if _qstat_location_cache != None: return _qstat_location_cache - try: - location = os.path.join(config.get('pbs_binpath'), 'qstat') - except KeyError: - cmd = 'which qstat' - child_stdout = os.popen(cmd) - output = child_stdout.read() - location = output.split("\n")[0].strip() - if child_stdout.close(): - raise Exception("Unable to determine qstat location: %s" % output) - - _qstat_location_cache = location - return location + if not (config.has_option('pbs_binpath') and config.get('pbs_binpath')): + config.set('pbs_binpath', '/usr/bin') + cmd = 'echo "%s/%s"' % (config.get('pbs_binpath'), 'qstat') + + child_stdout = os.popen(cmd) + output = child_stdout.read().split("\n")[0].strip() + if child_stdout.close(): + raise Exception("Unable to determine qstat location: %s" % output) + + _qstat_location_cache = output + return output job_id_re = re.compile("\s*Job Id:\s([0-9]+)([\w\-\/.]*)") exec_host_re = re.compile("\s*exec_host = ([\w\-\/.]+)") @@ -528,7 +527,6 @@ def main(): return 1 jobid = jobid_arg.split("/")[-1].split(".")[0] - config_dir = os.path.dirname(os.path.abspath(__file__)) global config config = blah.BlahConfigParser() diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index b462a298..bee91452 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -335,18 +335,18 @@ def get_slurm_location(program): global _slurm_location_cache if _slurm_location_cache != None: return os.path.join(_slurm_location_cache, program) - try: - location = os.path.join(config.get('slurm_binpath'), program) - except KeyError: - cmd = 'which %s' % program - child_stdout = os.popen(cmd) - output = child_stdout.read() - location = output.split("\n")[0].strip() - if child_stdout.close(): - raise Exception("Unable to determine scontrol location: %s" % output) - - _slurm_location_cache = os.path.dirname(location) - return location + + if not (config.has_option('slurm_binpath') and config.get('slurm_binpath')): + config.set('slurm_binpath', '/usr/bin') + cmd = 'echo "%s/%s"' % (config.get('slurm_binpath'), 'scontrol') + + child_stdout = os.popen(cmd) + output = child_stdout.read().split("\n")[0].strip() + if child_stdout.close(): + raise Exception("Unable to determine scontrol location: %s" % output) + + _slurm_location_cache = os.path.dirname(output) + return output job_id_re = re.compile("JobId=([0-9]+) .*") exec_host_re = re.compile("\s*BatchHost=([\w\-.]+)") @@ -490,8 +490,8 @@ def main(): return 1 jobid = jobid_arg.split("/")[-1].split(".")[0] - config_dir = os.path.dirname(os.path.abspath(__file__)) - blah.load_env(config_dir) + global config + config = blah.BlahConfigParser() log("Checking cache for jobid %s" % jobid) cache_contents = None From 429a2d5ae8c9fb7cdc15b189a77c54e8c82f10c4 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Sat, 22 Jul 2017 10:44:08 -0500 Subject: [PATCH 138/169] Set default bin paths to /usr/bin Admins should configure (manually or via osg-configure) this to a path and we shouldn't support shell evaluation in config: 1. Avoid $PATH tomfoolery 2. Why add the overhead of `which`? --- config/blah.config.template | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config/blah.config.template b/config/blah.config.template index 2e0ec6de..e9962d52 100644 --- a/config/blah.config.template +++ b/config/blah.config.template @@ -80,7 +80,7 @@ blah_require_proxy_on_submit= #Path where PBS executables are located # NOTE: this path is computed many times; I worry about the overhead here. -BB, 12-13-2012 -pbs_binpath=`which qsub 2>/dev/null|sed 's|/[^/]*$||'` +pbs_binpath=/usr/bin #Path where the PBS logs are located ($pbs_spoolpath/server_logs) #pbs_spoolpath= @@ -105,7 +105,7 @@ pbs_pro=no ##LSF common variables #Path where LSF executables are located -lsf_binpath=`which bsub 2>/dev/null|sed 's|/[^/]*$||'` +lsf_binpath=/usr/bin #Path where the LSF conf file is located ($lsf_confpath/lsf.conf) lsf_confpath= @@ -285,7 +285,7 @@ tracejob_max_output= ##Condor #condor bin location -condor_binpath=`which condor_submit 2>/dev/null|sed 's|/[^/]*$||'` +condor_binpath=/usr/bin #path to condor_config #export CONDOR_CONFIG="/etc/condor/condor_config" From 517a63081e7feaccab7059846234f7d3e5206f0f Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 23 Aug 2017 15:39:13 -0500 Subject: [PATCH 139/169] Fix parsing of time fields for slurm jobs Time fields in the output of slurm's sacct have optional day and hour fields. https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=6380 --- src/scripts/slurm_status.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index bee91452..dc9acd59 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -281,8 +281,16 @@ def is_exe(fpath): return None def convert_cpu_to_seconds(cpu_string): - hrs, mins, secs = re.split(':', cpu_string) - return int(hrs) * 3600 + int(mins) * 60 + int(secs) + # The time fields in sacct's output have this format: + # [DD-[hh:]]mm:ss + # Convert that to just seconds. + elem = re.split('[-:]', cpu_string) + secs = int(elem[-1]) + int(elem[-2]) * 60 + if len(elem) > 2: + secs += int(elem[-3]) * 3600 + if len(elem) > 3: + secs += int(elem[-4]) * 86400 + return secs def get_finished_job_stats(jobid): """ From ec6f1f4a232da36a15e5ed162875ce65a3cd90e0 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Mon, 28 Aug 2017 09:00:02 -0500 Subject: [PATCH 140/169] Fix uncaught exception when config doesn't contain pbs_pro https://ticket.opensciencegrid.org/34702 --- src/scripts/pbs_status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index acc646e1..623378d2 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -233,7 +233,7 @@ def qstat(jobid=""): starttime = time.time() log("Starting qstat.") command = (qstat_bin, '-f') - if config.get('pbs_pro').lower() != 'yes': + if config.has_option('pbs_pro') and config.get('pbs_pro').lower() != 'yes': command += ('-1',) # -1 conflicts with -f in PBS Pro if jobid: command += (jobid,) From 34b3d113a82243e947af99a4811f2866736751d9 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Mon, 28 Aug 2017 18:29:56 -0500 Subject: [PATCH 141/169] Set default values in blah.config --- src/scripts/blah.py | 4 ++-- src/scripts/pbs_status.py | 7 +++---- src/scripts/slurm_status.py | 4 +--- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/scripts/blah.py b/src/scripts/blah.py index 57c05b4f..b40c2749 100644 --- a/src/scripts/blah.py +++ b/src/scripts/blah.py @@ -6,7 +6,7 @@ class BlahConfigParser(RawConfigParser, object): - def __init__(self, path='/etc/blah.config'): + def __init__(self, path='/etc/blah.config', defaults=None): # RawConfigParser requires ini-style [section headers] but since # blah.config is also used as a shell script we need to fake one self.header = 'blahp' @@ -14,7 +14,7 @@ def __init__(self, path='/etc/blah.config'): config = f.read() vfile = StringIO(u'[%s]\n%s' % (self.header, config)) - super(BlahConfigParser, self).__init__() + super(BlahConfigParser, self).__init__(defaults=defaults) # TODO: readfp() is replaced by read_file() in Python 3.2+ self.readfp(vfile) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 623378d2..9e415fef 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -233,7 +233,7 @@ def qstat(jobid=""): starttime = time.time() log("Starting qstat.") command = (qstat_bin, '-f') - if config.has_option('pbs_pro') and config.get('pbs_pro').lower() != 'yes': + if config.get('pbs_pro').lower() != 'yes': command += ('-1',) # -1 conflicts with -f in PBS Pro if jobid: command += (jobid,) @@ -358,8 +358,6 @@ def get_qstat_location(): if _qstat_location_cache != None: return _qstat_location_cache - if not (config.has_option('pbs_binpath') and config.get('pbs_binpath')): - config.set('pbs_binpath', '/usr/bin') cmd = 'echo "%s/%s"' % (config.get('pbs_binpath'), 'qstat') child_stdout = os.popen(cmd) @@ -528,7 +526,8 @@ def main(): jobid = jobid_arg.split("/")[-1].split(".")[0] global config - config = blah.BlahConfigParser() + config = blah.BlahConfigParser(defaults={'pbs_pro': 'no', + 'pbs_binpath': '/usr/bin'}) log("Checking cache for jobid %s" % jobid) cache_contents = None diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index dc9acd59..5294a9ec 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -344,8 +344,6 @@ def get_slurm_location(program): if _slurm_location_cache != None: return os.path.join(_slurm_location_cache, program) - if not (config.has_option('slurm_binpath') and config.get('slurm_binpath')): - config.set('slurm_binpath', '/usr/bin') cmd = 'echo "%s/%s"' % (config.get('slurm_binpath'), 'scontrol') child_stdout = os.popen(cmd) @@ -499,7 +497,7 @@ def main(): jobid = jobid_arg.split("/")[-1].split(".")[0] global config - config = blah.BlahConfigParser() + config = blah.BlahConfigParser(defaults={'slurm_binpath': '/usr/bin'}) log("Checking cache for jobid %s" % jobid) cache_contents = None From e4a463a02c7488a025b7d4c665aec5a8cb945699 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Mon, 16 Oct 2017 17:05:48 -0500 Subject: [PATCH 142/169] Revert c2d4b6a --- src/scripts/pbs_status.py | 23 +++++++++++------------ src/scripts/slurm_status.py | 20 +++++++++----------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 9e415fef..2799ab3c 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -329,25 +329,24 @@ def get_finished_job_stats(jobid): except Exception, e: log("Unable to read in CSV output from sacct: %s" % str(e)) return return_dict - - sacct_parser = {'RemoteUserCpu': lambda orig, results: orig + \ - convert_cpu_to_seconds(results["AveCPU"]) * int(results["AllocCPUS"]), - 'ImageSize': lambda orig, results: orig + int(results["MaxRSS"].replace('K', '')), - 'ExitCode': lambda orig, results: int(results["ExitCode"].split(":")[0])} + # Slurm can return more than 1 row, for some odd reason. # so sum up relevant values for row in reader: - for attr, func in sacct_parser.items(): - try: - return_dict[attr] = func(return_dict[attr], row) - except (ValueError, KeyError), exc: - log("Could not parse %s for Jobid %s: %s" % (attr, jobid, exc)) - - # PBS completion + if row["AveCPU"] is not "": + return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) + if row["MaxRSS"] is not "": + # Remove the trailing 'K' + return_dict["ImageSize"] += int(row["MaxRSS"].replace('K', '')) + if row["ExitCode"] is not "": + return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) + + # PBS completion elif _cluster_type_cache == "pbs": pass return return_dict + _qstat_location_cache = None def get_qstat_location(): diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index 5294a9ec..cda9ed48 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -318,22 +318,20 @@ def get_finished_job_stats(jobid): except Exception, e: log("Unable to read in CSV output from sacct: %s" % str(e)) return return_dict - - sacct_parser = {'RemoteUserCpu': lambda orig, results: orig + \ - convert_cpu_to_seconds(results["AveCPU"]) * int(results["AllocCPUS"]), - 'ImageSize': lambda orig, results: orig + int(results["MaxRSS"].replace('K', '')), - 'ExitCode': lambda orig, results: int(results["ExitCode"].split(":")[0])} + # Slurm can return more than 1 row, for some odd reason. # so sum up relevant values for row in reader: - for attr, func in sacct_parser.items(): - try: - return_dict[attr] = func(return_dict[attr], row) - except (ValueError, KeyError), exc: - log("Could not parse %s for Jobid %s: %s" % (attr, jobid, exc)) + if row["AveCPU"] is not "": + return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) + if row["MaxRSS"] is not "": + # Remove the trailing 'K' + return_dict["ImageSize"] += int(row["MaxRSS"].replace('K', '')) + if row["ExitCode"] is not "": + return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) return return_dict - + _slurm_location_cache = None def get_slurm_location(program): From 7cc0816aa3e30050eb42b7d9d06dfc259ebb2671 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Mon, 16 Oct 2017 17:07:15 -0500 Subject: [PATCH 143/169] Fix memory usage parsing for SLURM and PBS (SOFTWARE-2929) --- src/scripts/pbs_status.py | 16 ++++++++++++++-- src/scripts/slurm_status.py | 16 ++++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 2799ab3c..21a34f23 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -336,8 +336,20 @@ def get_finished_job_stats(jobid): if row["AveCPU"] is not "": return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) if row["MaxRSS"] is not "": - # Remove the trailing 'K' - return_dict["ImageSize"] += int(row["MaxRSS"].replace('K', '')) + # Remove the trailing [KMGTP] and scale the value appropriately + # Note: We assume that all values will have a suffix, and we + # want the value in kilos. + value = row["MaxRSS"] + factor = 1 + if value[-1] == 'M': + factor = 1024 + elif value[-1] == 'G': + factor = 1024 * 1024 + elif value[-1] == 'T': + factor = 1024 * 1024 * 1024 + elif value[-1] == 'P': + factor = 1024 * 1024 * 1024 * 1024 + return_dict["ImageSize"] += int(value.strip('KMGTP')) * factor if row["ExitCode"] is not "": return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index cda9ed48..98eda196 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -325,8 +325,20 @@ def get_finished_job_stats(jobid): if row["AveCPU"] is not "": return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) if row["MaxRSS"] is not "": - # Remove the trailing 'K' - return_dict["ImageSize"] += int(row["MaxRSS"].replace('K', '')) + # Remove the trailing [KMGTP] and scale the value appropriately + # Note: We assume that all values will have a suffix, and we + # want the value in kilos. + value = row["MaxRSS"] + factor = 1 + if value[-1] == 'M': + factor = 1024 + elif value[-1] == 'G': + factor = 1024 * 1024 + elif value[-1] == 'T': + factor = 1024 * 1024 * 1024 + elif value[-1] == 'P': + factor = 1024 * 1024 * 1024 * 1024 + return_dict["ImageSize"] += int(value.strip('KMGTP')) * factor if row["ExitCode"] is not "": return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) From aa755f0cb7671fb10206660654e24fbb79e09894 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Thu, 19 Oct 2017 11:35:07 -0500 Subject: [PATCH 144/169] Log any parsing exceptions --- src/scripts/pbs_status.py | 40 ++++++++++++++++++++++++------------- src/scripts/slurm_status.py | 39 +++++++++++++++++++++++------------- 2 files changed, 51 insertions(+), 28 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 21a34f23..de5a2d1a 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -334,25 +334,37 @@ def get_finished_job_stats(jobid): # so sum up relevant values for row in reader: if row["AveCPU"] is not "": - return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) + try: + return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) + except: + log("Failed to parse CPU usage for job id %s: %s, %s" % (jobid, row["AveCPU"], row["AllocCPUS"])) + raise if row["MaxRSS"] is not "": # Remove the trailing [KMGTP] and scale the value appropriately # Note: We assume that all values will have a suffix, and we # want the value in kilos. - value = row["MaxRSS"] - factor = 1 - if value[-1] == 'M': - factor = 1024 - elif value[-1] == 'G': - factor = 1024 * 1024 - elif value[-1] == 'T': - factor = 1024 * 1024 * 1024 - elif value[-1] == 'P': - factor = 1024 * 1024 * 1024 * 1024 - return_dict["ImageSize"] += int(value.strip('KMGTP')) * factor + try: + value = row["MaxRSS"] + factor = 1 + if value[-1] == 'M': + factor = 1024 + elif value[-1] == 'G': + factor = 1024 * 1024 + elif value[-1] == 'T': + factor = 1024 * 1024 * 1024 + elif value[-1] == 'P': + factor = 1024 * 1024 * 1024 * 1024 + return_dict["ImageSize"] += int(value.strip('KMGTP')) * factor + except: + log("Failed to parse memory usage for job id %s: %s" % (jobid, row["MaxRSS"])) + raise if row["ExitCode"] is not "": - return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) - + try: + return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) + except: + log("Failed to parse ExitCode for job id %s: %s" % (jobid, row["ExitCode"])) + raise + # PBS completion elif _cluster_type_cache == "pbs": pass diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index 98eda196..a7215836 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -323,25 +323,36 @@ def get_finished_job_stats(jobid): # so sum up relevant values for row in reader: if row["AveCPU"] is not "": - return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) + try: + return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) + except: + log("Failed to parse CPU usage for job id %s: %s, %s" % (jobid, row["AveCPU"], row["AllocCPUS"])) + raise if row["MaxRSS"] is not "": # Remove the trailing [KMGTP] and scale the value appropriately # Note: We assume that all values will have a suffix, and we # want the value in kilos. - value = row["MaxRSS"] - factor = 1 - if value[-1] == 'M': - factor = 1024 - elif value[-1] == 'G': - factor = 1024 * 1024 - elif value[-1] == 'T': - factor = 1024 * 1024 * 1024 - elif value[-1] == 'P': - factor = 1024 * 1024 * 1024 * 1024 - return_dict["ImageSize"] += int(value.strip('KMGTP')) * factor + try: + value = row["MaxRSS"] + factor = 1 + if value[-1] == 'M': + factor = 1024 + elif value[-1] == 'G': + factor = 1024 * 1024 + elif value[-1] == 'T': + factor = 1024 * 1024 * 1024 + elif value[-1] == 'P': + factor = 1024 * 1024 * 1024 * 1024 + return_dict["ImageSize"] += int(value.strip('KMGTP')) * factor + except: + log("Failed to parse memory usage for job id %s: %s" % (jobid, row["MaxRSS"])) + raise if row["ExitCode"] is not "": - return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) - + try: + return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) + except: + log("Failed to parse memory usage for job id %s: %s" % (jobid, row["MaxRSS"])) + raise return return_dict From 4a5023d9c14eaa4910e05e7b9948f333988af66c Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Tue, 24 Oct 2017 10:08:23 -0500 Subject: [PATCH 145/169] Use StringIO over io.StringIO The latter is designed for use with Python3 libs where all strings are in unicode. https://ticket.grid.iu.edu/34702 --- src/scripts/blah.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/scripts/blah.py b/src/scripts/blah.py index b40c2749..99dda1ab 100644 --- a/src/scripts/blah.py +++ b/src/scripts/blah.py @@ -1,8 +1,8 @@ """Common functions for BLAH python scripts""" -import os from ConfigParser import RawConfigParser -from io import StringIO +# TODO: io.StringIO is preferred in Python3 since it handles unicode-encoded files +from StringIO import StringIO class BlahConfigParser(RawConfigParser, object): @@ -12,7 +12,7 @@ def __init__(self, path='/etc/blah.config', defaults=None): self.header = 'blahp' with open(path) as f: config = f.read() - vfile = StringIO(u'[%s]\n%s' % (self.header, config)) + vfile = StringIO('[%s]\n%s' % (self.header, config)) super(BlahConfigParser, self).__init__(defaults=defaults) # TODO: readfp() is replaced by read_file() in Python 3.2+ From 0166e96b5ac34d4600b107e1d82358564caa0533 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Tue, 24 Oct 2017 10:10:10 -0500 Subject: [PATCH 146/169] Log tracebacks for easier debugging --- src/scripts/pbs_status.py | 2 ++ src/scripts/slurm_status.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 9e415fef..789ed252 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -40,6 +40,7 @@ import subprocess import signal import tempfile +import traceback import pickle import csv @@ -564,5 +565,6 @@ def main(): except SystemExit: raise except Exception, e: + log(traceback.format_exc()) print "1ERROR: %s" % str(e).replace("\n", "\\n") sys.exit(0) diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py index 5294a9ec..5d10a9c1 100644 --- a/src/scripts/slurm_status.py +++ b/src/scripts/slurm_status.py @@ -40,6 +40,7 @@ import subprocess import signal import tempfile +import traceback import pickle import csv @@ -534,5 +535,6 @@ def main(): except SystemExit: raise except Exception, e: + log(traceback.format_exc()) print "1ERROR: %s" % str(e).replace("\n", "\\n") sys.exit(0) From 379da31dd046043931807fd1f883e7dd8be1acd2 Mon Sep 17 00:00:00 2001 From: John Thiltges Date: Thu, 30 Nov 2017 15:27:04 -0600 Subject: [PATCH 147/169] limit_proxy() is not guaranteed to set errmsg. Initialize the variable to NULL. --- src/server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.c b/src/server.c index 02a018ec..50fd7c35 100644 --- a/src/server.c +++ b/src/server.c @@ -1159,7 +1159,7 @@ cmd_submit_job(void *args) else if ((proxyname) != NULL && (!disable_limited_proxy)) { /* not in glexec mode: need to limit the proxy */ - char *errmsg; + char *errmsg = NULL; if((proxynameNew = limit_proxy(proxyname, NULL, &errmsg)) == NULL) { /* PUSH A FAILURE */ From ac768e1b8bcbabf81c332aa906656c677c2afbb4 Mon Sep 17 00:00:00 2001 From: Andrew Melo Date: Tue, 6 Feb 2018 23:07:06 -0600 Subject: [PATCH 148/169] Check input files exist before submitting to SLURM For some reason, condor-ce and BLAH can get out of sync -- the net result is that BLAH is told to submit jobs whose input files (and even parent directories!) don't exist in condor's spol directory. These jobs are doomed to fail and be later restarted, causing a lot of unnecessary churn on the SLURM scheduler. Before submitting jobs to SLURM, verify that the input files exist. If they don't, error out early. --- src/scripts/blah_common_submit_functions.sh | 28 +++++++++++++++++++++ src/scripts/slurm_submit.sh | 8 ++++++ 2 files changed, 36 insertions(+) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index d4d8fef8..583bca3a 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -130,6 +130,34 @@ function bls_fl_subst_and_accumulate () done } +function bls_fl_test_exists () +{ +# +# Usage: bls_fl_test_exists container_name +# Verfies all container_name "@@F_LOCAL" exists +# First missing file is returned in $bls_fl_test_exists_result. +# + local container_name + + container_name=${1:?"Missing container name argument to bls_fl_subst_and_accumulate"} + + local last_argument + + eval "last_argument=\${bls_${container_name}_counter:=0}" + + local ind + bls_fl_test_exists_result= + for (( ind=0 ; ind < $last_argument ; ind++ )) ; do + bls_fl_subst $container_name $ind "@@F_LOCAL" + if [ ! -z "$bls_fl_subst_result" -a ! -f "$bls_fl_subst_result" ] ; then + bls_fl_test_exists_result="${bls_fl_subst_result}" + return 1 + fi + done + return 0 +} + + function bls_fl_subst_and_dump () { # diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index f02f31f2..53b26867 100755 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -70,6 +70,14 @@ if [[ $bls_opt_mpinodes -gt 1 ]] ; then echo "#SBATCH --cpus-per-task=$bls_opt_mpinodes" >> $bls_tmp_file fi +# Ensure local files actually exist before submitting job. This prevents +# unnecessary churn on the scheduler if the files don't exist. +if ! bls_fl_test_exists inputsand ; then + echo "Input sandbox file doesn't exist: $bls_fl_test_exists_result" >&2 + echo Error # for the sake of waiting fgets in blahpd + exit 1 +fi + # Do the local and extra args after all #SBATCH commands, otherwise slurm ignores anything # after a non-#SBATCH command bls_set_up_local_and_extra_args From 5c2c0a9534df095526cddc73a76487a438382f2b Mon Sep 17 00:00:00 2001 From: Andrew Melo Date: Wed, 7 Feb 2018 20:26:02 -0600 Subject: [PATCH 149/169] Check workdir exists for jobs submitted to SLURM For some reason, condor-ce is pulling the rug out of jobs in SLURM and deleting the spool directory. This is annoying because SLURM spends a lot of time scheduling jobs that will never work. Fail early if we detect this. --- src/scripts/slurm_submit.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index 53b26867..ab30b3af 100755 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -70,6 +70,14 @@ if [[ $bls_opt_mpinodes -gt 1 ]] ; then echo "#SBATCH --cpus-per-task=$bls_opt_mpinodes" >> $bls_tmp_file fi +# Verify the workdir exists before submitting the job. If a bogus workdir is +# given, the job is hopeless +if [[ ! -z "$bls_opt_workdir" && ! -d "$bls_opt_workdir" ]] ; then + echo "Error: Workdir doesn't exist" >&2 + echo Error # for the sake of waiting fgets in blahpd + exit 1 +fi + # Ensure local files actually exist before submitting job. This prevents # unnecessary churn on the scheduler if the files don't exist. if ! bls_fl_test_exists inputsand ; then From 0941ca7883762cce37bb193dc992e2952481c2ef Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Fri, 9 Feb 2018 13:19:42 -0600 Subject: [PATCH 150/169] Save debug submit info before submission --- src/scripts/blah_common_submit_functions.sh | 61 +++++++++++---------- src/scripts/lsf_submit.sh | 1 + src/scripts/pbs_submit.sh | 1 + src/scripts/sge_submit.sh | 1 + src/scripts/slurm_submit.sh | 2 +- 5 files changed, 35 insertions(+), 31 deletions(-) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index d4d8fef8..e026c81b 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -786,38 +786,39 @@ function bls_set_up_local_and_extra_args () fi } -function bls_wrap_up_submit () -{ - - if [ -d "$blah_debug_save_submit_info" -a -n "$bls_tmp_name" ]; then - # Store files used for this job in a directory - bls_info_dir="$blah_debug_save_submit_info/$bls_tmp_name.debug" - mkdir "$bls_info_dir" - if [ $? -eq 0 ]; then - # Best effort. - if [ -r "$bls_proxy_local_file" ]; then - cp "$bls_proxy_local_file" "$bls_info_dir/submit.proxy" - fi - if [ -r "$bls_opt_stdout" ]; then - ln "$bls_opt_stdout" "$bls_info_dir/job.stdout" - if [ $? -ne 0 ]; then - # If we cannot hardlink, try a soft link. - ln -s "$bls_opt_stdout" "$bls_info_dir/job.stdout" - fi - fi - if [ -r "$bls_opt_stderr" ]; then - ln "$bls_opt_stderr" "$bls_info_dir/job.stderr" - if [ $? -ne 0 ]; then - # If we cannot hardlink, try a soft link. - ln -s "$bls_opt_stderr" "$bls_info_dir/job.stderr" +function bls_save_submit () { + if [ -d "$blah_debug_save_submit_info" -a -n "$bls_tmp_name" ]; then + # Store files used for this job in a directory + bls_info_dir="$blah_debug_save_submit_info/$bls_tmp_name.debug" + mkdir "$bls_info_dir" + if [ $? -eq 0 ]; then + # Best effort. + if [ -r "$bls_proxy_local_file" ]; then + cp "$bls_proxy_local_file" "$bls_info_dir/submit.proxy" + fi + if [ -r "$bls_opt_stdout" ]; then + ln "$bls_opt_stdout" "$bls_info_dir/job.stdout" + if [ $? -ne 0 ]; then + # If we cannot hardlink, try a soft link. + ln -s "$bls_opt_stdout" "$bls_info_dir/job.stdout" + fi + fi + if [ -r "$bls_opt_stderr" ]; then + ln "$bls_opt_stderr" "$bls_info_dir/job.stderr" + if [ $? -ne 0 ]; then + # If we cannot hardlink, try a soft link. + ln -s "$bls_opt_stderr" "$bls_info_dir/job.stderr" + fi + fi + if [ -r "$bls_tmp_file" ]; then + cp "$bls_tmp_file" "$bls_info_dir/submit.script" + fi fi - fi - if [ -r "$bls_tmp_file" ]; then - cp "$bls_tmp_file" "$bls_info_dir/submit.script" - fi - fi - fi + fi +} +function bls_wrap_up_submit () +{ bls_fl_clear inputsand bls_fl_clear outputsand bls_fl_clear inputcopy diff --git a/src/scripts/lsf_submit.sh b/src/scripts/lsf_submit.sh index b7b34bdb..fb95fb4b 100755 --- a/src/scripts/lsf_submit.sh +++ b/src/scripts/lsf_submit.sh @@ -161,6 +161,7 @@ echo " cd \$CERN_STARTER_ORIGINAL_CWD" >> $bls_tmp_file echo "fi" >> $bls_tmp_file bls_add_job_wrapper +bls_save_submit # Let the wrap script be at least 1 second older than logfile # for subsequent "find -newer" command to work diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index c04df344..b26d18ae 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -214,6 +214,7 @@ fi echo "#PBS -m n" >> $bls_tmp_file bls_add_job_wrapper +bls_save_submit # Let the wrap script be at least 1 second older than logfile # for subsequent "find -newer" command to work diff --git a/src/scripts/sge_submit.sh b/src/scripts/sge_submit.sh index c95e7325..d04d719f 100755 --- a/src/scripts/sge_submit.sh +++ b/src/scripts/sge_submit.sh @@ -94,6 +94,7 @@ bls_fl_subst_and_accumulate outputsand "@@F_REMOTE@`hostname -f`:@@F_LOCAL" "@@@ echo "#$ -m n" >> $bls_tmp_file bls_add_job_wrapper +bls_save_submit ############################################################### # Submit the script diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index f02f31f2..a9d9a894 100755 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -78,7 +78,7 @@ bls_set_up_local_and_extra_args # Assume all filesystems are shared. bls_add_job_wrapper - +bls_save_submit ############################################################### # Submit the script From 55a09bd8074d3c9364a12fe7fde22ba355278d7b Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Tue, 27 Feb 2018 14:37:41 -0600 Subject: [PATCH 151/169] Check workdir and input files before job submission Use the pre-existing workdir test function, which catches permission issues with the workdir as well as non-existence. Do this all before we write out the job wrapper because why bother? condor_submit doesn't benefit from this since it doesn't use bls_add_job_wrapper (or any other common submit functions for that matter) --- src/scripts/blah_common_submit_functions.sh | 15 +++++++++++++-- src/scripts/slurm_submit.sh | 16 ---------------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index 72115f13..8c449e71 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -768,8 +768,10 @@ function bls_finish_job_wrapper () fi } -function bls_test_working_dir () +function bls_test_input_files () { + # Verify the workdir can be accessed before submitting the job. If a bogus workdir is + # given, the job is hopeless if [ "x$bls_opt_workdir" != "x" ]; then cd $bls_opt_workdir elif [ "x$blah_set_default_workdir_to_home" == "xyes" ]; then @@ -782,13 +784,22 @@ function bls_test_working_dir () rm -f $bls_tmp_file exit 1 fi + + # Ensure local files actually exist. When called before job submission, this prevents + # unnecessary churn on the scheduler if the files don't exist. + if ! bls_fl_test_exists inputsand ; then + echo "Input sandbox file doesn't exist: $bls_fl_test_exists_result" >&2 + echo Error # for the sake of waiting fgets in blahpd + rm -rf $bls_tmp_file + exit 1 + fi } function bls_add_job_wrapper () { + bls_test_input_files bls_start_job_wrapper >> $bls_tmp_file bls_finish_job_wrapper >> $bls_tmp_file - bls_test_working_dir } function bls_set_up_local_and_extra_args () diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index 4ba65996..a9d9a894 100755 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -70,22 +70,6 @@ if [[ $bls_opt_mpinodes -gt 1 ]] ; then echo "#SBATCH --cpus-per-task=$bls_opt_mpinodes" >> $bls_tmp_file fi -# Verify the workdir exists before submitting the job. If a bogus workdir is -# given, the job is hopeless -if [[ ! -z "$bls_opt_workdir" && ! -d "$bls_opt_workdir" ]] ; then - echo "Error: Workdir doesn't exist" >&2 - echo Error # for the sake of waiting fgets in blahpd - exit 1 -fi - -# Ensure local files actually exist before submitting job. This prevents -# unnecessary churn on the scheduler if the files don't exist. -if ! bls_fl_test_exists inputsand ; then - echo "Input sandbox file doesn't exist: $bls_fl_test_exists_result" >&2 - echo Error # for the sake of waiting fgets in blahpd - exit 1 -fi - # Do the local and extra args after all #SBATCH commands, otherwise slurm ignores anything # after a non-#SBATCH command bls_set_up_local_and_extra_args From fda81bec8fa2ab8082042d4e764be0de9e47761e Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Tue, 13 Mar 2018 16:35:46 -0500 Subject: [PATCH 152/169] Address review comments https://github.com/osg-bosco/BLAH/pull/69 --- src/scripts/blah_common_submit_functions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index 8c449e71..451d1601 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -790,7 +790,7 @@ function bls_test_input_files () if ! bls_fl_test_exists inputsand ; then echo "Input sandbox file doesn't exist: $bls_fl_test_exists_result" >&2 echo Error # for the sake of waiting fgets in blahpd - rm -rf $bls_tmp_file + rm -f "$bls_tmp_file" exit 1 fi } From dc1d3da4c2920f7c918d4f138ac34ff9001be96f Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Tue, 29 May 2018 17:01:41 -0500 Subject: [PATCH 153/169] tell wordexp not to do command substitution --- src/blparser_master.c | 2 +- src/mapped_exec.c | 2 +- src/mtsafe_popen.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/blparser_master.c b/src/blparser_master.c index 1c73baee..e06c688b 100644 --- a/src/blparser_master.c +++ b/src/blparser_master.c @@ -144,7 +144,7 @@ check_on_children_args(const struct blah_managed_child *children, const int coun fret = fork(); if (fret == 0) { - if((j = wordexp(children[i].exefile, &args, 0))) + if((j = wordexp(children[i].exefile, &args, WRDE_NOCMD))) { fprintf(stderr,"wordexp: unable to parse the command line \"%s\" (error %d)\n", children[i].exefile, j); return; diff --git a/src/mapped_exec.c b/src/mapped_exec.c index 1f74502e..82bb72cf 100644 --- a/src/mapped_exec.c +++ b/src/mapped_exec.c @@ -430,7 +430,7 @@ execute_cmd(exec_cmd_t *cmd) } /* Do the shell expansion */ - if(wordexp_err = wordexp(command, &args, 0)) + if(wordexp_err = wordexp(command, &args, WRDE_NOCMD)) { fprintf(stderr,"wordexp: unable to parse the command line \"%s\" (error %d)\n", command, wordexp_err); return(1); diff --git a/src/mtsafe_popen.c b/src/mtsafe_popen.c index aab48bc8..b5e0a50d 100644 --- a/src/mtsafe_popen.c +++ b/src/mtsafe_popen.c @@ -308,7 +308,7 @@ exe_getouterr(char *const command, char *const environment[], char **cmd_output, envcopy[envcopy_size + i] = (char *)NULL; /* Do the shell expansion */ - if(i = wordexp(command, &args, 0)) + if(i = wordexp(command, &args, WRDE_NOCMD)) { fprintf(stderr,"wordexp: unable to parse the command line \"%s\" (error %d)\n", command, i); return(1); From 7148c91b0df21e99c50d12c35c68a676ecd520b7 Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Wed, 6 Jun 2018 12:27:08 -0500 Subject: [PATCH 154/169] failures after fork() should _exit() return is especially bad since it leaves both the parent and child alive --- src/blparser_master.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/blparser_master.c b/src/blparser_master.c index e06c688b..71340ef1 100644 --- a/src/blparser_master.c +++ b/src/blparser_master.c @@ -147,15 +147,15 @@ check_on_children_args(const struct blah_managed_child *children, const int coun if((j = wordexp(children[i].exefile, &args, WRDE_NOCMD))) { fprintf(stderr,"wordexp: unable to parse the command line \"%s\" (error %d)\n", children[i].exefile, j); - return; - } + _exit(1); + } /* Child process. Exec exe file. */ if (execv(args.we_wordv[0], args.we_wordv) < 0) { fprintf(stderr,"Cannot exec %s: %s\n", children[i].exefile, strerror(errno)); - exit(1); + _exit(1); } /* Free the wordexp'd args */ wordfree(&args); From a6438eb7f7cf89e195a25f4f95f57828ea9c2237 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Thu, 15 Mar 2018 11:12:20 -0500 Subject: [PATCH 155/169] Add blahp.spec for OSG github builds --- rpm/blahp.spec | 407 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 407 insertions(+) create mode 100644 rpm/blahp.spec diff --git a/rpm/blahp.spec b/rpm/blahp.spec new file mode 100644 index 00000000..32d43cb4 --- /dev/null +++ b/rpm/blahp.spec @@ -0,0 +1,407 @@ +# Have gitrev be the short hash or branch name if doing a prerelease build +#define gitrev +%define bl_sysconfdir %{_sysconfdir}/%{name} +%define bl_libexecdir %{_libexecdir}/%{name} + +Name: blahp +Version: 1.18.37.bosco +Release: 1%{?gitrev:.%{gitrev}}%{?dist} +Summary: gLite BLAHP daemon + +Group: System/Libraries +License: Apache 2.0 +URL: https://github.com/osg-bosco/BLAH + +# Generated with: +# git archive v1_18_bosco | gzip -9 > %{name}-%{version}.tar.gz +# +# Pre-release build tarballs should be generated with: +# git archive %{gitrev} | gzip -9 > %{name}-%{version}-%{gitrev}.tar.gz +Source0: %{name}-%{version}%{?gitrev:-%{gitrev}}.tar.gz + +BuildRequires: automake +BuildRequires: autoconf +BuildRequires: libtool +BuildRequires: glite-build-common-cpp +BuildRequires: condor-classads-devel +BuildRequires: globus-gss-assist-devel +BuildRequires: globus-gsi-credential-devel +BuildRequires: globus-gsi-proxy-core-devel +BuildRequires: globus-gsi-cert-utils-devel +BuildRequires: docbook-style-xsl, libxslt + +#Requires(post): chkconfig +#Requires(preun): chkconfig +#Requires(preun): initscripts +#Requires(postun): initscripts + +%description +%{summary} + +%prep +%setup -c -n %{name}-%{version} + +%build +./bootstrap +%if 0%{?rhel} >= 7 +export CPPFLAGS="-I/usr/include/classad -std=c++11" +export LDFLAGS="-lclassad -lglobus_gsi_credential -lglobus_common -lglobus_gsi_proxy_core" +%else +export CPPFLAGS="-I/usr/include/classad" +export LDFLAGS="-lclassad" +%endif +%configure --with-classads-prefix=/usr --with-globus-prefix=/usr --with-glite-location=/usr +unset CPPFLAGS +unset LDFLAGS +make %{?_smp_mflags} + +%install +make install DESTDIR=$RPM_BUILD_ROOT + +rm -f $RPM_BUILD_ROOT%{_libdir}/*.la +rm -f $RPM_BUILD_ROOT%{_libdir}/*.a + +# Move all the blahp scripts into /usr/libexec/blahp +mkdir blahp +mv $RPM_BUILD_ROOT%{_libexecdir}/* blahp +install -m 0755 -d -p $RPM_BUILD_ROOT%{bl_libexecdir}/ +mv blahp/* $RPM_BUILD_ROOT%{bl_libexecdir}/ + +# Correct the config file location +install -m 0755 -d -p $RPM_BUILD_ROOT%{_sysconfdir} +mv $RPM_BUILD_ROOT%{_sysconfdir}/blah.config.template $RPM_BUILD_ROOT%{_sysconfdir}/blah.config +mv $RPM_BUILD_ROOT%{_sysconfdir}/blparser.conf.template $RPM_BUILD_ROOT%{_sysconfdir}/blparser.conf +echo "blah_libexec_directory=/usr/libexec/blahp" >> $RPM_BUILD_ROOT%{_sysconfdir}/blah.config + +# Insert appropriate templates for LSF, SGE, Slurm, and HTCondor; admins will need to change these +install -m 0755 -d -p $RPM_BUILD_ROOT%{bl_sysconfdir} + +for batch_system in sge slurm; do + mv $RPM_BUILD_ROOT%{bl_libexecdir}/${batch_system}_local_submit_attributes.sh $RPM_BUILD_ROOT%{bl_sysconfdir}/ +done + +for batch_system in lsf condor; do +cat > $RPM_BUILD_ROOT%{bl_sysconfdir}/${batch_system}_local_submit_attributes.sh << EOF +#/bin/sh + +# This file is sourced by blahp before submitting the job to ${i} +# Anything printed to stdout is included in the submit file. +# For example, to set a default walltime of 24 hours in PBS, you +# could uncomment this line: + +# echo "#PBS -l walltime=24:00:00" + +# blahp allows arbitrary attributes to be passed to this script on a per-job +# basis. If you add the following to your HTCondor-G submit file: + +#+remote_cerequirements = NumJobs == 100 && foo = 5 + +# Then an environment variable, NumJobs, will be exported prior to calling this +# script and set to a value of 100. The variable foo will be set to 5. + +# You could allow users to set the walltime for the job with the following +# customization (PBS syntax given; adjust for the appropriate batch system): + +#if [ -n "\$Walltime" ]; then +# echo "#PBS -l walltime=\$Walltime" +#else +# echo "#PBS -l walltime=24:00:00" +#fi + +EOF +done + +# A more appropriate template for PBS; actually does something +cat > $RPM_BUILD_ROOT%{bl_sysconfdir}/pbs_local_submit_attributes.sh << EOF +#/bin/sh + +# This file is sourced by blahp before submitting the job to PBS +# Anything printed to stdout is included in the submit file. +# For example, to set a default walltime of 24 hours in PBS, you +# could uncomment this line: + +# echo "#PBS -l walltime=24:00:00" + +# blahp allows arbitrary attributes to be passed to this script on a per-job +# basis. If you add the following to your HTCondor-G submit file: + +#+remote_cerequirements = NumJobs == 100 && foo = 5 + +# Then an environment variable, NumJobs, will be exported prior to calling this +# script and set to a value of 100. The variable foo will be set to 5. + +# You could allow users to set the walltime for the job with the following +# customization (PBS syntax given; adjust for the appropriate batch system): + +# Uncomment the else block to default to 24 hours of runtime; otherwise, the queue +# default is used. +if [ -n "\$Walltime" ]; then + echo "#PBS -l walltime=\$Walltime" +#else +# echo "#PBS -l walltime=24:00:00" +fi + +EOF + +# Create local_submit_attributes.sh symlinks in /etc/blahp +for batch_system in pbs sge slurm lsf condor; do + ln -s %{bl_sysconfdir}/${batch_system}_local_submit_attributes.sh \ + $RPM_BUILD_ROOT%{bl_libexecdir}/${batch_system}_local_submit_attributes.sh +done + +mv $RPM_BUILD_ROOT%{_docdir}/glite-ce-blahp-@PVER@ $RPM_BUILD_ROOT%{_docdir}/%{name}-%{version} + +%post + +if [ $1 -eq 1 ] ; then + /sbin/chkconfig --add glite-ce-blah-parser +fi + +%preun + +if [ $1 -eq 0 ] ; then + /sbin/service glite-ce-blah-parser stop >/dev/null 2>&1 + /sbin/chkconfig --del glite-ce-blah-parser +fi + +%files +%defattr(-,root,root,-) +%{_bindir}/* +%{_sbindir}/* +%{_libexecdir}/%{name} +%{_docdir}/%{name}-%{version} +%config(noreplace) %{_sysconfdir}/blparser.conf +%config(noreplace) %{_sysconfdir}/blah.config +%dir %{_sysconfdir}/%{name} +%config(noreplace) %{bl_sysconfdir}/*.sh +%{_mandir}/man1/* +%{_initrddir}/glite-ce-* + +%changelog +* Wed Jun 13 2018 Carl Edquist - 1.18.37.bosco-1 +- Disable command substitution in shell word expansion (SOFTWARE-3288) + +* Thu Mar 15 2018 Brian Lin - 1.18.36.bosco-1 +- Verify input file existence before submission (SOFTWARE-3154) +- Save debugging dirs if job submission fails (SOFTWARE-2827) + +* Fri Dec 1 2017 Brian Lin - 1.18.35.bosco-1 +- Fix segfault when submitting jobs with limited proxies + +* Tue Oct 31 2017 Brian Lin - 1.18.34.bosco-1 +- Fix memory usage parsing for SLURM and PBS (SOFTWARE-2929) +- Fix UnicodeDecodeError when reading blah.config (SOFTWARE-2953) + +* Tue Aug 29 2017 Brian Lin - 1.18.33.bosco-1 +- Fix bug that caused jobs submitted to PBS batch systems to be held + with "Error parsing classad or job not found" (SOFTWARE-2875) +- Fix parsing of time fields for slurm jobs (SOFTWARE-2871) + +* Tue Jul 25 2017 Brian Lin - 1.18.32.bosco-1 +- Fix bug that broke shell parsing of `*_binpath` config values +- Set default bin paths to `/usr/bin` to remove the overhead of `which` for each PBS, LSF, and SGE call. + +* Tue Jul 11 2017 Brian Lin - 1.18.31.bosco-1 +- Add blahp configuration to differentiate PBS flavors (SOFTWARE-2628) + +* Thu Mar 16 2017 Brian Lin - 1.18.30.bosco-1 +- Fix multicore request for SLURM batch systems (SOFTWARE-2774) + +* Thu Mar 16 2017 Brian Lin - 1.18.29.bosco-2 +- Rebuild against condor-8.7.1 + +* Thu Mar 1 2017 Brian Lin - 1.18.29.bosco-1 +- Blahp python scripts should ignore optional '-w' argument (SOFTWARE-2603) +- Fail gracefully when encountering unexpected sacct output (SOFTWARE-2604) +- Some #SBATCH commands are being ignored (SOFTWARE-2605) + +* Tue Feb 28 2017 Edgar Fajardo - 1.18.28.bosco-5 +- Build against condor-8.6.1 + +* Thu Jan 26 2017 Brian Lin - 1.18.28.bosco-4 +- Build against condor-8.7.0 + +* Thu Jan 26 2017 Brian Lin - 1.18.28.bosco-3 +- Build against condor-8.4.11 + +* Mon Dec 19 2016 Brian Lin - 1.18.28.bosco-2 +- Build against condor-8.4.10 + +* Thu Oct 27 2016 Brian Lin - 1.18.28.bosco-1 +- Fixed incompatibility between blahp_results_cache and torque-4.2.9 + that caused jobs to be held when performing status updates on + HTCondor-CE (SOFTWARE-2516) + +* Thu Oct 20 2016 Brian Lin - 1.18.27.bosco-1 +- Fix segfault when using glexec and disabling limited proxies (SOFTWARE-2475) + +* Fri Sep 23 2016 Brian Lin - 1.18.26.bosco-1 +- Refactor scontrol calls to use subprocess (SOFTWARE-2450) + +* Fri Sep 09 2016 Brian Lin - 1.18.25.bosco-1 +- Fix qstart parsing errors that caused blank caches + +* Fri Aug 26 2016 Brian Lin - 1.18.24.bosco-1 +- Fixed slurm multicore requests in slurm_submit.sh +- Added slurm_submit_attributes.sh +- Enabled multicore support to PBS Pro (SOFTWARE-2326) +- Allow users to set the SGE parallel environment policy (SOFTWARE-2334) +- Fixed issues with qstat() (SOFTWARE-2358) + +* Tue Jul 26 2016 Edgar Fajardo - 1.18.23.bosco-1 +- Fixed a bug in HTConodor Ticket-5804. (SOFTWARE-2404) + +* Thu Jul 21 2016 Edgar Fajardo - 1.18.22.bosco-2 +- The code was taken from the osg-bosco instead of Edgar's fork. + +* Wed Jul 20 2016 Edgar Fajardo - 1.18.22.bosco-1 +- Merge HTCondor Ticket-5722. Cache output of slurm-status. (SOFTWARE-2399) + +* Thu Jun 23 2016 Brian Lin - 1.18.21.bosco-1 +- Fix Slurm file leak (SOFTWARE-2367) +- Package slurm_hold.sh (SOFTWARE-2375) + +* Fri Jun 03 2016 Brian Lin - 1.18.20.bosco-1 +- Add multicore HTCondor support (SOFTWARE-2303) +- Support dynamic assignment of env variables (SOFTWARE-2221) + +* Mon May 02 2016 Matyas Selmeci - 1.18.19.bosco-2 +- Built against HTCondor 8.5.4 (SOFTWARE-2307) + +* Mon Apr 25 2016 Brian Lin - 1.18.19.bosco-1 +- Add SLURM support (SOFTWARE-2256) +- Fix mem requests (SOFTWARE-2260) + +* Fri Feb 26 2016 Brian Lin - 1.18.18.bosco-1 +- Bug fixes for PBS installations without qstat in their PATH + +* Mon Feb 22 2016 Brian Lin - 1.18.17.bosco-1 +- Re-apply lost SGE script changes (SOFTWARE-2199) +- Handle LSF suspended states (SOFTWARE-2168) +- Modify BLAHP to report gratia necessary attributes (SOFTWARE-2019) + +* Thu Dec 16 2015 Brian Lin - 1.18.16.bosco-1 +- Allow for disabling limited proxies in glexec +- Fix bug in pbs_status.py when /tmp/ and /var/tmp were on different filesystems +- Resync job registry to prevent jobs from being incorrectly marked as completed + +* Mon Nov 23 2015 Edgar Fajardo - 1.18.15.bosco-2 +- Built against HTCondor 8.5.1 SOFTWARE-2077 + +* Wed Nov 11 2015 Carl Edquist - 1.18.15.bosco-3 +- Build against condor 8.4.2 (SOFTWARE-2084) + +* Mon Nov 2 2015 Edgar Fajardo - 1.18.15.bosco-2 +- Build aginst condor 8.4.0 (SOFTWARE-2084) + +* Tue Oct 27 2015 Jeff Dost - 1.18.15.bosco-1 +- Build against HTCondor 8.4.1 (SOFTWARE-2084) +- Added error reporting to pbs_submit + +* Fri Oct 23 2015 Edgar Fajardo - 1.18.15.bosco-1 +- Built against HTCOndor 8.5.0 SOFTWARE-2077 +- Added error reporting to pbs_submit + +* Tue Sep 29 2015 Brian Lin - 1.18.14.bosco-1 +- Added PBS Pro support (SOFTWARE-1958) +- Fix for job registry losing track of LSF jobs in its registry (gittrac #5062) +- Added 'blah_disable_limited_proxies' to disable creation of limited proxies +- Reduce 'blah_max_threaded_commands' to 50 (SOFTWARE-1980) + +* Mon Aug 31 2015 Carl Edquist - 1.18.13.bosco-4 +- Rebuild against HTCondor 8.3.8 (SOFTWARE-1995) + +* Mon Jul 20 2015 Mátyás Selmeci 1.18.13.bosco-3 +- bump to rebuild + +* Thu Jun 25 2015 Brian Lin - 1.18.13.bosco-2 +- Rebuild against HTCondor 8.3.6 + +* Thu May 28 2015 Brian Lin - 1.18.13.bosco-1 +- Fixes to PBS and HTCondor submission + +* Tue Apr 28 2015 Brian Lin - 1.18.12.bosco-2 +- Rebuild against HTCondor 8.3.5 + +* Mon Mar 30 2015 Brian Lin - 1.18.12.bosco-1 +- Source profile.lsf for LSF job submission + +* Wed Dec 03 2014 Mátyás Selmeci 1.18.11.bosco-4 +- Fix syntax error in condor_submit.sh +- Source OSG job environment variables in generated submit scripts for pbs, + lsf, sge, and slurm jobmanagers (SOFTWARE-1709) + +* Mon Oct 27 2014 Brian Lin - 1.18.11.bosco-3 +- Rebuild against condor-8.2.3 + +* Mon Oct 20 2014 Carl Edquist - 1.18.11.bosco-2 +- Build fixes for el7 (SOFTWARE-1604) + +* Mon Sep 29 2014 Brian Lin - 1.18.11.bosco-1 +- Fix bug in PBS status script + +* Thu Sep 25 2014 Brian Lin - 1.18.10.bosco-1 +- Fixes to LSF scripts pushed upstream (SOFTWARE-1589, creating a temp file in /tmp) +- Fix to PBS script that tracks job status (SOFTWARE-1594) + +* Mon Aug 25 2014 Brian Lin - 1.18.9.bosco-2 +- Fix for memory allocation failure when tracking LSF jobs (SOFTWARE-1589) + +* Thu Jan 09 2014 Brian Bockelman - 1.18.9.bosco-1 +- Fix proxy renewal in the case where no home directory exists. +- Improve packaging of local customization scripts and include defaults. + These are now marked as config files and places in /etc. +- Change name of documentation directory to reflect RPM name. + +* Tue Jan 07 2014 Brian Bockelman - 1.18.8.bosco-1 +- Fixes from PBS testing. Blahp now handles multiple arguments correctly + and the wrapper script will remove the job proxy after it finishes. + +* Wed Oct 30 2013 Matyas Selmeci - 1.18.7.bosco-2 +- Bump to rebuild against condor-7.8.8-x (OSG-3.1) and condor-8.0.4-x (OSG 3.2) + +* Fri Sep 20 2013 Brian Bockelman - 1.18.7.bosco-1 +- Do not close stderr fd from the blah. + +* Tue May 14 2013 Brian Bockelman - 1.18.5.bosco-1 +- Alter the pbs_status.py locking algorithm to add random component to + sleeps between poll. + +* Thu Jan 17 2013 Derek Weitzel - 1.18.4.bosco-1 +- Fixing pbs_status.py via upstream SOFTWARE-905 + +* Thu Dec 13 2012 Brian Bockelman 1.18.3.bosco-1.osg +- Merge BOSCO and OSG distribution of blahp. + +* Wed Dec 05 2012 John Thiltges 1.18.0.4-9.osg +- Fix pbs_status.sh in spec file + +* Fri Oct 12 2012 Brian Bockelman - 1.18.0.4-8.osg +- Pull in all remaining patches from the OSG-CE work. +- Fix non-standard qstat locations. +- Fix arg escaping in Condor. +- Fix submissions with a relative proxy path. +- Release bumped a few extra versions to stay in line with the Caltech Koji. + +* Wed Aug 29 2012 Matyas Selmeci - 1.18.0.4-5.osg +- Fixed paths in init script +- Added default options for condor + +* Wed Jul 25 2012 Matyas Selmeci - 1.18.0.4-4.osg +- Disable autostart of blah parser + +* Thu May 31 2012 Brian Bockelman - 1.18.0.4-3 +- Add caching for PBS script. + +* Mon May 28 2012 Brian Bockelman -1.18.0.4-2 +- Import patches from Condor team. + +* Mon May 28 2012 Brian Bockelman -1.18.0.4-1 +- Update to latest upstream. + +* Fri Sep 16 2011 Brian Bockelman - 1.16.1-3 +- Rev bump for GT 5.2 recompile. + +* Wed Jan 05 2011 Brian Bockelman 1.16.1-1 +- Initial RPM packaging + From 01069deaa645fd89c11c5c276157d826322bac87 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Fri, 31 Aug 2018 12:33:04 -0500 Subject: [PATCH 156/169] Disable blahp proxy renewal/limited proxies We're doing this in the default config template instead of the code itself so that existing configs aren't affected --- config/blah.config.template | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/blah.config.template b/config/blah.config.template index e9962d52..6b5b50a3 100644 --- a/config/blah.config.template +++ b/config/blah.config.template @@ -12,7 +12,7 @@ BLAHPD_ACCOUNTING_INFO_LOG= #Set to yes if you wish to disable BLAH's machinery for transferring #or delegating proxies to the worker node where a job is running. (default = no) -blah_disable_wn_proxy_renewal= +blah_disable_wn_proxy_renewal=yes #Set to yes to enable delegation (instead of copy) of renewed proxies #to worker nodes. NOTE: limited *and* delegated proxes are not @@ -21,7 +21,7 @@ blah_disable_wn_proxy_renewal= blah_delegate_renewed_proxies= #Set to yes to disable creation of a limited proxy. (default = no) -blah_disable_limited_proxy= +blah_disable_limited_proxy=yes #max number of concurrent threads to serve commands (default = 500) blah_max_threaded_cmds=50 From b4c54ad535b1637ee0c79a3b3bd21b65521ccd34 Mon Sep 17 00:00:00 2001 From: Matyas Selmeci Date: Mon, 10 Sep 2018 13:00:14 -0500 Subject: [PATCH 157/169] Revert "Add blahp.spec for OSG github builds" This reverts commit a6438eb7f7cf89e195a25f4f95f57828ea9c2237. We'll set up the github build stuff once we're shipping from the opensciencegrid org. --- rpm/blahp.spec | 407 ------------------------------------------------- 1 file changed, 407 deletions(-) delete mode 100644 rpm/blahp.spec diff --git a/rpm/blahp.spec b/rpm/blahp.spec deleted file mode 100644 index 32d43cb4..00000000 --- a/rpm/blahp.spec +++ /dev/null @@ -1,407 +0,0 @@ -# Have gitrev be the short hash or branch name if doing a prerelease build -#define gitrev -%define bl_sysconfdir %{_sysconfdir}/%{name} -%define bl_libexecdir %{_libexecdir}/%{name} - -Name: blahp -Version: 1.18.37.bosco -Release: 1%{?gitrev:.%{gitrev}}%{?dist} -Summary: gLite BLAHP daemon - -Group: System/Libraries -License: Apache 2.0 -URL: https://github.com/osg-bosco/BLAH - -# Generated with: -# git archive v1_18_bosco | gzip -9 > %{name}-%{version}.tar.gz -# -# Pre-release build tarballs should be generated with: -# git archive %{gitrev} | gzip -9 > %{name}-%{version}-%{gitrev}.tar.gz -Source0: %{name}-%{version}%{?gitrev:-%{gitrev}}.tar.gz - -BuildRequires: automake -BuildRequires: autoconf -BuildRequires: libtool -BuildRequires: glite-build-common-cpp -BuildRequires: condor-classads-devel -BuildRequires: globus-gss-assist-devel -BuildRequires: globus-gsi-credential-devel -BuildRequires: globus-gsi-proxy-core-devel -BuildRequires: globus-gsi-cert-utils-devel -BuildRequires: docbook-style-xsl, libxslt - -#Requires(post): chkconfig -#Requires(preun): chkconfig -#Requires(preun): initscripts -#Requires(postun): initscripts - -%description -%{summary} - -%prep -%setup -c -n %{name}-%{version} - -%build -./bootstrap -%if 0%{?rhel} >= 7 -export CPPFLAGS="-I/usr/include/classad -std=c++11" -export LDFLAGS="-lclassad -lglobus_gsi_credential -lglobus_common -lglobus_gsi_proxy_core" -%else -export CPPFLAGS="-I/usr/include/classad" -export LDFLAGS="-lclassad" -%endif -%configure --with-classads-prefix=/usr --with-globus-prefix=/usr --with-glite-location=/usr -unset CPPFLAGS -unset LDFLAGS -make %{?_smp_mflags} - -%install -make install DESTDIR=$RPM_BUILD_ROOT - -rm -f $RPM_BUILD_ROOT%{_libdir}/*.la -rm -f $RPM_BUILD_ROOT%{_libdir}/*.a - -# Move all the blahp scripts into /usr/libexec/blahp -mkdir blahp -mv $RPM_BUILD_ROOT%{_libexecdir}/* blahp -install -m 0755 -d -p $RPM_BUILD_ROOT%{bl_libexecdir}/ -mv blahp/* $RPM_BUILD_ROOT%{bl_libexecdir}/ - -# Correct the config file location -install -m 0755 -d -p $RPM_BUILD_ROOT%{_sysconfdir} -mv $RPM_BUILD_ROOT%{_sysconfdir}/blah.config.template $RPM_BUILD_ROOT%{_sysconfdir}/blah.config -mv $RPM_BUILD_ROOT%{_sysconfdir}/blparser.conf.template $RPM_BUILD_ROOT%{_sysconfdir}/blparser.conf -echo "blah_libexec_directory=/usr/libexec/blahp" >> $RPM_BUILD_ROOT%{_sysconfdir}/blah.config - -# Insert appropriate templates for LSF, SGE, Slurm, and HTCondor; admins will need to change these -install -m 0755 -d -p $RPM_BUILD_ROOT%{bl_sysconfdir} - -for batch_system in sge slurm; do - mv $RPM_BUILD_ROOT%{bl_libexecdir}/${batch_system}_local_submit_attributes.sh $RPM_BUILD_ROOT%{bl_sysconfdir}/ -done - -for batch_system in lsf condor; do -cat > $RPM_BUILD_ROOT%{bl_sysconfdir}/${batch_system}_local_submit_attributes.sh << EOF -#/bin/sh - -# This file is sourced by blahp before submitting the job to ${i} -# Anything printed to stdout is included in the submit file. -# For example, to set a default walltime of 24 hours in PBS, you -# could uncomment this line: - -# echo "#PBS -l walltime=24:00:00" - -# blahp allows arbitrary attributes to be passed to this script on a per-job -# basis. If you add the following to your HTCondor-G submit file: - -#+remote_cerequirements = NumJobs == 100 && foo = 5 - -# Then an environment variable, NumJobs, will be exported prior to calling this -# script and set to a value of 100. The variable foo will be set to 5. - -# You could allow users to set the walltime for the job with the following -# customization (PBS syntax given; adjust for the appropriate batch system): - -#if [ -n "\$Walltime" ]; then -# echo "#PBS -l walltime=\$Walltime" -#else -# echo "#PBS -l walltime=24:00:00" -#fi - -EOF -done - -# A more appropriate template for PBS; actually does something -cat > $RPM_BUILD_ROOT%{bl_sysconfdir}/pbs_local_submit_attributes.sh << EOF -#/bin/sh - -# This file is sourced by blahp before submitting the job to PBS -# Anything printed to stdout is included in the submit file. -# For example, to set a default walltime of 24 hours in PBS, you -# could uncomment this line: - -# echo "#PBS -l walltime=24:00:00" - -# blahp allows arbitrary attributes to be passed to this script on a per-job -# basis. If you add the following to your HTCondor-G submit file: - -#+remote_cerequirements = NumJobs == 100 && foo = 5 - -# Then an environment variable, NumJobs, will be exported prior to calling this -# script and set to a value of 100. The variable foo will be set to 5. - -# You could allow users to set the walltime for the job with the following -# customization (PBS syntax given; adjust for the appropriate batch system): - -# Uncomment the else block to default to 24 hours of runtime; otherwise, the queue -# default is used. -if [ -n "\$Walltime" ]; then - echo "#PBS -l walltime=\$Walltime" -#else -# echo "#PBS -l walltime=24:00:00" -fi - -EOF - -# Create local_submit_attributes.sh symlinks in /etc/blahp -for batch_system in pbs sge slurm lsf condor; do - ln -s %{bl_sysconfdir}/${batch_system}_local_submit_attributes.sh \ - $RPM_BUILD_ROOT%{bl_libexecdir}/${batch_system}_local_submit_attributes.sh -done - -mv $RPM_BUILD_ROOT%{_docdir}/glite-ce-blahp-@PVER@ $RPM_BUILD_ROOT%{_docdir}/%{name}-%{version} - -%post - -if [ $1 -eq 1 ] ; then - /sbin/chkconfig --add glite-ce-blah-parser -fi - -%preun - -if [ $1 -eq 0 ] ; then - /sbin/service glite-ce-blah-parser stop >/dev/null 2>&1 - /sbin/chkconfig --del glite-ce-blah-parser -fi - -%files -%defattr(-,root,root,-) -%{_bindir}/* -%{_sbindir}/* -%{_libexecdir}/%{name} -%{_docdir}/%{name}-%{version} -%config(noreplace) %{_sysconfdir}/blparser.conf -%config(noreplace) %{_sysconfdir}/blah.config -%dir %{_sysconfdir}/%{name} -%config(noreplace) %{bl_sysconfdir}/*.sh -%{_mandir}/man1/* -%{_initrddir}/glite-ce-* - -%changelog -* Wed Jun 13 2018 Carl Edquist - 1.18.37.bosco-1 -- Disable command substitution in shell word expansion (SOFTWARE-3288) - -* Thu Mar 15 2018 Brian Lin - 1.18.36.bosco-1 -- Verify input file existence before submission (SOFTWARE-3154) -- Save debugging dirs if job submission fails (SOFTWARE-2827) - -* Fri Dec 1 2017 Brian Lin - 1.18.35.bosco-1 -- Fix segfault when submitting jobs with limited proxies - -* Tue Oct 31 2017 Brian Lin - 1.18.34.bosco-1 -- Fix memory usage parsing for SLURM and PBS (SOFTWARE-2929) -- Fix UnicodeDecodeError when reading blah.config (SOFTWARE-2953) - -* Tue Aug 29 2017 Brian Lin - 1.18.33.bosco-1 -- Fix bug that caused jobs submitted to PBS batch systems to be held - with "Error parsing classad or job not found" (SOFTWARE-2875) -- Fix parsing of time fields for slurm jobs (SOFTWARE-2871) - -* Tue Jul 25 2017 Brian Lin - 1.18.32.bosco-1 -- Fix bug that broke shell parsing of `*_binpath` config values -- Set default bin paths to `/usr/bin` to remove the overhead of `which` for each PBS, LSF, and SGE call. - -* Tue Jul 11 2017 Brian Lin - 1.18.31.bosco-1 -- Add blahp configuration to differentiate PBS flavors (SOFTWARE-2628) - -* Thu Mar 16 2017 Brian Lin - 1.18.30.bosco-1 -- Fix multicore request for SLURM batch systems (SOFTWARE-2774) - -* Thu Mar 16 2017 Brian Lin - 1.18.29.bosco-2 -- Rebuild against condor-8.7.1 - -* Thu Mar 1 2017 Brian Lin - 1.18.29.bosco-1 -- Blahp python scripts should ignore optional '-w' argument (SOFTWARE-2603) -- Fail gracefully when encountering unexpected sacct output (SOFTWARE-2604) -- Some #SBATCH commands are being ignored (SOFTWARE-2605) - -* Tue Feb 28 2017 Edgar Fajardo - 1.18.28.bosco-5 -- Build against condor-8.6.1 - -* Thu Jan 26 2017 Brian Lin - 1.18.28.bosco-4 -- Build against condor-8.7.0 - -* Thu Jan 26 2017 Brian Lin - 1.18.28.bosco-3 -- Build against condor-8.4.11 - -* Mon Dec 19 2016 Brian Lin - 1.18.28.bosco-2 -- Build against condor-8.4.10 - -* Thu Oct 27 2016 Brian Lin - 1.18.28.bosco-1 -- Fixed incompatibility between blahp_results_cache and torque-4.2.9 - that caused jobs to be held when performing status updates on - HTCondor-CE (SOFTWARE-2516) - -* Thu Oct 20 2016 Brian Lin - 1.18.27.bosco-1 -- Fix segfault when using glexec and disabling limited proxies (SOFTWARE-2475) - -* Fri Sep 23 2016 Brian Lin - 1.18.26.bosco-1 -- Refactor scontrol calls to use subprocess (SOFTWARE-2450) - -* Fri Sep 09 2016 Brian Lin - 1.18.25.bosco-1 -- Fix qstart parsing errors that caused blank caches - -* Fri Aug 26 2016 Brian Lin - 1.18.24.bosco-1 -- Fixed slurm multicore requests in slurm_submit.sh -- Added slurm_submit_attributes.sh -- Enabled multicore support to PBS Pro (SOFTWARE-2326) -- Allow users to set the SGE parallel environment policy (SOFTWARE-2334) -- Fixed issues with qstat() (SOFTWARE-2358) - -* Tue Jul 26 2016 Edgar Fajardo - 1.18.23.bosco-1 -- Fixed a bug in HTConodor Ticket-5804. (SOFTWARE-2404) - -* Thu Jul 21 2016 Edgar Fajardo - 1.18.22.bosco-2 -- The code was taken from the osg-bosco instead of Edgar's fork. - -* Wed Jul 20 2016 Edgar Fajardo - 1.18.22.bosco-1 -- Merge HTCondor Ticket-5722. Cache output of slurm-status. (SOFTWARE-2399) - -* Thu Jun 23 2016 Brian Lin - 1.18.21.bosco-1 -- Fix Slurm file leak (SOFTWARE-2367) -- Package slurm_hold.sh (SOFTWARE-2375) - -* Fri Jun 03 2016 Brian Lin - 1.18.20.bosco-1 -- Add multicore HTCondor support (SOFTWARE-2303) -- Support dynamic assignment of env variables (SOFTWARE-2221) - -* Mon May 02 2016 Matyas Selmeci - 1.18.19.bosco-2 -- Built against HTCondor 8.5.4 (SOFTWARE-2307) - -* Mon Apr 25 2016 Brian Lin - 1.18.19.bosco-1 -- Add SLURM support (SOFTWARE-2256) -- Fix mem requests (SOFTWARE-2260) - -* Fri Feb 26 2016 Brian Lin - 1.18.18.bosco-1 -- Bug fixes for PBS installations without qstat in their PATH - -* Mon Feb 22 2016 Brian Lin - 1.18.17.bosco-1 -- Re-apply lost SGE script changes (SOFTWARE-2199) -- Handle LSF suspended states (SOFTWARE-2168) -- Modify BLAHP to report gratia necessary attributes (SOFTWARE-2019) - -* Thu Dec 16 2015 Brian Lin - 1.18.16.bosco-1 -- Allow for disabling limited proxies in glexec -- Fix bug in pbs_status.py when /tmp/ and /var/tmp were on different filesystems -- Resync job registry to prevent jobs from being incorrectly marked as completed - -* Mon Nov 23 2015 Edgar Fajardo - 1.18.15.bosco-2 -- Built against HTCondor 8.5.1 SOFTWARE-2077 - -* Wed Nov 11 2015 Carl Edquist - 1.18.15.bosco-3 -- Build against condor 8.4.2 (SOFTWARE-2084) - -* Mon Nov 2 2015 Edgar Fajardo - 1.18.15.bosco-2 -- Build aginst condor 8.4.0 (SOFTWARE-2084) - -* Tue Oct 27 2015 Jeff Dost - 1.18.15.bosco-1 -- Build against HTCondor 8.4.1 (SOFTWARE-2084) -- Added error reporting to pbs_submit - -* Fri Oct 23 2015 Edgar Fajardo - 1.18.15.bosco-1 -- Built against HTCOndor 8.5.0 SOFTWARE-2077 -- Added error reporting to pbs_submit - -* Tue Sep 29 2015 Brian Lin - 1.18.14.bosco-1 -- Added PBS Pro support (SOFTWARE-1958) -- Fix for job registry losing track of LSF jobs in its registry (gittrac #5062) -- Added 'blah_disable_limited_proxies' to disable creation of limited proxies -- Reduce 'blah_max_threaded_commands' to 50 (SOFTWARE-1980) - -* Mon Aug 31 2015 Carl Edquist - 1.18.13.bosco-4 -- Rebuild against HTCondor 8.3.8 (SOFTWARE-1995) - -* Mon Jul 20 2015 Mátyás Selmeci 1.18.13.bosco-3 -- bump to rebuild - -* Thu Jun 25 2015 Brian Lin - 1.18.13.bosco-2 -- Rebuild against HTCondor 8.3.6 - -* Thu May 28 2015 Brian Lin - 1.18.13.bosco-1 -- Fixes to PBS and HTCondor submission - -* Tue Apr 28 2015 Brian Lin - 1.18.12.bosco-2 -- Rebuild against HTCondor 8.3.5 - -* Mon Mar 30 2015 Brian Lin - 1.18.12.bosco-1 -- Source profile.lsf for LSF job submission - -* Wed Dec 03 2014 Mátyás Selmeci 1.18.11.bosco-4 -- Fix syntax error in condor_submit.sh -- Source OSG job environment variables in generated submit scripts for pbs, - lsf, sge, and slurm jobmanagers (SOFTWARE-1709) - -* Mon Oct 27 2014 Brian Lin - 1.18.11.bosco-3 -- Rebuild against condor-8.2.3 - -* Mon Oct 20 2014 Carl Edquist - 1.18.11.bosco-2 -- Build fixes for el7 (SOFTWARE-1604) - -* Mon Sep 29 2014 Brian Lin - 1.18.11.bosco-1 -- Fix bug in PBS status script - -* Thu Sep 25 2014 Brian Lin - 1.18.10.bosco-1 -- Fixes to LSF scripts pushed upstream (SOFTWARE-1589, creating a temp file in /tmp) -- Fix to PBS script that tracks job status (SOFTWARE-1594) - -* Mon Aug 25 2014 Brian Lin - 1.18.9.bosco-2 -- Fix for memory allocation failure when tracking LSF jobs (SOFTWARE-1589) - -* Thu Jan 09 2014 Brian Bockelman - 1.18.9.bosco-1 -- Fix proxy renewal in the case where no home directory exists. -- Improve packaging of local customization scripts and include defaults. - These are now marked as config files and places in /etc. -- Change name of documentation directory to reflect RPM name. - -* Tue Jan 07 2014 Brian Bockelman - 1.18.8.bosco-1 -- Fixes from PBS testing. Blahp now handles multiple arguments correctly - and the wrapper script will remove the job proxy after it finishes. - -* Wed Oct 30 2013 Matyas Selmeci - 1.18.7.bosco-2 -- Bump to rebuild against condor-7.8.8-x (OSG-3.1) and condor-8.0.4-x (OSG 3.2) - -* Fri Sep 20 2013 Brian Bockelman - 1.18.7.bosco-1 -- Do not close stderr fd from the blah. - -* Tue May 14 2013 Brian Bockelman - 1.18.5.bosco-1 -- Alter the pbs_status.py locking algorithm to add random component to - sleeps between poll. - -* Thu Jan 17 2013 Derek Weitzel - 1.18.4.bosco-1 -- Fixing pbs_status.py via upstream SOFTWARE-905 - -* Thu Dec 13 2012 Brian Bockelman 1.18.3.bosco-1.osg -- Merge BOSCO and OSG distribution of blahp. - -* Wed Dec 05 2012 John Thiltges 1.18.0.4-9.osg -- Fix pbs_status.sh in spec file - -* Fri Oct 12 2012 Brian Bockelman - 1.18.0.4-8.osg -- Pull in all remaining patches from the OSG-CE work. -- Fix non-standard qstat locations. -- Fix arg escaping in Condor. -- Fix submissions with a relative proxy path. -- Release bumped a few extra versions to stay in line with the Caltech Koji. - -* Wed Aug 29 2012 Matyas Selmeci - 1.18.0.4-5.osg -- Fixed paths in init script -- Added default options for condor - -* Wed Jul 25 2012 Matyas Selmeci - 1.18.0.4-4.osg -- Disable autostart of blah parser - -* Thu May 31 2012 Brian Bockelman - 1.18.0.4-3 -- Add caching for PBS script. - -* Mon May 28 2012 Brian Bockelman -1.18.0.4-2 -- Import patches from Condor team. - -* Mon May 28 2012 Brian Bockelman -1.18.0.4-1 -- Update to latest upstream. - -* Fri Sep 16 2011 Brian Bockelman - 1.16.1-3 -- Rev bump for GT 5.2 recompile. - -* Wed Jan 05 2011 Brian Bockelman 1.16.1-1 -- Initial RPM packaging - From 8fe02e41a5939f0c09d6dfc56e7b24844b79ad9b Mon Sep 17 00:00:00 2001 From: Suchandra Thapa Date: Thu, 1 Nov 2018 15:13:20 -1000 Subject: [PATCH 158/169] Use new condor env format --- src/scripts/condor_submit.sh | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/scripts/condor_submit.sh b/src/scripts/condor_submit.sh index 10849855..1d6290e0 100755 --- a/src/scripts/condor_submit.sh +++ b/src/scripts/condor_submit.sh @@ -201,19 +201,27 @@ submit_file_environment="#" if [ "x$environment" != "x" ] ; then # Input format is suitable for bourne shell style assignment. Convert to -# old condor format (no double quotes in submit file). -# FIXME: probably it's better to convert everything into the 'new' Condor -# environment format. +# new condor format to avoid errors when things like LS_COLORS (which +# has semicolons in it) get captured eval "env_array=($environment)" submit_file_environment="" for env_var in "${env_array[@]}"; do if [ "x$submit_file_environment" == "x" ] ; then - submit_file_environment="environment = " + submit_file_environment="environment = \" ${env_var}" else - submit_file_environment="$submit_file_environment;" + # check for spaces in env_var + pattern=" " + if [[ $env_var =~ $pattern ]]; then + fixed_env_var="${env_var}'" + fixed_env_var=`echo ${fixed_env_var} | sed -e "s|=|='|"` + echo "12 -- ${fixed_env_var}" >> ~/foo + submit_file_environment="${submit_file_environment} ${fixed_env_var} " + else + submit_file_environment="${submit_file_environment} ${env_var} " + fi fi - submit_file_environment="${submit_file_environment}${env_var}" done + submit_file_environment="${submit_file_environment}\"" else if [ "x$envir" != "x" ] ; then # Old Condor format (no double quotes in submit file) From 6294ef063a196295fa097df33cde3e73594fda44 Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Mon, 26 Nov 2018 13:10:28 -0600 Subject: [PATCH 159/169] cleanup $submit_file_environment calculation always map key=val -> key='val', and don't do it in a big loop. the $sq variable is used for single quotes, to avoid quoting weirdness inside of the env_array substitution lines. --- src/scripts/condor_submit.sh | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/src/scripts/condor_submit.sh b/src/scripts/condor_submit.sh index 1d6290e0..15b0fd35 100755 --- a/src/scripts/condor_submit.sh +++ b/src/scripts/condor_submit.sh @@ -204,24 +204,10 @@ if [ "x$environment" != "x" ] ; then # new condor format to avoid errors when things like LS_COLORS (which # has semicolons in it) get captured eval "env_array=($environment)" - submit_file_environment="" - for env_var in "${env_array[@]}"; do - if [ "x$submit_file_environment" == "x" ] ; then - submit_file_environment="environment = \" ${env_var}" - else - # check for spaces in env_var - pattern=" " - if [[ $env_var =~ $pattern ]]; then - fixed_env_var="${env_var}'" - fixed_env_var=`echo ${fixed_env_var} | sed -e "s|=|='|"` - echo "12 -- ${fixed_env_var}" >> ~/foo - submit_file_environment="${submit_file_environment} ${fixed_env_var} " - else - submit_file_environment="${submit_file_environment} ${env_var} " - fi - fi - done - submit_file_environment="${submit_file_environment}\"" + sq="'" # map key=val -> key='val' + env_array=("${env_array[@]/=/=$sq}") + env_array=("${env_array[@]/%/$sq}") + submit_file_environment="environment = \"${env_array[*]}\"" else if [ "x$envir" != "x" ] ; then # Old Condor format (no double quotes in submit file) From b6a617bbede7058cd89404506ace9f47a9172507 Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Thu, 7 Feb 2019 16:44:47 -0600 Subject: [PATCH 160/169] name trap signals explicitly, for clarity (SOFTWARE-3554) but who knows what XCPU is doing in this list... --- src/scripts/blah_common_submit_functions.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index 451d1601..14e00245 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -661,8 +661,8 @@ function bls_start_job_wrapper () fi echo "mkdir \$new_home" - echo "trap 'wait \$job_pid; cd \$old_home; rm -rf \$new_home; exit 255' 1 2 3 15 24" - echo "trap 'wait \$job_pid; cd \$old_home; rm -rf \$new_home' 0" + echo "trap 'wait \$job_pid; cd \$old_home; rm -rf \$new_home; exit 255' HUP INT QUIT TERM XCPU" + echo "trap 'wait \$job_pid; cd \$old_home; rm -rf \$new_home' EXIT" echo "# Copy into new home any shared input sandbox file" bls_fl_subst_and_dump inputcopy "cp \"@@F_LOCAL\" \"\$new_home/@@F_REMOTE\" &> /dev/null" From a69d6fd59eed4b90fa1f27ad24d9f2cf967a0343 Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Fri, 8 Feb 2019 16:47:01 -0600 Subject: [PATCH 161/169] proper escaping of $new_home (SOFTWARE-3554) --- src/scripts/blah_common_submit_functions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index 14e00245..42f3c44f 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -660,7 +660,7 @@ function bls_start_job_wrapper () echo "new_home=\${old_home}/$run_dir" fi - echo "mkdir \$new_home" + echo 'mkdir "$new_home"' echo "trap 'wait \$job_pid; cd \$old_home; rm -rf \$new_home; exit 255' HUP INT QUIT TERM XCPU" echo "trap 'wait \$job_pid; cd \$old_home; rm -rf \$new_home' EXIT" From d9248c8dfd9b547c79c1513924f820413301adb7 Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Fri, 8 Feb 2019 16:48:37 -0600 Subject: [PATCH 162/169] forward signals to $job_pid before waiting (SOFTWARE-3554) --- src/scripts/blah_common_submit_functions.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index 42f3c44f..12313620 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -661,8 +661,11 @@ function bls_start_job_wrapper () fi echo 'mkdir "$new_home"' - echo "trap 'wait \$job_pid; cd \$old_home; rm -rf \$new_home; exit 255' HUP INT QUIT TERM XCPU" - echo "trap 'wait \$job_pid; cd \$old_home; rm -rf \$new_home' EXIT" + echo 'job_wait_cleanup () { wait "$job_pid"; cd "$old_home"; rm -rf "$new_home"; }' + echo 'on_signal () { kill -$1 "$job_pid"; job_wait_cleanup; exit 255; }' + echo 'trap_sigs () { for sig; do trap "on_signal $sig" $sig; done; }' + echo 'trap_sigs HUP INT QUIT TERM XCPU' + echo 'trap job_wait_cleanup EXIT' echo "# Copy into new home any shared input sandbox file" bls_fl_subst_and_dump inputcopy "cp \"@@F_LOCAL\" \"\$new_home/@@F_REMOTE\" &> /dev/null" From 22693911d961044e245b860556a6a3c0b0047900 Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Wed, 13 Feb 2019 19:19:24 -0600 Subject: [PATCH 163/169] condor externals: apply blahp/1.16.5.1/pbs-qdel.patch --- src/scripts/pbs_cancel.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/scripts/pbs_cancel.sh b/src/scripts/pbs_cancel.sh index f6e7439f..618dddcc 100755 --- a/src/scripts/pbs_cancel.sh +++ b/src/scripts/pbs_cancel.sh @@ -35,6 +35,13 @@ for job in $@ ; do requested=`echo $job | sed 's/^.*\///'` cmdout=`${pbs_binpath}/qdel $requested 2>&1` retcode=$? + # If the job is already completed or no longer in the queue, + # treat it as successfully deleted. + if echo "$cmdout" | grep -q 'Unknown Job' ; then + retcode=0 + elif echo "$cmdout" | grep -q 'Request invalid for state of job MSG=invalid state for job - COMPLETE' ; then + retcode=0 + fi if [ "$retcode" == "0" ] ; then if [ "$jnr" == "1" ]; then echo " 0 No\\ error" From ac020455dad36174d3842753d6009ee4fce373f5 Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Wed, 13 Feb 2019 19:19:24 -0600 Subject: [PATCH 164/169] condor externals: apply blahp/1.16.5.1/pbs-status.patch --- src/scripts/pbs_status.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/scripts/pbs_status.sh b/src/scripts/pbs_status.sh index 51cda061..9c850f52 100755 --- a/src/scripts/pbs_status.sh +++ b/src/scripts/pbs_status.sh @@ -208,6 +208,9 @@ BEGIN { END { if (current_js ~ "Q") {jobstatus = 1} + if (current_js ~ "W") {jobstatus = 1} + if (current_js ~ "S") {jobstatus = 1} + if (current_js ~ "T") {jobstatus = 1} if (current_js ~ "R") {jobstatus = 2} if (current_js ~ "E") {jobstatus = 2} if (current_js ~ "C") {jobstatus = 4} From 9b9afeac0d4ccf5fd75ea561a49f6d71cad1a437 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Tue, 23 Apr 2019 16:23:34 -0500 Subject: [PATCH 165/169] Update PBS Pro qstat options for completed jobs (SOFTWARE-3675) --- src/scripts/pbs_status.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index 6457a95a..f85021ce 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -234,8 +234,10 @@ def qstat(jobid=""): starttime = time.time() log("Starting qstat.") command = (qstat_bin, '-f') - if config.get('pbs_pro').lower() != 'yes': - command += ('-1',) # -1 conflicts with -f in PBS Pro + if config.get('pbs_pro').lower() == 'yes': + command += ('-x',) # also query for detailed output of finished jobs + else: + command += ('-1',) # -1 conflicts with -f in PBS Pro if jobid: command += (jobid,) qstat_proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) From 331415529e8afe8819dddd4b9f40d7a80da11d81 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Wed, 8 May 2019 15:32:22 -0500 Subject: [PATCH 166/169] Only query for completed jobs with a specific job ID (SOFTWARE-3675) --- src/scripts/pbs_status.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py index f85021ce..68cd54c8 100755 --- a/src/scripts/pbs_status.py +++ b/src/scripts/pbs_status.py @@ -234,12 +234,11 @@ def qstat(jobid=""): starttime = time.time() log("Starting qstat.") command = (qstat_bin, '-f') - if config.get('pbs_pro').lower() == 'yes': - command += ('-x',) # also query for detailed output of finished jobs - else: + pbs_pro = config.get('pbs_pro').lower() == 'yes' + if not pbs_pro: command += ('-1',) # -1 conflicts with -f in PBS Pro if jobid: - command += (jobid,) + command += ('-x', jobid) if pbs_pro else (jobid,) qstat_proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) qstat_out, _ = qstat_proc.communicate() result = parse_qstat(qstat_out) From 4510dbd1d6a3e06589c0777bd3a3528b3b1b8046 Mon Sep 17 00:00:00 2001 From: Brian Lin Date: Tue, 23 Apr 2019 17:32:53 -0500 Subject: [PATCH 167/169] Use the original proxy if blahp proxy delegation is disabled (SOFTWARE-3661) Without this change, the blahp copies the job proxy into its own working dir and uses that. Unfortunately, it seems like the blahp relies on the BRPClient to pass along any proxy renewals, which doesn't run when "blah_disable_wn_proxy_renewal=yes" --- src/scripts/blah_common_submit_functions.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index 12313620..dde85030 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -514,11 +514,14 @@ function bls_setup_all_files () bls_proxy_remote_file=${bls_tmp_name}.proxy bls_test_shared_dir "$bls_proxy_local_file" if [ "x$bls_is_in_shared_dir" == "xyes" ] ; then - bls_fl_add_value inputcopy "$bls_proxy_local_file" "${bls_proxy_remote_file}" + if [ "x$bls_opt_proxyrenew" == "xyes" ] ; then + bls_fl_add_value inputcopy "$bls_proxy_local_file" "${bls_proxy_remote_file}" + bls_need_to_reset_proxy=yes + fi else - bls_fl_add_value inputsand "$bls_proxy_local_file" "${blah_wn_inputsandbox}${bls_proxy_remote_file}" "$bls_proxy_remote_file" + bls_fl_add_value inputsand "$bls_proxy_local_file" "${blah_wn_inputsandbox}${bls_proxy_remote_file}" "$bls_proxy_remote_file" + bls_need_to_reset_proxy=yes fi - bls_need_to_reset_proxy=yes fi fi @@ -679,6 +682,8 @@ function bls_start_job_wrapper () if [ "x$bls_need_to_reset_proxy" == "xyes" ] ; then echo "# Resetting proxy to local position" echo "export X509_USER_PROXY=\$new_home/${bls_proxy_remote_file}" + elif [ -r "$bls_proxy_local_file" -a -f "$bls_proxy_local_file" ] ; then + echo "export X509_USER_PROXY=${bls_proxy_local_file}" fi # Add the command (with full path if not staged) From d42bd8049c5ff73c6e7298afb0052dec0bc7efa3 Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Thu, 9 May 2019 18:46:00 -0500 Subject: [PATCH 168/169] escape single-quote and double-quote characters in environment --- src/scripts/condor_submit.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/scripts/condor_submit.sh b/src/scripts/condor_submit.sh index 15b0fd35..194bb1b9 100755 --- a/src/scripts/condor_submit.sh +++ b/src/scripts/condor_submit.sh @@ -204,9 +204,14 @@ if [ "x$environment" != "x" ] ; then # new condor format to avoid errors when things like LS_COLORS (which # has semicolons in it) get captured eval "env_array=($environment)" - sq="'" # map key=val -> key='val' + dq='"' + sq="'" + # map key=val -> key='val' env_array=("${env_array[@]/=/=$sq}") env_array=("${env_array[@]/%/$sq}") + # escape single-quote and double-quote characters (by doubling them) + env_array=("${env_array[@]//$sq/$sq$sq}") + env_array=("${env_array[@]//$dq/$dq$dq}") submit_file_environment="environment = \"${env_array[*]}\"" else if [ "x$envir" != "x" ] ; then From 4e67757db61241ac8953294edacb4a6a352bdfd5 Mon Sep 17 00:00:00 2001 From: Carl Edquist Date: Wed, 15 May 2019 14:53:33 -0500 Subject: [PATCH 169/169] escape existing quote characters first (SOFTWARE-3589) that is, before adding surrounding quotes --- src/scripts/condor_submit.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/scripts/condor_submit.sh b/src/scripts/condor_submit.sh index 194bb1b9..697286a1 100755 --- a/src/scripts/condor_submit.sh +++ b/src/scripts/condor_submit.sh @@ -206,12 +206,12 @@ if [ "x$environment" != "x" ] ; then eval "env_array=($environment)" dq='"' sq="'" - # map key=val -> key='val' - env_array=("${env_array[@]/=/=$sq}") - env_array=("${env_array[@]/%/$sq}") # escape single-quote and double-quote characters (by doubling them) env_array=("${env_array[@]//$sq/$sq$sq}") env_array=("${env_array[@]//$dq/$dq$dq}") + # map key=val -> key='val' + env_array=("${env_array[@]/=/=$sq}") + env_array=("${env_array[@]/%/$sq}") submit_file_environment="environment = \"${env_array[*]}\"" else if [ "x$envir" != "x" ] ; then