diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..e6108ade --- /dev/null +++ b/.gitignore @@ -0,0 +1,61 @@ +CMakeCache.txt +CMakeFiles +CPackSourceConfig.cmake +CPackConfig.cmake +src/*.o +Makefile +Makefile.in +config/Makefile +config/Makefile.in +src/Makefile +src/Makefile.in +aclocal.m4 +autom4te.cache +config.log +config.status +configure +doc/blah_check_config.1 +doc/blah_job_registry_add.1 +/blah_job_registry_dump.1 +doc/blah_job_registry_lkup.1 +doc/blah_job_registry_scan_by_subject.1 +doc/blah_job_registry_dump.1 +doc/blahpd.1 +libtool +project/compile +project/config.guess +project/config.sub +project/depcomp +project/install-sh +project/libtool.m4 +project/ltmain.sh +project/ltoptions.m4 +project/ltsugar.m4 +project/ltversion.m4 +project/lt~obsolete.m4 +project/missing +src/.deps/ +src/BLClient +src/BLParserLSF +src/BLParserPBS +src/BNotifier +src/BUpdaterCondor +src/BUpdaterLSF +src/BUpdaterPBS +src/BUpdaterSGE +src/autogen/ +src/blah_check_config +src/blah_job_registry_add +src/blah_job_registry_dump +src/blah_job_registry_lkup +src/blah_job_registry_purge +src/blah_job_registry_scan_by_subject +src/blahpd +src/blahpd_daemon +src/blparser_master +src/test_cmdbuffer +src/test_job_registry_access +src/test_job_registry_create +src/test_job_registry_purge +src/test_job_registry_update +src/test_job_registry_update_from_network diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..bd700f71 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,62 @@ +# **************** +# * BLAHP daemon * +# **************** +# +# $Id: $ +# +# File: CMakeLists.txt +# +# Author(s): Francesco Prelz ($Author: $) +# e-mail: "Francesco.Prelz@mi.infn.it" +# +# Revision history: +# +# 5-Nov-2012 Created + +cmake_minimum_required(VERSION 2.6) + +#project(glite-ce-blahp) +project(BLAH) + +# CPack info + +set(CPACK_RPM_PACKAGE_RELEASE "0") + +if (UNIX AND NOT APPLE) +set(CPACK_GENERATOR "STGZ;DEB;RPM") +execute_process(COMMAND uname -i + OUTPUT_VARIABLE local_arch OUTPUT_STRIP_TRAILING_WHITESPACE) +set(CPACK_SYSTEM_NAME "${CPACK_RPM_PACKAGE_RELEASE}.${local_arch}") +endif (UNIX AND NOT APPLE) + +set(CMAKE_INSTALL_PREFIX "/usr") + +set(CPACK_PACKAGE_VENDOR "EMI") +set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Batch Local ASCII Helper Protocol suite") +set(CPACK_PACKAGE_DESCRIPTION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/blah_description.txt") +set(CPACK_PACKAGE_VERSION_MAJOR "1") +set(CPACK_PACKAGE_VERSION_MINOR "18") +set(CPACK_PACKAGE_VERSION_PATCH "1") +set(CPACK_PACKAGE_VERSION + "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") + +# For the following setting, see CMAKE manual, page 155 +# or http://public.kitware.com/Bug/view.php?id=7000 +set(CPACK_SET_DESTDIR ON) +set(CPACK_PACKAGE_RELOCATABLE "false") + +set(CPACK_PACKAGE_CONTACT "blah@mi.infn.it") +set(CPACK_RPM_PACKAGE_LICENSE "Apache Software License") +set(CPACK_RPM_PACKAGE_GROUP "Applications/Internet") + +set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE + ${CMAKE_CURRENT_SOURCE_DIR}/project/glite-ce-blahp.post) +set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE + ${CMAKE_CURRENT_SOURCE_DIR}/project/glite-ce-blahp.preun) + +add_subdirectory(src build) +add_subdirectory(config) +add_subdirectory(doc) + +install(FILES LICENSE + DESTINATION share/doc/${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION}) diff --git a/Makefile.am b/Makefile.am index 7e097586..db07c2bb 100755 --- a/Makefile.am +++ b/Makefile.am @@ -25,7 +25,7 @@ doc_DATA = LICENSE SUBDIRS = src config doc ## Default flags to run aclocal -ACLOCAL_AMFLAGS = -I project -I ../org.glite/project +ACLOCAL_AMFLAGS = -I project stage: @set fnord $(MAKEFLAGS); amf=$$2; \ diff --git a/blah_description.txt b/blah_description.txt new file mode 100644 index 00000000..70d04f67 --- /dev/null +++ b/blah_description.txt @@ -0,0 +1,2 @@ +The BLAHP daemon is a light component accepting commands to manage jobs on different Local Resources Management Systems + diff --git a/bootstrap b/bootstrap index 4d8ecb1b..32993aea 100755 --- a/bootstrap +++ b/bootstrap @@ -24,7 +24,7 @@ set -x if [ -d ../org.glite/project ]; then aclocal -I project -I ../org.glite/project else - aclocal -I project + aclocal -I project -I /usr/share/glite-build-common-cpp/m4 fi libtoolize --force diff --git a/config/CMakeLists.txt b/config/CMakeLists.txt new file mode 100644 index 00000000..58de7947 --- /dev/null +++ b/config/CMakeLists.txt @@ -0,0 +1,30 @@ +# **************** +# * BLAHP daemon * +# **************** +# +# $Id: $ +# +# File: CMakeLists.txt +# +# Author(s): Francesco Prelz ($Author: $) +# e-mail: "Francesco.Prelz@mi.infn.it" +# +# Revision history: +# +# 5-Nov-2012 Created + +cmake_minimum_required(VERSION 2.6) + +install(FILES + blah.config.template + blparser.conf.template + DESTINATION /etc) + +install(FILES + glite-ce-blah-parser + glite-ce-check-blparser + PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE + GROUP_READ GROUP_EXECUTE + WORLD_READ WORLD_EXECUTE + DESTINATION /etc/rc.d/init.d) + diff --git a/config/Makefile.am b/config/Makefile.am index ac2401a6..c48e81bf 100644 --- a/config/Makefile.am +++ b/config/Makefile.am @@ -26,6 +26,7 @@ EXTRA_DIST = blah.config.template \ glite-ce-blah-parser \ glite-ce-check-blparser +initdir = rc.d/init.d bldir = $(sysconfdir)/$(initdir) bl_SCRIPTS = glite-ce-blah-parser glite-ce-check-blparser diff --git a/config/blah.config.template b/config/blah.config.template index c4fcee11..6b5b50a3 100644 --- a/config/blah.config.template +++ b/config/blah.config.template @@ -1,44 +1,140 @@ -#Supported batch systems -supported_lrms=pbs,lsf +## +#####Common section +## + +##Blah common variables + +#Supported batch systems (e.g. pbs,lsf) +supported_lrms=pbs,lsf,sge,slurm,condor #DGAS logfile BLAHPD_ACCOUNTING_INFO_LOG= #Set to yes if you wish to disable BLAH's machinery for transferring -#or delegating proxies to the worker node where a job is running. -blah_disable_wn_proxy_renewal=no +#or delegating proxies to the worker node where a job is running. (default = no) +blah_disable_wn_proxy_renewal=yes #Set to yes to enable delegation (instead of copy) of renewed proxies #to worker nodes. NOTE: limited *and* delegated proxes are not #accepted for default GSI authentication as of VDT 1.2, so this should -#be enabled only if non-limited proxies are used for proxy renewal. -blah_delegate_renewed_proxies=no +#be enabled only if non-limited proxies are used for proxy renewal. (default = no) +blah_delegate_renewed_proxies= + +#Set to yes to disable creation of a limited proxy. (default = no) +blah_disable_limited_proxy=yes + +#max number of concurrent threads to serve commands (default = 500) +blah_max_threaded_cmds=50 + +#Colon-separated list of paths that are shared among batch system +#head and worker nodes. +blah_shared_directories=/ + +#By default the job temporary work directory is created as a subdirectory +#of wherever the batch system is configured to land the job. +#This variable changes the location where the work directory is created. +#A shell variable escaped or in single quotes will be resolved on the +#worker node in the job environment. Non-escaped variables will be resolved +#on the submit node in the submit environment. +#blah_wn_temporary_home_dir='$GLITE_LOCATION_TMP' + +#These two attributes allow to change the directory on the worker node where +#the batch system is instructed to transfer input/output sandbox files to +#and from. +#These can be set in case the batch system default is not good enough +#(e.g.: the batch systems leaves output files behind) +#These variables can be resolved on the submit node -only-. +#blah_wn_inputsandbox=/tmp +#blah_wn_outputsandbox=/tmp + +#The following configuration attributes allow for additional +#submit command attributes to be set in the local shell callout +#for batch system customizations. +# +#Set this variable to pass all submit command attributes: +#blah_pass_all_submit_attributes=yes +# +#-Or- select individual attributes as follows: +#blah_pass_submit_attributes[0]="x509UserProxySubject" +#blah_pass_submit_attributes[1]="x509UserProxy" + +#timeout before blah kill a process (default 20) +blah_graceful_kill_timeout= + +#Enable condor/glexec commands (default no) +blah_enable_glexec_from_condor= + +#umask for accounting log +blah_accounting_log_umask= + +#interval between two child consecutive restart (default 150) +blah_children_restart_interval= + +#if blah requires proxy on submit (default no) +blah_require_proxy_on_submit= + +#disable proxy user copy (default no) +#blah_disable_proxy_user_copy=yes + +##PBS common variables #Path where PBS executables are located -pbs_binpath=/usr/pbs/bin +# NOTE: this path is computed many times; I worry about the overhead here. -BB, 12-13-2012 +pbs_binpath=/usr/bin #Path where the PBS logs are located ($pbs_spoolpath/server_logs) -pbs_spoolpath=/usr/spool/PBS +#pbs_spoolpath= #If it is set to yes blah does not check the jobid in the logfiles pbs_nochecksubmission= #If it is set to yes blah does NOT use log files to get job status, #but uses only standard LRMS query (qstat) -pbs_nologaccess= +pbs_nologaccess=yes #If it is set to no blah scripts for PBS will not try to read #locally from the logs if BLParser is not present pbs_fallback=no +#Set to 'yes' to request pvmem when submitting jobs +pbs_set_pvmem=no + +#Set to 'yes' if you are running PBS Pro +pbs_pro=no + +##LSF common variables + +#Path where LSF executables are located +lsf_binpath=/usr/bin + +#Path where the LSF conf file is located ($lsf_confpath/lsf.conf) +lsf_confpath= + +#If it is set to yes blah does not check the jobid in the logfiles +lsf_nochecksubmission= + +#If it is set to yes blah does NOT use log files to get job status, +#but uses only standard LRMS query (bhist) +lsf_nologaccess= + +#If it is set to no blah scripts for LSF will not try to read +#locally from the logs if BLParser is not present +lsf_fallback=no + +## +#####BLParser section +## + +##PBS subsection + #Set to yes to use Blah Log Parser for PBS pbs_BLParser= #Host where Blah Log Parser for PBS is running -pbs_BLPserver=127.0.0.1 +pbs_BLPserver= #Port where Blah Log Parser for PBS is running -pbs_BLPport=33332 +pbs_BLPport= #Number of Blah Log Parser to try for PBS (if it is null pbs_BLPserver and pbs_BLPport are used) pbs_num_BLParser= @@ -61,33 +157,16 @@ pbs_BLPport2= # use it as failback solution if neither 'yes' nor 'no' works for you. blah_torque_multiple_staging_directive_bug=no -#### - -#Path where LSF executables are located -lsf_binpath=/usr/local/lsf/bin - -#Path where the LSF conf file is located ($lsf_confpath/lsf.conf) -lsf_confpath=/etc - -#If it is set to yes blah does not check the jobid in the logfiles -lsf_nochecksubmission= - -#If it is set to yes blah does NOT use log files to get job status, -#but uses only standard LRMS query (bhist) -lsf_nologaccess= - -#If it is set to no blah scripts for LSF will not try to read -#locally from the logs if BLParser is not present -lsf_fallback=no +##LSF subsection #Set to yes to use Blah Log Parser for LSF lsf_BLParser= #Host where Blah Log Parser for LSF is running -lsf_BLPserver=127.0.0.1 +lsf_BLPserver= #Port where Blah Log Parser for LSF is running -lsf_BLPport=33333 +lsf_BLPport= #Number of Blah Log Parser to try for LSF (if it is null lsf_BLPserver and lsf_BLPport are used) lsf_num_BLParser= @@ -98,32 +177,28 @@ lsf_BLPport1= lsf_BLPserver2= lsf_BLPport2= -# -#LSF Updater -# -#number of logs to be read by bhist (default:3) -bhist_logs_to_read= -# -# Condor -# +## +#####BUpdater/BNotifier section +## + +#seconds to sleep in the main loop +loop_interval= + +## +#####BUpdater subsection +## + +##Common BUpdater variables #Updater location bupdater_path= -#Notifier location -bnotifier_path= - #Updater pid file bupdater_pidfile=/var/tmp/cream_tomcat_bupdater.pid -#Notifier pid file -bnotifier_pidfile=/var/tmp/cream_tomcat_bnotifier.pid - -#condor bin location -condor_binpath=/opt/condor-c/bin - #Registry file location -job_registry=/var/tmp/cream_tomcat_registry.db +job_registry= + #Set the following variable to 'yes' to have multiple BLAHPD instances #share the job registry -index- via mmap: job_registry_use_mmap=no @@ -140,68 +215,120 @@ bupdater_debug_level=1 #bupdater debug log file bupdater_debug_logfile=/var/tmp/bupdater.log -#bnotifier debug level -bnotifier_debug_level=1 - -#bnotifier debug log file -bnotifier_debug_logfile=/var/tmp/bnotifier.log - # purge interval purge_interval=7200 -#after that interval a bhist with -n bhist_logs_to_read is tried (default:120) -bhist_finalstate_interval=120 - #Minimum interval of time between the last update of a jobid entry and the first finalstate query try (default:30) finalstate_query_interval=30 #after that interval an unseen job is set as done (status == 4) and exitstatus == 999 (default:3600) alldone_interval=3600 -#path to condor_config -export CONDOR_CONFIG="/opt/condor-c/etc/condor_config" +#Command use to cache info abput the job in the batch system +batch_command_caching_filter= -#max number of concurrent threads to serve commands (default = 500) -#blah_max_threaded_cmds=100 +#poll timeout +bupdater_child_poll_timeout= -#seconds to sleep in the main loop -#loop_interval= +#set to yes to enable the blah clustering +job_registry_add_remote= + +#time interval between consistency check of blah registry (default 3600) +bupdater_consistency_check_interval= + +##LSF + +#number of logs to be read by bhist (default:3) +bhist_logs_to_read= + +#after that interval a bhist with -n bhist_logs_to_read is tried (default:120) +bhist_finalstate_interval=120 #use the long format for bjobs command (-l instead of -w) (yes|no) (default=yes) bupdater_bjobs_long_format=yes +#Enable the use of the caching for the batch system commands +#(the command is specified by batch_command_caching_filter) +lsf_batch_caching_enabled= + #use bhist to calculate suspended jobs timestamp bupdater_use_bhist_for_susp=no -#Colon-separated list of paths that are shared among batch system -#head and worker nodes. -#blah_shared_directories=/home:/users +#if set to yes bhist uses a time constraint to reduce the output (default no) +bupdater_use_bhist_time_constraint= -#By default the job temporary work directory is created as a subdirectory -#of wherever the batch system is configured to land the job. -#This variable changes the location where the work directory is created. -#A shell variable escaped or in single quotes will be resolved on the -#worker node in the job environment. Non-escaped variables will be resolved -#on the submit node in the submit environment. -#blah_wn_temporary_home_dir='$GLITE_LOCATION_TMP' +#use btools (default no) +bupdater_use_btools= -#These two attributes allow to change the directory on the worker node where -#the batch system is instructed to transfer input/output sandbox files to -#and from. -#These can be set in case the batch system default is not good enough -#(e.g.: the batch systems leaves output files behind) -#These variables can be resolved on the submit node -only-. -#blah_wn_inputsandbox=/tmp -#blah_wn_outputsandbox=/tmp +#btools path (default /usr/local/bin) +bupdater_btools_path= -#The following configuration attributes allow for additional -#submit command attributes to be set in the local shell callout -#for batch system customizations. -# -#Set this variable to pass all submit command attributes: -#blah_pass_all_submit_attributes=yes -# -#-Or- select individual attributes as follows: -#blah_pass_submit_attributes[0]="x509UserProxySubject" -#blah_pass_submit_attributes[1]="x509UserProxy" +#use bhist for jobs in idle state (default yes) +bupdater_use_bhist_for_idle= + +#use bhist for killed jobs (default yes) +bupdater_use_bhist_for_killed= + +##PBS + +#Enable the use of the caching for the batch system commands +#(the command is specified by batch_command_caching_filter) +pbs_batch_caching_enabled= + +#number of logs that tracejob read (default 2) +tracejob_logs_to_read= + +#max number of lines in tracejob output. This is done to get rid of +# a bug in pbs that causes tracejob to produce a large output (default 1000) +tracejob_max_output= + +##Condor + +#condor bin location +condor_binpath=/usr/bin + +#path to condor_config +#export CONDOR_CONFIG="/etc/condor/condor_config" +#Enable the use of the caching for the batch system commands +#(the command is specified by batch_command_caching_filter) +condor_batch_caching_enabled= + +#If condor_history should be used or not to the final state info about the jobs. +bupdater_use_condor_history= + +##SGE + +sge_binpath= + +sge_cellname=$SGE_CELL + +sge_rootpath=$SGE_ROOT + +#set the SGE parallel environment policy +sge_pe_policy=* + +##SLURM + +#path to the slurm executables +#default: /usr/bin +slurm_binpath=/usr/bin + +#default: /dev/null +slurm_std_storage=/dev/null + +## +#####BNotifier subsection +## + +#Notifier location +bnotifier_path= + +#Notifier pid file +bnotifier_pidfile=/var/tmp/cream_tomcat_bnotifier.pid + +#bnotifier debug level +bnotifier_debug_level=1 + +#bnotifier debug log file +bnotifier_debug_logfile=/var/tmp/bnotifier.log diff --git a/config/glite-ce-blah-parser b/config/glite-ce-blah-parser index cbe081c9..2800790f 100755 --- a/config/glite-ce-blah-parser +++ b/config/glite-ce-blah-parser @@ -19,7 +19,7 @@ # # description: gLite CE blah parser -# chkconfig: 345 94 15 +# chkconfig: - 94 15 # Source function library . /etc/rc.d/init.d/functions @@ -39,7 +39,7 @@ else blparser_bin_directory="${GLITE_LOCATION}/bin" else # Default value when everything else fails. - blparser_bin_directory="/usr/libexec" + blparser_bin_directory="/usr/libexec/blahp" fi fi diff --git a/config/glite-ce-check-blparser b/config/glite-ce-check-blparser index 84a62220..a078ee39 100755 --- a/config/glite-ce-check-blparser +++ b/config/glite-ce-check-blparser @@ -31,7 +31,7 @@ else blparser_bin_directory="${GLITE_LOCATION}/bin" else # Default when everything else fails. - blparser_bin_directory="/usr/libexec" + blparser_bin_directory="/usr/libexec/blahp" fi fi diff --git a/configure.ac b/configure.ac index 62655be3..b3b0aab2 100755 --- a/configure.ac +++ b/configure.ac @@ -196,10 +196,25 @@ AC_HEADER_TIME dnl Checks for library functions. AC_CHECK_FUNCS(select socket strdup strerror bsearch vsnprintf mmap munmap) +AC_CHECK_FUNCS(dlopen, ,AC_CHECK_LIB(dl, dlopen)) -GLITE_CHECK_LIBDIR -GLITE_CHECK_INITDIR -AC_GLITE_DOCBOOK +dnl GLITE_CHECK_LIBDIR +dnl GLITE_CHECK_INITDIR +dnl AC_GLITE_DOCBOOK + +GLITE_DB_MANPAGES_STYLESHEET="/usr/share/sgml/docbook/xsl-stylesheets/manpages/docbook.xsl" + +AC_ARG_WITH(manpage_stylesheet, + [ --with-manpage-stylesheet=PATH (path to the docbook stylesheet for man pages ($GLITE_DB_MANPAGES_STYLESHEET)], + [], + with_manpage_stylesheet=$GLITE_DB_MANPAGES_STYLESHEET) + +if ! test -r "$with_manpage_stylesheet" ; then + AC_MSG_ERROR("$with_manpage_stylesheet not found. Try setting --with-manpage-stylesheet.") +fi + +GLITE_DB_MANPAGES_STYLESHEET=$with_manpage_stylesheet +AC_SUBST(GLITE_DB_MANPAGES_STYLESHEET) AC_ARG_WITH(dist_location, [ --with-dist-location=PFX prefix where DIST location is. (pwd)], @@ -221,13 +236,18 @@ AC_SUBST(DISTTAR) AC_CLASSADS([], AC_MSG_RESULT(["CLASSADS ok"]), AC_MSG_ERROR(["CLASSADS not found"])) -have_globus=yes -PKG_CHECK_MODULES(GLOBUS_GSI_CRED, globus-gsi-credential, , have_globus=no) -PKG_CHECK_MODULES(GLOBUS_GSI_PROXY, globus-gsi-proxy-core, , have_globus=no) -PKG_CHECK_MODULES(GLOBUS_GSI_UTILS, globus-gsi-cert-utils, , have_globus=no) -PKG_CHECK_MODULES(GLOBUS_GSS_ASSIST, globus-gss-assist, , have_globus=no) -PKG_CHECK_MODULES(GLOBUS_GSI_SYSCFG, globus-gsi-sysconfig, , have_globus=no) -AC_MSG_RESULT(["GLOBUS found $have_globus"]) +AC_GLOBUS([], have_globus=yes, have_globus=no) + +if test $have_globus = no; then + have_globus=yes + PKG_CHECK_MODULES(GLOBUS_GSI_CRED, globus-gsi-credential, , have_globus=no) + PKG_CHECK_MODULES(GLOBUS_GSI_PROXY, globus-gsi-proxy-core, , have_globus=no) + PKG_CHECK_MODULES(GLOBUS_GSI_UTILS, globus-gsi-cert-utils, , have_globus=no) + PKG_CHECK_MODULES(GLOBUS_GSS_ASSIST, globus-gss-assist, , have_globus=no) + PKG_CHECK_MODULES(GLOBUS_GSI_SYSCFG, globus-gsi-sysconfig, , have_globus=no) + PKG_CHECK_MODULES(GLOBUS_GSSAPI_GSI, globus-gssapi-gsi, , have_globus=no) +fi +AC_MSG_RESULT(["GLOBUS found: $have_globus"]) AM_CONDITIONAL([HAVE_GLOBUS], [test "x$bprserver" == "xyes" -a "x$have_globus" == "xyes"]) dnl Temporarily built with no optimisation diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt new file mode 100644 index 00000000..d091e419 --- /dev/null +++ b/doc/CMakeLists.txt @@ -0,0 +1,47 @@ +# **************** +# * BLAHP daemon * +# **************** +# +# $Id: $ +# +# File: CMakeLists.txt +# +# Author(s): Francesco Prelz ($Author: $) +# e-mail: "Francesco.Prelz@mi.infn.it" +# +# Revision history: +# +# 5-Nov-2012 Created + +cmake_minimum_required(VERSION 2.6) + +find_program(XSLTPROC_EXECUTABLE xsltproc) + +find_file(XSLTPROC_MANPAGE_STYLESHEET + NAMES docbook.xsl + PATHS /usr/share/sgml/docbook/xsl-stylesheets/manpages) + +set(MAN1PAGES_TO_CREATE + blah_job_registry_add.1 + blah_job_registry_dump.1 + blah_job_registry_lkup.1 + blah_job_registry_scan_by_subject.1 + blah_check_config.1 + blahpd.1 +) + +foreach (manpage ${MAN1PAGES_TO_CREATE}) + string(REGEX REPLACE ".[1-9]$" ".xml" manpage_src ${manpage}) + add_custom_command(OUTPUT ${manpage} + COMMAND ${XSLTPROC_EXECUTABLE} + ${XSLTPROC_MANPAGE_STYLESHEET} + ${manpage_src} + DEPENDS ${manpage_src}) + set_source_files_properties(${manpage} PROPERTIES GENERATED TRUE) +endforeach() + +add_custom_target(all_manpages ALL + DEPENDS ${MAN1PAGES_TO_CREATE} ) + +install(FILES ${MAN1PAGES_TO_CREATE} DESTINATION man/man1) + diff --git a/project/classads.m4 b/project/classads.m4 new file mode 100644 index 00000000..82fa3795 --- /dev/null +++ b/project/classads.m4 @@ -0,0 +1,83 @@ +dnl Usage: +dnl AC_CLASSAD(MINIMUM-VERSION, [ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]]]) +dnl - CLASSAD_CFLAGS (compiler flags) +dnl - CLASSAD_LIBS (linker flags, stripping and path) +dnl - CLASSAD_DL_LIBS +dnl - CLASSAD_INSTALL_PATH +dnl - CLASSAD_PATH + +AC_DEFUN([AC_CLASSADS], +[ + AC_ARG_WITH(classads_prefix, + [ --with-classads-prefix=PFX prefix where the Classad libraries are installed (/usr)], + [], + with_classads_prefix="/usr") + + AC_MSG_CHECKING([for CLASSAD installation]) + + CLASSAD_CFLAGS="" + if test -n "$with_classads_prefix" ; then + AC_MSG_RESULT([prefix: $with_classads_prefix]) + + ac_classads_prefix=$with_classads_prefix + if test "$with_classads_prefix" != "/usr"; then + + CLASSAD_CFLAGS="-I$with_classads_prefix/include -I$with_classads_prefix/include/classad" + CLASSAD_LIBS="-L$with_classads_prefix/lib -lclassad" + CLASSAD_DL_LIBS="-L$with_classads_prefix/lib -lclassad_dl" + else + CLASSAD_CFLAGS="-I$with_classads_prefix/include/classad" + CLASSAD_LIBS="-lclassad" + CLASSAD_DL_LIBS="-lclassad_dl" + fi + fi + + AC_LANG_SAVE + AC_LANG_CPLUSPLUS + ac_save_cppflags=$CPPFLAGS + ac_save_libs=$LIBS + CPPFLAGS="$CLASSAD_CFLAGS $CPPFLAGS" + BASE_LIBS="$LIBS" + LIBS="$CLASSAD_LIBS $LIBS" + AC_MSG_CHECKING([if a small classads program compiles]) + AC_TRY_LINK([ #include ], + [ classad::ClassAd ad; classad::ClassAdParser parser; ], + [ ac_have_classads=yes ], [ ac_have_classads=no ]) + if test x$ac_have_classads = xno ; then + CLASSAD_CFLAGS="$CLASSAD_CFLAGS -DWANT_CLASSAD_NAMESPACE -DWANT_NAMESPACES" + CLASSAD_LIBS="-L$with_classads_prefix/lib -lclassad_ns" + LIBS="$CLASSAD_LIBS $BASE_LIBS" + CPPFLAGS="$CLASSAD_CFLAGS $ac_save_cppflags" + AC_TRY_LINK([ #include ], + [ classad::ClassAd ad; classad::ClassAdParser parser; ], + [ ac_have_classads=yes ], [ ac_have_classads=no ]) + fi + AC_MSG_RESULT([$ac_have_classads]) + + CPPFLAGS=$ac_save_cppflags + LIBS=$ac_save_libs + AC_LANG_RESTORE + + CLASSAD_PATH=$with_classads_prefix + + if test x$ac_have_classads = xyes ; then + CLASSAD_INSTALL_PATH=$ac_classads_prefix + ifelse([$2], , :, [$2]) + else + AC_MSG_WARN([ + *** Cannot compile a small classads program: check whether the + *** Condor ClassADs library is installed]) + CLASSAD_CFLAGS="" + CLASSAD_LIBS="" + CLASSAD_DL_LIBS="" + CLASSAD_PATH="" + ifelse([$3], , :, [$3]) + fi + + AC_SUBST(CLASSAD_INSTALL_PATH) + AC_SUBST(CLASSAD_CFLAGS) + AC_SUBST(CLASSAD_LIBS) + AC_SUBST(CLASSAD_DL_LIBS) + AC_SUBST(CLASSAD_PATH) +]) + diff --git a/project/debfiles/control b/project/debfiles/control index 9f6be901..e193fd86 100644 --- a/project/debfiles/control +++ b/project/debfiles/control @@ -2,7 +2,7 @@ Source: glite-ce-blahp Section: net Priority: optional Maintainer: CREAM group -Build-Depends: debhelper (>= 8.0.0~) +Build-Depends: debhelper (>= 8.0.0~), libtool, libclassad-dev, docbook-xsl, xsltproc Standards-Version: 3.9.1 Homepage: http://glite.cern.ch/ diff --git a/project/debfiles/glite-ce-blahp.install b/project/debfiles/glite-ce-blahp.install index 16079fd9..cfdf34c9 100644 --- a/project/debfiles/glite-ce-blahp.install +++ b/project/debfiles/glite-ce-blahp.install @@ -1,3 +1,4 @@ usr/bin/* +usr/sbin/* etc/*.template usr/share/man/man1/*.1.gz diff --git a/project/debfiles/glite-ce-blahp.postinst b/project/debfiles/glite-ce-blahp.postinst new file mode 100755 index 00000000..dd3e2580 --- /dev/null +++ b/project/debfiles/glite-ce-blahp.postinst @@ -0,0 +1,34 @@ +#!/bin/sh + +set -e + +case "$1" in + configure) + + if test -z "$2"; then + update-rc.d glite-ce-blah-parser start 94 3 4 5 . stop 15 3 4 5 . >/dev/null + + if [ ! "x`grep tomcat6 /etc/passwd`" == "x" ] ; then + mkdir -p /var/log/cream/accounting + chown root.tomcat6 /var/log/cream/accounting + chmod 0730 /var/log/cream/accounting + + mkdir -p /var/blah + chown tomcat6.tomcat6 /var/blah + chmod 771 /var/blah + + fi + fi + ;; + + abort-upgrade|abort-remove|abort-deconfigure) + ;; + + *) + echo "postinst called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +exit 0 + diff --git a/project/debfiles/glite-ce-blahp.prerm b/project/debfiles/glite-ce-blahp.prerm new file mode 100755 index 00000000..87783767 --- /dev/null +++ b/project/debfiles/glite-ce-blahp.prerm @@ -0,0 +1,32 @@ +#!/bin/sh + +set -e + +case "$1" in + remove) + invoke-rc.d glite-ce-blah-parser stop >/dev/null 2>&1 + update-rc.d glite-ce-blah-parser remove >/dev/null 2>&1 + + if [ -d /var/log/cream/accounting ] ; then + rm -rf /var/log/cream/accounting + fi + + if [ -d /var/blah ] ; then + rm -rf /var/blah + fi + ;; + + upgrade) + ;; + + deconfigure|failed-upgrade) + ;; + + *) + echo "prerm called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +exit 0 + diff --git a/project/glite-ce-blahp_sl_any.spec b/project/glite-ce-blahp_sl_any.spec index a062bd3a..2ff4c1d5 100644 --- a/project/glite-ce-blahp_sl_any.spec +++ b/project/glite-ce-blahp_sl_any.spec @@ -42,6 +42,15 @@ if test "x%{extbuilddir}" == "x--" ; then else cp -R %{extbuilddir}/* %{buildroot} fi +strip -s %{buildroot}/usr/sbin/blah_job_registry_* +strip -s %{buildroot}/usr/sbin/blahpd_daemon +strip -s %{buildroot}/usr/sbin/blah_check_config +strip -s %{buildroot}/usr/libexec/blparser_master +strip -s %{buildroot}/usr/libexec/BLClient +strip -s %{buildroot}/usr/libexec/BUpdater* +strip -s %{buildroot}/usr/libexec/BNotifier +strip -s %{buildroot}/usr/libexec/BLParser* +strip -s %{buildroot}/usr/bin/blahpd %clean @@ -92,6 +101,6 @@ fi %doc /usr/share/man/man1/*.1.gz %changelog -* %(date +"%%a %%b %%d %%Y") CREAM group - %{version}-%{release} +* %{extcdate} CREAM group - %{extversion}-%{extage}.%{extdist} - %{extclog} diff --git a/project/glite.m4 b/project/glite.m4 new file mode 100644 index 00000000..55fe945f --- /dev/null +++ b/project/glite.m4 @@ -0,0 +1,37 @@ +dnl Usage: +dnl AC_GLITE +dnl - GLITE_LOCATION +dnl - GLITE_CFLAGS +dnl - DISTTAR + +AC_DEFUN([AC_GLITE], +[ + AC_ARG_WITH(glite_location, + [ --with-glite-location=PFX prefix where GLITE is installed. (/opt/glite)], + [], + with_glite_location=/opt/glite) + + if test -n "with_glite_location" ; then + GLITE_LOCATION="$with_glite_location" + GLITE_CFLAGS="-I$GLITE_LOCATION/include" + else + GLITE_LOCATION="" + GLITE_CFLAGS="" + fi + + AC_MSG_RESULT([GLITE_LOCATION set to $GLITE_LOCATION]) + + AC_SUBST(GLITE_LOCATION) + AC_SUBST(GLITE_CFLAGS) + + AC_ARG_WITH(dist_location, + [ --with-dist-location=PFX prefix where DIST location is. (pwd)], + [], + with_dist_location=$WORKDIR/../dist) + + DISTTAR=$with_dist_location + + AC_SUBST(DISTTAR) + +]) + diff --git a/project/globus.m4 b/project/globus.m4 new file mode 100644 index 00000000..9e4edfd7 --- /dev/null +++ b/project/globus.m4 @@ -0,0 +1,233 @@ +dnl Usage: +dnl AC_GLOBUS(MINIMUM-VERSION, [ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]]]) +dnl - GLOBUS_LOCATION +dnl - GLOBUS_NOTHR_FLAVOR +dnl - GLOBUS_THR_FLAVOR +dnl - GLOBUS_NOTHR_CFLAGS +dnl - GLOBUS_THR_CFLAGS +dnl - GLOBUS_NOTHR_LIBS +dnl - GLOBUS_THR_LIBS +dnl - GLOBUS_COMMON_NOTHR_LIBS +dnl - GLOBUS_COMMON_THR_LIBS +dnl - GLOBUS_STATIC_COMMON_NOTHR_LIBS +dnl - GLOBUS_STATIC_COMMON_THR_LIBS +dnl - GLOBUS_FTP_CLIENT_NOTHR_LIBS +dnl - GLOBUS_FTP_CLIENT_THR_LIBS +dnl - GLOBUS_SSL_NOTHR_LIBS +dnl - GLOBUS_SSL_THR_LIBS +dnl - GLOBUS_STATIC_SSL_NOTHR_LIBS +dnl - GLOBUS_STATIC_SSL_THR_LIBS +dnl - GLOBUS_GSS_NOTHR_LIBS +dnl - GLOBUS_GSS_THR_LIBS +dnl - GLOBUS_LDAP_THR_LIBS + +AC_DEFUN([AC_GLOBUS], +[ + AC_ARG_WITH(globus_prefix, + [ --with-globus-prefix=PFX prefix where GLOBUS is installed. ($GLOBUS_LOCATION or /opt/globus or pkg-config)], + [], + with_globus_prefix=${GLOBUS_LOCATION:-/opt/globus}) + + AC_ARG_WITH(globus_nothr_flavor, + [ --with-globus-nothr-flavor=flavor [default=gcc32dbg]], + [], + with_globus_nothr_flavor=${GLOBUS_FLAVOR:-gcc32dbg}) + + AC_MSG_RESULT(["GLOBUS nothread flavor is $with_globus_nothr_flavor"]) + + AC_ARG_WITH(globus_thr_flavor, + [ --with-globus-thr-flavor=flavor [default=gcc32dbgpthr]], + [], + with_globus_thr_flavor=${GLOBUS_FLAVOR:-gcc32dbgpthr}) + + AC_MSG_RESULT(["GLOBUS thread flavor is $with_globus_thr_flavor"]) + + ac_cv_globus_nothr_valid=no + ac_cv_globus_thr_valid1=no + ac_cv_globus_thr_valid2=no + + GLOBUS_NOTHR_CFLAGS="$with_globus_prefix/include/$with_globus_nothr_flavor" + GLOBUS_THR_CFLAGS="$with_globus_prefix/include/$with_globus_thr_flavor" + + ac_globus_ldlib="-L$with_globus_prefix/lib" + + GLOBUS_COMMON_NOTHR_LIBS="$ac_globus_ldlib -lglobus_common_$with_globus_nothr_flavor" + GLOBUS_COMMON_THR_LIBS="$ac_globus_ldlib -lglobus_common_$with_globus_thr_flavor" + + GLOBUS_STATIC_COMMON_NOTHR_LIBS="$with_globus_prefix/lib/libglobus_common_$with_globus_nothr_flavor.a" + GLOBUS_STATIC_COMMON_THR_LIBS="$with_globus_prefix/lib/libglobus_common_$with_globus_thr_flavor.a" + + GLOBUS_FTP_CLIENT_NOTHR_LIBS="$ac_globus_ldlib -lglobus_ftp_client_$with_globus_nothr_flavor" + GLOBUS_FTP_CLIENT_THR_LIBS="$ac_globus_ldlib -lglobus_ftp_client_$with_globus_thr_flavor" + + GLOBUS_GSS_NOTHR_LIBS="$ac_globus_ldlib -lglobus_gssapi_gsi_$with_globus_nothr_flavor -lglobus_gss_assist_$with_globus_nothr_flavor" + GLOBUS_GSS_THR_LIBS="$ac_globus_ldlib -lglobus_gssapi_gsi_$with_globus_thr_flavor -lglobus_gss_assist_$with_globus_thr_flavor" + + GLOBUS_SSL_NOTHR_LIBS="$ac_globus_ldlib -lssl_$with_globus_nothr_flavor -lcrypto_$with_globus_nothr_flavor" + GLOBUS_SSL_THR_LIBS="$ac_globus_ldlib -lssl_$with_globus_thr_flavor -lcrypto_$with_globus_thr_flavor" + + GLOBUS_STATIC_SSL_NOTHR_LIBS="$with_globus_prefix/lib/libssl_$with_globus_nothr_flavor.a $with_globus_prefix/lib/libcrypto_$with_globus_nothr_flavor.a" + GLOBUS_STATIC_SSL_THR_LIBS="$with_globus_prefix/lib/libssl_$with_globus_thr_flavor.a $with_globus_prefix/lib/libcrypto_$with_globus_thr_flavor.a" + + GLOBUS_LDAP_THR_LIBS="$ac_globus_ldlib -lldap_$with_globus_thr_flavor -llber_$with_globus_thr_flavor" + + dnl Needed by LCAS/LCMAPS voms plugins + GLOBUS_GSI_NOTHR_LIBS="$ac_globus_ldlib -lglobus_gsi_credential_$with_globus_nothr_flavor" + GLOBUS_GSI_THR_LIBS="$ac_globus_ldlib -lglobus_gsi_credential_$with_globus_thr_flavor" + + dnl + dnl check nothr openssl header + dnl + ac_globus_nothr_ssl="$with_globus_prefix/include/$with_globus_nothr_flavor/openssl" + + AC_MSG_CHECKING([for $ac_globus_nothr_ssl/ssl.h]) + + if test ! -f "$ac_globus_nothr_ssl/ssl.h" ; then + ac_globus_nothr_ssl="" + AC_MSG_RESULT([no]) + else + AC_MSG_RESULT([yes]) + fi + + AC_MSG_CHECKING([for openssl nothr]) + + if test -n "$ac_globus_nothr_ssl" ; then + GLOBUS_NOTHR_CFLAGS="-I$ac_globus_nothr_ssl -I$GLOBUS_NOTHR_CFLAGS" + fi + + if test -n "$ac_globus_nothr_ssl" ; then + dnl + dnl maybe do some complex test of globus instalation here later + dnl + ac_save_CFLAGS=$CFLAGS + CFLAGS="$GLOBUS_NOTHR_CFLAGS $CFLAGS" + AC_TRY_COMPILE([ + #include "ssl.h" + #include "globus_gss_assist.h" + ], + [globus_gss_assist_ex aex], + [ac_cv_globus_nothr_valid=yes], + [ac_cv_globus_nothr_valid=no]) + CFLAGS=$ac_save_CFLAGS + AC_MSG_RESULT([$ac_cv_globus_nothr_valid]) + fi + + dnl + dnl check thr openssl header + dnl + ac_globus_thr_ssl="$with_globus_prefix/include/$with_globus_thr_flavor/openssl" + + AC_MSG_CHECKING([for $ac_globus_thr_ssl/ssl.h]) + + if test ! -f "$ac_globus_thr_ssl/ssl.h" ; then + ac_globus_thr_ssl="" + AC_MSG_RESULT([no]) + else + AC_MSG_RESULT([yes]) + fi + + if test -n "$ac_globus_thr_ssl" ; then + GLOBUS_THR_CFLAGS="-I$ac_globus_thr_ssl -I$GLOBUS_THR_CFLAGS" + fi + + AC_MSG_CHECKING([checking openssl thr]) + + if test -n "$ac_globus_thr_ssl" ; then + dnl + dnl maybe do some complex test of globus instalation here later + dnl + ac_save_CFLAGS=$CFLAGS + CFLAGS="$GLOBUS_THR_CFLAGS $CFLAGS" + AC_TRY_COMPILE([ + #include "openssl/ssl.h" + #include "globus_gss_assist.h" + ], + [globus_gss_assist_ex aex], + [ac_cv_globus_thr_valid1=yes], + [ac_cv_globus_thr_valid1=no]) + CFLAGS=$ac_save_CFLAGS + AC_MSG_RESULT([$ac_cv_globus_thr_valid1]) + fi + + dnl + dnl check thr ldap header + dnl + ac_globus_thr_ldap="$with_globus_prefix/include/$with_globus_thr_flavor" + + AC_MSG_CHECKING([for $ac_globus_thr_ldap/lber.h]) + + if test ! -f "$ac_globus_thr_ldap/lber.h" ; then + ac_globus_thr_ldap="" + AC_MSG_RESULT([no]) + else + AC_MSG_RESULT([yes]) + fi + + AC_MSG_CHECKING([for ldap thr]) + + if test -n "$ac_globus_thr_ldap" ; then + dnl + dnl maybe do some complex test of globus instalation here later + dnl + ac_save_CFLAGS=$CFLAGS + CFLAGS="$GLOBUS_THR_CFLAGS $CFLAGS" + AC_TRY_COMPILE([ + #include "ldap.h" + #include "lber.h" + ], + [ + LDAPMessage *ldresult; + BerElement *ber; + ], + [ac_cv_globus_thr_valid2=yes], + [ac_cv_globus_thr_valid2=no]) + CFLAGS=$ac_save_CFLAGS + AC_MSG_RESULT([$ac_cv_globus_thr_valid2]) + fi + + if test x$ac_cv_globus_nothr_valid = xyes -a x$ac_cv_globus_thr_valid1 = xyes -a x$ac_cv_globus_thr_valid2 = xyes ; then + GLOBUS_LOCATION=$with_globus_prefix + GLOBUS_NOTHR_FLAVOR=$with_globus_nothr_flavor + GLOBUS_THR_FLAVOR=$with_globus_thr_flavor + ifelse([$2], , :, [$2]) + else + GLOBUS_NOTHR_CFLAGS="" + GLOBUS_THR_CFLAGS="" + GLOBUS_NOTHR_LIBS="" + GLOBUS_THR_LIBS="" + GLOBUS_COMMON_NOTHR_LIBS="" + GLOBUS_COMMON_THR_LIBS="" + GLOBUS_STATIC_COMMON_NOTHR_LIBS="" + GLOBUS_STATIC_COMMON_THR_LIBS="" + GLOBUS_FTP_CLIENT_NOTHR_LIBS="" + GLOBUS_FTP_CLIENT_THR_LIBS="" + GLOBUS_SSL_NOTHR_LIBS="" + GLOBUS_SSL_THR_LIBS="" + GLOBUS_STATIC_SSL_NOTHR_LIBS="" + GLOBUS_STATIC_SSL_THR_LIBS="" + GLOBUS_LDAP_THR_LIBS="" + ifelse([$3], , :, [$3]) + fi + + AC_SUBST(GLOBUS_LOCATION) + AC_SUBST(GLOBUS_NOTHR_FLAVOR) + AC_SUBST(GLOBUS_THR_FLAVOR) + AC_SUBST(GLOBUS_NOTHR_CFLAGS) + AC_SUBST(GLOBUS_THR_CFLAGS) + AC_SUBST(GLOBUS_NOTHR_LIBS) + AC_SUBST(GLOBUS_THR_LIBS) + AC_SUBST(GLOBUS_COMMON_NOTHR_LIBS) + AC_SUBST(GLOBUS_COMMON_THR_LIBS) + AC_SUBST(GLOBUS_STATIC_COMMON_NOTHR_LIBS) + AC_SUBST(GLOBUS_STATIC_COMMON_THR_LIBS) + AC_SUBST(GLOBUS_FTP_CLIENT_NOTHR_LIBS) + AC_SUBST(GLOBUS_FTP_CLIENT_THR_LIBS) + AC_SUBST(GLOBUS_SSL_NOTHR_LIBS) + AC_SUBST(GLOBUS_SSL_THR_LIBS) + AC_SUBST(GLOBUS_STATIC_SSL_NOTHR_LIBS) + AC_SUBST(GLOBUS_STATIC_SSL_THR_LIBS) + AC_SUBST(GLOBUS_GSS_NOTHR_LIBS) + AC_SUBST(GLOBUS_GSS_THR_LIBS) + AC_SUBST(GLOBUS_LDAP_THR_LIBS) +]) + diff --git a/project/properties.xml b/project/properties.xml index cfebbaa4..d978f1fa 100755 --- a/project/properties.xml +++ b/project/properties.xml @@ -21,7 +21,7 @@ Authors: Joachim Flammer Version info: $Id: properties.xml,v 1.8 2010/03/17 10:48:17 mezzadri Exp $ - Release: $Name: $ + Release: $Name: glite-ce-blahp_B_1_18 $ Revision history: $Log: properties.xml,v $ diff --git a/project/version.properties b/project/version.properties index 1ca4fb7d..eb61542c 100755 --- a/project/version.properties +++ b/project/version.properties @@ -1,3 +1,3 @@ #Mon Apr 11 15:13:49 CEST 2005 -module.version=1.19.0 +module.version=1.17.0 module.age=0 diff --git a/src/BNotifier.c b/src/BNotifier.c index 8ffc498c..7e483de2 100644 --- a/src/BNotifier.c +++ b/src/BNotifier.c @@ -42,8 +42,6 @@ char *debuglogname; int c_sock; -config_entry *remupd_conf; - /* moved to per-thread structure int startnotify=FALSE; int startnotifyjob=FALSE; @@ -228,11 +226,6 @@ main(int argc, char *argv[]) } } - remupd_conf = config_get("job_registry_add_remote",cha); - if (remupd_conf == NULL){ - do_log(debuglogfile, debug, 1, "%s: key job_registry_add_remote not found\n",argv0); - } - /* create listening socket for Cream */ if ( !async_notif_port ) { @@ -324,6 +317,7 @@ PollDB() job_registry_handle *rha; job_registry_handle *rhc; char *buffer=NULL; + char *finalbuffer=NULL; char *cdate=NULL; time_t now; int maxtok,i,maxtokl,j; @@ -336,8 +330,7 @@ PollDB() char *cp=NULL; int to_sleep=FALSE; int skip_reg_open=FALSE; - int ret; - + rha=job_registry_init(registry_file, BY_BATCH_ID); if (rha == NULL){ do_log(debuglogfile, debug, 1, "%s: Error initialising job registry %s\n",argv0,registry_file); @@ -372,37 +365,19 @@ PollDB() if ((en=job_registry_get(rhc, tbuf[j])) != NULL){ buffer=ComposeClassad(en); }else{ - if(remupd_conf == NULL){ - cdate=iepoch2str(now); - maxtokl=strtoken(tbuf[j],'_',&lbuf); - if(lbuf[1]){ - if ((cp = strrchr (lbuf[1], '\n')) != NULL){ - *cp = '\0'; - } - if ((cp = strrchr (lbuf[1], '\r')) != NULL){ - *cp = '\0'; - } - buffer=make_message("[BlahJobName=\"%s\"; ClientJobId=\"%s\"; JobStatus=4; JwExitCode=999; ExitReason=\"BUpdater is not able to find the job anymore\"; Reason=\"BUpdater is not able to find the job anymore\"; ChangeTime=\"%s\"; ]\n",tbuf[j],lbuf[1],cdate); + cdate=iepoch2str(now); + maxtokl=strtoken(tbuf[j],'_',&lbuf); + if(lbuf[1]){ + if ((cp = strrchr (lbuf[1], '\n')) != NULL){ + *cp = '\0'; } - freetoken(&lbuf,maxtokl); - free(cdate); - }else{ - maxtokl=strtoken(tbuf[j],':',&lbuf); - JOB_REGISTRY_ASSIGN_ENTRY(en->batch_id,lbuf[0]); - JOB_REGISTRY_ASSIGN_ENTRY(en->blah_id,lbuf[1]); - freetoken(&lbuf,maxtokl); - en->status = 0; - if ((ret=job_registry_append(rhc, en))<0){ - if(ret != JOB_REGISTRY_NOT_FOUND){ - fprintf(stderr,"Update of record returns %d: ",ret); - perror(""); - } - }else{ - if(ret==JOB_REGISTRY_SUCCESS){ - do_log(debuglogfile, debug, 2, "%s: registry append in PollDB for: jobid=%s blahjobid=%s\n",argv0,en->batch_id,en->blah_id); - } - } - } + if ((cp = strrchr (lbuf[1], '\r')) != NULL){ + *cp = '\0'; + } + buffer=make_message("[BlahJobName=\"%s\"; ClientJobId=\"%s\"; JobStatus=4; JwExitCode=999; ExitReason=\"BUpdater is not able to find the job anymore\"; Reason=\"BUpdater is not able to find the job anymore\"; ChangeTime=\"%s\"; ]\n",tbuf[j],lbuf[1],cdate); + } + freetoken(&lbuf,maxtokl); + free(cdate); } free(en); len=strlen(buffer); @@ -643,10 +618,6 @@ STARTNOTIFYJOBEND GetJobList(buffer, &(connection->joblist_string)); connection->startnotifyjob = TRUE; connection->startnotify = FALSE; - } else if (strstr(buffer,"STARTNETWORKSYNC/") != NULL) { - GetJobList(buffer, &(connection->joblist_string)); - connection->startnotifyjob = TRUE; - connection->startnotify = FALSE; } else if (strstr(buffer,"STARTNOTIFYJOBEND/") != NULL) { connection->firstnotify=TRUE; connection->lastnotiftime = time(NULL); diff --git a/src/BUpdaterCondor.c b/src/BUpdaterCondor.c index ec458c8b..1ecbd0c3 100644 --- a/src/BUpdaterCondor.c +++ b/src/BUpdaterCondor.c @@ -38,6 +38,9 @@ int main(int argc, char *argv[]){ char *pidfile=NULL; char *first_duplicate=NULL; + struct pollfd *remupd_pollset = NULL; + int remupd_nfds; + int version=0; int qlen=0; int first=TRUE; @@ -329,18 +332,20 @@ int main(int argc, char *argv[]){ fprintf(stderr,"%s: Error purging job registry %s :",argv0,registry_file); perror(""); + }else{ + purge_time=time(0); } - purge_time=time(0); } now=time(0); if(now - last_consistency_check > bupdater_consistency_check_interval){ if(job_registry_check_index_key_uniqueness(rha,&first_duplicate)==JOB_REGISTRY_FAIL){ - do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s\nJobid should be removed or registry directory should be removed.\n",argv0,first_duplicate); - fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s\nJobid should be removed or registry directory should be removed.",argv0,first_duplicate); + do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s\n",argv0,first_duplicate); + fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s",argv0,first_duplicate); + }else{ + last_consistency_check=time(0); } - last_consistency_check=time(0); } IntStateQuery(); @@ -447,24 +452,21 @@ int ReceiveUpdateFromNetwork() { char *proxy_path, *proxy_subject; - int timeout_ms = -1; - int ret, prret, rhret; + int timeout_ms = 0; + int ent, ret, prret, rhret; job_registry_entry *nen; job_registry_entry *ren; proxy_path = NULL; proxy_subject = NULL; - do_log(debuglogfile, debug, 1, "%s: ReceiveUpdateFromNetwork() thread started\n", argv0); - while ((nen = job_registry_receive_update(remupd_pollset, remupd_nfds,timeout_ms, &proxy_subject, &proxy_path))){ - do_log(debuglogfile, debug, 2, "%s: ReceiveUpdateFromNetwork() received an update for job %s\n", argv0, nen->batch_id); + while (nen = job_registry_receive_update(remupd_pollset, remupd_nfds,timeout_ms, &proxy_subject, &proxy_path)){ JOB_REGISTRY_ASSIGN_ENTRY(nen->subject_hash,"\0"); JOB_REGISTRY_ASSIGN_ENTRY(nen->proxy_link,"\0"); if ((ren=job_registry_get(rha, nen->batch_id)) == NULL){ if ((ret=job_registry_append(rha, nen)) < 0){ - do_log(debuglogfile, debug, 1, "%s: Warning: job_registry_append returns %d: ",argv0,ret); fprintf(stderr,"%s: Warning: job_registry_append returns %d: ",argv0,ret); perror(""); } @@ -598,7 +600,7 @@ IntStateQuery() do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); } } @@ -701,7 +703,7 @@ FinalStateQuery(char *query) job_registry_unlink_proxy(rha, &en); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0); } } @@ -744,7 +746,7 @@ int AssignFinalState(char *batchid){ do_log(debuglogfile, debug, 2, "%s: registry update in AssignStateQuery for: jobid=%s creamjobid=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.status); job_registry_unlink_proxy(rha, &en); if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in AssignFinalState\n",argv0); } } diff --git a/src/BUpdaterLSF.c b/src/BUpdaterLSF.c index ba6166dc..5b99d631 100644 --- a/src/BUpdaterLSF.c +++ b/src/BUpdaterLSF.c @@ -24,8 +24,6 @@ #include "BUpdaterLSF.h" -time_t last_network_update; - int main(int argc, char *argv[]){ FILE *fd; @@ -36,6 +34,9 @@ int main(int argc, char *argv[]){ char *pidfile=NULL; char *first_duplicate=NULL; + struct pollfd *remupd_pollset = NULL; + int remupd_nfds; + int version=0; int first=TRUE; int tmptim; @@ -44,6 +45,7 @@ int main(int argc, char *argv[]){ int rc; int c; + int status; pthread_t RecUpdNetThd; @@ -306,16 +308,6 @@ int main(int argc, char *argv[]){ free(s); } - ret = config_get("bupdater_use_bhist_for_idle",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key bupdater_use_bhist_for_idle not found - using the default:%s\n",argv0,use_bhist_for_idle); - } else { - use_bhist_for_idle=strdup(ret->value); - if(use_bhist_for_idle == NULL){ - sysfatal("strdup failed for use_bhist_for_idle in main: %r"); - } - } - ret = config_get("bupdater_use_bhist_for_killed",cha); if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key bupdater_use_bhist_for_killed not found - using the default:%s\n",argv0,use_bhist_for_killed); @@ -326,6 +318,16 @@ int main(int argc, char *argv[]){ } } + ret = config_get("bupdater_use_bhist_for_idle",cha); + if (ret == NULL){ + do_log(debuglogfile, debug, 1, "%s: key bupdater_use_bhist_for_idle not found - using the default:%s\n",argv0,use_bhist_for_idle); + } else { + use_bhist_for_idle=strdup(ret->value); + if(use_bhist_for_idle == NULL){ + sysfatal("strdup failed for use_bhist_for_idle in main: %r"); + } + } + ret = config_get("lsf_batch_caching_enabled",cha); if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key lsf_batch_caching_enabled not found using default\n",argv0,lsf_batch_caching_enabled); @@ -419,26 +421,22 @@ int main(int argc, char *argv[]){ fprintf(stderr,"%s: Error purging job registry %s :",argv0,registry_file); perror(""); + }else{ + purge_time=time(0); } - purge_time=time(0); } now=time(0); if(now - last_consistency_check > bupdater_consistency_check_interval){ if(job_registry_check_index_key_uniqueness(rha,&first_duplicate)==JOB_REGISTRY_FAIL){ - do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s\nJobid should be removed or registry directory should be removed.\n",argv0,first_duplicate); - fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s\nJobid should be removed or registry directory should be removed.",argv0,first_duplicate); + do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s\n",argv0,first_duplicate); + fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s",argv0,first_duplicate); + }else{ + last_consistency_check=time(0); } - last_consistency_check=time(0); } - - if (now - last_network_update < loop_interval) { - do_log(debuglogfile, debug, 2, "%s: skipping iteration as registry was updated %d seconds ago via network\n", argv0, now - last_network_update); - sleep(loop_interval); - continue; - } - + if(use_btools && strcmp(use_btools,"yes")==0){ IntStateQueryCustom(); @@ -447,7 +445,7 @@ int main(int argc, char *argv[]){ }else{ IntStateQueryShort(); } - + fd = job_registry_open(rha, "r"); if (fd == NULL){ do_log(debuglogfile, debug, 1, "%s: Error opening job registry %s\n",argv0,registry_file); @@ -486,12 +484,13 @@ int main(int argc, char *argv[]){ } /* Try to run FinalStateQuery reading older log files*/ - if(now-confirm_time>bhist_finalstate_interval){ - runfinal_oldlogs=TRUE; - free(en); - continue; - } - + if(now-confirm_time>bhist_finalstate_interval && use_bhist_for_idle && strcmp(use_bhist_for_idle,"yes")==0){ + do_log(debuglogfile, debug, 2, "%s: FinalStateQuery needed for jobid=%s with status=%d from old logs\n",argv0,en->batch_id,en->status); + runfinal_oldlogs=TRUE; + free(en); + continue; + } + if(en->status==IDLE && strlen(en->updater_info)>0 && use_bhist_for_idle && strcmp(use_bhist_for_idle,"yes")==0){ if (en->mdate < finalquery_start_date){ finalquery_start_date=en->mdate; @@ -533,24 +532,21 @@ int ReceiveUpdateFromNetwork() { char *proxy_path, *proxy_subject; - int timeout_ms = -1; - int ret, prret, rhret; + int timeout_ms = 0; + int ent, ret, prret, rhret; job_registry_entry *nen; job_registry_entry *ren; proxy_path = NULL; proxy_subject = NULL; - do_log(debuglogfile, debug, 1, "%s: ReceiveUpdateFromNetwork() thread started\n", argv0); - while ((nen = job_registry_receive_update(remupd_pollset, remupd_nfds, timeout_ms, &proxy_subject, &proxy_path))){ - do_log(debuglogfile, debug, 2, "%s: ReceiveUpdateFromNetwork() received an update for job %s\n", argv0, nen->batch_id); - + while (nen = job_registry_receive_update(remupd_pollset, remupd_nfds,timeout_ms, &proxy_subject, &proxy_path)){ + JOB_REGISTRY_ASSIGN_ENTRY(nen->subject_hash,"\0"); JOB_REGISTRY_ASSIGN_ENTRY(nen->proxy_link,"\0"); if ((ren=job_registry_get(rha, nen->batch_id)) == NULL){ if ((ret=job_registry_append(rha, nen)) < 0){ - do_log(debuglogfile, debug, 1, "%s: Warning: job_registry_append returns %d: ",argv0,ret); fprintf(stderr,"%s: Warning: job_registry_append returns %d: ",argv0,ret); perror(""); } @@ -587,17 +583,14 @@ ReceiveUpdateFromNetwork() } if(job_registry_need_update(ren,nen,JOB_REGISTRY_UPDATE_ALL)){ if ((ret=job_registry_update(rha, nen)) < 0){ - do_log(debuglogfile, debug, 1, "%s: Warning: job_registry_update returns %d: ",argv0,ret); fprintf(stderr,"%s: Warning: job_registry_update returns %d: ",argv0,ret); perror(""); - } else { - last_network_update = time(0); } } } free(nen); } - do_log(debuglogfile, debug, 1, "%s: ReceiveUpdateFromNetwork() thread exiting\n", argv0); + return 0; } @@ -616,6 +609,8 @@ IntStateQueryCustom() int maxtok_l=0; job_registry_entry en; int ret; + char *timestamp; + time_t tmstampepoch; char *tmp=NULL; char *cp=NULL; char *command_string=NULL; @@ -626,7 +621,7 @@ IntStateQueryCustom() int wexitcode=0; int wexitinfo=0; - command_string=make_message("%s%s/bjobsinfo",batch_command,btools_path); + command_string=make_message("%s%s/bjobsinfo -a",batch_command,btools_path); fp = popen(command_string,"r"); do_log(debuglogfile, debug, 3, "%s: command in IntStateQueryCustom is:%s\n",argv0,command_string); @@ -679,7 +674,7 @@ IntStateQueryCustom() do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQueryCustom for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQueryCustom\n",argv0); } } @@ -783,7 +778,7 @@ IntStateQueryCustom() do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQueryCustom for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQueryCustom\n",argv0); } } @@ -882,7 +877,7 @@ IntStateQueryShort() do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQueryShort for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQueryShort\n",argv0); } } @@ -968,7 +963,7 @@ IntStateQueryShort() do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQueryShort for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQueryShort\n",argv0); } } @@ -1066,7 +1061,7 @@ IntStateQuery() do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); } } @@ -1177,7 +1172,7 @@ IntStateQuery() free(ex_str); freetoken(&token,maxtok_t); - if(wexitcode==255 || wexitcode==130 || wexitcode==143){ + if(wexitcode==255 || wexitcode==130){ en.status=REMOVED; en.exitcode=-999; }else{ @@ -1250,7 +1245,7 @@ IntStateQuery() do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); } } @@ -1372,7 +1367,7 @@ exitcode (=0 if Done successfully) or (from Exited with exit code 2) } } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0); } } @@ -1477,7 +1472,7 @@ exitcode (=0 if Done successfully) or (from Exited with exit code 2) job_registry_unlink_proxy(rha, &en); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0); } } @@ -1652,7 +1647,7 @@ int AssignFinalState(char *batchid){ do_log(debuglogfile, debug, 2, "%s: registry update in AssignStateQuery for: jobid=%s creamjobid=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.status); job_registry_unlink_proxy(rha, &en); if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in AssignFinalState\n",argv0); } } diff --git a/src/BUpdaterPBS.c b/src/BUpdaterPBS.c index 3f2b81f8..f6f7ce38 100644 --- a/src/BUpdaterPBS.c +++ b/src/BUpdaterPBS.c @@ -39,6 +39,9 @@ int main(int argc, char *argv[]){ char *first_duplicate=NULL; + struct pollfd *remupd_pollset = NULL; + int remupd_nfds; + int version=0; int first=TRUE; int tmptim; @@ -293,7 +296,7 @@ int main(int argc, char *argv[]){ if (ret == NULL){ do_log(debuglogfile, debug, 1, "%s: key tracejob_max_output not found using default\n",argv0,tracejob_max_output); } else { - tracejob_max_output=atoi(ret->value); + tracejob_max_output==atoi(ret->value); } remupd_conf = config_get("job_registry_add_remote",cha); @@ -360,18 +363,20 @@ int main(int argc, char *argv[]){ fprintf(stderr,"%s: Error purging job registry %s :",argv0,registry_file); perror(""); + }else{ + purge_time=time(0); } - purge_time=time(0); } now=time(0); if(now - last_consistency_check > bupdater_consistency_check_interval){ if(job_registry_check_index_key_uniqueness(rha,&first_duplicate)==JOB_REGISTRY_FAIL){ - do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s\n.Jobid should be removed or registry directory should be removed.\n",argv0,first_duplicate); - fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s\nJobid should be removed or registry directory should be removed.",argv0,first_duplicate); + do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s\n",argv0,first_duplicate); + fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s",argv0,first_duplicate); + }else{ + last_consistency_check=time(0); } - last_consistency_check=time(0); } IntStateQuery(); @@ -460,24 +465,21 @@ int ReceiveUpdateFromNetwork() { char *proxy_path, *proxy_subject; - int timeout_ms = -1; - int ret, prret, rhret; + int timeout_ms = 0; + int ent, ret, prret, rhret; job_registry_entry *nen; job_registry_entry *ren; proxy_path = NULL; proxy_subject = NULL; - do_log(debuglogfile, debug, 1, "%s: ReceiveUpdateFromNetwork() thread started\n", argv0); - while ((nen = job_registry_receive_update(remupd_pollset, remupd_nfds,timeout_ms, &proxy_subject, &proxy_path))){ - do_log(debuglogfile, debug, 2, "%s: ReceiveUpdateFromNetwork() received an update for job %s\n", argv0, nen->batch_id); + while (nen = job_registry_receive_update(remupd_pollset, remupd_nfds,timeout_ms, &proxy_subject, &proxy_path)){ JOB_REGISTRY_ASSIGN_ENTRY(nen->subject_hash,"\0"); JOB_REGISTRY_ASSIGN_ENTRY(nen->proxy_link,"\0"); if ((ren=job_registry_get(rha, nen->batch_id)) == NULL){ if ((ret=job_registry_append(rha, nen)) < 0){ - do_log(debuglogfile, debug, 1, "%s: Warning: job_registry_append returns %d: ",argv0,ret); fprintf(stderr,"%s: Warning: job_registry_append returns %d: ",argv0,ret); perror(""); } @@ -619,7 +621,7 @@ Job Id: 11.cream-12.pd.infn.it } } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); } } @@ -734,7 +736,7 @@ Job Id: 11.cream-12.pd.infn.it } } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); } } @@ -912,7 +914,7 @@ Job: 13.cream-12.pd.infn.it job_registry_unlink_proxy(rha, &en); } if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0); } } @@ -959,7 +961,7 @@ int AssignFinalState(char *batchid){ do_log(debuglogfile, debug, 2, "%s: registry update in AssignStateQuery for: jobid=%s creamjobid=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.status); job_registry_unlink_proxy(rha, &en); if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ + if (ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL)<=0){ do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in AssignFinalState\n",argv0); } } diff --git a/src/BUpdaterSLURM.c b/src/BUpdaterSLURM.c deleted file mode 100644 index 1eaf2e79..00000000 --- a/src/BUpdaterSLURM.c +++ /dev/null @@ -1,855 +0,0 @@ -/* -# File: BUpdaterSLURM.c -# -# Author: Massimo Mezzadri -# e-mail: Massimo.Mezzadri@mi.infn.it -# -# Copyright (c) Members of the EGEE Collaboration. 2004. -# See http://www.eu-egee.org/partners/ for details on the copyright -# holders. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -*/ - -#include "BUpdaterSLURM.h" - -int main(int argc, char *argv[]){ - - FILE *fd; - job_registry_entry *en; - time_t now; - time_t purge_time=0; - time_t last_consistency_check=0; - char *pidfile=NULL; - char *first_duplicate=NULL; - - struct pollfd *remupd_pollset = NULL; - int remupd_nfds; - - int version=0; - int first=TRUE; - int tmptim; - time_t finalquery_start_date; - int loop_interval=DEFAULT_LOOP_INTERVAL; - - int rc; - int c; - - pthread_t RecUpdNetThd; - - int confirm_time=0; - - static int help; - static int short_help; - - bact.njobs = 0; - bact.jobs = NULL; - - while (1) { - static struct option long_options[] = - { - {"help", no_argument, &help, 1}, - {"usage", no_argument, &short_help, 1}, - {"nodaemon", no_argument, 0, 'o'}, - {"version", no_argument, 0, 'v'}, - {"prefix", required_argument, 0, 'p'}, - {0, 0, 0, 0} - }; - - int option_index = 0; - - c = getopt_long (argc, argv, "vop:",long_options, &option_index); - - if (c == -1){ - break; - } - - switch (c) - { - - case 0: - if (long_options[option_index].flag != 0){ - break; - } - - case 'v': - version=1; - break; - - case 'o': - nodmn=1; - break; - - case 'p': - break; - - case '?': - break; - - default: - abort (); - } - } - - if(help){ - usage(); - } - - if(short_help){ - short_usage(); - } - - argv0 = argv[0]; - - signal(SIGHUP,sighup); - - if(version) { - printf("%s Version: %s\n",progname,VERSION); - exit(EXIT_SUCCESS); - } - - /* Checking configuration */ - check_config_file("UPDATER"); - - cha = config_read(NULL); - if (cha == NULL) - { - fprintf(stderr,"Error reading config: "); - perror(""); - return -1; - } - - ret = config_get("bupdater_child_poll_timeout",cha); - if (ret != NULL){ - tmptim=atoi(ret->value); - if (tmptim > 0) bfunctions_poll_timeout = tmptim*1000; - } - - ret = config_get("bupdater_debug_level",cha); - if (ret != NULL){ - debug=atoi(ret->value); - } - - ret = config_get("bupdater_debug_logfile",cha); - if (ret != NULL){ - debuglogname=strdup(ret->value); - if(debuglogname == NULL){ - sysfatal("strdup failed for debuglogname in main: %r"); - } - } - if(debug <=0){ - debug=0; - } - - if(debuglogname){ - if((debuglogfile = fopen(debuglogname, "a+"))==0){ - debug = 0; - } - } else { - debug = 0; - } - - ret = config_get("slurm_binpath",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key slurm_binpath not found\n",argv0); - } else { - slurm_binpath=strdup(ret->value); - if(slurm_binpath == NULL){ - sysfatal("strdup failed for slurm_binpath in main: %r"); - } - } - - ret = config_get("job_registry",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key job_registry not found\n",argv0); - sysfatal("job_registry not defined. Exiting"); - } else { - registry_file=strdup(ret->value); - if(registry_file == NULL){ - sysfatal("strdup failed for registry_file in main: %r"); - } - } - - ret = config_get("purge_interval",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key purge_interval not found using the default:%d\n",argv0,purge_interval); - } else { - purge_interval=atoi(ret->value); - } - - ret = config_get("finalstate_query_interval",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key finalstate_query_interval not found using the default:%d\n",argv0,finalstate_query_interval); - } else { - finalstate_query_interval=atoi(ret->value); - } - - ret = config_get("alldone_interval",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key alldone_interval not found using the default:%d\n",argv0,alldone_interval); - } else { - alldone_interval=atoi(ret->value); - } - - ret = config_get("bupdater_consistency_check_interval",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key bupdater_consistency_check_interval not found using the default:%d\n",argv0,bupdater_consistency_check_interval); - } else { - bupdater_consistency_check_interval=atoi(ret->value); - } - - ret = config_get("bupdater_pidfile",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key bupdater_pidfile not found\n",argv0); - } else { - pidfile=strdup(ret->value); - if(pidfile == NULL){ - sysfatal("strdup failed for pidfile in main: %r"); - } - } - - ret = config_get("bupdater_loop_interval",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key bupdater_loop_interval not found - using the default:%d\n",argv0,loop_interval); - } else { - loop_interval=atoi(ret->value); - } - - ret = config_get("job_registry_use_mmap",cha); - if (ret == NULL){ - do_log(debuglogfile, debug, 1, "%s: key job_registry_use_mmap not found. Default is NO\n",argv0); - } else { - do_log(debuglogfile, debug, 1, "%s: key job_registry_use_mmap is set to %s\n",argv0,ret->value); - } - - remupd_conf = config_get("job_registry_add_remote",cha); - if (remupd_conf == NULL){ - do_log(debuglogfile, debug, 1, "%s: key job_registry_add_remote not found\n",argv0); - }else{ - if (job_registry_updater_setup_receiver(remupd_conf->values,remupd_conf->n_values,&remupd_head) < 0){ - do_log(debuglogfile, debug, 1, "%s: Cannot set network receiver(s) up for remote update\n",argv0); - fprintf(stderr,"%s: Cannot set network receiver(s) up for remote update \n",argv0); - } - - if (remupd_head == NULL){ - do_log(debuglogfile, debug, 1, "%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n",argv0); - fprintf(stderr,"%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n", argv0); - } - - if ((remupd_nfds = job_registry_updater_get_pollfd(remupd_head, &remupd_pollset)) < 0){ - do_log(debuglogfile, debug, 1, "%s: Cannot setup poll set for receiving data.\n",argv0); - fprintf(stderr,"%s: Cannot setup poll set for receiving data.\n", argv0); - } - if (remupd_pollset == NULL || remupd_nfds == 0){ - do_log(debuglogfile, debug, 1, "%s: No poll set available for receiving data.\n",argv0); - fprintf(stderr,"%s: No poll set available for receiving data.\n",argv0); - } - - } - - if( !nodmn ) daemonize(); - - - if( pidfile ){ - writepid(pidfile); - free(pidfile); - } - - rha=job_registry_init(registry_file, BY_BATCH_ID); - if (rha == NULL){ - do_log(debuglogfile, debug, 1, "%s: Error initialising job registry %s\n",argv0,registry_file); - fprintf(stderr,"%s: Error initialising job registry %s :",argv0,registry_file); - perror(""); - } - - if (remupd_conf != NULL){ - pthread_create(&RecUpdNetThd, NULL, (void *(*)(void *))ReceiveUpdateFromNetwork, (void *)NULL); - - if (job_registry_updater_setup_sender(remupd_conf->values,remupd_conf->n_values,0,&remupd_head_send) < 0){ - do_log(debuglogfile, debug, 1, "%s: Cannot set network sender(s) up for remote update\n",argv0); - fprintf(stderr,"%s: Cannot set network sender(s) up for remote update \n",argv0); - } - if (remupd_head_send == NULL){ - do_log(debuglogfile, debug, 1, "%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n",argv0); - fprintf(stderr,"%s: Cannot find values for network endpoints in configuration file (attribute 'job_registry_add_remote').\n", argv0); - } - } - - config_free(cha); - - for(;;){ - /* Purge old entries from registry */ - now=time(0); - if(now - purge_time > 86400){ - if((rc=job_registry_purge(registry_file, now-purge_interval,0))<0){ - do_log(debuglogfile, debug, 1, "%s: Error purging job registry %s:%d\n",argv0,registry_file,rc); - fprintf(stderr,"%s: Error purging job registry %s :",argv0,registry_file); - perror(""); - - } - purge_time=time(0); - } - - now=time(0); - if(now - last_consistency_check > bupdater_consistency_check_interval){ - if(job_registry_check_index_key_uniqueness(rha,&first_duplicate)==JOB_REGISTRY_FAIL){ - do_log(debuglogfile, debug, 1, "%s: Found job registry duplicate entry. The first one is:%s\n.Jobid should be removed or registry directory should be removed.\n",argv0,first_duplicate); - fprintf(stderr,"%s: Found job registry duplicate entry. The first one is:%s.\nJobid should be removed or registry directory should be removed.",argv0,first_duplicate); - - } - last_consistency_check=time(0); - } - - IntStateQuery(); - - fd = job_registry_open(rha, "r"); - if (fd == NULL){ - do_log(debuglogfile, debug, 1, "%s: Error opening job registry %s\n",argv0,registry_file); - fprintf(stderr,"%s: Error opening job registry %s :",argv0,registry_file); - perror(""); - sleep(loop_interval); - continue; - } - if (job_registry_rdlock(rha, fd) < 0){ - do_log(debuglogfile, debug, 1, "%s: Error read locking job registry %s\n",argv0,registry_file); - fprintf(stderr,"%s: Error read locking job registry %s :",argv0,registry_file); - perror(""); - sleep(loop_interval); - continue; - } - job_registry_firstrec(rha,fd); - fseek(fd,0L,SEEK_SET); - - first=TRUE; - finalquery_start_date = time(0); - - while ((en = job_registry_get_next(rha, fd)) != NULL){ - - if((bupdater_lookup_active_jobs(&bact,en->batch_id) != BUPDATER_ACTIVE_JOBS_SUCCESS) && en->status!=REMOVED && en->status!=COMPLETED){ - - confirm_time=atoi(en->updater_info); - if(confirm_time==0){ - confirm_time=en->mdate; - } - - /* Assign Status=4 and ExitStatus=999 to all entries that after alldone_interval are still not in a final state(3 or 4)*/ - if(now-confirm_time>alldone_interval){ - AssignFinalState(en->batch_id); - free(en); - continue; - } - - if(en->status==IDLE && strlen(en->updater_info)>0){ - if (en->mdate < finalquery_start_date){ - finalquery_start_date=en->mdate; - } - do_log(debuglogfile, debug, 2, "%s: FinalStateQuery needed for jobid=%s with status=%d\n",argv0,en->batch_id,en->status); - runfinal=TRUE; - }else if((now-confirm_time>finalstate_query_interval) && (now > next_finalstatequery)){ - if (en->mdate < finalquery_start_date){ - finalquery_start_date=en->mdate; - } - do_log(debuglogfile, debug, 2, "%s: FinalStateQuery needed for jobid=%s with status=%d\n",argv0,en->batch_id,en->status); - runfinal=TRUE; - } - - - } - free(en); - } - - if(runfinal_oldlogs){ - FinalStateQuery(0,1); - runfinal_oldlogs=FALSE; - runfinal=FALSE; - }else if(runfinal){ - FinalStateQuery(finalquery_start_date,1); - runfinal=FALSE; - } - fclose(fd); - sleep(loop_interval); - } - - job_registry_destroy(rha); - - return 0; - -} - -int -ReceiveUpdateFromNetwork() -{ - char *proxy_path, *proxy_subject; - int timeout_ms = 0; - int ret, prret, rhret; - job_registry_entry *nen; - job_registry_entry *ren; - - proxy_path = NULL; - proxy_subject = NULL; - - while ((nen = job_registry_receive_update(remupd_pollset, remupd_nfds,timeout_ms, &proxy_subject, &proxy_path))){ - - JOB_REGISTRY_ASSIGN_ENTRY(nen->subject_hash,"\0"); - JOB_REGISTRY_ASSIGN_ENTRY(nen->proxy_link,"\0"); - - if ((ren=job_registry_get(rha, nen->batch_id)) == NULL){ - if ((ret=job_registry_append(rha, nen)) < 0){ - fprintf(stderr,"%s: Warning: job_registry_append returns %d: ",argv0,ret); - perror(""); - } - }else{ - - if(ren->subject_hash!=NULL && strlen(ren->subject_hash) && ren->proxy_link!=NULL && strlen(ren->proxy_link)){ - JOB_REGISTRY_ASSIGN_ENTRY(nen->subject_hash,ren->subject_hash); - JOB_REGISTRY_ASSIGN_ENTRY(nen->proxy_link,ren->proxy_link); - }else{ - if (proxy_path != NULL && strlen(proxy_path) > 0){ - prret = job_registry_set_proxy(rha, nen, proxy_path); - if (prret < 0){ - do_log(debuglogfile, debug, 1, "%s: warning: setting proxy to %s\n",argv0,proxy_path); - fprintf(stderr,"%s: warning: setting proxy to %s: ",argv0,proxy_path); - perror(""); - /* Make sure we don't renew non-existing proxies */ - nen->renew_proxy = 0; - } - free(proxy_path); - - nen->subject_hash[0] = '\000'; - if (proxy_subject != NULL && strlen(proxy_subject) > 0){ - job_registry_compute_subject_hash(nen, proxy_subject); - rhret = job_registry_record_subject_hash(rha, nen->subject_hash, proxy_subject, TRUE); - if (rhret < 0){ - do_log(debuglogfile, debug, 1, "%s: warning: recording proxy subject %s (hash %s)\n",argv0, proxy_subject, nen->subject_hash); - fprintf(stderr,"%s: warning: recording proxy subject %s (hash %s): ",argv0, proxy_subject, nen->subject_hash); - perror(""); - } - } - free(proxy_subject); - - } - } - if(job_registry_need_update(ren,nen,JOB_REGISTRY_UPDATE_ALL)){ - if ((ret=job_registry_update(rha, nen)) < 0){ - fprintf(stderr,"%s: Warning: job_registry_update returns %d: ",argv0,ret); - perror(""); - } - } - } - free(nen); - } - - return 0; -} - -int -IntStateQuery() -{ - - FILE *fp; - char *line=NULL; - char **token; - char **token_l; - char **token_e; - int maxtok_t=0; - int maxtok_l=0; - int maxtok_e=0; - job_registry_entry en; - int ret; - time_t tmstampepoch; - char *cp=NULL; - char *batch_str=NULL; - char *command_string=NULL; - job_registry_entry *ren=NULL; - int isresumed=FALSE; - int first=TRUE; - time_t now; - char *string_now=NULL; - - command_string=make_message("%s/scontrol -a show jobid",slurm_binpath); - fp = popen(command_string,"r"); - - en.status=UNDEFINED; - JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0"); - JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,"\0"); - en.exitcode=-1; - bupdater_free_active_jobs(&bact); - - if(fp!=NULL){ - while(!feof(fp) && (line=get_line(fp))){ - if(line && strlen(line)==0){ - free(line); - continue; - } - if ((cp = strrchr (line, '\n')) != NULL){ - *cp = '\0'; - } - do_log(debuglogfile, debug, 3, "%s: line in IntStateQuery is:%s\n",argv0,line); - now=time(0); - string_now=make_message("%d",now); - maxtok_t = strtoken(line, ' ', &token); - if(line && strstr(line,"JobId=")){ - isresumed=FALSE; - if(!first && en.status!=UNDEFINED && ren && ren->status!=REMOVED && ren->status!=COMPLETED){ - if ((ret=job_registry_update_recn_select(rha, &en, ren->recnum, - JOB_REGISTRY_UPDATE_WN_ADDR| - JOB_REGISTRY_UPDATE_STATUS| - JOB_REGISTRY_UPDATE_UDATE| - JOB_REGISTRY_UPDATE_UPDATER_INFO| - JOB_REGISTRY_UPDATE_EXITCODE| - JOB_REGISTRY_UPDATE_EXITREASON)) < 0){ - if(ret != JOB_REGISTRY_NOT_FOUND){ - fprintf(stderr,"Update of record returns %d: ",ret); - perror(""); - } - } else { - if(ret==JOB_REGISTRY_SUCCESS){ - if (en.status == REMOVED || en.status == COMPLETED) { - do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d exitcode=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status,en.exitcode); - job_registry_unlink_proxy(rha, &en); - }else{ - do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); - } - if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ - do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); - } - } - } - } - en.status = UNDEFINED; - JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,"\0"); - JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0"); - en.exitcode=-1; - } - en.status = UNDEFINED; - maxtok_l = strtoken(token[0], '=', &token_l); - batch_str=strdup(token_l[1]); - JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,batch_str); - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - en.exitcode=-1; - bupdater_push_active_job(&bact, en.batch_id); - free(batch_str); - freetoken(&token_l,maxtok_l); - if(!first) free(ren); - if ((ren=job_registry_get(rha, en.batch_id)) == NULL){ - fprintf(stderr,"Get of record returns error "); - perror(""); - } - if(ren){ - if(strlen(ren->updater_info)>0){ - en.udate=ren->udate; - }else{ - en.udate=time(0); - } - } - first=FALSE; - - }else if(line && strstr(line," JobState=")){ - if(token[0] && strstr(line,"JobState=")){ - maxtok_l = strtoken(token[0], '=', &token_l); - if(token_l[1] && strstr(token_l[1],"PENDING")){ - en.status=IDLE; - en.exitcode=-1; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token_l[1] && strstr(token_l[1],"RUNNING")){ - en.status=RUNNING; - en.exitcode=-1; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token_l[1] && strstr(token_l[1],"COMPLETED")){ - en.status=COMPLETED; - en.exitcode=0; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token_l[1] && strstr(token_l[1],"CANCELLED")){ - en.status=REMOVED; - en.exitcode=-999; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token_l[1] && strstr(token_l[1],"FAILED")){ - en.status=COMPLETED; - en.exitcode=0; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token_l[1] && strstr(token_l[1],"SUSPENDED")){ - en.status=HELD; - en.exitcode=-1; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token_l[1] && strstr(token_l[1],"COMPLETING")){ - bupdater_remove_active_job(&bact, en.batch_id); - } - freetoken(&token_l,maxtok_l); - } - }else if(line && strstr(line," BatchHost=")){ - if(token[0] && strstr(line,"BatchHost=")){ - maxtok_l = strtoken(token[0], '=', &token_l); - if(en.status!=IDLE){ - JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,token_l[1]); - } - freetoken(&token_l,maxtok_l); - } - }else if(line && strstr(line," ExitCode=")){ - if(token[3] && strstr(line,"ExitCode=")){ - maxtok_l = strtoken(token[3], '=', &token_l); - maxtok_e = strtoken(token_l[1], ':', &token_e); - if(en.status==COMPLETED){ - en.exitcode=atoi(token_e[0]); - } - freetoken(&token_l,maxtok_l); - freetoken(&token_e,maxtok_e); - } - }else if(line && strstr(line," SubmitTime=")){ - if(en.status==IDLE){ - if(token[0] && strstr(line,"SubmitTime=")){ - maxtok_l = strtoken(token[0], '=', &token_l); - tmstampepoch=str2epoch(token_l[1],"N"); - en.udate=tmstampepoch; - freetoken(&token_l,maxtok_l); - } - } - }else if(line && strstr(line," StartTime=")){ - if(en.status==RUNNING){ - if(token[0] && strstr(line,"StartTime=")){ - maxtok_l = strtoken(token[0], '=', &token_l); - tmstampepoch=str2epoch(token_l[1],"N"); - en.udate=tmstampepoch; - freetoken(&token_l,maxtok_l); - } - } - if(en.status==COMPLETED || en.status==REMOVED){ - if(token[1] && strstr(line,"EndTime=")){ - maxtok_l = strtoken(token[1], '=', &token_l); - tmstampepoch=str2epoch(token_l[1],"N"); - en.udate=tmstampepoch; - freetoken(&token_l,maxtok_l); - } - } - }else if(line && strstr(line," SuspendTime=")){ - if(en.status==HELD){ - if(token[1] && strstr(line,"SuspendTime=")){ - maxtok_l = strtoken(token[1], '=', &token_l); - tmstampepoch=str2epoch(token_l[1],"N"); - en.udate=tmstampepoch; - freetoken(&token_l,maxtok_l); - } - } - } - - free(line); - free(string_now); - freetoken(&token,maxtok_t); - } - pclose(fp); - } - - if(en.status!=UNDEFINED && ren && ren->status!=REMOVED && ren->status!=COMPLETED){ - if ((ret=job_registry_update_recn_select(rha, &en, ren->recnum, - JOB_REGISTRY_UPDATE_WN_ADDR| - JOB_REGISTRY_UPDATE_STATUS| - JOB_REGISTRY_UPDATE_UDATE| - JOB_REGISTRY_UPDATE_UPDATER_INFO| - JOB_REGISTRY_UPDATE_EXITCODE| - JOB_REGISTRY_UPDATE_EXITREASON)) < 0){ - if(ret != JOB_REGISTRY_NOT_FOUND){ - fprintf(stderr,"Update of record returns %d: ",ret); - perror(""); - } - } else { - if(ret==JOB_REGISTRY_SUCCESS){ - if (en.status == REMOVED || en.status == COMPLETED) { - do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d exitcode=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status,en.exitcode); - job_registry_unlink_proxy(rha, &en); - }else{ - do_log(debuglogfile, debug, 2, "%s: registry update in IntStateQuery for: jobid=%s creamjobid=%s wn=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.wn_addr,en.status); - } - if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ - do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in IntStateQuery\n",argv0); - } - } - } - } - } - - free(ren); - free(command_string); - return 0; -} - -int -FinalStateQuery(time_t start_date, int logs_to_read) -{ - - FILE *fp; - char *line=NULL; - char **token; - char **token_l; - int maxtok_t=0; - int maxtok_l=0; - job_registry_entry en; - int ret; - time_t tmstampepoch; - char *cp=NULL; - char *command_string=NULL; - time_t now; - char *string_now=NULL; - job_registry_entry *ren=NULL; - - - command_string=make_message("%s/sacct -nap -o JobID,JobName,State,ExitCode,submit,start,end 2>/dev/null",slurm_binpath); - - fp = popen(command_string,"r"); - - do_log(debuglogfile, debug, 3, "%s: command_string in FinalStateQuery is:%s\n",argv0,command_string); - - en.status=UNDEFINED; - JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); - - if(fp!=NULL){ - while(!feof(fp) && (line=get_line(fp))){ - if(line && strlen(line)==0){ - free(line); - continue; - } - if ((cp = strrchr (line, '\n')) != NULL){ - *cp = '\0'; - } - do_log(debuglogfile, debug, 3, "%s: line in FinalStateQuery is:%s\n",argv0,line); - now=time(0); - string_now=make_message("%d",now); - maxtok_t = strtoken(line, '|', &token); - JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,token[0]); - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - if(token[2] && strstr(token[2],"COMPLETED")){ - en.status=COMPLETED; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token[2] && strstr(token[2],"CANCELLED")){ - en.status=REMOVED; - en.exitcode=-999; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - }else if(token[2] && strstr(token[2],"FAILED")){ - en.status=COMPLETED; - JOB_REGISTRY_ASSIGN_ENTRY(en.updater_info,string_now); - } - - tmstampepoch=str2epoch(token[6],"N"); - en.udate=tmstampepoch; - if(en.status==COMPLETED){ - maxtok_l = strtoken(token[3], ':', &token_l); - en.exitcode=atoi(token_l[0]); - freetoken(&token_l,maxtok_l); - } - - if ((ren=job_registry_get(rha, en.batch_id)) == NULL){ - fprintf(stderr,"Get of record returns error "); - perror(""); - } - if(en.status!=UNDEFINED && en.status!=IDLE && ren && ren->status!=REMOVED && ren->status!=COMPLETED){ - if ((ret=job_registry_update_select(rha, &en, - JOB_REGISTRY_UPDATE_UDATE | - JOB_REGISTRY_UPDATE_STATUS | - JOB_REGISTRY_UPDATE_UPDATER_INFO | - JOB_REGISTRY_UPDATE_EXITCODE | - JOB_REGISTRY_UPDATE_EXITREASON )) < 0){ - if(ret != JOB_REGISTRY_NOT_FOUND){ - fprintf(stderr,"Update of record returns %d: ",ret); - perror(""); - } - } else { - do_log(debuglogfile, debug, 2, "%s: f registry update in FinalStateQuery for: jobid=%s creamjobid=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.status); - if (en.status == REMOVED || en.status == COMPLETED){ - job_registry_unlink_proxy(rha, &en); - } - if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ - do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in FinalStateQuery\n",argv0); - } - } - } - } - free(string_now); - free(line); - freetoken(&token,maxtok_t); - free(ren); - } - pclose(fp); - } - - free(command_string); - return 0; -} - -int AssignFinalState(char *batchid){ - - job_registry_entry en; - int ret,i; - time_t now; - - now=time(0); - - JOB_REGISTRY_ASSIGN_ENTRY(en.batch_id,batchid); - en.status=COMPLETED; - en.exitcode=999; - en.udate=now; - JOB_REGISTRY_ASSIGN_ENTRY(en.wn_addr,"\0"); - JOB_REGISTRY_ASSIGN_ENTRY(en.exitreason,"\0"); - - if ((ret=job_registry_update(rha, &en)) < 0){ - if(ret != JOB_REGISTRY_NOT_FOUND){ - fprintf(stderr,"Update of record %d returns %d: ",i,ret); - perror(""); - } - } else { - do_log(debuglogfile, debug, 2, "%s: registry update in AssignStateQuery for: jobid=%s creamjobid=%s status=%d\n",argv0,en.batch_id,en.user_prefix,en.status); - job_registry_unlink_proxy(rha, &en); - if (remupd_conf != NULL){ - if ((ret=job_registry_send_update(remupd_head_send,&en,NULL,NULL))<=0){ - do_log(debuglogfile, debug, 2, "%s: Error creating endpoint in AssignFinalState\n",argv0); - } - } - } - - - return 0; -} - -void sighup() -{ - if(debug){ - fclose(debuglogfile); - if((debuglogfile = fopen(debuglogname, "a+"))==0){ - debug = 0; - } - } -} - -int -usage() -{ - printf("Usage: BUpdaterSLURM [OPTION...]\n"); - printf(" -o, --nodaemon do not run as daemon\n"); - printf(" -v, --version print version and exit\n"); - printf("\n"); - printf("Help options:\n"); - printf(" -?, --help Show this help message\n"); - printf(" --usage Display brief usage message\n"); - exit(EXIT_SUCCESS); -} - -int -short_usage() -{ - printf("Usage: BUpdaterSLURM [-ov?] [-o|--nodaemon] [-v|--version] [-?|--help] [--usage]\n"); - exit(EXIT_SUCCESS); -} - diff --git a/src/BUpdaterSLURM.h b/src/BUpdaterSLURM.h deleted file mode 100644 index d04203ce..00000000 --- a/src/BUpdaterSLURM.h +++ /dev/null @@ -1,72 +0,0 @@ -/* -# File: BUpdaterSLURM.h -# -# Author: Massimo Mezzadri -# e-mail: Massimo.Mezzadri@mi.infn.it -# -# Copyright (c) Members of the EGEE Collaboration. 2004. -# See http://www.eu-egee.org/partners/ for details on the copyright -# holders. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -*/ - -#include "acconfig.h" - -#include "job_registry.h" -#include "job_registry_updater.h" -#include "Bfunctions.h" -#include "config.h" - -#define DEFAULT_LOOP_INTERVAL 5 - -#ifndef VERSION -#define VERSION "1.8.0" -#endif - -int ReceiveUpdateFromNetwork(); -int IntStateQuery(); -int FinalStateQuery(time_t start_date, int logs_to_read); -int AssignFinalState(char *batchid); -void sighup(); -int usage(); -int short_usage(); - -int runfinal=FALSE; -int runfinal_oldlogs=FALSE; -char *slurm_binpath; -char *registry_file; -int purge_interval=864000; -int finalstate_query_interval=30; -int alldone_interval=36000; -int next_finalstatequery=0; -int bupdater_consistency_check_interval=3600; -int debug=FALSE; -int nodmn=FALSE; - -bupdater_active_jobs bact; - -FILE *debuglogfile; -char *debuglogname=NULL; - -job_registry_handle *rha; -config_handle *cha; -config_entry *ret; -char *progname="BUpdaterSLURM"; - -struct pollfd *remupd_pollset = NULL; -int remupd_nfds; -job_registry_updater_endpoint *remupd_head = NULL; -job_registry_updater_endpoint *remupd_head_send = NULL; -config_entry *remupd_conf; diff --git a/src/Bfunctions.c b/src/Bfunctions.c index 63324e1b..2da92162 100644 --- a/src/Bfunctions.c +++ b/src/Bfunctions.c @@ -316,8 +316,6 @@ str2epoch(char *str, char * f) strptime(str,"%a %b %d %T %Y",&tm); }else if(strcmp(f,"A")==0){ strptime(str,"%m/%d/%Y %T",&tm); - }else if(strcmp(f,"N")==0){ - strptime(str,"%Y-%m-%dT%T",&tm); }else if(strcmp(f,"W")==0){ /* If do not have the year in the date we compare day and month and set the year */ @@ -605,6 +603,7 @@ bupdater_remove_active_job(bupdater_active_jobs *bact, if (cmp == 0) { /* Job ID found. Remove it from list. */ + free(bact->jobs[cur]); for (resize = cur+1; resizenjobs; resize++) { bact->jobs[resize - 1] = bact->jobs[resize]; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 00000000..e4bc2ccb --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,158 @@ +# **************** +# * BLAHP daemon * +# **************** +# +# $Id: $ +# +# File: CMakeLists.txt +# +# Author(s): Francesco Prelz ($Author: $) +# e-mail: "Francesco.Prelz@mi.infn.it" +# +# Revision history: +# +# 26-Oct-2012 Created + +cmake_minimum_required(VERSION 2.6) + +include(FindClassAd.cmake) +include_directories(${ClassAd_INCLUDE_DIR}) +include(FindPkgConfig) + +pkg_check_modules(GLOBUS_COMMON globus-common) +include_directories(${GLOBUS_COMMON_INCLUDE_DIRS}) + +pkg_check_modules(GLOBUS_IO globus-io) +pkg_check_modules(GLOBUS_GSSAPI_GSI globus-gssapi-gsi) +pkg_check_modules(GLOBUS_GSS_ASSIST globus-gss-assist) +pkg_check_modules(GLOBUS_GSI_CREDENTIAL globus-gsi-credential) +pkg_check_modules(GLOBUS_GSI_PROXY_CORE globus-gsi-proxy-core) + +include_directories(.) + +set (main_common_sources + console.c job_status.c resbuffer.c server.c commands.c + classad_binary_op_unwind.C classad_c_helper.C proxy_hashcontainer.c + config.c job_registry.c blah_utils.c env_helper.c mapped_exec.c md5.c + cmdbuffer.c) + +set (bupdater_common_sources + Bfunctions.c job_registry.c md5.c config.c blah_utils.c + job_registry_updater.c) + +# programs for 'sbin' +add_executable(blahpd_daemon main_daemon.c ${main_common_sources}) +set_target_properties(blahpd_daemon PROPERTIES COMPILE_FLAGS ${ClassAd_CXX_FLAGS}) +target_link_libraries(blahpd_daemon -lpthread ${ClassAd_LIBRARY}) +add_executable(blah_job_registry_add + blah_job_registry_add.c job_registry.c + job_registry_updater.c md5.c config.c) +add_executable(blah_job_registry_lkup + blah_job_registry_lkup.c job_registry.c md5.c config.c) +add_executable(blah_job_registry_scan_by_subject + blah_job_registry_scan_by_subject.c classad_c_helper.C + classad_binary_op_unwind.C job_registry.c md5.c config.c) +set_target_properties(blah_job_registry_scan_by_subject PROPERTIES COMPILE_FLAGS ${ClassAd_CXX_FLAGS}) +target_link_libraries(blah_job_registry_scan_by_subject ${ClassAd_LIBRARY}) +add_executable(blah_check_config + blah_check_config.c Bfunctions.c config.c blah_utils.c) +add_executable(blah_job_registry_dump + blah_job_registry_dump.c job_registry.c md5.c config.c) +add_executable(blah_job_registry_purge + blah_job_registry_purge.c job_registry.c md5.c) + +# programs for 'bin' +add_executable(blahpd main.c ${main_common_sources}) +set_target_properties(blahpd PROPERTIES COMPILE_FLAGS ${ClassAd_CXX_FLAGS}) +target_link_libraries(blahpd -lpthread ${ClassAd_LIBRARY}) + +# programs for 'libexec' +add_executable(BLClient BLClient.c blah_utils.c BLfunctions.c) +add_executable(BLParserLSF BLParserLSF.c blah_utils.c BLfunctions.c) +target_link_libraries(BLParserLSF -lpthread) +add_executable(BLParserPBS BLParserPBS.c blah_utils.c BLfunctions.c) +target_link_libraries(BLParserPBS -lpthread) +add_executable(BUpdaterCondor BUpdaterCondor.c ${bupdater_common_sources}) +target_link_libraries(BUpdaterCondor -lpthread) +add_executable(BNotifier + BNotifier.c Bfunctions.c job_registry.c md5.c config.c blah_utils.c) +target_link_libraries(BNotifier -lpthread) +add_executable(BUpdaterLSF BUpdaterLSF.c ${bupdater_common_sources}) +target_link_libraries(BUpdaterLSF -lpthread -lm) +add_executable(BUpdaterPBS BUpdaterPBS.c ${bupdater_common_sources}) +target_link_libraries(BUpdaterPBS -lpthread -lm) +add_executable(BUpdaterSGE + BUpdaterSGE.c Bfunctions.c job_registry.c md5.c config.c + blah_utils.c) +add_executable(blparser_master blparser_master.c config.c blah_utils.c) + +if (${GLOBUS_COMMON_FOUND} AND ${GLOBUS_IO_FOUND}) +add_executable(BPRclient BPRclient.c BPRcomm.c tokens.c) +target_link_libraries(BPRclient + ${GLOBUS_GSI_PROXY_CORE_LDFLAGS} + ${GLOBUS_GSI_CREDENTIALS_LDFLAGS} + ${GLOBUS_GSS_ASSIST_LDFLAGS}) +add_executable(BPRserver BPRserver.c BPRcomm.c tokens.c) +target_link_libraries(BPRserver + ${GLOBUS_GSI_PROXY_CORE_LDFLAGS} + ${GLOBUS_GSI_CREDENTIALS_LDFLAGS} + ${GLOBUS_GSS_ASSIST_LDFLAGS}) +set_target_properties(BPRserver PROPERTIES COMPILE_FLAGS "-static") +endif (${GLOBUS_COMMON_FOUND} AND ${GLOBUS_IO_FOUND}) + +# test programs +add_executable(test_job_registry_create test_job_registry_create.c job_registry.c md5.c) +add_executable(test_job_registry_purge test_job_registry_purge.c job_registry.c md5.c) +add_executable(test_job_registry_update test_job_registry_update.c job_registry.c md5.c) +add_executable(test_job_registry_access test_job_registry_access.c job_registry.c md5.c) +add_executable(test_job_registry_update_from_network + test_job_registry_update_from_network.c job_registry.c + job_registry_updater.c md5.c config.c) +add_executable(test_cmdbuffer cmdbuffer.c) +set_target_properties(test_cmdbuffer PROPERTIES COMPILE_FLAGS "-DCMDBUF_DEBUG") + +# CPack info + +install(TARGETS blahpd RUNTIME DESTINATION bin) +install(TARGETS + blahpd_daemon blah_job_registry_add blah_job_registry_lkup + blah_job_registry_scan_by_subject blah_check_config + blah_job_registry_dump blah_job_registry_purge + RUNTIME DESTINATION sbin) +install(TARGETS + BLClient BLParserLSF BLParserPBS BUpdaterCondor BNotifier + BUpdaterLSF BUpdaterPBS BUpdaterSGE + blparser_master + RUNTIME DESTINATION libexec) + +set(blah_scripts + scripts/blah_load_config.sh scripts/blah_common_submit_functions.sh + scripts/pbs_cancel.sh scripts/pbs_status.sh scripts/pbs_submit.sh + scripts/pbs_hold.sh scripts/pbs_resume.sh scripts/lsf_cancel.sh + scripts/lsf_status.sh scripts/lsf_submit.sh scripts/lsf_hold.sh + scripts/lsf_resume.sh scripts/condor_cancel.sh scripts/condor_status.sh + scripts/condor_submit.sh scripts/condor_hold.sh scripts/condor_resume.sh + scripts/sge_cancel.sh scripts/sge_helper scripts/sge_resume.sh + scripts/sge_submit.sh scripts/sge_filestaging scripts/sge_hold.sh + scripts/sge_status.sh scripts/runcmd.pl.template + scripts/sge_local_submit_attributes.sh + scripts/slurm_cancel.sh scripts/slurm_resume.sh scripts/slurm_status.sh + scripts/slurm_hold.sh scripts/slurm_submit.sh + scripts/slurm_local_submit_attributes.sh + scripts/blah.py scripts/__init__.py + scripts/pbs_status.py + scripts/slurm_status.py + ) + +install(FILES + ${blah_scripts} + PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE + GROUP_READ GROUP_EXECUTE + WORLD_READ WORLD_EXECUTE + DESTINATION libexec) + +if (${GLOBUS_COMMON_FOUND} AND ${GLOBUS_IO_FOUND}) +install(TARGETS BPRclient BPRserver RUNTIME DESTINATION libexec) +endif (${GLOBUS_COMMON_FOUND} AND ${GLOBUS_IO_FOUND}) + +include(CPack) diff --git a/src/FindClassAd.cmake b/src/FindClassAd.cmake new file mode 100644 index 00000000..a5a3e4b5 --- /dev/null +++ b/src/FindClassAd.cmake @@ -0,0 +1,112 @@ +# - Finds Condor Classified Ad (Classad) binary distribution. +# The following variables are set: +# ClassAd_CXX_FLAGS - flags to add to the CXX compiler for Classad support +# CLASSAD_FOUND - true if the Classad distribution is detected +# +# Supported compilers can be found at http://openmp.org/wp/openmp-compilers/ + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +include(CheckCSourceCompiles) +include(CheckCXXSourceCompiles) +include(FindPackageHandleStandardArgs) + +set(ClassAd_INCLUDE_PATH_DESCRIPTION "top-level directory containing the Condor ClassAd include directories. E.g /opt/classad/include") +set(ClassAd_INCLUDE_DIR_MESSAGE "Set the ClassAd_INCLUDE_DIR cmake cache entry to the ${ClassAd_INCLUDE_PATH_DESCRIPTION}") +set(ClassAd_LIBRARY_PATH_DESCRIPTION "top-level directory containing the Condor ClassAd libraries.") +set(ClassAd_LIBRARY_DIR_MESSAGE "Set the ClassAd_LIBRARY_DIR cmake cache entry to the ${ClassAd_LIBRARY_PATH_DESCRIPTION}") + +find_path(ClassAd_INCLUDE_DIR + NAMES classad_distribution.h + PATHS + # Look in other places. + ${ClassAd_ROOT_DIRECTORIES} + PATH_SUFFIXES + classad + include + # Help the user find it if we cannot. + DOC "The ${ClassAd_INCLUDE_DIR_MESSAGE}" +) + +message(STATUS "ClassAd_INCLUDE_DIR == " ${ClassAd_INCLUDE_DIR}) + +# The ClassAd library (should have namespaces enabled). +set (ClassAd_LIBRARY_TO_FIND classad_ns) + +# Setting some more prefixes for the library +set (ClassAd_LIB_PREFIX "") +if ( WIN32 ) + set (ClassAd_LIB_PREFIX ${ClassAd_LIB_PREFIX} "lib") + set ( ClassAd_LIBRARY_TO_FIND ${ClassAd_LIB_PREFIX}${ClassAd_LIBRARY_TO_FIND}) +endif() + +find_library( ClassAd_LIBRARY + NAMES ${ClassAd_LIBRARY_TO_FIND} + PATHS + ${ClassAd_LIBRARY_DIR} + PATH_SUFFIXES + lib +) + +get_filename_component(ClassAd_LIBRARY_DIR ${ClassAd_LIBRARY} PATH) +message(STATUS "ClassAd_LIBRARY == " ${ClassAd_LIBRARY}) + +# sample Classad source code to test +set(ClassAd_CXX_TEST_SOURCE +" +#include +classad::ClassAd ad; +classad::ClassAdParser parser; + +int +main(int argc, char *argv[]) +{ +} +") + +set(ClassAd_CXX_FLAG_CANDIDATES + "-DWANT_NAMESPACES" + "-DWANT_NAMESPACES -DWANT_CLASSAD_NAMESPACE" +) + +# check cxx compiler +foreach(FLAG ${ClassAd_CXX_FLAG_CANDIDATES}) + set(SAFE_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}") + set(SAFE_CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + set(SAFE_CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES}") + set(CMAKE_REQUIRED_FLAGS "${FLAG}") + set(CMAKE_REQUIRED_LIBRARIES "${ClassAd_LIBRARY}") + set(CMAKE_REQUIRED_INCLUDES "${ClassAd_INCLUDE_DIR}") + unset(ClassAd_FLAG_DETECTED CACHE) + message(STATUS "Try Classad CXX flag = [${FLAG}] (library = [${ClassAd_LIBRARY}])") + check_cxx_source_compiles("${ClassAd_CXX_TEST_SOURCE}" ClassAd_FLAG_DETECTED) + set(CMAKE_REQUIRED_FLAGS "${SAFE_CMAKE_REQUIRED_FLAGS}") + set(CMAKE_REQUIRED_LIBRARIES "${SAFE_CMAKE_REQUIRED_LIBRARIES}") + set(CMAKE_REQUIRED_INCLUDES "${SAFE_CMAKE_REQUIRED_INCLUDES}") + if(ClassAd_FLAG_DETECTED) + set(ClassAd_CXX_FLAGS_INTERNAL "${FLAG}") + break() + endif(ClassAd_FLAG_DETECTED) +endforeach(FLAG ${ClassAd_CXX_FLAG_CANDIDATES}) + +set(ClassAd_CXX_FLAGS "${ClassAd_CXX_FLAGS_INTERNAL}" + CACHE STRING "C++ compiler flags for use of the Condor Classad library") +message(STATUS "ClassAd_CXX_FLAGS == " ${ClassAd_CXX_FLAGS}) +# handle the standard arguments for find_package +find_package_handle_standard_args(ClassAd DEFAULT_MSG + ClassAd_LIBRARY ClassAd_INCLUDE_DIR) + +mark_as_advanced( + ClassAd_CXX_FLAGS + ClassAd_LIBRARY +) diff --git a/src/Makefile.am b/src/Makefile.am index 6283bde3..4b3cd62e 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -2,7 +2,7 @@ # * BLAHP daemon * # **************** # -# $Id: Makefile.am,v 1.60 2012/05/16 09:41:23 mezzadri Exp $ +# $Id: Makefile.am,v 1.57.2.3 2012/03/20 13:38:43 mezzadri Exp $ # # File: Makefile.am # @@ -48,7 +48,7 @@ endif sbin_PROGRAMS = blahpd_daemon blah_job_registry_add blah_job_registry_lkup blah_job_registry_scan_by_subject blah_check_config blah_job_registry_dump blah_job_registry_purge bin_PROGRAMS = blahpd -libexec_PROGRAMS = BLClient BLParserLSF BLParserPBS BUpdaterCondor BNotifier BUpdaterLSF BUpdaterPBS BUpdaterSGE BUpdaterSLURM $(GLOBUS_EXECS) blparser_master +libexec_PROGRAMS = BLClient BLParserLSF BLParserPBS BUpdaterCondor BNotifier BUpdaterLSF BUpdaterPBS BUpdaterSGE $(GLOBUS_EXECS) blparser_master noinst_PROGRAMS = test_job_registry_create test_job_registry_purge test_job_registry_update test_job_registry_access test_job_registry_update_from_network test_cmdbuffer common_sources = console.c job_status.c resbuffer.c server.c commands.c classad_binary_op_unwind.C classad_c_helper.C proxy_hashcontainer.c config.c job_registry.c blah_utils.c env_helper.c mapped_exec.c md5.c cmdbuffer.c @@ -57,7 +57,7 @@ blahpd_SOURCES = main.c $(common_sources) blahpd_daemon_SOURCES = main_daemon.c $(common_sources) -blahpd_LDADD = $(CLASSAD_LIBS) +blahpd_LDADD = $(CLASSAD_LIBS) $(GLOBUS_GSSSAPI_GSI_LIBS) $(GLOBUS_GSS_ASSIST_LIBS) blahpd_daemon_LDADD = $(blahpd_LDADD) @@ -136,9 +136,6 @@ BUpdaterPBS_LDADD = -lpthread -lm BUpdaterSGE_SOURCES = BUpdaterSGE.c Bfunctions.c job_registry.c md5.c config.c blah_utils.c BUpdaterSGE_LDADD = -lpthread -BUpdaterSLURM_SOURCES = BUpdaterSLURM.c Bfunctions.c job_registry.c md5.c config.c blah_utils.c job_registry_updater.c -BUpdaterSLURM_LDADD = -lpthread -lm - blparser_master_SOURCES = blparser_master.c config.c blah_utils.c blparser_master_LDADD = @@ -151,5 +148,5 @@ blah_job_registry_dump_CFLAGS = $(AM_CFLAGS) test_cmdbuffer_SOURCES = cmdbuffer.c test_cmdbuffer_CFLAGS = $(AM_CFLAGS) -DCMDBUF_DEBUG -noinst_HEADERS = blahpd.h classad_binary_op_unwind.h classad_c_helper.h commands.h job_status.h resbuffer.h server.h console.h BPRcomm.h tokens.h BLParserPBS.h BLParserLSF.h proxy_hashcontainer.h job_registry.h md5.h config.h BUpdaterCondor.h Bfunctions.h BNotifier.h BUpdaterLSF.h BUpdaterPBS.h BUpdaterSGE.h BUpdaterSLURM.h blah_utils.h env_helper.h mapped_exec.h blah_check_config.h BLfunctions.h cmdbuffer.h job_registry_updater.h +noinst_HEADERS = blahpd.h classad_binary_op_unwind.h classad_c_helper.h commands.h job_status.h resbuffer.h server.h console.h BPRcomm.h tokens.h BLParserPBS.h BLParserLSF.h proxy_hashcontainer.h job_registry.h md5.h config.h BUpdaterCondor.h Bfunctions.h BNotifier.h BUpdaterLSF.h BUpdaterPBS.h BUpdaterSGE.h blah_utils.h env_helper.h mapped_exec.h blah_check_config.h BLfunctions.h cmdbuffer.h job_registry_updater.h diff --git a/src/acconfig.h b/src/acconfig.h new file mode 100644 index 00000000..e69de29b diff --git a/src/blah_job_registry_scan_by_subject.c b/src/blah_job_registry_scan_by_subject.c index daf11d00..f12a8d60 100644 --- a/src/blah_job_registry_scan_by_subject.c +++ b/src/blah_job_registry_scan_by_subject.c @@ -1,13 +1,11 @@ /* * File : blah_job_registry_scan_by_subject.c * - * Author : Francesco Prelz ($Author: fprelz $) + * Author : Francesco Prelz ($Author: mezzadri $) * e-mail : "francesco.prelz@mi.infn.it" * * Revision history : * 5-May-2009 Original release - * 16-Jul-2012 Added statistics count of jobs. Allow empty hash. - * 16-Jul-2012 Added job user prefix filter. * * Description: * Executable to look up for entries in the BLAH job registry @@ -290,7 +288,7 @@ get_format_type(char *fmt, int which, int *totfmts) return result; } -#define USAGE_STRING "ERROR Usage: %s [-total] [<-s (proxy subject)>|<-h (proxy subject hash>] [-p (user prefix)] [-j job_status[\\|job_status]] \"Optional arg1 format\" arg1 \"Optional arg2 format\" arg2, etc.\n" +#define USAGE_STRING "ERROR Usage: %s (<-s (proxy subject)>|<-h (proxy subject hash>) [-j job_status[\\|job_status]] \"Optional arg1 format\" arg1 \"Optional arg2 format\" arg2, etc.\n" static void print_usage(char *name) @@ -325,86 +323,85 @@ main(int argc, char *argv[]) char *lookup_subject = NULL; char *lookup_hash = NULL; int cur_arg; - int format_args = -1; + int format_args; int select_by_job_status = 0; int ifr; int njobs = 0; format_type first_fmt; int nfmts; - int total_only = 0; - char *test_user_prefix = NULL; - int test_user_prefix_len = 0; - if (argc > 1) + if (argc < 2) { - cur_arg = 1; - - while (argv[cur_arg][0] == '-') + print_usage(argv[0]); + return 1; + } + + cur_arg = 1; + + while (argv[cur_arg][0] == '-') + { + format_args = -1; + /* Look up for command line switches */ + if (strlen(argv[cur_arg]) > 2) { - format_args = -1; - /* Look up for command line switches */ - if (strlen(argv[cur_arg]) > 2) + arg = argv[cur_arg] + 2; + if (argc > (cur_arg+1)) { - arg = argv[cur_arg] + 2; - if (argc > (cur_arg+1)) + format_args = cur_arg+1; + } + } + else if (argc > (cur_arg+1)) + { + arg = argv[cur_arg+1]; + if (argc > (cur_arg+2)) + { + format_args = cur_arg+2; + } + } + + if (strlen(arg) <= 0) + { + print_usage(argv[0]); + return 1; + } + + switch (argv[cur_arg][1]) + { + case 'h': + if (lookup_hash != NULL) { - format_args = cur_arg+1; + print_usage(argv[0]); + return 1; } - } - else if (argc > (cur_arg+1)) - { - arg = argv[cur_arg+1]; - if (argc > (cur_arg+2)) - { - format_args = cur_arg+2; - } - } - - if (strlen(arg) <= 0) - { - print_usage(argv[0]); - return 1; - } - - switch (argv[cur_arg][1]) - { - case 'h': - if (lookup_hash != NULL) - { - print_usage(argv[0]); - return 1; - } - lookup_hash = arg; - break; - case 's': - if (lookup_hash != NULL) - { - print_usage(argv[0]); - return 1; - } - job_registry_compute_subject_hash(&hen, arg); - lookup_subject = arg; - lookup_hash = hen.subject_hash; - break; - case 'j': - select_by_job_status = parse_job_state_condition(arg); - break; - case 't': - total_only = 1; - break; - case 'p': - test_user_prefix = arg; - test_user_prefix_len = strlen(arg); - break; - default: + lookup_hash = arg; + break; + case 's': + if (lookup_hash != NULL) + { print_usage(argv[0]); return 1; - } - if ((format_args > 0) && (format_args < argc)) cur_arg = format_args; - else break; + } + job_registry_compute_subject_hash(&hen, arg); + lookup_subject = arg; + lookup_hash = hen.subject_hash; + break; + case 'j': + select_by_job_status = parse_job_state_condition(arg); + break; + default: + print_usage(argv[0]); + return 1; } + if ((format_args > 0) && (format_args < argc)) cur_arg = format_args; + else break; } + if (lookup_hash == NULL) + { + print_usage(argv[0]); + return 1; + } + cha = config_read(NULL); /* Read config from default locations. */ if (cha != NULL) { @@ -449,23 +446,20 @@ main(int argc, char *argv[]) if (cha != NULL) config_free(cha); if (need_to_free_registry_file) free(registry_file); - if (lookup_hash != NULL) + looked_up_subject = job_registry_lookup_subject_hash(rha, lookup_hash); + if (looked_up_subject == NULL) { - looked_up_subject = job_registry_lookup_subject_hash(rha, lookup_hash); - if (looked_up_subject == NULL) + fprintf(stderr,"%s: Hash %s is not found in registry %s.\n",argv[0], + lookup_hash, rha->path); + job_registry_destroy(rha); + return 5; + } else { + if ((lookup_subject != NULL) && + (strcmp(looked_up_subject, lookup_subject) != 0)) { - fprintf(stderr,"%s: Hash %s is not found in registry %s.\n",argv[0], - lookup_hash, rha->path); - job_registry_destroy(rha); - return 5; - } else { - if ((lookup_subject != NULL) && - (strcmp(looked_up_subject, lookup_subject) != 0)) - { - fprintf(stderr, "%s: Warning: cached subject (%s) differs from the requested subject (%s)\n", argv[0], looked_up_subject, lookup_subject); - } - free(looked_up_subject); + fprintf(stderr, "%s: Warning: cached subject (%s) differs from the requested subject (%s)\n", argv[0], looked_up_subject, lookup_subject); } + free(looked_up_subject); } fd = job_registry_open(rha, "r"); @@ -491,31 +485,14 @@ main(int argc, char *argv[]) for (ifr = format_args; ifr < argc; ifr+=2) undo_escapes(argv[ifr]); } - if (lookup_hash == NULL) lookup_hash = ""; - while ((ren = job_registry_get_next_hash_match(rha, fd, lookup_hash)) != NULL) { /* Is the current entry in the requested job status ? */ if ((select_by_job_status != 0) && (!check_job_state_condition(select_by_job_status, ren->status))) - { - free(ren); - continue; - } - - if ((test_user_prefix != NULL) && - (strncmp(ren->user_prefix, test_user_prefix, test_user_prefix_len) != 0)) - { - free(ren); continue; - } njobs++; - if (total_only != 0) - { - free(ren); - continue; - } cad = job_registry_entry_as_classad(rha, ren); if (cad != NULL) @@ -583,8 +560,5 @@ main(int argc, char *argv[]) fclose(fd); job_registry_destroy(rha); - if (total_only != 0) printf("%s: Matched entries: %d\n", argv[0], njobs); - - if (total_only && (njobs>0)) return ((njobs%127)+1); return 0; } diff --git a/src/blparser_master.c b/src/blparser_master.c index 1c73baee..41b45c0b 100644 --- a/src/blparser_master.c +++ b/src/blparser_master.c @@ -34,7 +34,9 @@ #include #include #include +#ifdef MTRACE_ON #include +#endif #include #include #include @@ -144,18 +146,18 @@ check_on_children_args(const struct blah_managed_child *children, const int coun fret = fork(); if (fret == 0) { - if((j = wordexp(children[i].exefile, &args, 0))) + if((j = wordexp(children[i].exefile, &args, WRDE_NOCMD))) { fprintf(stderr,"wordexp: unable to parse the command line \"%s\" (error %d)\n", children[i].exefile, j); - return; - } + _exit(1); + } /* Child process. Exec exe file. */ if (execv(args.we_wordv[0], args.we_wordv) < 0) { fprintf(stderr,"Cannot exec %s: %s\n", children[i].exefile, strerror(errno)); - exit(1); + _exit(1); } /* Free the wordexp'd args */ wordfree(&args); diff --git a/src/classad_binary_op_unwind.C b/src/classad_binary_op_unwind.C index 4eb7f0d7..bd352698 100644 --- a/src/classad_binary_op_unwind.C +++ b/src/classad_binary_op_unwind.C @@ -39,10 +39,10 @@ #include // strcasecmp -#include "classad_distribution.h" +#include "classad/classad_distribution.h" #include "classad_binary_op_unwind.h" -#ifdef WANT_NAMESPACES +#if 1 namespace classad { #endif @@ -221,6 +221,6 @@ UnparseAux( std::string &buffer, std::string &fnName, std::vector& ar return; } -#ifdef WANT_NAMESPACES +#if 1 } // end of classad namespace #endif diff --git a/src/classad_binary_op_unwind.h b/src/classad_binary_op_unwind.h index 227892f3..ec4a492b 100644 --- a/src/classad_binary_op_unwind.h +++ b/src/classad_binary_op_unwind.h @@ -33,16 +33,14 @@ # */ -#include "classad_distribution.h" +#include "classad/classad_distribution.h" #ifndef __CLASSAD_BINARY_OP_UNWIND_H__ #define __CLASSAD_BINARY_OP_UNWIND_H__ -#ifdef WANT_NAMESPACES using namespace classad; namespace classad { -#endif class BinaryOpUnwind : public ClassAdUnParser { @@ -64,8 +62,6 @@ class BinaryOpUnwind : public ClassAdUnParser std::vector m_unwind_output; }; -#ifdef WANT_NAMESPACES } // end of classad namespace -#endif #endif // defined __CLASSAD_BINARY_OP_UNWIND_H__ diff --git a/src/classad_c_helper.C b/src/classad_c_helper.C index 5e98d750..560a2ab7 100644 --- a/src/classad_c_helper.C +++ b/src/classad_c_helper.C @@ -40,7 +40,7 @@ */ #include -#include "classad_distribution.h" +#include "classad/classad_distribution.h" #include "classad_binary_op_unwind.h" #ifdef WANT_NAMESPACES @@ -289,7 +289,7 @@ extern "C" ExprList *et_value; et_value = ExprList::MakeExprList(et_ads); - if (ad->Insert (name, et_value)) return C_CLASSAD_NO_ERROR; + if (ad->Insert (name, (ExprTree* &)et_value)) return C_CLASSAD_NO_ERROR; else return C_CLASSAD_INSERT_FAILED; } diff --git a/src/config.c b/src/config.c index 2000d8f1..0244bb7f 100644 --- a/src/config.c +++ b/src/config.c @@ -9,6 +9,8 @@ * 23-Nov-2007 Original release * 24-Apr-2009 Added parsing of shell arrays. * 13-Jan-2012 Added sbin and libexec install dirs. + * 30-Nov-2012 Added ability to locally setenv the env variables + * that are exported in the config file. * * Description: * Small library for access to the BLAH configuration file. @@ -77,8 +79,47 @@ config_parse_array_values(config_entry *en) } } +int +config_setenv(const char *ipath) + { + const char *printenv_command_before = "printenv"; + const char *printenv_command_after = ". %s;printenv"; + config_handle *envs_before; + config_handle *envs_after; + config_entry *cur; + int n_added = 0; + + envs_before = config_read_cmd(ipath, printenv_command_before); + envs_after = config_read_cmd(ipath, printenv_command_after); + + + /* Set in the local environment all env variables that were exported in */ + /* the config file. */ + + for (cur = envs_after->list; cur != NULL; cur=cur->next) + { + if (config_get(cur->key, envs_before) == NULL) + { + setenv(cur->key, cur->value, 1); + n_added++; + } + } + + config_free(envs_before); + config_free(envs_after); + + return n_added; + } + config_handle * config_read(const char *ipath) + { + const char *set_command_format = ". %s; set"; + return config_read_cmd(ipath, set_command_format); + } + +config_handle * +config_read_cmd(const char *ipath, const char *set_command_format) { char *path; char *install_location=NULL; @@ -91,7 +132,6 @@ config_read(const char *ipath) config_entry *c_tail = NULL; config_entry *found,*new_entry=NULL; char *set_command=NULL; - const char *set_command_format = ". %s; set"; int set_command_size; int line_len = 0; int line_alloc = 0; @@ -434,6 +474,8 @@ int main(int argc, char *argv[]) { int tcf; + int n_env; + char *test_env; char *path; const char *test_config = "\n" @@ -451,6 +493,7 @@ main(int argc, char *argv[]) "b4=0\n" "b4=\" Junk\"\n" "b5=\" False\"\n" + "export e1=\" My Env Variable \"\n" "file=/tmp/test_`whoami`.bjr\n" "arr[0]=value_0\n" "arr[3]=value_3\n" @@ -488,13 +531,27 @@ main(int argc, char *argv[]) setenv("BLAHPD_CONFIG_LOCATION",path,1); cha = config_read(NULL); + n_env = config_setenv(NULL); unlink(path); if (cha == NULL) { fprintf(stderr,"%s: Error reading config from %s: ",argv[0],path); + perror(""); return 4; } + if (n_env <= 0) + { + fprintf(stderr,"%s: No new env variables found in %s.\n",argv[0],path); + r=30; + } + if ((test_env = getenv("e1")) == NULL) + { + fprintf(stderr,"%s: Env variable e1 not found in %s.\n",argv[0],path); + r=31; + } + else printf("e1 env == <%s>\n", test_env); + ret = config_get("a",cha); if (ret == NULL) fprintf(stderr,"%s: key a not found\n",argv[0]),r=5; else if (atoi(ret->value) != 123) fprintf(stderr,"%s: key a != 123\n",argv[0]),r=6; diff --git a/src/config.h b/src/config.h index c96b9d28..f13af836 100644 --- a/src/config.h +++ b/src/config.h @@ -8,6 +8,8 @@ * Revision history : * 23-Nov-2007 Original release * 13-Jan-2012 Added sbin and libexec install dirs. + * 30-Nov-2012 Added ability to locally setenv the env variables + * that are exported in the config file. * * Description: * Prototypes of functions defined in config.c @@ -55,6 +57,8 @@ typedef struct config_handle_s } config_handle; config_handle *config_read(const char *path); +config_handle *config_read_cmd(const char *path, const char *cmd); +int config_setenv(const char *ipath); config_entry *config_get(const char *key, config_handle *handle); int config_test_boolean(const config_entry *entry); void config_free(config_handle *handle); diff --git a/src/job_registry.c b/src/job_registry.c index 27d137af..a5aeff0b 100644 --- a/src/job_registry.c +++ b/src/job_registry.c @@ -2,7 +2,7 @@ * File : job_registry.c * * - * Author : Francesco Prelz ($Author: mezzadri $) + * Author : Francesco Prelz ($Author: fprelz $) * e-mail : "francesco.prelz@mi.infn.it" * * Revision history : @@ -29,6 +29,7 @@ * case is guaranteed by the invoking service). * Added job_registry_check_index_key_uniqueness. * 21-Jul-2011 Added job_registry_need_update function. + * 11-Sep-2015 Always return most recent job in job_registry_get_recnum. * * Description: * File-based container to cache job IDs and statuses to implement @@ -359,7 +360,7 @@ job_registry_probe_next_record(FILE *fd, job_registry_entry *en) { size_t rsize = 0; size_t act_size; - long start_pos, end_pos; + long start_pos, end_pos, offset; int sret, eret, cret; int ic; size_t allowed_size_incs[N_JOB_REGISTRY_ALLOWED_ENTRY_SIZE_INCS] = @@ -400,7 +401,7 @@ job_registry_probe_next_record(FILE *fd, job_registry_entry *en) } else { - if (feof(fd)) en->magic_start = JOB_REGISTRY_MAGIC_START; + if (feof(fd)) en->magic_start == JOB_REGISTRY_MAGIC_START; break; } } @@ -452,8 +453,10 @@ job_registry_update_reg(const job_registry_handle *rha, { FILE *of, *nf; job_registry_entry en; + unsigned char *enendp; int encount; int rret, wret; + int i; of = fopen(old_path,"r"); if (of == NULL) return -1; @@ -522,6 +525,7 @@ job_registry_init(const char *path, char real_file_name[FILENAME_MAX]; int rlnk_status; mode_t old_umask; + const char *npu_tail="/npu"; int cfd; FILE *fd; char *old_lockfile, *old_npudir, *old_path=NULL; @@ -1494,7 +1498,7 @@ int job_registry_append_op(job_registry_handle *rha, job_registry_entry *entry, FILE *fd, time_t now) { - job_registry_recnum_t found; + job_registry_recnum_t found,curr_recn; job_registry_entry last; long curr_pos; int need_to_fclose = FALSE; @@ -1598,7 +1602,9 @@ job_registry_get_new_npufd(job_registry_handle *rha) FILE *rfd = NULL; int lfd; char *tp; + struct stat fst; const char *npu_tail="/npu_XXXXXX"; + int i; /* Append a filename to rha->npudir, so it can be passed back to */ /* jobregistry_construct_path */ @@ -1690,6 +1696,7 @@ int job_registry_merge_pending_nonpriv_updates(job_registry_handle *rha, FILE *fd) { + int i; int nadd = 0; int rapp; int frret; @@ -2135,7 +2142,8 @@ job_registry_need_update(const job_registry_entry *olde, * Binary search for an entry in the indexed, sorted job registry pointed to by * rha. The record number in the current JR cache is returned. * No file access is required. - * In case multiple entries are found, the lowest recnum is returned. + * In case multiple entries are found, the highest (most recent) recnum + * is returned. * * @param rha Pointer to a job registry handle returned by job_registry_init. * @param id Job id key to be looked up @@ -2165,12 +2173,12 @@ job_registry_get_recnum(const job_registry_handle *rha, /* Check for duplicates. */ for (tcur=cur-1; tcur >=0 && strcmp(rha->entries[tcur].id,id)==0; tcur--) { - if (rha->entries[tcur].recnum < found) found = rha->entries[tcur].recnum; + if (rha->entries[tcur].recnum > found) found = rha->entries[tcur].recnum; } for (tcur=cur+1;tcur < rha->n_entries && strcmp(rha->entries[tcur].id,id)==0; tcur++) { - if (rha->entries[tcur].recnum < found) found = rha->entries[tcur].recnum; + if (rha->entries[tcur].recnum > found) found = rha->entries[tcur].recnum; } break; } @@ -2305,6 +2313,33 @@ job_registry_get(job_registry_handle *rha, } firstrec = job_registry_firstrec(rha,fd); + + /* Determine if the job registry index must be resync'd. + * The record numbers are monotonically increasing through the lifetime + * of the registry; the firstrec we read from the data file above must + * match the firstrec in our in-memory index. The firstrec on the index + * is guaranteed to change if a purge operation occurred. + */ + if (firstrec != rha->firstrec) + { + int retval = job_registry_resync(rha, fd); + if (retval < 0) // Registry failed to update. + { + fclose(fd); + return NULL; + } + if (retval > 0) // Registry has been updated; our lookup was invalid. + { + found = job_registry_lookup(rha, id); + if (found == 0) + { + errno = ENOENT; + fclose(fd); + return NULL; + } + } + } + /* Was this record just purged ? */ if ((firstrec > rha->firstrec) && (found >= rha->firstrec) && (found < firstrec)) { @@ -2364,7 +2399,7 @@ job_registry_open(job_registry_handle *rha, const char *mode) int job_registry_unlock(FILE *sfd) { - int fd; + int fd, lfd; struct flock ulock; int ret; @@ -2647,6 +2682,7 @@ job_registry_entry_as_classad(const job_registry_handle *rha, "CreateTime=%u; ModifiedTime=%u; UserTime=%u; " "SubmitterUid=%d; %s]"; char *result, *fmt_extra, *extra_attrs=NULL, *new_extra_attrs; + char *extra_attrs_append; int extra_attrs_size = 0; int need_to_free_extra_attrs = FALSE; int esiz,fsiz; @@ -3113,6 +3149,7 @@ job_registry_lookup_subject_hash(const job_registry_handle *rha, { FILE *fd; char subline[JOB_REGISTRY_MAX_SUBJECTLIST_LINE]; + int retcod; char *en; if (rha == NULL || hash == NULL) return NULL; diff --git a/src/job_registry_updater.c b/src/job_registry_updater.c index 57d8ecdf..9e66e8fe 100644 --- a/src/job_registry_updater.c +++ b/src/job_registry_updater.c @@ -2,13 +2,12 @@ * File : job_registry_updater.c * * - * Author : Francesco Prelz ($Author: drebatto $) + * Author : Francesco Prelz ($Author: fprelz $) * e-mail : "francesco.prelz@mi.infn.it" * * Revision history : * 13-Jul-2011 Original release * 19-Jul-2011 Added transfer of full proxy subject and path. - * 19-Jul-2012 Exclude local addresses from possible destinations. * * Description: * Protocol to distribute network updates to the BLAH registry. @@ -42,7 +41,6 @@ #include #include #include -#include #include #include #include @@ -191,12 +189,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, int n_added = 0; int is_multicast; struct ip_mreqn mreq4; - struct ifconf ifc; /* holds IOCTL return value for SIOCGIFCONF */ - int iofd = -1; - int ifconf_ret, numreqs = 30, n; - struct ifreq *ifr; /* points to one interface returned from ioctl */ - struct sockaddr_in *sin, *lin; - struct sockaddr_in6 *sin6, *lin6; if (endpoints == NULL) { @@ -211,34 +203,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, last = cur; } - /* Get local interface data via SIOCGIFCONF ioctl */ - iofd = socket (PF_INET, SOCK_DGRAM, 0); - if (iofd >= 0) - { - memset (&ifc, 0, sizeof(ifc)); - - ifc.ifc_buf = NULL; - - for (;;) - { - ifc.ifc_len = sizeof(struct ifreq) * numreqs; - ifc.ifc_buf = realloc(ifc.ifc_buf, ifc.ifc_len); - - if ((ifconf_ret = ioctl(iofd, SIOCGIFCONF, &ifc)) < 0) - { - break; - } - if (ifc.ifc_len == sizeof(struct ifreq) * numreqs) - { - /* assume it overflowed and try again */ - numreqs += 10; - continue; - } - break; - } - } - - for (i=0; i= 0) @@ -246,54 +210,12 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, /* Look for a workable address */ for (cur_ans = ai_ans; cur_ans != NULL; cur_ans = cur_ans->ai_next) { - /* Exclude local addresses */ - if ((iofd >= 0) && (ifconf_ret >= 0)) - { - /* loop through interfaces returned from SIOCGIFCONF */ - ifr = ifc.ifc_req; - for (n=0; n < ifc.ifc_len; n+=sizeof(struct ifreq)) - { - /* Get the interface address */ - if (ioctl(iofd,SIOCGIFADDR, ifr) == 0 ) - { - if (ifr->ifr_ifru.ifru_addr.sa_family == cur_ans->ai_family) - { - switch(cur_ans->ai_family) - { - case AF_INET: - /* IPV4 case */ - lin = (struct sockaddr_in *)&ifr->ifr_ifru.ifru_addr; - sin = (struct sockaddr_in *)(cur_ans->ai_addr); - if ((lin->sin_addr.s_addr) == (sin->sin_addr.s_addr)) - continue; - break; - case AF_INET6: - /* IPV6 case */ - lin6 = (struct sockaddr_in6 *)&ifr->ifr_ifru.ifru_addr; - sin6 = (struct sockaddr_in6 *)(cur_ans->ai_addr); - if (memcmp(lin6->sin6_addr.s6_addr, sin6->sin6_addr.s6_addr, 16) == 0) - continue; - break; - default: - /* Unknown family */ - break; - } - } - } - } - } - tfd = socket(cur_ans->ai_family, cur_ans->ai_socktype, cur_ans->ai_protocol); if (tfd < 0) { if (n_destinations == 1) { freeaddrinfo(ai_ans); - if (iofd >= 0) - { - free(ifc.ifc_buf); - close(iofd); - } return JOB_REGISTRY_SOCKET_FAIL; } else continue; @@ -319,11 +241,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, if (n_destinations == 1) { freeaddrinfo(ai_ans); - if (iofd >= 0) - { - free(ifc.ifc_buf); - close(iofd); - } return JOB_REGISTRY_MCAST_FAIL; } else continue; @@ -339,11 +256,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, if (n_destinations == 1) { freeaddrinfo(ai_ans); - if (iofd >= 0) - { - free(ifc.ifc_buf); - close(iofd); - } return JOB_REGISTRY_MCAST_FAIL; } else continue; @@ -362,11 +274,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, if (n_destinations == 1) { freeaddrinfo(ai_ans); - if (iofd >= 0) - { - free(ifc.ifc_buf); - close(iofd); - } return JOB_REGISTRY_MALLOC_FAIL; } else continue; @@ -382,11 +289,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, if (n_destinations == 1) { freeaddrinfo(ai_ans); - if (iofd >= 0) - { - free(ifc.ifc_buf); - close(iofd); - } return JOB_REGISTRY_TTL_FAIL; } else continue; @@ -398,11 +300,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, if (n_destinations == 1) { freeaddrinfo(ai_ans); - if (iofd >= 0) - { - free(ifc.ifc_buf); - close(iofd); - } return JOB_REGISTRY_CONNECT_FAIL; } else continue; @@ -424,11 +321,6 @@ job_registry_updater_setup_sender(char **destinations, int n_destinations, return pretcod; } } - if (iofd >= 0) - { - free(ifc.ifc_buf); - close(iofd); - } return n_added; } diff --git a/src/job_status.c b/src/job_status.c index 411da3e2..ac26f22e 100644 --- a/src/job_status.c +++ b/src/job_status.c @@ -126,8 +126,15 @@ get_status(const char *jobDesc, classad_context *cad, char **deleg_parameters, c return(255); } - exec_command.command = make_message("%s/%s_status.sh %s %s", blah_script_location, - spid->lrms, (get_workernode ? "-w" : ""), jobDesc); + if (strcmp(spid->lrms, "pbs") == 0) { + exec_command.command = make_message("%s/%s_status.py %s %s", blah_script_location, + spid->lrms, (get_workernode ? "-w" : ""), jobDesc); + } + else + { + exec_command.command = make_message("%s/%s_status.sh %s %s", blah_script_location, + spid->lrms, (get_workernode ? "-w" : ""), jobDesc); + } if (exec_command.command == NULL) { fprintf(stderr, "blahpd: out of memory"); diff --git a/src/main.c b/src/main.c index 0c7f842d..a5a7c485 100644 --- a/src/main.c +++ b/src/main.c @@ -43,8 +43,9 @@ #include #include #include +#ifdef MTRACE_ON #include - +#endif #include "blahpd.h" #include "server.h" #include "console.h" diff --git a/src/main_daemon.c b/src/main_daemon.c index dbaebaec..24fd5dce 100644 --- a/src/main_daemon.c +++ b/src/main_daemon.c @@ -43,8 +43,9 @@ #include #include #include +#ifdef MTRACE_ON #include - +#endif #include "blahpd.h" #include "server.h" #include "console.h" diff --git a/src/mapped_exec.c b/src/mapped_exec.c index 1f74502e..82bb72cf 100644 --- a/src/mapped_exec.c +++ b/src/mapped_exec.c @@ -430,7 +430,7 @@ execute_cmd(exec_cmd_t *cmd) } /* Do the shell expansion */ - if(wordexp_err = wordexp(command, &args, 0)) + if(wordexp_err = wordexp(command, &args, WRDE_NOCMD)) { fprintf(stderr,"wordexp: unable to parse the command line \"%s\" (error %d)\n", command, wordexp_err); return(1); diff --git a/src/mtsafe_popen.c b/src/mtsafe_popen.c index aab48bc8..b5e0a50d 100644 --- a/src/mtsafe_popen.c +++ b/src/mtsafe_popen.c @@ -308,7 +308,7 @@ exe_getouterr(char *const command, char *const environment[], char **cmd_output, envcopy[envcopy_size + i] = (char *)NULL; /* Do the shell expansion */ - if(i = wordexp(command, &args, 0)) + if(i = wordexp(command, &args, WRDE_NOCMD)) { fprintf(stderr,"wordexp: unable to parse the command line \"%s\" (error %d)\n", command, i); return(1); diff --git a/src/resbuffer.c b/src/resbuffer.c index 1a1782ed..cb9b6849 100644 --- a/src/resbuffer.c +++ b/src/resbuffer.c @@ -149,6 +149,7 @@ get_lines(void) * */ { char *res_lines = NULL; + char *reallocated; /* Acquire lock */ pthread_mutex_lock(&resbuffer_lock); diff --git a/src/scripts/Makefile.am b/src/scripts/Makefile.am index 34f241b5..6f7db77b 100644 --- a/src/scripts/Makefile.am +++ b/src/scripts/Makefile.am @@ -2,11 +2,11 @@ # * BLAHP scripts * # ***************** # -# $Id: Makefile.am,v 1.11 2012/07/04 11:03:06 drebatto Exp $ +# $Id: Makefile.am,v 1.10 2012/01/13 11:23:44 mezzadri Exp $ # # File: Makefile.am # -# Author(s): Francesco Prelz ($Author: drebatto $) +# Author(s): Francesco Prelz ($Author: mezzadri $) # e-mail: "Francesco.Prelz@mi.infn.it" # # Revision history: @@ -37,6 +37,10 @@ libexec_SCRIPTS = blah_load_config.sh blah_common_submit_functions.sh \ condor_cancel.sh condor_status.sh condor_submit.sh condor_hold.sh condor_resume.sh \ sge_cancel.sh sge_helper sge_resume.sh sge_submit.sh sge_filestaging \ sge_hold.sh sge_status.sh runcmd.pl.template sge_local_submit_attributes.sh \ - slurm_submit.sh slurm_status.sh slurm_cancel.sh + slurm_cancel.sh slurm_hold.sh slurm_resume.sh slurm_status.sh \ + slurm_submit.sh slurm_local_submit_attributes.sh \ + blah.py __init__.py \ + pbs_status.py \ + slurm_status.py EXTRA_DIST = $(bin_SCRIPTS) diff --git a/src/scripts/__init__.py b/src/scripts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/scripts/blah.py b/src/scripts/blah.py new file mode 100644 index 00000000..99dda1ab --- /dev/null +++ b/src/scripts/blah.py @@ -0,0 +1,34 @@ +"""Common functions for BLAH python scripts""" + +from ConfigParser import RawConfigParser +# TODO: io.StringIO is preferred in Python3 since it handles unicode-encoded files +from StringIO import StringIO + +class BlahConfigParser(RawConfigParser, object): + + def __init__(self, path='/etc/blah.config', defaults=None): + # RawConfigParser requires ini-style [section headers] but since + # blah.config is also used as a shell script we need to fake one + self.header = 'blahp' + with open(path) as f: + config = f.read() + vfile = StringIO('[%s]\n%s' % (self.header, config)) + + super(BlahConfigParser, self).__init__(defaults=defaults) + # TODO: readfp() is replaced by read_file() in Python 3.2+ + self.readfp(vfile) + + def items(self): + return super(BlahConfigParser, self).items(self.header) + + def get(self, option): + # ConfigParser happily includes quotes in value strings, which we + # happily allow in /etc/blah.config. This causes failures when joining + # paths, for example. + return super(BlahConfigParser, self).get(self.header, option).strip('"\'') + + def set(self, option, value): + return super(BlahConfigParser, self).set(self.header, option, value) + + def has_option(self, option): + return super(BlahConfigParser, self).has_option(self.header, option) diff --git a/src/scripts/blah_common_submit_functions.sh b/src/scripts/blah_common_submit_functions.sh index f48166e9..dde85030 100644 --- a/src/scripts/blah_common_submit_functions.sh +++ b/src/scripts/blah_common_submit_functions.sh @@ -1,3 +1,4 @@ +#!/bin/bash # File: blah_common_submit_functions.sh # # Author: Francesco Prelz @@ -129,6 +130,34 @@ function bls_fl_subst_and_accumulate () done } +function bls_fl_test_exists () +{ +# +# Usage: bls_fl_test_exists container_name +# Verfies all container_name "@@F_LOCAL" exists +# First missing file is returned in $bls_fl_test_exists_result. +# + local container_name + + container_name=${1:?"Missing container name argument to bls_fl_subst_and_accumulate"} + + local last_argument + + eval "last_argument=\${bls_${container_name}_counter:=0}" + + local ind + bls_fl_test_exists_result= + for (( ind=0 ; ind < $last_argument ; ind++ )) ; do + bls_fl_subst $container_name $ind "@@F_LOCAL" + if [ ! -z "$bls_fl_subst_result" -a ! -f "$bls_fl_subst_result" ] ; then + bls_fl_test_exists_result="${bls_fl_subst_result}" + return 1 + fi + done + return 0 +} + + function bls_fl_subst_and_dump () { # @@ -267,7 +296,7 @@ function bls_parse_submit_options () ############################################################### # Parse parameters ############################################################### - while getopts "a:i:o:e:c:s:v:V:dw:q:n:N:z:h:S:r:p:l:x:u:j:T:I:O:R:C:" arg + while getopts "a:i:o:e:c:s:v:V:dw:q:n:N:z:h:S:r:p:l:x:u:j:T:I:O:R:C:D:m:" arg do case "$arg" in a) bls_opt_xtra_args="$OPTARG" ;; @@ -297,6 +326,8 @@ function bls_parse_submit_options () O) bls_opt_outputflstring="$OPTARG" ;; R) bls_opt_outputflstringremap="$OPTARG" ;; C) bls_opt_req_file="$OPTARG";; + D) bls_opt_run_dir="$OPTARG";; + m) bls_opt_req_mem="$OPTARG";; -) break ;; ?) echo $usage_string exit 1 ;; @@ -483,16 +514,20 @@ function bls_setup_all_files () bls_proxy_remote_file=${bls_tmp_name}.proxy bls_test_shared_dir "$bls_proxy_local_file" if [ "x$bls_is_in_shared_dir" == "xyes" ] ; then - bls_fl_add_value inputcopy "$bls_proxy_local_file" "${bls_proxy_remote_file}" + if [ "x$bls_opt_proxyrenew" == "xyes" ] ; then + bls_fl_add_value inputcopy "$bls_proxy_local_file" "${bls_proxy_remote_file}" + bls_need_to_reset_proxy=yes + fi else - bls_fl_add_value inputsand "$bls_proxy_local_file" "${blah_wn_inputsandbox}${bls_proxy_remote_file}" "$bls_proxy_remote_file" + bls_fl_add_value inputsand "$bls_proxy_local_file" "${blah_wn_inputsandbox}${bls_proxy_remote_file}" "$bls_proxy_remote_file" + bls_need_to_reset_proxy=yes fi - bls_need_to_reset_proxy=yes fi fi # Setup stdin, stdout & stderr if [ ! -z "$bls_opt_stdin" ] ; then + if [ "${bls_opt_stdin:0:1}" != "/" ] ; then bls_opt_stdin=${bls_opt_workdir}/${bls_opt_stdin} ; fi if [ -f "$bls_opt_stdin" ] ; then bls_test_shared_dir "$bls_opt_stdin" if [ "x$bls_is_in_shared_dir" == "xyes" ] ; then @@ -507,22 +542,22 @@ function bls_setup_all_files () fi fi if [ ! -z "$bls_opt_stdout" ] ; then + if [ "${bls_opt_stdout:0:1}" != "/" ] ; then bls_opt_stdout=${bls_opt_workdir}/${bls_opt_stdout} ; fi bls_test_shared_dir "$bls_opt_stdout" if [ "x$bls_is_in_shared_dir" == "xyes" ] ; then bls_arguments="$bls_arguments > \"$bls_opt_stdout\"" else - if [ "${bls_opt_stdout:0:1}" != "/" ] ; then bls_opt_stdout=${bls_opt_workdir}/${bls_opt_stdout} ; fi bls_unique_stdout_name="${blah_wn_outputsandbox}out_${bls_tmp_name}_`basename $bls_opt_stdout`" bls_arguments="$bls_arguments > \"$bls_unique_stdout_name\"" bls_fl_add_value outputsand "$bls_opt_stdout" "$bls_unique_stdout_name" fi fi if [ ! -z "$bls_opt_stderr" ] ; then + if [ "${bls_opt_stderr:0:1}" != "/" ] ; then bls_opt_stderr=${bls_opt_workdir}/${bls_opt_stderr} ; fi bls_test_shared_dir "$bls_opt_stderr" if [ "x$bls_is_in_shared_dir" == "xyes" ] ; then bls_arguments="$bls_arguments 2> \"$bls_opt_stderr\"" else - if [ "${bls_opt_stderr:0:1}" != "/" ] ; then bls_opt_stderr=${bls_opt_workdir}/${bls_opt_stderr} ; fi if [ "$bls_opt_stderr" == "$bls_opt_stdout" ]; then bls_arguments="$bls_arguments 2>&1" else @@ -542,6 +577,7 @@ function bls_setup_all_files () exec 4< "$bls_opt_inputflstring" while read xfile <&4 ; do if [ ! -z $xfile ] ; then + if [ "${xfile:0:1}" != "/" ] ; then xfile=${bls_opt_workdir}/${xfile} ; fi bls_test_shared_dir "$xfile" if [ "x$bls_is_in_shared_dir" == "xyes" ] ; then bls_fl_add_value inputcopy "$xfile" "./`basename ${xfile}`" @@ -569,28 +605,19 @@ function bls_setup_all_files () read xfileremap <&6 fi - bls_test_shared_dir "$xfile" + if [ -z $xfileremap ] ; then + xfileremap="$xfile" + fi + if [ "${xfileremap:0:1}" != "/" ] ; then + xfileremap=${bls_opt_workdir}/${xfileremap} + fi + bls_test_shared_dir "$xfileremap" if [ "x$bls_is_in_shared_dir" != "xyes" ] ; then - if [ "${xfile:0:1}" != "/" ] ; then - xfile_base="`basename ${xfile}`" - xfile_transfer="${blah_wn_outputsandbox}${xfile_base}.$uni_ext" - else - xfile_transfer="$xfile" - fi - if [ ! -z $xfileremap ] ; then - if [ "${xfileremap:0:1}" != "/" ] ; then - bls_fl_add_value outputsand "${bls_opt_workdir}/${xfileremap}" "$xfile_transfer" "$xfile" - else - bls_test_shared_dir "$xfileremap" - if [ "x$bls_is_in_shared_dir" == "xyes" ] ; then - bls_fl_add_value outputmove "${xfileremap}" "$xfile" - else - bls_fl_add_value outputsand "${xfileremap}" "$xfile_transfer" "$xfile" - fi - fi - else - bls_fl_add_value outputsand "${bls_opt_workdir}/${xfile}" "$xfile_transfer" "$xfile" - fi + xfile_base="`basename ${xfile}`" + xfile_transfer="${blah_wn_outputsandbox}${xfile_base}.$uni_ext" + bls_fl_add_value outputsand "$xfileremap" "$xfile_transfer" "$xfile" + else + bls_fl_add_value outputmove "$xfileremap" "$xfile" fi fi done @@ -617,17 +644,31 @@ function bls_start_job_wrapper () fi fi + JOB_ENV="/var/lib/osg/osg-job-environment.conf" + LOCAL_JOB_ENV="/var/lib/osg/osg-local-job-environment.conf" + for fname in $JOB_ENV $LOCAL_JOB_ENV; do + test -r $fname && echo "`grep -G \"^[^# ]\" $fname`" + done + echo "old_home=\`pwd\`" # Set the temporary home (including cd'ing into it) + if [ "x$bls_opt_run_dir" != "x" ] ; then + run_dir="$bls_opt_run_dir" + else + run_dir="home_$bls_tmp_name" + fi if [ -n "$blah_wn_temporary_home_dir" ] ; then - echo "new_home=${blah_wn_temporary_home_dir}/home_$bls_tmp_name" + echo "new_home=${blah_wn_temporary_home_dir}/$run_dir" else - echo "new_home=\${old_home}/home_$bls_tmp_name" + echo "new_home=\${old_home}/$run_dir" fi - echo "mkdir \$new_home" - echo "trap 'wait \$job_pid; cd \$old_home; rm -rf \$new_home; exit 255' 1 2 3 15 24" - echo "trap 'wait \$job_pid; cd \$old_home; rm -rf \$new_home' 0" + echo 'mkdir "$new_home"' + echo 'job_wait_cleanup () { wait "$job_pid"; cd "$old_home"; rm -rf "$new_home"; }' + echo 'on_signal () { kill -$1 "$job_pid"; job_wait_cleanup; exit 255; }' + echo 'trap_sigs () { for sig; do trap "on_signal $sig" $sig; done; }' + echo 'trap_sigs HUP INT QUIT TERM XCPU' + echo 'trap job_wait_cleanup EXIT' echo "# Copy into new home any shared input sandbox file" bls_fl_subst_and_dump inputcopy "cp \"@@F_LOCAL\" \"\$new_home/@@F_REMOTE\" &> /dev/null" @@ -641,6 +682,8 @@ function bls_start_job_wrapper () if [ "x$bls_need_to_reset_proxy" == "xyes" ] ; then echo "# Resetting proxy to local position" echo "export X509_USER_PROXY=\$new_home/${bls_proxy_remote_file}" + elif [ -r "$bls_proxy_local_file" -a -f "$bls_proxy_local_file" ] ; then + echo "export X509_USER_PROXY=${bls_proxy_local_file}" fi # Add the command (with full path if not staged) @@ -663,7 +706,8 @@ function bls_start_job_wrapper () echo "\$new_home/`basename $bls_opt_the_command` $bls_arguments &" echo "fi" else - echo "$bls_opt_the_command $bls_arguments &" + echo "export NODE_COUNT=$bls_opt_mpinodes" + echo "$blah_job_wrapper $bls_opt_the_command $bls_arguments &" fi echo "job_pid=\$!" @@ -716,7 +760,11 @@ function bls_start_job_wrapper () function bls_finish_job_wrapper () { echo "cd \$old_home" - + if [ "x$bls_opt_proxy_string" != "x" ] + then + echo "rm -f $bls_opt_proxy_string" + fi + echo "" echo "exit \$user_retcode" @@ -728,8 +776,10 @@ function bls_finish_job_wrapper () fi } -function bls_test_working_dir () +function bls_test_input_files () { + # Verify the workdir can be accessed before submitting the job. If a bogus workdir is + # given, the job is hopeless if [ "x$bls_opt_workdir" != "x" ]; then cd $bls_opt_workdir elif [ "x$blah_set_default_workdir_to_home" == "xyes" ]; then @@ -742,13 +792,22 @@ function bls_test_working_dir () rm -f $bls_tmp_file exit 1 fi + + # Ensure local files actually exist. When called before job submission, this prevents + # unnecessary churn on the scheduler if the files don't exist. + if ! bls_fl_test_exists inputsand ; then + echo "Input sandbox file doesn't exist: $bls_fl_test_exists_result" >&2 + echo Error # for the sake of waiting fgets in blahpd + rm -f "$bls_tmp_file" + exit 1 + fi } function bls_add_job_wrapper () { + bls_test_input_files bls_start_job_wrapper >> $bls_tmp_file bls_finish_job_wrapper >> $bls_tmp_file - bls_test_working_dir } function bls_set_up_local_and_extra_args () @@ -774,38 +833,39 @@ function bls_set_up_local_and_extra_args () fi } -function bls_wrap_up_submit () -{ - - if [ -d "$blah_debug_save_submit_info" -a -n "$bls_tmp_name" ]; then - # Store files used for this job in a directory - bls_info_dir="$blah_debug_save_submit_info/$bls_tmp_name.debug" - mkdir "$bls_info_dir" - if [ $? -eq 0 ]; then - # Best effort. - if [ -r "$bls_proxy_local_file" ]; then - cp "$bls_proxy_local_file" "$bls_info_dir/submit.proxy" - fi - if [ -r "$bls_opt_stdout" ]; then - ln "$bls_opt_stdout" "$bls_info_dir/job.stdout" - if [ $? -ne 0 ]; then - # If we cannot hardlink, try a soft link. - ln -s "$bls_opt_stdout" "$bls_info_dir/job.stdout" - fi - fi - if [ -r "$bls_opt_stderr" ]; then - ln "$bls_opt_stderr" "$bls_info_dir/job.stderr" - if [ $? -ne 0 ]; then - # If we cannot hardlink, try a soft link. - ln -s "$bls_opt_stderr" "$bls_info_dir/job.stderr" +function bls_save_submit () { + if [ -d "$blah_debug_save_submit_info" -a -n "$bls_tmp_name" ]; then + # Store files used for this job in a directory + bls_info_dir="$blah_debug_save_submit_info/$bls_tmp_name.debug" + mkdir "$bls_info_dir" + if [ $? -eq 0 ]; then + # Best effort. + if [ -r "$bls_proxy_local_file" ]; then + cp "$bls_proxy_local_file" "$bls_info_dir/submit.proxy" + fi + if [ -r "$bls_opt_stdout" ]; then + ln "$bls_opt_stdout" "$bls_info_dir/job.stdout" + if [ $? -ne 0 ]; then + # If we cannot hardlink, try a soft link. + ln -s "$bls_opt_stdout" "$bls_info_dir/job.stdout" + fi + fi + if [ -r "$bls_opt_stderr" ]; then + ln "$bls_opt_stderr" "$bls_info_dir/job.stderr" + if [ $? -ne 0 ]; then + # If we cannot hardlink, try a soft link. + ln -s "$bls_opt_stderr" "$bls_info_dir/job.stderr" + fi + fi + if [ -r "$bls_tmp_file" ]; then + cp "$bls_tmp_file" "$bls_info_dir/submit.script" + fi fi - fi - if [ -r "$bls_tmp_file" ]; then - cp "$bls_tmp_file" "$bls_info_dir/submit.script" - fi - fi - fi + fi +} +function bls_wrap_up_submit () +{ bls_fl_clear inputsand bls_fl_clear outputsand bls_fl_clear inputcopy diff --git a/src/scripts/blah_load_config.sh b/src/scripts/blah_load_config.sh index c2b6cc67..1e340144 100755 --- a/src/scripts/blah_load_config.sh +++ b/src/scripts/blah_load_config.sh @@ -1,3 +1,4 @@ +#!/bin/bash # File: blah_load_config.sh # # Author: Francesco Prelz diff --git a/src/scripts/condor_status.sh b/src/scripts/condor_status.sh index e909c34a..94dbd779 100755 --- a/src/scripts/condor_status.sh +++ b/src/scripts/condor_status.sh @@ -252,15 +252,27 @@ for job in $* ; do fi fi + # Caching of condor_q output doesn't appear to work properly in + # HTCondor builds of the blahp. So do an explicit condor_q for + # this job before trying condor_history, which can take a long time. + line=$(echo $FORMAT | xargs $condor_binpath/condor_q $target $id) + if [ -n "$line" ] ; then + echo "0$(make_ad $job "$line")" + exit 0 + fi + ### WARNING: This is troubling because the remote history file ### might just happen to be in the same place as a local history ### file, in which case condor_history is going to be looking at ### the history of an unexpected queue. # We can possibly get the location of the history file and check it. + # NOTE: In Condor 7.7.6-7.8.1, the -f option to condor_history was + # broken. To work around that, we set HISTORY via the environment + # instead of using -f. history_file=$($condor_binpath/condor_config_val $target -schedd history) if [ "$?" == "0" ]; then - line=$(echo $FORMAT | xargs $condor_binpath/condor_history -f $history_file -backwards $id) + line=$(echo $FORMAT | _condor_HISTORY="$history_file" xargs $condor_binpath/condor_history -f $history_file -backwards -match 1 $id) if [ ! -z "$line" ] ; then echo "0$(make_ad $job "$line")" exit 0 diff --git a/src/scripts/condor_submit.sh b/src/scripts/condor_submit.sh index 8337fbb2..697286a1 100755 --- a/src/scripts/condor_submit.sh +++ b/src/scripts/condor_submit.sh @@ -1,4 +1,4 @@ -#!/bin/bash -l +#!/bin/bash # # File: condor_submit.sh # Author: Giuseppe Fiorentino (giuseppe.fiorentino@mi.infn.it) @@ -47,13 +47,13 @@ original_args="$@" # script debug flag: currently unused debug=no -# number of MPI nodes: currently unused -mpinodes=0 +# number of MPI nodes: interpretted as a core count for vanilla universe +mpinodes=1 # Name of local requirements file: currently unused req_file="" -while getopts "a:i:o:de:j:n:v:V:c:w:x:u:q:r:s:T:I:O:R:C:" arg +while getopts "a:i:o:de:j:n:N:z:h:S:v:V:c:w:x:u:q:r:s:T:I:O:R:C:D:m:" arg do case "$arg" in a) xtra_args="$OPTARG" ;; @@ -66,6 +66,10 @@ do V) environment="$OPTARG";; c) command="$OPTARG" ;; n) mpinodes="$OPTARG" ;; + N) hostsmpsize="$OPTARG";; + z) wholenodes="$OPTARG";; + h) hostnumber="$OPTARG";; + S) smpgranularity="$OPTARG";; w) workdir="$OPTARG";; x) proxy_file="$OPTARG" ;; u) proxy_subject="$OPTARG" ;; @@ -77,6 +81,8 @@ do O) outputflstring="$OPTARG" ;; R) remaps="$OPTARG" ;; C) req_file="$OPTARG" ;; + D) run_dir="$OPTARG" ;; + m) req_mem="$OPTARG" ;; -) break ;; ?) echo $usage_string exit 1 ;; @@ -181,7 +187,7 @@ if [ ${#remap_files[@]} -gt 0 ] ; then if [ ! -z "${remap_files[0]}" ] ; then map=${remap_files[$i]} else - map=${output_files$i]} + map=${output_files[$i]} fi transfer_output_remaps="$transfer_output_remaps;${output_files[$i]}=$map" done @@ -195,19 +201,18 @@ submit_file_environment="#" if [ "x$environment" != "x" ] ; then # Input format is suitable for bourne shell style assignment. Convert to -# old condor format (no double quotes in submit file). -# FIXME: probably it's better to convert everything into the 'new' Condor -# environment format. +# new condor format to avoid errors when things like LS_COLORS (which +# has semicolons in it) get captured eval "env_array=($environment)" - submit_file_environment="" - for env_var in "${env_array[@]}"; do - if [ "x$submit_file_environment" == "x" ] ; then - submit_file_environment="environment = " - else - submit_file_environment="$submit_file_environment;" - fi - submit_file_environment="${submit_file_environment}${env_var}" - done + dq='"' + sq="'" + # escape single-quote and double-quote characters (by doubling them) + env_array=("${env_array[@]//$sq/$sq$sq}") + env_array=("${env_array[@]//$dq/$dq$dq}") + # map key=val -> key='val' + env_array=("${env_array[@]/=/=$sq}") + env_array=("${env_array[@]/%/$sq}") + submit_file_environment="environment = \"${env_array[*]}\"" else if [ "x$envir" != "x" ] ; then # Old Condor format (no double quotes in submit file) @@ -239,7 +244,13 @@ then echo -e $xtra_args >> $submit_file fi +if [ "x$req_mem" != "x" ] +then + echo "request_memory = $req_mem" >> $submit_file +fi + cat >> $submit_file << EOF +request_cpus = $mpinodes # We insist on new style quoting in Condor arguments = $arguments input = $stdin diff --git a/src/scripts/lsf_hold.sh b/src/scripts/lsf_hold.sh old mode 100644 new mode 100755 diff --git a/src/scripts/lsf_resume.sh b/src/scripts/lsf_resume.sh old mode 100644 new mode 100755 diff --git a/src/scripts/lsf_status.sh b/src/scripts/lsf_status.sh index d31db430..dada5844 100755 --- a/src/scripts/lsf_status.sh +++ b/src/scripts/lsf_status.sh @@ -225,11 +225,11 @@ END { if [ "$cliretcode" == "1" -o "x$lsf_BLParser" != "xyes" ] ; then result="" usedBLParser="no" - datefile=blahdate_$RANDOM$RANDOM$RANDOM + datefile=/tmp/blahdate_$RANDOM$RANDOM$RANDOM touch $datefile;chmod 600 $datefile if [ $? -ne 0 ]; then - echo 'Error creating temporary file' + echo '1ERROR: Could not create temporary file' datefile="" echo "1ERROR: Job not found" break @@ -319,15 +319,15 @@ $0 ~ rex_finished { } $0 ~ rex_uhold { - jobstatus = 5 + jobstatus = 7 } $0 ~ rex_phold { - jobstatus = 5 + jobstatus = 1 } $0 ~ rex_shold { - jobstatus = 5 + jobstatus = 7 } END { diff --git a/src/scripts/lsf_submit.sh b/src/scripts/lsf_submit.sh index 3c4c3dc8..fb95fb4b 100755 --- a/src/scripts/lsf_submit.sh +++ b/src/scripts/lsf_submit.sh @@ -1,4 +1,4 @@ -#!/bin/bash -l +#!/bin/bash # # File: lsf_submit.sh # Author: David Rebatto (david.rebatto@mi.infn.it) @@ -51,6 +51,9 @@ conffile=$lsf_confpath/lsf.conf lsf_base_path=`cat $conffile|grep LSB_SHAREDIR| awk -F"=" '{ print $2 }'` +lsf_confdir=`cat $conffile|grep LSF_CONFDIR| awk -F"=" '{ print $2 }'` +[ -f ${lsf_confdir}/profile.lsf ] && . ${lsf_confdir}/profile.lsf + lsf_clustername=`${lsf_binpath}/lsid | grep 'My cluster name is'|awk -F" " '{ print $5 }'` logpath=$lsf_base_path/$lsf_clustername/logdir @@ -158,6 +161,7 @@ echo " cd \$CERN_STARTER_ORIGINAL_CWD" >> $bls_tmp_file echo "fi" >> $bls_tmp_file bls_add_job_wrapper +bls_save_submit # Let the wrap script be at least 1 second older than logfile # for subsequent "find -newer" command to work @@ -178,6 +182,9 @@ fi jobID=`echo "$bsub_out" | awk -F" " '{ print $2 }' | sed "s/>//" |sed "s/&1` retcode=$? + # If the job is already completed or no longer in the queue, + # treat it as successfully deleted. + if echo "$cmdout" | grep -q 'Unknown Job' ; then + retcode=0 + elif echo "$cmdout" | grep -q 'Request invalid for state of job MSG=invalid state for job - COMPLETE' ; then + retcode=0 + fi if [ "$retcode" == "0" ] ; then if [ "$jnr" == "1" ]; then echo " 0 No\\ error" diff --git a/src/scripts/pbs_hold.sh b/src/scripts/pbs_hold.sh old mode 100644 new mode 100755 diff --git a/src/scripts/pbs_resume.sh b/src/scripts/pbs_resume.sh old mode 100644 new mode 100755 diff --git a/src/scripts/pbs_status.py b/src/scripts/pbs_status.py new file mode 100755 index 00000000..68cd54c8 --- /dev/null +++ b/src/scripts/pbs_status.py @@ -0,0 +1,594 @@ +#!/usr/bin/python + +# File: pbs_status.py +# +# Author: Brian Bockelman +# e-mail: bbockelm@cse.unl.edu +# +# +# Copyright (c) University of Nebraska-Lincoln. 2012 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Query PBS (or SLURM with the PBS emulation layer) for the status of a given job + +Internally, it creates a cache of the PBS qstat response and will reuse this +for subsequent queries. +""" + +import os +import re +import pwd +import sys +import time +import errno +import fcntl +import random +import struct +import subprocess +import signal +import tempfile +import traceback +import pickle +import csv + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import blah + +cache_timeout = 60 + +launchtime = time.time() + +def log(msg): + """ + A very lightweight log - not meant to be used in production, but helps + when debugging scale tests + """ + print >> sys.stderr, time.strftime("%x %X"), os.getpid(), msg + +def createCacheDir(): + uid = os.geteuid() + username = pwd.getpwuid(uid).pw_name + cache_dir = os.path.join("/var/tmp", "qstat_cache_%s" % username) + + try: + os.mkdir(cache_dir, 0755) + except OSError, oe: + if oe.errno != errno.EEXIST: + raise + s = os.stat(cache_dir) + if s.st_uid != uid: + raise Exception("Unable to check cache because it is owned by UID %d" % s.st_uid) + + return cache_dir + +def initLog(): + """ + Determine whether to create a logfile based on the presence of a file + in the user's qstat cache directory. If so, make the logfile there. + """ + cache_dir = createCacheDir() + if os.path.exists(os.path.join(cache_dir, "pbs_status.debug")): + filename = os.path.join(cache_dir, "pbs_status.log") + else: + filename = "/dev/null" + fd = open(filename, "a") + # Do NOT close the file descriptor blahp originally hands us for stderr. + # This causes blahp to lose all status updates. + os.dup(2) + os.dup2(fd.fileno(), 2) + +# Something else from a prior life - see gratia-probe-common's GratiaWrapper.py +def ExclusiveLock(fd, timeout=120): + """ + Grabs an exclusive lock on fd + + If the lock is owned by another process, and that process is older than the + timeout, then the other process will be signaled. If the timeout is + negative, then the other process is never signaled. + + If we are unable to hold the lock, this call will not block on the lock; + rather, it will throw an exception. + + By default, the timeout is 120 seconds. + """ + + # POSIX file locking is cruelly crude. There's nothing to do besides + # try / sleep to grab the lock, no equivalent of polling. + # Why hello, thundering herd. + + # An alternate would be to block on the lock, and use signals to interupt. + # This would mess up Gratia's flawed use of signals already, and not be + # able to report on who has the lock. I don't like indefinite waits! + max_time = 30 + starttime = time.time() + tries = 1 + while time.time() - starttime < max_time: + try: + fcntl.lockf(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + return + except IOError, ie: + if not ((ie.errno == errno.EACCES) or (ie.errno == errno.EAGAIN)): + raise + if check_lock(fd, timeout): + time.sleep(.2) # Fast case; however, we have *no clue* how + # long it takes to clean/release the old lock. + # Nor do we know if we'd get it if we did + # fcntl.lockf w/ blocking immediately. Blech. + # Check again immediately, especially if this was the last + # iteration in the for loop. + try: + fcntl.lockf(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + return + except IOError, ie: + if not ((ie.errno == errno.EACCES) or (ie.errno == errno.EAGAIN)): + raise + sleeptime = random.random() + log("Unable to acquire lock, try %i; will sleep for %.2f " \ + "seconds and try for %.2f more seconds." % (tries, sleeptime, max_time - (time.time()-starttime))) + tries += 1 + time.sleep(sleeptime) + + log("Fatal exception - Unable to acquire lock") + raise Exception("Unable to acquire lock") + +def check_lock(fd, timeout): + """ + For internal use only. + + Given a fd that is locked, determine which process has the lock. + Kill said process if it is older than "timeout" seconds. + This will log the PID of the "other process". + """ + + pid = get_lock_pid(fd) + if pid == os.getpid(): + return True + + if timeout < 0: + log("Another process, %d, holds the cache lock." % pid) + return False + + try: + age = get_pid_age(pid) + except: + log("Another process, %d, holds the cache lock." % pid) + log("Unable to get the other process's age; will not time it out.") + return False + + log("Another process, %d (age %d seconds), holds the cache lock." % (pid, age)) + + if age > timeout: + os.kill(pid, signal.SIGKILL) + else: + return False + + return True + +linux_struct_flock = "hhxxxxqqixxxx" +try: + os.O_LARGEFILE +except AttributeError: + start_len = "hhlli" + +def get_lock_pid(fd): + # For reference, here's the definition of struct flock on Linux + # (/usr/include/bits/fcntl.h). + # + # struct flock + # { + # short int l_type; /* Type of lock: F_RDLCK, F_WRLCK, or F_UNLCK. */ + # short int l_whence; /* Where `l_start' is relative to (like `lseek'). */ + # __off_t l_start; /* Offset where the lock begins. */ + # __off_t l_len; /* Size of the locked area; zero means until EOF. */ + # __pid_t l_pid; /* Process holding the lock. */ + # }; + # + # Note that things are different on Darwin + # Assuming off_t is unsigned long long, pid_t is int + try: + if sys.platform == "darwin": + arg = struct.pack("QQihh", 0, 0, 0, fcntl.F_WRLCK, 0) + else: + arg = struct.pack(linux_struct_flock, fcntl.F_WRLCK, 0, 0, 0, 0) + result = fcntl.fcntl(fd, fcntl.F_GETLK, arg) + except IOError, ie: + if ie.errno != errno.EINVAL: + raise + log("Unable to determine which PID has the lock due to a " \ + "python portability failure. Contact the developers with your" \ + " platform information for support.") + return False + if sys.platform == "darwin": + _, _, pid, _, _ = struct.unpack("QQihh", result) + else: + _, _, _, _, pid = struct.unpack(linux_struct_flock, result) + return pid + +def get_pid_age(pid): + now = time.time() + st = os.stat("/proc/%d" % pid) + return now - st.st_ctime + +def qstat(jobid=""): + """ + Call qstat directly for a jobid. + If none is specified, query all jobid's. + + Returns a python dictionary with the job info. + """ + qstat_bin = get_qstat_location() + + starttime = time.time() + log("Starting qstat.") + command = (qstat_bin, '-f') + pbs_pro = config.get('pbs_pro').lower() == 'yes' + if not pbs_pro: + command += ('-1',) # -1 conflicts with -f in PBS Pro + if jobid: + command += ('-x', jobid) if pbs_pro else (jobid,) + qstat_proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + qstat_out, _ = qstat_proc.communicate() + result = parse_qstat(qstat_out) + log("Finished qstat (time=%f)." % (time.time()-starttime)) + + if qstat_proc.returncode in [35, 153]: # Completed or no longer in queue (presumably completed successfully) + result = {jobid: {'BatchJobId': '"%s"' % jobid, "JobStatus": "4", "ExitCode": ' 0'}} + elif qstat_proc.returncode == 271: # Removed + result = {jobid: {'BatchJobId': '"%s"' % jobid, 'JobStatus': '3', 'ExitCode': ' 0'}} + elif qstat_proc.returncode != 0: + raise Exception("qstat failed with exit code %s" % str(qstat_proc.returncode)) + + # If the job has completed... + if jobid is not "" and "JobStatus" in result[jobid] and (result[jobid]["JobStatus"] == '4' or result[jobid]["JobStatus"] == '3'): + # Get the finished job stats and update the result + finished_job_stats = get_finished_job_stats(jobid) + result[jobid].update(finished_job_stats) + + return result + + +def which(program): + """ + Determine if the program is in the path. + + arg program: name of the program to search + returns: full path to executable, or None if executable is not found + """ + def is_exe(fpath): + return os.path.isfile(fpath) and os.access(fpath, os.X_OK) + + fpath, fname = os.path.split(program) + if fpath: + if is_exe(program): + return program + else: + for path in os.environ["PATH"].split(os.pathsep): + path = path.strip('"') + exe_file = os.path.join(path, program) + if is_exe(exe_file): + return exe_file + + return None + +def convert_cpu_to_seconds(cpu_string): + hrs, mins, secs = re.split(':', cpu_string) + return int(hrs) * 3600 + int(mins) * 60 + int(secs) + +_cluster_type_cache = None +def get_finished_job_stats(jobid): + """ + Get a completed job's statistics such as used RAM and cpu usage. + """ + + # List the attributes that we want + return_dict = { "ImageSize": 0, "ExitCode": 0, "RemoteUserCpu": 0 } + # First, determine if this is a pbs or slurm machine. + uid = os.geteuid() + username = pwd.getpwuid(uid).pw_name + cache_dir = os.path.join("/var/tmp", "qstat_cache_%s" % username) + cluster_type_file = os.path.join(cache_dir, "cluster_type") + global _cluster_type_cache + if not _cluster_type_cache: + # Look for the special file, cluster_type + if os.path.exists(cluster_type_file): + _cluster_type_cache = open(cluster_type_file).read() + else: + # No idea what type of cluster is running, not set, so give up + log("cluster_type file is not present, not checking for completed job statistics") + return return_dict + + # Slurm completion + if _cluster_type_cache == "slurm": + + # Next, query the appropriate interfaces for the completed job information + # TODO: fix for pbs + log("Querying sacct for completed job for jobid: %s" % (str(jobid))) + child_stdout = os.popen("sacct -j %s -l --noconvert -P" % (str(jobid))) + sacct_data = child_stdout.readlines() + ret = child_stdout.close() + + if ret: + # retry without --noconvert for slurm < 15.8 + child_stdout = os.popen("sacct -j %s -l -P" % (str(jobid))) + sacct_data = child_stdout.readlines() + child_stdout.close() + + try: + reader = csv.DictReader(sacct_data, delimiter="|") + except Exception, e: + log("Unable to read in CSV output from sacct: %s" % str(e)) + return return_dict + + # Slurm can return more than 1 row, for some odd reason. + # so sum up relevant values + for row in reader: + if row["AveCPU"] is not "": + try: + return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) + except: + log("Failed to parse CPU usage for job id %s: %s, %s" % (jobid, row["AveCPU"], row["AllocCPUS"])) + raise + if row["MaxRSS"] is not "": + # Remove the trailing [KMGTP] and scale the value appropriately + # Note: We assume that all values will have a suffix, and we + # want the value in kilos. + try: + value = row["MaxRSS"] + factor = 1 + if value[-1] == 'M': + factor = 1024 + elif value[-1] == 'G': + factor = 1024 * 1024 + elif value[-1] == 'T': + factor = 1024 * 1024 * 1024 + elif value[-1] == 'P': + factor = 1024 * 1024 * 1024 * 1024 + return_dict["ImageSize"] += int(value.strip('KMGTP')) * factor + except: + log("Failed to parse memory usage for job id %s: %s" % (jobid, row["MaxRSS"])) + raise + if row["ExitCode"] is not "": + try: + return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) + except: + log("Failed to parse ExitCode for job id %s: %s" % (jobid, row["ExitCode"])) + raise + + # PBS completion + elif _cluster_type_cache == "pbs": + pass + + return return_dict + + +_qstat_location_cache = None +def get_qstat_location(): + """ + Locate the copy of qstat the blahp configuration wants to use. + """ + global _qstat_location_cache + if _qstat_location_cache != None: + return _qstat_location_cache + + cmd = 'echo "%s/%s"' % (config.get('pbs_binpath'), 'qstat') + + child_stdout = os.popen(cmd) + output = child_stdout.read().split("\n")[0].strip() + if child_stdout.close(): + raise Exception("Unable to determine qstat location: %s" % output) + + _qstat_location_cache = output + return output + +job_id_re = re.compile("\s*Job Id:\s([0-9]+)([\w\-\/.]*)") +exec_host_re = re.compile("\s*exec_host = ([\w\-\/.]+)") +status_re = re.compile("\s*job_state = ([QREFCH])") +exit_status_re = re.compile("\s*[Ee]xit_status = (-?[0-9]+)") +status_mapping = {"Q": 1, "R": 2, "E": 2, "F": 4, "C": 4, "H": 5} + +def parse_qstat(output): + """ + Parse the stdout of "qstat -f" into a python dictionary containing + the information we need. + """ + job_info = {} + cur_job_id = None + cur_job_info = {} + for line in output.split('\n'): + line = line.strip() + m = job_id_re.match(line) + if m: + if cur_job_id: + job_info[cur_job_id] = cur_job_info + cur_job_id = m.group(1) + #print cur_job_id, line + cur_job_info = {"BatchJobId": '"%s"' % cur_job_id.split(".")[0]} + continue + if cur_job_id == None: + continue + m = exec_host_re.match(line) + if m: + cur_job_info["WorkerNode"] = '"' + m.group(1).split("/")[0] + '"' + continue + m = status_re.match(line) + if m: + status = status_mapping.get(m.group(1), 0) + if status != 0: + cur_job_info["JobStatus"] = str(status) + continue + m = exit_status_re.match(line) + if m: + cur_job_info["ExitCode"] = ' %s' % m.group(1) + continue + if cur_job_id: + job_info[cur_job_id] = cur_job_info + return job_info + +def job_dict_to_string(info): + result = ["%s=%s;" % (i[0], i[1]) for i in info.items()] + return "[" + " ".join(result) + " ]" + +def fill_cache(cache_location): + log("Starting query to fill cache.") + results = qstat() + log("Finished query to fill cache.") + (fd, filename) = tempfile.mkstemp(dir = "/var/tmp") + # Open the file with a proper python file object + f = os.fdopen(fd, "w") + writer = csv.writer(f, delimiter='\t') + try: + try: + for key, val in results.items(): + key = key.split(".")[0] + writer.writerow([key, pickle.dumps(val)]) + os.fsync(fd) + except: + os.unlink(filename) + raise + finally: + f.close() + os.rename(filename, cache_location) + + # Create the cluster_type file + uid = os.geteuid() + username = pwd.getpwuid(uid).pw_name + cache_dir = os.path.join("/var/tmp", "qstat_cache_%s" % username) + cluster_type_file = os.path.join(cache_dir, "cluster_type") + (fd, filename) = tempfile.mkstemp(dir = "/var/tmp") + global _cluster_type_cache + if which("sacct"): + os.write(fd, "slurm") + _cluster_type_cache = "slurm" + else: + log("Unable to find cluster type") + os.close(fd) + os.rename(filename, cluster_type_file) + + global launchtime + launchtime = time.time() + +cache_line_re = re.compile("([0-9]+[\.\w\-]+):\s+(.+)") +def cache_to_status(jobid, fd): + reader = csv.reader(fd, delimiter='\t') + for row in reader: + if row[0] == jobid: + return pickle.loads(row[1]) + +def check_cache(jobid, recurse=True): + uid = os.geteuid() + username = pwd.getpwuid(uid).pw_name + cache_dir = os.path.join("/var/tmp", "qstat_cache_%s" % username) + if recurse: + try: + s = os.stat(cache_dir) + except OSError, oe: + if oe.errno != 2: + raise + os.mkdir(cache_dir, 0755) + s = os.stat(cache_dir) + if s.st_uid != uid: + raise Exception("Unable to check cache because it is owned by UID %d" % s.st_uid) + cache_location = os.path.join(cache_dir, "blahp_results_cache") + try: + fd = open(cache_location, "a+") + except IOError, ie: + if ie.errno != 2: + raise + # Create an empty file so we can hold the file lock + fd = open(cache_location, "w+") + ExclusiveLock(fd) + # If someone grabbed the lock between when we opened and tried to + # acquire, they may have filled the cache + if os.stat(cache_location).st_size == 0: + fill_cache(cache_location) + fd.close() + if recurse: + return check_cache(jobid, recurse=False) + else: + return None + ExclusiveLock(fd) + s = os.fstat(fd.fileno()) + if s.st_uid != uid: + raise Exception("Unable to check cache file because it is owned by UID %d" % s.st_uid) + if (s.st_size == 0) or (launchtime - s.st_mtime > cache_timeout): + # If someone filled the cache between when we opened the file and + # grabbed the lock, we may not need to fill the cache. + s2 = os.stat(cache_location) + if (s2.st_size == 0) or (launchtime - s2.st_mtime > cache_timeout): + fill_cache(cache_location) + if recurse: + return check_cache(jobid, recurse=False) + else: + return None + return cache_to_status(jobid, fd) + +job_status_re = re.compile(".*JobStatus=(\d+);.*") + +def main(): + initLog() + + # Accept the optional -w argument, but ignore it + if len(sys.argv) == 2: + jobid_arg = sys.argv[1] + elif len(sys.argv) == 3 and sys.argv[1] == "-w": + jobid_arg = sys.argv[2] + else: + print "1Usage: pbs_status.sh pbs//" + return 1 + jobid = jobid_arg.split("/")[-1].split(".")[0] + + global config + config = blah.BlahConfigParser(defaults={'pbs_pro': 'no', + 'pbs_binpath': '/usr/bin'}) + + log("Checking cache for jobid %s" % jobid) + cache_contents = None + try: + cache_contents = check_cache(jobid) + except Exception, e: + msg = "1ERROR: Internal exception, %s" % str(e) + log(msg) + #print msg + if not cache_contents: + log("Jobid %s not in cache; querying PBS" % jobid) + results = qstat(jobid) + log("Finished querying PBS for jobid %s" % jobid) + if not results or jobid not in results: + log("1ERROR: Unable to find job %s" % jobid) + print "1ERROR: Unable to find job %s" % jobid + else: + log("0%s" % job_dict_to_string(results[jobid])) + print "0%s" % job_dict_to_string(results[jobid]) + else: + log("Jobid %s in cache." % jobid) + log("0%s" % job_dict_to_string(cache_contents)) + + if cache_contents["JobStatus"] == '4' or cache_contents["JobStatus"] == '3': + finished_job_stats = get_finished_job_stats(jobid) + cache_contents.update(finished_job_stats) + + print "0%s" % job_dict_to_string(cache_contents) + return 0 + +if __name__ == "__main__": + try: + sys.exit(main()) + except SystemExit: + raise + except Exception, e: + log(traceback.format_exc()) + print "1ERROR: %s" % str(e).replace("\n", "\\n") + sys.exit(0) diff --git a/src/scripts/pbs_status.sh b/src/scripts/pbs_status.sh index 01629c97..9c850f52 100755 --- a/src/scripts/pbs_status.sh +++ b/src/scripts/pbs_status.sh @@ -33,6 +33,10 @@ . `dirname $0`/blah_load_config.sh +if [ -x ${blah_libexec_directory}/pbs_status.py ] ; then + exec ${blah_libexec_directory}/pbs_status.py "$@" +fi + if [ "x$job_registry" != "x" ] ; then ${blah_sbin_directory}/blah_job_registry_lkup $@ exit 0 @@ -82,6 +86,9 @@ BEGIN { /Job Id:/ { current_job = substr($0, index($0, ":") + 2) + end = index(current_job, ".") + if ( end == 0 ) { end = length(current_job) + 1 } + current_job = substr(current_job, 1, end) } /exec_host =/ { current_wn = substr($0, index($0, "=")+2) @@ -169,16 +176,21 @@ for reqfull in $pars ; do staterr=/tmp/${reqjob}_staterr -result=`${pbs_binpath}/qstat -f $reqjob 2>$staterr | awk -v jobId=$reqjob ' +result=`${pbs_binpath}/qstat -f $reqjob 2>$staterr` +qstat_exit_code=$? +result=`echo "$result" | awk -v jobId=$reqjob ' BEGIN { current_job = "" current_wn = "" current_js = "" + exitcode = "-1" } /Job Id:/ { current_job = substr($0, index($0, ":") + 2) - current_job = substr(current_job, 1, index(current_job, ".")-1) + end = index(current_job, ".") + if ( end == 0 ) { end = length(current_job) + 1 } + current_job = substr(current_job, 1, end) print "[BatchJobId=\"" current_job "\";" } /exec_host =/ { @@ -196,6 +208,9 @@ BEGIN { END { if (current_js ~ "Q") {jobstatus = 1} + if (current_js ~ "W") {jobstatus = 1} + if (current_js ~ "S") {jobstatus = 1} + if (current_js ~ "T") {jobstatus = 1} if (current_js ~ "R") {jobstatus = 2} if (current_js ~ "E") {jobstatus = 2} if (current_js ~ "C") {jobstatus = 4} @@ -207,6 +222,9 @@ END { } print "JobStatus=" jobstatus ";" if (jobstatus == 4) { + if (exitcode == "-1") { + exitcode = "0" + } print "ExitCode=" exitcode ";" } print "]" @@ -223,6 +241,11 @@ END { if [ -z "$errout" ] ; then echo "0"$result retcode=0 + elif [ "$qstat_exit_code" -eq "153" ] ; then + # If the job has disappeared, assume it's completed + # (same as globus) + echo "0[BatchJobId=\"$reqjob\";JobStatus=4;ExitCode=0]" + retcode=0 else echo "1ERROR: Job not found" retcode=1 diff --git a/src/scripts/pbs_submit.sh b/src/scripts/pbs_submit.sh index 40d2eb60..b26d18ae 100755 --- a/src/scripts/pbs_submit.sh +++ b/src/scripts/pbs_submit.sh @@ -1,4 +1,4 @@ -#!/bin/bash -l +#!/bin/bash # # File: pbs_submit.sh # Author: David Rebatto (david.rebatto@mi.infn.it) @@ -46,8 +46,14 @@ logpath=${pbs_spoolpath}/server_logs if [ ! -d $logpath -o ! -x $logpath ]; then - pbs_spoolpath=`${pbs_binpath}/tracejob | grep 'default prefix path'|awk -F" " '{ print $5 }'` - logpath=${pbs_spoolpath}/server_logs + if [ -x "${pbs_binpath}/tracejob" ]; then + pbs_spoolpath=`${pbs_binpath}/tracejob | grep 'default prefix path'|awk -F" " '{ print $5 }'` + logpath=${pbs_spoolpath}/server_logs + else + # EPEL defaults for torque + pbs_spoolpath=/var/lib/torque/spool + logpath=/var/lib/torque/server_logs + fi fi bls_job_id_for_renewal=PBS_JOBID @@ -111,6 +117,29 @@ fi #local batch system-specific file output must be added to the submit file bls_local_submit_attributes_file=${blah_libexec_directory}/pbs_local_submit_attributes.sh +# Begin building the select statement: select=x where x is the number of 'chunks' +# to request. Chunk requests should precede any resource requests (resource +# requests are order independent). An example from the PBS Pro manual: +# #PBS -l select=2:ncpus=8:mpiprocs=8:mem=6gb:interconnect=10g,walltime=16:00:00 +# Only one chunk is required for OSG needs at this time. +pbs_select="#PBS -l select=1" + +if [ "x$bls_opt_req_mem" != "x" ]; then + # Max amount of virtual memory allocated to a single process + if [[ "x$pbs_set_pvmem" == "xyes" ]]; then + echo "#PBS -l pvmem=${bls_opt_req_mem}mb" >> $bls_tmp_file + fi + # Max amount of physical memory allocated to a single process + if [[ "$bls_opt_smpgranularity" == 1 ]]; then + echo "#PBS -l pmem=${bls_opt_req_mem}mb" >> $bls_tmp_file + fi + # Total amount of memory allocated to the job + pbs_select="$pbs_select:mem=${bls_opt_req_mem}mb" + if [ "x$pbs_pro" != "xyes" ]; then + echo "#PBS -l mem=${bls_opt_req_mem}mb" >> $bls_tmp_file + fi +fi + bls_set_up_local_and_extra_args # Write PBS directives according to command line options @@ -118,48 +147,50 @@ bls_set_up_local_and_extra_args [ -z "$bls_opt_queue" ] || grep -q "^#PBS -q" $bls_tmp_file || echo "#PBS -q $bls_opt_queue" >> $bls_tmp_file # Extended support for MPI attributes -if [ "x$bls_opt_wholenodes" == "xyes" ] ; then - bls_opt_hostsmpsize=${bls_opt_hostsmpsize:-1} - if [[ ! -z "$bls_opt_smpgranularity" ]] ; then - if [[ -z "$bls_opt_hostnumber" ]] ; then - echo "#PBS -l nodes=1:ppn=$bls_opt_hostsmpsize" >> $bls_tmp_file - else - echo "#PBS -l nodes=$bls_opt_hostnumber:ppn=$bls_opt_hostsmpsize" >> $bls_tmp_file - fi - echo "#PBS -W x=NACCESSPOLICY:SINGLEJOB" >> $bls_tmp_file - else - if [[ ! -z "$bls_opt_hostnumber" ]] ; then - if [[ $bls_opt_mpinodes -gt 0 ]] ; then - r=$((bls_opt_mpinodes % bls_opt_hostnumber)) - (( r )) && mpireminder="+$r:ppn=$bls_opt_hostsmpsize" - echo "#PBS -l nodes=$((bls_opt_hostnumber-r)):ppn=${bls_opt_hostsmpsize}${mpireminder}" >> $bls_tmp_file - else - echo "#PBS -l nodes=$bls_opt_hostnumber:ppn=$bls_opt_hostsmpsize" >> $bls_tmp_file - fi - echo "#PBS -W x=NACCESSPOLICY:SINGLEJOB" >> $bls_tmp_file - fi - fi +if [ "x$pbs_pro" == "xyes" ]; then + pbs_select="$pbs_select:ncpus=$bls_opt_smpgranularity" else - if [[ ! -z "$bls_opt_smpgranularity" ]] ; then - n=$((bls_opt_mpinodes / bls_opt_smpgranularity)) - r=$((bls_opt_mpinodes % bls_opt_smpgranularity)) - (( r )) && mpireminder="+1:ppn=$r" - echo "#PBS -l nodes=$n:ppn=${bls_opt_smpgranularity}${mpireminder}" >> $bls_tmp_file - else - if [[ ! -z "$bls_opt_hostnumber" ]] ; then - n=$((bls_opt_mpinodes / bls_opt_hostnumber)) - r=$((bls_opt_mpinodes % bls_opt_hostnumber)) - (( r )) && mpireminder="+$r:ppn=$((n+1))" - echo "#PBS -l nodes=$((bls_opt_hostnumber-r)):ppn=$n$mpireminder" >> $bls_tmp_file - elif [[ $bls_opt_mpinodes -gt 0 ]] ; then - echo "#PBS -l nodes=$bls_opt_mpinodes" >> $bls_tmp_file + if [ "x$bls_opt_wholenodes" == "xyes" ]; then + bls_opt_hostsmpsize=${bls_opt_hostsmpsize:-1} + if [[ ! -z "$bls_opt_smpgranularity" ]] ; then + if [[ -z "$bls_opt_hostnumber" ]] ; then + echo "#PBS -l nodes=1:ppn=$bls_opt_hostsmpsize" >> $bls_tmp_file + else + echo "#PBS -l nodes=$bls_opt_hostnumber:ppn=$bls_opt_hostsmpsize" >> $bls_tmp_file + fi + echo "#PBS -W x=NACCESSPOLICY:SINGLEJOB" >> $bls_tmp_file + else + if [[ ! -z "$bls_opt_hostnumber" ]] ; then + if [[ $bls_opt_mpinodes -gt 0 ]] ; then + r=$((bls_opt_mpinodes % bls_opt_hostnumber)) + (( r )) && mpireminder="+$r:ppn=$bls_opt_hostsmpsize" + echo "#PBS -l nodes=$((bls_opt_hostnumber-r)):ppn=${bls_opt_hostsmpsize}${mpireminder}" >> $bls_tmp_file + else + echo "#PBS -l nodes=$bls_opt_hostnumber:ppn=$bls_opt_hostsmpsize" >> $bls_tmp_file + fi + echo "#PBS -W x=NACCESSPOLICY:SINGLEJOB" >> $bls_tmp_file + fi + fi + else + if [[ ! -z "$bls_opt_smpgranularity" ]] ; then + n=$((bls_opt_mpinodes / bls_opt_smpgranularity)) + r=$((bls_opt_mpinodes % bls_opt_smpgranularity)) + (( r )) && mpireminder="+1:ppn=$r" + echo "#PBS -l nodes=$n:ppn=${bls_opt_smpgranularity}${mpireminder}" >> $bls_tmp_file + else + if [[ ! -z "$bls_opt_hostnumber" ]] ; then + n=$((bls_opt_mpinodes / bls_opt_hostnumber)) + r=$((bls_opt_mpinodes % bls_opt_hostnumber)) + (( r )) && mpireminder="+$r:ppn=$((n+1))" + echo "#PBS -l nodes=$((bls_opt_hostnumber-r)):ppn=$n$mpireminder" >> $bls_tmp_file + elif [[ $bls_opt_mpinodes -gt 0 ]] ; then + echo "#PBS -l nodes=$bls_opt_mpinodes" >> $bls_tmp_file + fi + fi fi - fi fi # --- End of MPI directives - - # Input and output sandbox setup. if [ "x$blah_torque_multiple_staging_directive_bug" == "xyes" ]; then bls_fl_subst_and_accumulate inputsand "stagein=@@F_REMOTE@`hostname -f`:@@F_LOCAL" "," @@ -176,9 +207,14 @@ else [ -z "$bls_fl_subst_and_accumulate_result" ] || echo "#PBS -W stageout=\\'$bls_fl_subst_and_accumulate_result\\'" >> $bls_tmp_file fi +if [ "x$pbs_pro" == "xyes" ]; then + echo $pbs_select >> $bls_tmp_file +fi + echo "#PBS -m n" >> $bls_tmp_file bls_add_job_wrapper +bls_save_submit # Let the wrap script be at least 1 second older than logfile # for subsequent "find -newer" command to work @@ -194,6 +230,17 @@ jobID=`${pbs_binpath}/qsub $bls_tmp_file` # actual submission retcode=$? if [ "$retcode" != "0" ] ; then rm -f $bls_tmp_file + # Echo the output from qsub onto stderr, which is captured by HTCondor + echo "Error from qsub: $jobID" >&2 + exit 1 +fi + +# The job id is actually the first numbers in the string (slurm support) +jobID=`echo $jobID | awk 'match($0,/[0-9]+/){print substr($0, RSTART, RLENGTH)}'` +if [ "X$jobID" == "X" ]; then + rm -f $bls_tmp_file + echo "Error from qsub: $jobID" >&2 + echo Error # for the sake of waiting fgets in blahpd exit 1 fi diff --git a/src/scripts/sge_cancel.sh b/src/scripts/sge_cancel.sh index c631f6c1..821d4b61 100755 --- a/src/scripts/sge_cancel.sh +++ b/src/scripts/sge_cancel.sh @@ -20,7 +20,7 @@ # -[ -f ${GLITE_LOCATION:-/opt/glite}/etc/blah.config ] && . ${GLITE_LOCATION:-/opt/glite}/etc/blah.config +. `dirname $0`/blah_load_config.sh if [ -z "$sge_rootpath" ]; then sge_rootpath="/usr/local/sge/pro"; fi if [ -r "$sge_rootpath/${sge_cellname:-default}/common/settings.sh" ] diff --git a/src/scripts/sge_helper b/src/scripts/sge_helper index 725f1347..55d83d26 100755 --- a/src/scripts/sge_helper +++ b/src/scripts/sge_helper @@ -203,6 +203,7 @@ foreach my $i ( 0 .. $#results ) { print "[ ", map( "$_ = $results[$i]->{$_}; ", keys %{$results[$i]} ), "]\n"; } else { print "Error\n"; + exit ( 1 ); } } elsif ( $jobstatus ) { printf( "%s %d %d %s %s OK\n", diff --git a/src/scripts/sge_hold.sh b/src/scripts/sge_hold.sh index 677e11fb..67ee17df 100755 --- a/src/scripts/sge_hold.sh +++ b/src/scripts/sge_hold.sh @@ -20,7 +20,7 @@ # -[ -f ${GLITE_LOCATION:-/opt/glite}/etc/blah.config ] && . ${GLITE_LOCATION:-/opt/glite}/etc/blah.config +. `dirname $0`/blah_load_config.sh if [ -z "$sge_rootpath" ]; then sge_rootpath="/usr/local/sge/pro"; fi if [ -r "$sge_rootpath/${sge_cellname:-default}/common/settings.sh" ] diff --git a/src/scripts/sge_resume.sh b/src/scripts/sge_resume.sh index cfcda85c..525dab3c 100755 --- a/src/scripts/sge_resume.sh +++ b/src/scripts/sge_resume.sh @@ -20,7 +20,7 @@ # -[ -f ${GLITE_LOCATION:-/opt/glite}/etc/blah.config ] && . ${GLITE_LOCATION:-/opt/glite}/etc/blah.config +. `dirname $0`/blah_load_config.sh if [ -z "$sge_rootpath" ]; then sge_rootpath="/usr/local/sge/pro"; fi if [ -r "$sge_rootpath/${sge_cellname:-default}/common/settings.sh" ] diff --git a/src/scripts/sge_status.sh b/src/scripts/sge_status.sh index 551069fb..3633dc49 100755 --- a/src/scripts/sge_status.sh +++ b/src/scripts/sge_status.sh @@ -20,9 +20,10 @@ # -#[ -f ${GLITE_LOCATION:-/opt/glite}/etc/blah.config ] && . ${GLITE_LOCATION:-/opt/glite}/etc/blah.config . `dirname $0`/blah_load_config.sh +sge_helper_path=${blah_libexec_directory} + usage_string="Usage: $0 [-w] [-n]" #get worker node info @@ -63,11 +64,43 @@ fi tmpid=`echo "$@"|sed 's/.*\/.*\///g'` # ASG Keith way -jobid=${tmpid}.default +jobid=${tmpid}.${sge_cellname:-default} -blahp_status=`exec ${sge_helper_path:-/opt/glite/bin}/sge_helper --status $getwn $jobid` +blahp_status=`exec ${sge_helper_path}/sge_helper --status $getwn $jobid` retcode=$? +# Now see if we need to run qstat 'manually' +if [ $retcode -ne 0 ]; then + + qstat_out=`qstat` + + # First, find the column with the State information: + state_col=`echo "$qstat_out" | head -n 1 | awk '{ for (i = 1; i<=NF; i++) if ($i == "state") print i;}'` + job_state=`echo "$qstat_out" | awk -v "STATE_COL=$state_col" -v "JOBID=$tmpid" '{ if ($1 == JOBID) print $STATE_COL; }'` + + if [[ "$job_state" =~ q ]]; then + jobstatus=1 + elif [[ "$job_state" =~ [rt] ]]; then + jobstatus=2 + elif [[ "$job_state" =~ h ]]; then + jobstatus=5 + elif [[ "$job_state" =~ E ]]; then + jobstatus=4 + elif [[ "$job_state" =~ d ]]; then + jobstatus=3 + elif [ "x$job_state" == "x" ]; then + jobstatus=4 + fi + + if [ $jobstatus -eq 4 ]; then + blahp_status="[BatchJobId=\"$tmpid\";JobStatus=$jobstatus;ExitCode=0]" + else + blahp_status="[BatchJobId=\"$tmpid\";JobStatus=$jobstatus]" + fi + retcode=0 + +fi + echo ${retcode}${blahp_status} #exit $retcode diff --git a/src/scripts/sge_submit.sh b/src/scripts/sge_submit.sh index 5e05ed48..d04d719f 100755 --- a/src/scripts/sge_submit.sh +++ b/src/scripts/sge_submit.sh @@ -64,7 +64,7 @@ cat > $bls_tmp_file << end_of_preamble end_of_preamble #local batch system-specific file output must be added to the submit file -local_submit_attributes_file=${GLITE_LOCATION:-/opt/glite}/bin/sge_local_submit_attributes.sh +local_submit_attributes_file=${blah_libexec_directory}/sge_local_submit_attributes.sh if [ -r $local_submit_attributes_file ] ; then echo \#\!/bin/sh > $bls_opt_tmp_req_file if [ ! -z $bls_opt_req_file ] ; then @@ -83,7 +83,8 @@ fi # Write SGE directives according to command line options # handle queue overriding [ -z "$bls_opt_queue" ] || grep -q "^#\$ -q" $bls_tmp_file || echo "#\$ -q $bls_opt_queue" >> $bls_tmp_file -[ -z "$bls_opt_mpinodes" -o "x${bls_opt_mpinodes}" = "x1" ] || grep -q"^#\$ -pe *\\*" $bls_tmp_file || echo "#\$ -pe * $bls_opt_mpinodes" >>$bls_tmp_file +[ -z "$bls_opt_mpinodes" -o "x${bls_opt_mpinodes}" = "x1" ] || grep -q "^#\$ -pe *\\*" $bls_tmp_file \ + || echo "#\$ -pe $sge_pe_policy $bls_opt_mpinodes" >>$bls_tmp_file # Input and output sandbox setup. bls_fl_subst_and_accumulate inputsand "@@F_REMOTE@`hostname -f`:@@F_LOCAL" "@@@" @@ -93,6 +94,7 @@ bls_fl_subst_and_accumulate outputsand "@@F_REMOTE@`hostname -f`:@@F_LOCAL" "@@@ echo "#$ -m n" >> $bls_tmp_file bls_add_job_wrapper +bls_save_submit ############################################################### # Submit the script diff --git a/src/scripts/slurm_cancel.sh b/src/scripts/slurm_cancel.sh index ea67876a..b80245e4 100755 --- a/src/scripts/slurm_cancel.sh +++ b/src/scripts/slurm_cancel.sh @@ -1,15 +1,13 @@ #!/bin/bash -# File: slurm_cancel.sh -# -# Author: David Rebatto -# e-mail: David.Rebatto@mi.infn.it -# -# -# Copyright (c) Members of the EGEE Collaboration. 2004. -# See http://www.eu-egee.org/partners/ for details on the copyright -# holders. +# File: slurm_cancel.sh +# Author: Jaime Frey (jfrey@cs.wisc.edu) +# Based on code by David Rebatto (david.rebatto@mi.infn.it) # +# Copyright (c) Members of the EGEE Collaboration. 2004. +# Copyright (c) HTCondor Team, Computer Sciences Department, +# University of Wisconsin-Madison, WI. 2015. +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -26,15 +24,24 @@ . `dirname $0`/blah_load_config.sh +if [ -z "$slurm_binpath" ] ; then + slurm_binpath=/usr/bin +fi + jnr=0 jc=0 for job in $@ ; do jnr=$(($jnr+1)) done -for job in $@ ; do +for job in $@ ; do requested=`echo $job | sed 's/^.*\///'` cmdout=`${slurm_binpath}/scancel $requested 2>&1` retcode=$? + # If the job is already completed or no longer in the queue, + # treat it as successfully deleted. + if echo "$cmdout" | grep -q 'Invalid job id specified' ; then + retcode=0 + fi if [ "$retcode" == "0" ] ; then if [ "$jnr" == "1" ]; then echo " 0 No\\ error" diff --git a/src/scripts/slurm_hold.sh b/src/scripts/slurm_hold.sh new file mode 100755 index 00000000..67f41f6f --- /dev/null +++ b/src/scripts/slurm_hold.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# File: slurm_hold.sh +# Author: Jaime Frey (jfrey@cs.wisc.edu) +# Based on code by David Rebatto (david.rebatto@mi.infn.it) +# +# Copyright (c) Members of the EGEE Collaboration. 2004. +# Copyright (c) HTCondor Team, Computer Sciences Department, +# University of Wisconsin-Madison, WI. 2015. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +. `dirname $0`/blah_load_config.sh + +if [ -z "$slurm_binpath" ] ; then + slurm_binpath=/usr/bin +fi + +requested=`echo $1 | sed 's/^.*\///'` + +cmdout=`${slurm_binpath}/scontrol hold $requested 2>&1` +retcode=$? +if echo "$cmdout" | grep -q 'Job is no longer pending execution' ; then + cmdout=`${slurm_binpath}/scontrol requeuehold $requested 2>&1` + retcode=$? +fi + +if [ "$retcode" == "0" ] ; then + echo " 0 No\\ error" + exit 0 +else + echo " 1 Error" + exit 1 +fi diff --git a/src/scripts/slurm_local_submit_attributes.sh b/src/scripts/slurm_local_submit_attributes.sh new file mode 100644 index 00000000..971bcf6d --- /dev/null +++ b/src/scripts/slurm_local_submit_attributes.sh @@ -0,0 +1,28 @@ +#/bin/sh + +# This file is sourced by blahp before submitting the job to slurm +# Anything printed to stdout is included in the submit file. +# For example, to set a default walltime of 24 hours in PBS, you +# could uncomment this line: + +# echo "#SBATCH --time=24:00:00" + +# blahp allows arbitrary attributes to be passed to this script on a per-job +# basis. If you add the following to your HTCondor-G submit file: + +#+remote_cerequirements = NumJobs == 100 && foo = 5 + +# Then an environment variable, NumJobs, will be exported prior to calling this +# script and set to a value of 100. The variable foo will be set to 5. + +# You could allow users to set the walltime for the job with the following +# customization (slurm syntax given; adjust for the appropriate batch system): + +# Uncomment the else block to default to 24 hours of runtime; otherwise, the queue +# default is used. +if [ -n "$Walltime" ]; then + let Walltime=Walltime/60 + echo "#SBATCH --time=$Walltime" +# else +# echo "#SBATCH --time=24:00:00" +fi diff --git a/src/scripts/slurm_resume.sh b/src/scripts/slurm_resume.sh new file mode 100755 index 00000000..188b22cf --- /dev/null +++ b/src/scripts/slurm_resume.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# File: slurm_resume.sh +# Author: Jaime Frey (jfrey@cs.wisc.edu) +# Based on code by David Rebatto (david.rebatto@mi.infn.it) +# +# Copyright (c) Members of the EGEE Collaboration. 2004. +# Copyright (c) HTCondor Team, Computer Sciences Department, +# University of Wisconsin-Madison, WI. 2015. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +. `dirname $0`/blah_load_config.sh + +if [ -z "$slurm_binpath" ] ; then + slurm_binpath=/usr/bin +fi + +requested=`echo $1 | sed 's/^.*\///'` +${slurm_binpath}/scontrol release $requested >&/dev/null + +if [ "$?" == "0" ]; then + echo " 0 No\\ error" + exit 0 +else + echo " 1 Error" + exit 1 +fi + diff --git a/src/scripts/slurm_status.py b/src/scripts/slurm_status.py new file mode 100644 index 00000000..b6d9a6e7 --- /dev/null +++ b/src/scripts/slurm_status.py @@ -0,0 +1,561 @@ +#!/usr/bin/python + +# File: slurm_status.py +# +# Author: Brian Bockelman (bbockelm@cse.unl.edu) +# Jaime Frey (jfrey@cs.wisc.edu) +# +# Copyright (c) University of Nebraska-Lincoln. 2012 +# University of Wisconsin-Madison. 2016 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Query SLURM for the status of a given job + +Internally, it creates a cache of the SLURM response for all jobs and +will reuse this for subsequent queries. +""" + +import os +import re +import pwd +import sys +import time +import errno +import fcntl +import random +import struct +import subprocess +import signal +import tempfile +import traceback +import pickle +import csv + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import blah + +cache_timeout = 60 + +launchtime = time.time() + +def log(msg): + """ + A very lightweight log - not meant to be used in production, but helps + when debugging scale tests + """ + print >> sys.stderr, time.strftime("%x %X"), os.getpid(), msg + +def createCacheDir(): + uid = os.geteuid() + username = pwd.getpwuid(uid).pw_name + cache_dir = os.path.join("/var/tmp", "slurm_cache_%s" % username) + + try: + os.mkdir(cache_dir, 0755) + except OSError, oe: + if oe.errno != errno.EEXIST: + raise + s = os.stat(cache_dir) + if s.st_uid != uid: + raise Exception("Unable to check cache because it is owned by UID %d" % s.st_uid) + + return cache_dir + +def initLog(): + """ + Determine whether to create a logfile based on the presence of a file + in the user's slurm cache directory. If so, make the logfile there. + """ + cache_dir = createCacheDir() + if os.path.exists(os.path.join(cache_dir, "slurm_status.debug")): + filename = os.path.join(cache_dir, "slurm_status.log") + else: + filename = "/dev/null" + fd = open(filename, "a") + # Do NOT close the file descriptor blahp originally hands us for stderr. + # This causes blahp to lose all status updates. + os.dup(2) + os.dup2(fd.fileno(), 2) + +# Something else from a prior life - see gratia-probe-common's GratiaWrapper.py +def ExclusiveLock(fd, timeout=120): + """ + Grabs an exclusive lock on fd + + If the lock is owned by another process, and that process is older than the + timeout, then the other process will be signaled. If the timeout is + negative, then the other process is never signaled. + + If we are unable to hold the lock, this call will not block on the lock; + rather, it will throw an exception. + + By default, the timeout is 120 seconds. + """ + + # POSIX file locking is cruelly crude. There's nothing to do besides + # try / sleep to grab the lock, no equivalent of polling. + # Why hello, thundering herd. + + # An alternate would be to block on the lock, and use signals to interupt. + # This would mess up Gratia's flawed use of signals already, and not be + # able to report on who has the lock. I don't like indefinite waits! + max_time = 30 + starttime = time.time() + tries = 1 + while time.time() - starttime < max_time: + try: + fcntl.lockf(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + return + except IOError, ie: + if not ((ie.errno == errno.EACCES) or (ie.errno == errno.EAGAIN)): + raise + if check_lock(fd, timeout): + time.sleep(.2) # Fast case; however, we have *no clue* how + # long it takes to clean/release the old lock. + # Nor do we know if we'd get it if we did + # fcntl.lockf w/ blocking immediately. Blech. + # Check again immediately, especially if this was the last + # iteration in the for loop. + try: + fcntl.lockf(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + return + except IOError, ie: + if not ((ie.errno == errno.EACCES) or (ie.errno == errno.EAGAIN)): + raise + sleeptime = random.random() + log("Unable to acquire lock, try %i; will sleep for %.2f " \ + "seconds and try for %.2f more seconds." % (tries, sleeptime, max_time - (time.time()-starttime))) + tries += 1 + time.sleep(sleeptime) + + log("Fatal exception - Unable to acquire lock") + raise Exception("Unable to acquire lock") + +def check_lock(fd, timeout): + """ + For internal use only. + + Given a fd that is locked, determine which process has the lock. + Kill said process if it is older than "timeout" seconds. + This will log the PID of the "other process". + """ + + pid = get_lock_pid(fd) + if pid == os.getpid(): + return True + + if timeout < 0: + log("Another process, %d, holds the cache lock." % pid) + return False + + try: + age = get_pid_age(pid) + except: + log("Another process, %d, holds the cache lock." % pid) + log("Unable to get the other process's age; will not time it out.") + return False + + log("Another process, %d (age %d seconds), holds the cache lock." % (pid, age)) + + if age > timeout: + os.kill(pid, signal.SIGKILL) + else: + return False + + return True + +linux_struct_flock = "hhxxxxqqixxxx" +try: + os.O_LARGEFILE +except AttributeError: + start_len = "hhlli" + +def get_lock_pid(fd): + # For reference, here's the definition of struct flock on Linux + # (/usr/include/bits/fcntl.h). + # + # struct flock + # { + # short int l_type; /* Type of lock: F_RDLCK, F_WRLCK, or F_UNLCK. */ + # short int l_whence; /* Where `l_start' is relative to (like `lseek'). */ + # __off_t l_start; /* Offset where the lock begins. */ + # __off_t l_len; /* Size of the locked area; zero means until EOF. */ + # __pid_t l_pid; /* Process holding the lock. */ + # }; + # + # Note that things are different on Darwin + # Assuming off_t is unsigned long long, pid_t is int + try: + if sys.platform == "darwin": + arg = struct.pack("QQihh", 0, 0, 0, fcntl.F_WRLCK, 0) + else: + arg = struct.pack(linux_struct_flock, fcntl.F_WRLCK, 0, 0, 0, 0) + result = fcntl.fcntl(fd, fcntl.F_GETLK, arg) + except IOError, ie: + if ie.errno != errno.EINVAL: + raise + log("Unable to determine which PID has the lock due to a " \ + "python portability failure. Contact the developers with your" \ + " platform information for support.") + return False + if sys.platform == "darwin": + _, _, pid, _, _ = struct.unpack("QQihh", result) + else: + _, _, _, _, pid = struct.unpack(linux_struct_flock, result) + return pid + +def get_pid_age(pid): + now = time.time() + st = os.stat("/proc/%d" % pid) + return now - st.st_ctime + +def call_scontrol(jobid=""): + """ + Call scontrol directly for a jobid. + If none is specified, query all jobid's. + + Returns a python dictionary with the job info. + """ + scontrol = get_slurm_location('scontrol') + + starttime = time.time() + log("Starting scontrol.") + command = (scontrol, 'show', 'job') + if jobid: + command += (jobid,) + scontrol_proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + scontrol_out, _ = scontrol_proc.communicate() + + result = parse_scontrol(scontrol_out) + log("Finished scontrol (time=%f)." % (time.time()-starttime)) + + if scontrol_proc.returncode == 1: # Completed + result = {jobid: {'BatchJobId': '"%s"' % jobid, "JobStatus": "4", "ExitCode": ' 0'}} + elif scontrol_proc.returncode == 271: # Removed + result = {jobid: {'BatchJobId': '"%s"' % jobid, 'JobStatus': '3', 'ExitCode': ' 0'}} + elif scontrol_proc.returncode != 0: + raise Exception("scontrol failed with exit code %s" % str(scontrol_proc.returncode)) + + # If the job has completed... + if jobid is not "" and "JobStatus" in result[jobid] and (result[jobid]["JobStatus"] == '4' or result[jobid]["JobStatus"] == '3'): + # Get the finished job stats and update the result + finished_job_stats = get_finished_job_stats(jobid) + result[jobid].update(finished_job_stats) + + return result + + +def which(program): + """ + Determine if the program is in the path. + + arg program: name of the program to search + returns: full path to executable, or None if executable is not found + """ + def is_exe(fpath): + return os.path.isfile(fpath) and os.access(fpath, os.X_OK) + + fpath, fname = os.path.split(program) + if fpath: + if is_exe(program): + return program + else: + for path in os.environ["PATH"].split(os.pathsep): + path = path.strip('"') + exe_file = os.path.join(path, program) + if is_exe(exe_file): + return exe_file + + return None + +def convert_cpu_to_seconds(cpu_string): + # The time fields in sacct's output have this format: + # [DD-[hh:]]mm:ss + # Convert that to just seconds. + elem = re.split('[-:]', cpu_string) + secs = int(elem[-1]) + int(elem[-2]) * 60 + if len(elem) > 2: + secs += int(elem[-3]) * 3600 + if len(elem) > 3: + secs += int(elem[-4]) * 86400 + return secs + +def get_finished_job_stats(jobid): + """ + Get a completed job's statistics such as used RAM and cpu usage. + """ + + # First, list the attributes that we want + return_dict = { "ImageSize": 0, "ExitCode": 0, "RemoteUserCpu": 0 } + + # Next, query the appropriate interfaces for the completed job information + sacct = get_slurm_location('sacct') + log("Querying sacct for completed job for jobid: %s" % (str(jobid))) + child_stdout = os.popen("%s -j %s -l --noconvert -P" % (sacct, str(jobid))) + sacct_data = child_stdout.readlines() + ret = child_stdout.close() + + if ret: + # retry without --noconvert for slurm < 15.8 + child_stdout = os.popen("sacct -j %s -l -P" % (str(jobid))) + sacct_data = child_stdout.readlines() + child_stdout.close() + + try: + reader = csv.DictReader(sacct_data, delimiter="|") + except Exception, e: + log("Unable to read in CSV output from sacct: %s" % str(e)) + return return_dict + + # Slurm can return more than 1 row, for some odd reason. + # so sum up relevant values + for row in reader: + if row["AveCPU"] is not "": + try: + return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"]) + except: + log("Failed to parse CPU usage for job id %s: %s, %s" % (jobid, row["AveCPU"], row["AllocCPUS"])) + raise + if row["MaxRSS"] is not "": + # Remove the trailing [KMGTP] and scale the value appropriately + # Note: We assume that all values will have a suffix, and we + # want the value in kilos. + try: + value = row["MaxRSS"] + factor = 1 + if value[-1] == 'M': + factor = 1024 + elif value[-1] == 'G': + factor = 1024 * 1024 + elif value[-1] == 'T': + factor = 1024 * 1024 * 1024 + elif value[-1] == 'P': + factor = 1024 * 1024 * 1024 * 1024 + return_dict["ImageSize"] += int(value.strip('KMGTP')) * factor + except: + log("Failed to parse memory usage for job id %s: %s" % (jobid, row["MaxRSS"])) + raise + if row["ExitCode"] is not "": + try: + return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0]) + except: + log("Failed to parse memory usage for job id %s: %s" % (jobid, row["MaxRSS"])) + raise + return return_dict + + +_slurm_location_cache = None +def get_slurm_location(program): + """ + Locate the copy of the slurm bin the blahp configuration wants to use. + """ + global _slurm_location_cache + if _slurm_location_cache != None: + return os.path.join(_slurm_location_cache, program) + + cmd = 'echo "%s/%s"' % (config.get('slurm_binpath'), 'scontrol') + + child_stdout = os.popen(cmd) + output = child_stdout.read().split("\n")[0].strip() + if child_stdout.close(): + raise Exception("Unable to determine scontrol location: %s" % output) + + _slurm_location_cache = os.path.dirname(output) + return output + +job_id_re = re.compile("JobId=([0-9]+) .*") +exec_host_re = re.compile("\s*BatchHost=([\w\-.]+)") +status_re = re.compile("\s*JobState=([\w]+) .*") +exit_status_re = re.compile(".* ExitCode=(-?[0-9]+:[0-9]+)") +status_mapping = {"BOOT_FAIL": 4, "CANCELLED": 3, "COMPLETED": 4, "CONFIGURING": 1, "COMPLETING": 2, "FAILED": 4, "NODE_FAIL": 4, "PENDING": 1, "PREEMPTED": 4, "RUNNING": 2, "SPECIAL_EXIT": 4, "STOPPED": 2, "SUSPENDED": 2} + +def parse_scontrol(output): + """ + Parse the stdout of "scontrol show job" into a python dictionary + containing the information we need. + """ + job_info = {} + cur_job_id = None + cur_job_info = {} + for line in output.split('\n'): + line = line.strip() + m = job_id_re.match(line) + if m: + if cur_job_id: + job_info[cur_job_id] = cur_job_info + cur_job_id = m.group(1) + #print cur_job_id, line + cur_job_info = {"BatchJobId": '"%s"' % cur_job_id} + continue + if cur_job_id == None: + continue + m = exec_host_re.match(line) + if m: + cur_job_info["WorkerNode"] = '"' + m.group(1) + '"' + continue + m = status_re.match(line) + if m: + status = status_mapping.get(m.group(1), 0) + if status != 0: + cur_job_info["JobStatus"] = str(status) + continue + m = exit_status_re.match(line) + if m: + cur_job_info["ExitCode"] = ' %s' % m.group(1).split(":")[0] + continue + if cur_job_id: + job_info[cur_job_id] = cur_job_info + return job_info + +def job_dict_to_string(info): + result = ["%s=%s;" % (i[0], i[1]) for i in info.items()] + return "[" + " ".join(result) + " ]" + +def fill_cache(cache_location): + log("Starting query to fill cache.") + results = call_scontrol() + log("Finished query to fill cache.") + (fd, filename) = tempfile.mkstemp(dir = "/var/tmp") + # Open the file with a proper python file object + f = os.fdopen(fd, "w") + writer = csv.writer(f, delimiter='\t') + try: + try: + for key, val in results.items(): + key = key.split(".")[0] + writer.writerow([key, pickle.dumps(val)]) + os.fsync(fd) + except: + os.unlink(filename) + raise + finally: + f.close() + os.rename(filename, cache_location) + + global launchtime + launchtime = time.time() + +cache_line_re = re.compile("([0-9]+[\.\w\-]+):\s+(.+)") +def cache_to_status(jobid, fd): + reader = csv.reader(fd, delimiter='\t') + for row in reader: + if row[0] == jobid: + return pickle.loads(row[1]) + +def check_cache(jobid, recurse=True): + uid = os.geteuid() + username = pwd.getpwuid(uid).pw_name + cache_dir = os.path.join("/var/tmp", "slurm_cache_%s" % username) + if recurse: + try: + s = os.stat(cache_dir) + except OSError, oe: + if oe.errno != 2: + raise + os.mkdir(cache_dir, 0755) + s = os.stat(cache_dir) + if s.st_uid != uid: + raise Exception("Unable to check cache because it is owned by UID %d" % s.st_uid) + cache_location = os.path.join(cache_dir, "blahp_results_cache") + try: + fd = open(cache_location, "a+") + except IOError, ie: + if ie.errno != 2: + raise + # Create an empty file so we can hold the file lock + fd = open(cache_location, "w+") + ExclusiveLock(fd) + # If someone grabbed the lock between when we opened and tried to + # acquire, they may have filled the cache + if os.stat(cache_location).st_size == 0: + fill_cache(cache_location) + fd.close() + if recurse: + return check_cache(jobid, recurse=False) + else: + return None + ExclusiveLock(fd) + s = os.fstat(fd.fileno()) + if s.st_uid != uid: + raise Exception("Unable to check cache file because it is owned by UID %d" % s.st_uid) + if (s.st_size == 0) or (launchtime - s.st_mtime > cache_timeout): + # If someone filled the cache between when we opened the file and + # grabbed the lock, we may not need to fill the cache. + s2 = os.stat(cache_location) + if (s2.st_size == 0) or (launchtime - s2.st_mtime > cache_timeout): + fill_cache(cache_location) + if recurse: + return check_cache(jobid, recurse=False) + else: + return None + return cache_to_status(jobid, fd) + +job_status_re = re.compile(".*JobStatus=(\d+);.*") + +def main(): + initLog() + + # Accept the optional -w argument, but ignore it + if len(sys.argv) == 2: + jobid_arg = sys.argv[1] + elif len(sys.argv) == 3 and sys.argv[1] == "-w": + jobid_arg = sys.argv[2] + else: + print "1Usage: slurm_status.py slurm//" + return 1 + jobid = jobid_arg.split("/")[-1].split(".")[0] + + global config + config = blah.BlahConfigParser(defaults={'slurm_binpath': '/usr/bin'}) + + log("Checking cache for jobid %s" % jobid) + cache_contents = None + try: + cache_contents = check_cache(jobid) + except Exception, e: + msg = "1ERROR: Internal exception, %s" % str(e) + log(msg) + #print msg + if not cache_contents: + log("Jobid %s not in cache; querying SLURM" % jobid) + results = call_scontrol(jobid) + log("Finished querying SLURM for jobid %s" % jobid) + if not results or jobid not in results: + log("1ERROR: Unable to find job %s" % jobid) + print "1ERROR: Unable to find job %s" % jobid + else: + log("0%s" % job_dict_to_string(results[jobid])) + print "0%s" % job_dict_to_string(results[jobid]) + else: + log("Jobid %s in cache." % jobid) + log("0%s" % job_dict_to_string(cache_contents)) + + if cache_contents["JobStatus"] == '4' or cache_contents["JobStatus"] == '3': + finished_job_stats = get_finished_job_stats(jobid) + cache_contents.update(finished_job_stats) + + print "0%s" % job_dict_to_string(cache_contents) + return 0 + +if __name__ == "__main__": + try: + sys.exit(main()) + except SystemExit: + raise + except Exception, e: + log(traceback.format_exc()) + print "1ERROR: %s" % str(e).replace("\n", "\\n") + sys.exit(0) diff --git a/src/scripts/slurm_status.sh b/src/scripts/slurm_status.sh index 56b862e3..ae800124 100755 --- a/src/scripts/slurm_status.sh +++ b/src/scripts/slurm_status.sh @@ -1,20 +1,8 @@ #!/bin/bash -# File: slurm_status.sh -# -# Author: David Rebatto -# e-mail: David.Rebatto@mi.infn.it -# -# -# Revision history: -# 18-Jun-2012: Original release -# -# Description: -# Return a classad describing the status of a SLURM job -# -# Copyright (c) Members of the EGEE Collaboration. 2012. -# See http://www.eu-egee.org/partners/ for details on the copyright -# holders. +# File: slurm_status.sh +# Author: Jaime Frey (jfrey@cs.wisc.edu) +# Based on code by David Rebatto (david.rebatto@mi.infn.it) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,10 +19,106 @@ . `dirname $0`/blah_load_config.sh -if [ "x$job_registry" != "x" ] ; then - ${blah_sbin_directory}/blah_job_registry_lkup $@ - exit 0 -else - echo "job registry not enabled (required for SLURM support)" >&2 - exit 1 +if [ -x ${blah_libexec_directory}/slurm_status.py ] ; then + exec ${blah_libexec_directory}/slurm_status.py "$@" +fi + +if [ -z "$slurm_binpath" ] ; then + slurm_binpath=/usr/bin fi + +usage_string="Usage: $0 [-w] [-n]" + +#echo $0 "$@" >>~/slurm.debug + +############################################################### +# Parse parameters +############################################################### + +while getopts "wn" arg +do + case "$arg" in + w) getwn="yes" ;; + n) ;; + + -) break ;; + ?) echo $usage_string + exit 1 ;; + esac +done + +shift `expr $OPTIND - 1` + +pars=$* +proxy_dir=~/.blah_jobproxy_dir + +for reqfull in $pars ; do + reqjob=`echo $reqfull | sed -e 's/^.*\///'` + + staterr=/tmp/${reqjob}_staterr + +#echo "running: ${slurm_binpath}/scontrol show job $reqjob" >>~/slurm.debug + result=`${slurm_binpath}/scontrol show job $reqjob 2>$staterr` + stat_exit_code=$? +#echo "stat_exit_code=$stat_exit_code" >>~/slurm.debug + result=`echo "$result" | awk -v job_id=$reqjob -v proxy_dir=$proxy_dir ' +BEGIN { + blah_status = 4 + slurm_status = "" + exit_code = "0" +} + +/JobState=/ { + slurm_status = substr( $1, index( $1, "=" ) + 1 ) +} + +/ExitCode=/ { + if ( split( $4, tmp, "[=:]" ) == 3 ) { + exit_code = tmp[2] + } +} + +END { + if ( slurm_status ~ "BOOT_FAIL" ) { blah_status = 4 } + if ( slurm_status ~ "CANCELLED" ) { blah_status = 3 } + if ( slurm_status ~ "COMPLETED" ) { blah_status = 4 } + if ( slurm_status ~ "CONFIGURING" ) { blah_status = 1 } + if ( slurm_status ~ "COMPLETING" ) { blah_status = 2 } + if ( slurm_status ~ "FAILED" ) { blah_status = 4 } + if ( slurm_status ~ "NODE_FAIL" ) { blah_status = 4 } + if ( slurm_status ~ "PENDING" ) { blah_status = 1 } + if ( slurm_status ~ "PREEMPTED" ) { blah_status = 4 } + if ( slurm_status ~ "RUNNING" ) { blah_status = 2 } + if ( slurm_status ~ "SPECIAL_EXIT" ) { blah_status = 4 } + if ( slurm_status ~ "STOPPED" ) { blah_status = 2 } + if ( slurm_status ~ "SUSPENDED" ) { blah_status = 2 } + + print "[BatchJobId=\"" job_id "\";JobStatus=" blah_status ";" + if ( blah_status == 4 ) { + print "ExitCode=" exit_code ";" + } + print "]\n" + if ( blah_status == 3 || blah_status == 4 ) { + #system( "rm " proxy_dir "/" job_id ".proxy 2>/dev/null" ) + } +} +' +` +#echo result=$result >>~/slurm.debug + errout=`cat $staterr` + rm -f $staterr 2>/dev/null + + if echo "$errout" | grep -q "Invalid job id specified" ; then + stat_exit_code=0 + fi + if [ $stat_exit_code -eq 0 ] ; then + echo 0${result} +#echo 0${result} >>~/slurm.debug + else + echo 1Error: ${errout} +#echo 1Error: ${errout} >>~/slurm.debug + fi + +done + +exit 0 diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index bafe6c27..a9d9a894 100755 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -1,40 +1,43 @@ #!/bin/bash # # File: slurm_submit.sh -# Author: David Rebatto (david.rebatto@mi.infn.it) +# Author: Jaime Frey (jfrey@cs.wisc.edu) +# Based on code by David Rebatto (david.rebatto@mi.infn.it) # -# Revision history: -# 14-Mar-2012: Original release +# Description: +# Submission script for SLURM, to be invoked by blahpd server. +# Usage: +# slurm_submit.sh -c [-i ] [-o ] [-e ] [-w working dir] [-- command's arguments] # -# -# Copyright (c) Members of the EGEE Collaboration. 2004. -# See http://www.eu-egee.org/partners/ for details on the copyright -# holders. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and +# Copyright (c) Members of the EGEE Collaboration. 2004. +# Copyright (c) HTCondor Team, Computer Sciences Department, +# University of Wisconsin-Madison, WI. 2015. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and # limitations under the License. # . `dirname $0`/blah_common_submit_functions.sh -bls_parse_submit_options "$@" -bls_setup_all_files - # Default values for configuration variables slurm_std_storage=${slurm_std_storage:-/dev/null} -slurm_opt_prefix=${slurm_opt_prefix:-SBATCH} +slurm_binpath=${slurm_binpath:-/usr/bin} + +bls_parse_submit_options "$@" + +bls_setup_all_files # Write wrapper preamble -cat >$bls_tmp_file << end_of_preamble +cat > $bls_tmp_file << end_of_preamble #!/bin/bash # SLURM job wrapper generated by `basename $0` # on `/bin/date` @@ -44,57 +47,67 @@ cat >$bls_tmp_file << end_of_preamble # proxy_local_file = $bls_proxy_local_file # # SLURM directives: -#$slurm_opt_prefix -o $slurm_std_storage -#$slurm_opt_prefix -e $slurm_std_storage +#SBATCH -o $slurm_std_storage +#SBATCH -e $slurm_std_storage end_of_preamble -# Add site specific directives +#local batch system-specific file output must be added to the submit file bls_local_submit_attributes_file=${blah_libexec_directory}/slurm_local_submit_attributes.sh -bls_set_up_local_and_extra_args -# Write SLURM directives according to command line options -# handle queue overriding -[ -z "$bls_opt_queue" ] || grep -q "^#$slurm_opt_prefix -p" $bls_tmp_file || - echo "#$slurm_opt_prefix -p $bls_opt_queue" >> $bls_tmp_file +# Handle queues and paritions (same thing in SLURM) (copied from PBS submit file) +[ -z "$bls_opt_queue" ] || grep -q "^#SBATCH --partition" $bls_tmp_file || echo "#SBATCH --partition=$bls_opt_queue" >> $bls_tmp_file -# Input sandbox setup -bls_fl_subst_and_dump inputsand "scp `hostname -f`:@@F_LOCAL @@F_REMOTE" >> $bls_tmp_file +if [ "x$bls_opt_req_mem" != "x" ] +then + # Different schedulers require different memory checks + echo "#SBATCH --mem=${bls_opt_req_mem}" >> $bls_tmp_file +fi -# The wrapper's body... -bls_add_job_wrapper +# Simple support for multi-cpu attributes +if [[ $bls_opt_mpinodes -gt 1 ]] ; then + echo "#SBATCH --nodes=1" >> $bls_tmp_file + echo "#SBATCH --ntasks=1" >> $bls_tmp_file + echo "#SBATCH --cpus-per-task=$bls_opt_mpinodes" >> $bls_tmp_file +fi -# Output sandbox setup -echo "# Copy the output file back..." >> $bls_tmp_file -bls_fl_subst_and_dump outputsand "scp @@F_REMOTE `hostname -f`:@@F_LOCAL" >> $bls_tmp_file +# Do the local and extra args after all #SBATCH commands, otherwise slurm ignores anything +# after a non-#SBATCH command +bls_set_up_local_and_extra_args -if [ "x$bls_opt_debug" = "xyes" ]; then - echo "Submit file written to $bls_tmp_file" - exit -fi +# Input and output sandbox setup. +# Assume all filesystems are shared. + +bls_add_job_wrapper +bls_save_submit ############################################################### # Submit the script ############################################################### datenow=`date +%Y%m%d` -jobID=`sbatch $bls_tmp_file | sed 's/Submitted batch job //'` +jobID=`${slurm_binpath}/sbatch $bls_tmp_file` # actual submission retcode=$? + if [ "$retcode" != "0" ] ; then - rm -f $bls_tmp_file - exit 1 + rm -f $bls_tmp_file + echo "Error from sbatch: $jobID" >&2 + exit 1 fi -# Compose the blahp jobID ("slurm" + metadata + slurm jobid) -blahp_jobID="slurm/${datenow}/${jobID}" - -if [ "x$job_registry" != "x" ]; then - now=$((`date +%s` - 1)) - ${blah_sbin_directory}/blah_job_registry_add "$blahp_jobID" "$jobID" 1 $now "$bls_opt_creamjobid" "$bls_proxy_local_file" "$bls_opt_proxyrenew_numeric" "$bls_opt_proxy_subject" +# The job id is actually the first numbers in the string (slurm support) +jobID=`echo $jobID | awk 'match($0,/[0-9]+/){print substr($0, RSTART, RLENGTH)}'` +if [ "X$jobID" == "X" ]; then + rm -f $bls_tmp_file + echo "Error: job id missing" >&2 + echo Error # for the sake of waiting fgets in blahpd + exit 1 fi -echo "BLAHP_JOBID_PREFIX$blahp_jobID" +# Compose the blahp jobID ("slurm/" + datenow + pbs jobid) +blahp_jobID="slurm/`basename $datenow`/$jobID" +echo "BLAHP_JOBID_PREFIX$blahp_jobID" + bls_wrap_up_submit exit $retcode - diff --git a/src/server.c b/src/server.c index afcf0f39..50fd7c35 100644 --- a/src/server.c +++ b/src/server.c @@ -84,6 +84,9 @@ #include #include +#include "globus_gsi_credential.h" +#include "globus_gsi_proxy.h" + #include "blahpd.h" #include "config.h" #include "job_registry.h" @@ -143,7 +146,7 @@ int set_cmd_list_option(char **command, classad_context cad, const char *attribu int set_cmd_string_option(char **command, classad_context cad, const char *attribute, const char *option, const int quote_style); int set_cmd_int_option(char **command, classad_context cad, const char *attribute, const char *option, const int quote_style); int set_cmd_bool_option(char **command, classad_context cad, const char *attribute, const char *option, const int quote_style); -char *limit_proxy(char* proxy_name, char *requested_name); +static char *limit_proxy(char* proxy_name, char *requested_name, char **error_message); int getProxyInfo(char* proxname, char** subject, char** fqan); int logAccInfo(char* jobId, char* server_lrms, classad_context cad, char* fqan, char* userDN, char** environment); int CEReq_parse(classad_context cad, char* filename, char *proxysubject, char *proxyfqan); @@ -185,6 +188,7 @@ int enable_condor_glexec = FALSE; int require_proxy_on_submit = FALSE; int disable_wn_proxy_renewal = FALSE; int disable_proxy_user_copy = FALSE; +int disable_limited_proxy = FALSE; int synchronous_termination = FALSE; static char *mapping_parameter[MEXEC_PARAM_COUNT]; @@ -372,6 +376,13 @@ serveConnection(int cli_socket, char* cli_ip_addr) tmp_dir = DEFAULT_TEMP_DIR; } +/* In the Condor build of the blahp, we can find all the libraries we need + * via the RUNPATH. Setting LD_LIBRARY_PATH can muck up the command line + * tools for the local batch system. + * + * Similarly, in OSG, all Globus libraries are in the expected location. + */ +#if 0 needed_libs = make_message("%s/lib:%s/externals/lib:%s/lib:/opt/lcg/lib", result, result, getenv("GLOBUS_LOCATION") ? getenv("GLOBUS_LOCATION") : "/opt/globus"); old_ld_lib=getenv("LD_LIBRARY_PATH"); if(old_ld_lib) @@ -387,13 +398,14 @@ serveConnection(int cli_socket, char* cli_ip_addr) } else setenv("LD_LIBRARY_PATH",needed_libs,1); - +#endif blah_script_location = strdup(blah_config_handle->libexec_path); blah_version = make_message(RCSID_VERSION, VERSION, "poly,new_esc_format"); require_proxy_on_submit = config_test_boolean(config_get("blah_require_proxy_on_submit",blah_config_handle)); enable_condor_glexec = config_test_boolean(config_get("blah_enable_glexec_from_condor",blah_config_handle)); disable_wn_proxy_renewal = config_test_boolean(config_get("blah_disable_wn_proxy_renewal",blah_config_handle)); disable_proxy_user_copy = config_test_boolean(config_get("blah_disable_proxy_user_copy",blah_config_handle)); + disable_limited_proxy = config_test_boolean(config_get("blah_disable_limited_proxy",blah_config_handle)); /* Scan configuration for submit attributes to pass to local script */ pass_all_submit_attributes = config_test_boolean(config_get("blah_pass_all_submit_attributes",blah_config_handle)); @@ -954,11 +966,12 @@ cmd_set_glexec_dn(void *args) /* proxt4 must be limited for subsequent submission */ if(argv[3][0]=='0') { - if((proxynameNew = limit_proxy(proxt4, NULL)) == NULL) + if (((proxynameNew = limit_proxy(proxt4, NULL, NULL)) == NULL) || + (disable_limited_proxy)) { free(mapping_parameter[MEXEC_PARAM_DELEGCRED]); mapping_parameter[MEXEC_PARAM_DELEGCRED] = NULL; - result = strdup("F Cannot\\ limit\\ proxy\\ file"); + result = strdup("F Not\\ limiting\\ proxy\\ file"); } else mapping_parameter[MEXEC_PARAM_SRCPROXY] = proxynameNew; @@ -1024,6 +1037,7 @@ cmd_submit_job(void *args) char *error_string; int res = 1; char *proxyname = NULL; + char *iwd = NULL; char *proxysubject = NULL; char *proxyfqan = NULL; char *proxynameNew = NULL; @@ -1084,6 +1098,30 @@ cmd_submit_job(void *args) proxyname = NULL; } } + /* If the proxy is a relative path, we must prepend the Iwd to make it absolute */ + if (proxyname && proxyname[0] != '/') { + if (classad_get_dstring_attribute(cad, "Iwd", &iwd) == C_CLASSAD_NO_ERROR) { + size_t iwdlen = strlen(iwd); + size_t proxylen = iwdlen + strlen(proxyname) + 1; + char *proxynameTmp; + proxynameTmp = malloc(proxylen + 1); + if (!proxynameTmp) { + resultLine = make_message("%s 1 Malloc\\ failure N/A", reqId); + goto cleanup_lrms; + } + memcpy(proxynameTmp, iwd, iwdlen); + proxynameTmp[iwdlen] = '/'; + strcpy(proxynameTmp+iwdlen+1, proxyname); + free(proxyname); + free(iwd); + iwd = NULL; + proxyname = proxynameTmp; + proxynameTmp = NULL; + } else { + resultLine = make_message("%s 1 Relative\\ x509UserProxy\\ specified\\ without\\ Iwd N/A", reqId); + goto cleanup_lrms; + } + } /* If there are additional arguments, we have to map on a different id */ if(argv[CMD_SUBMIT_JOB_ARGS + 1] != NULL) @@ -1118,13 +1156,17 @@ cmd_submit_job(void *args) } } } - else if (proxyname != NULL) + else if ((proxyname) != NULL && (!disable_limited_proxy)) { /* not in glexec mode: need to limit the proxy */ - if((proxynameNew = limit_proxy(proxyname, NULL)) == NULL) + char *errmsg = NULL; + if((proxynameNew = limit_proxy(proxyname, NULL, &errmsg)) == NULL) { /* PUSH A FAILURE */ - resultLine = make_message("%s 1 Unable\\ to\\ limit\\ the\\ proxy N/A", reqId); + char * escaped_errmsg = (errmsg) ? escape_spaces(errmsg) : NULL; + if (escaped_errmsg) resultLine = make_message("%s 1 Unable\\ to\\ limit\\ the\\ proxy\\ (%s) N/A", reqId, escaped_errmsg); + else resultLine = make_message("%s 1 Unable\\ to\\ limit\\ the\\ proxy N/A", reqId); + if (errmsg) free(errmsg); goto cleanup_proxyname; } free(proxyname); @@ -1268,7 +1310,9 @@ cmd_submit_job(void *args) (set_cmd_int_option (&command, cad, "HostSMPSize", "-N", INT_NOQUOTE) == C_CLASSAD_OUT_OF_MEMORY) || (set_cmd_bool_option (&command, cad, "StageCmd", "-s", NO_QUOTE) == C_CLASSAD_OUT_OF_MEMORY) || (set_cmd_string_option(&command, cad, "ClientJobId","-j", NO_QUOTE) == C_CLASSAD_OUT_OF_MEMORY) || - (set_cmd_string_option(&command, cad, "BatchExtraSubmitArgs", "-a", SINGLE_QUOTE) == C_CLASSAD_OUT_OF_MEMORY)) + (set_cmd_string_option(&command, cad, "JobDirectory","-D", NO_QUOTE) == C_CLASSAD_OUT_OF_MEMORY) || + (set_cmd_string_option(&command, cad, "BatchExtraSubmitArgs", "-a", SINGLE_QUOTE) == C_CLASSAD_OUT_OF_MEMORY) || + (set_cmd_int_option(&command, cad, "RequestMemory", "-m", INT_NOQUOTE) == C_CLASSAD_OUT_OF_MEMORY)) // (set_cmd_string_option(&command, cad, "Args", "--", SINGLE_QUOTE) == C_CLASSAD_OUT_OF_MEMORY)) { /* PUSH A FAILURE */ @@ -1725,7 +1769,7 @@ cmd_status_job_all(void *args) } int -get_status_and_old_proxy(int use_glexec, char *jobDescr, +get_status_and_old_proxy(int use_glexec, char *jobDescr, const char *proxyFileName, char **status_argv, char **old_proxy, char **workernode, char **error_string) { @@ -1817,6 +1861,21 @@ get_status_and_old_proxy(int use_glexec, char *jobDescr, job_registry_free_split_id(spid); return 1; /* 'local' state */ } + // Look for the limited proxy next to the new proxy - this is a common case for HTCondor-based submission. + free(proxy_link); + if ((proxy_link = make_message("%s.lmt", proxyFileName)) == NULL) + { + fprintf(stderr, "Out of memory.\n"); + exit(MALLOC_ERROR); + } + if (access(proxy_link, R_OK) == 0) + { + *old_proxy = proxy_link; + // do not free proxy_link in this case. + free(r_old_proxy); + job_registry_free_split_id(spid); + return 1; + } free(proxy_link); free(r_old_proxy); job_registry_free_split_id(spid); @@ -1937,7 +1996,7 @@ cmd_renew_proxy(void *args) if (blah_children_count>0) check_on_children(blah_children, blah_children_count); - jobStatus=get_status_and_old_proxy(use_mapping, jobDescr, argv + CMD_RENEW_PROXY_ARGS + 1, &old_proxy, &workernode, &error_string); + jobStatus=get_status_and_old_proxy(use_mapping, jobDescr, proxyFileName, argv + CMD_RENEW_PROXY_ARGS + 1, &old_proxy, &workernode, &error_string); old_proxy_len = -1; if (old_proxy != NULL) old_proxy_len = strlen(old_proxy); if ((jobStatus < 0) || (old_proxy == NULL) || (old_proxy_len <= 0)) @@ -1953,21 +2012,25 @@ cmd_renew_proxy(void *args) switch(jobStatus) { case 1: /* job queued: copy the proxy locally */ - if (!use_mapping) + if (!use_mapping) { - limit_proxy(proxyFileName, old_proxy); /*FIXME: should check if limited proxies are enabled? */ - resultLine = make_message("%s 0 Proxy\\ renewed", reqId); + if (!disable_limited_proxy) + { + limit_proxy(proxyFileName, old_proxy, NULL); + } + resultLine = make_message("%s 0 Proxy\\ renewed", reqId); } else { exe_command.delegation_type = atoi(argv[CMD_RENEW_PROXY_ARGS + 1 + MEXEC_PARAM_DELEGTYPE]); exe_command.delegation_cred = argv[CMD_RENEW_PROXY_ARGS + 1 + MEXEC_PARAM_DELEGCRED]; - if (use_glexec) + if ((use_glexec) || (disable_limited_proxy)) { exe_command.source_proxy = argv[CMD_RENEW_PROXY_ARGS + 1 + MEXEC_PARAM_SRCPROXY]; } else { - limited_proxy_name = limit_proxy(proxyFileName, NULL); - exe_command.source_proxy = limited_proxy_name; + limited_proxy_name = limit_proxy(proxyFileName, NULL, NULL); + exe_command.source_proxy = limited_proxy_name; + } exe_command.dest_proxy = old_proxy; if (exe_command.source_proxy == NULL) @@ -2080,9 +2143,16 @@ cmd_send_proxy_to_worker_node(void *args) if (workernode != NULL && strcmp(workernode, "")) { - if(!use_glexec) + if (!use_glexec) { - proxyFileNameNew = limit_proxy(proxyFileName, NULL); + if (disable_limited_proxy) + { + proxyFileNameNew = strdup(proxyFileName); + } + else + { + proxyFileNameNew = limit_proxy(proxyFileName, NULL, NULL); + } } else proxyFileNameNew = strdup(argv[CMD_SEND_PROXY_TO_WORKER_NODE_ARGS + MEXEC_PARAM_SRCPROXY + 1]); @@ -2554,14 +2624,167 @@ set_cmd_list_option(char **command, classad_context cad, const char *attribute, if (to_append) free (to_append); return(result); } + +const char *grid_proxy_errmsg = NULL; -char * -limit_proxy(char* proxy_name, char *limited_proxy_name) +int activate_globus() +{ + static int active = 0; + + if (active) { + return 0; + } + + if ( globus_thread_set_model( "pthread" ) ) { + grid_proxy_errmsg = "failed to activate Globus"; + return -1; + } + + if ( globus_module_activate(GLOBUS_GSI_CREDENTIAL_MODULE) ) { + grid_proxy_errmsg = "failed to activate Globus"; + return -1; + } + + if ( globus_module_activate(GLOBUS_GSI_PROXY_MODULE) ) { + grid_proxy_errmsg = "failed to activate Globus"; + return -1; + } + + active = 1; + return 0; +} + +/* Returns lifetime left on proxy, in seconds. + * 0 means proxy is expired. + * -1 means an error occurred. + */ +int grid_proxy_info(const char *proxy_filename) +{ + globus_gsi_cred_handle_t handle = NULL; + time_t time_left = -1; + + if ( activate_globus() < 0 ) { + return -1; + } + + if (globus_gsi_cred_handle_init(&handle, NULL)) { + grid_proxy_errmsg = "failed to initialize Globus data structures"; + goto cleanup; + } + + // We should have a proxy file, now, try to read it + if (globus_gsi_cred_read_proxy(handle, proxy_filename)) { + grid_proxy_errmsg = "unable to read proxy file"; + goto cleanup; + } + + if (globus_gsi_cred_get_lifetime(handle, &time_left)) { + grid_proxy_errmsg = "unable to extract expiration time"; + goto cleanup; + } + + if ( time_left < 0 ) { + time_left = 0; + } + + cleanup: + if (handle) { + globus_gsi_cred_handle_destroy(handle); + } + + return time_left; +} + +/* Writes new proxy derived from existing one. Argument lifetime is the + * number of seconds until expiration for the new proxy. A 0 lifetime + * means the same expiration time as the source proxy. + * Returns 0 on success and -1 on error. + */ +int grid_proxy_init(const char *src_filename, char *dst_filename, + int lifetime) +{ + globus_gsi_cred_handle_t src_handle = NULL; + globus_gsi_cred_handle_t dst_handle = NULL; + globus_gsi_proxy_handle_t dst_proxy_handle = NULL; + int rc = -1; + time_t src_time_left = -1; + globus_gsi_cert_utils_cert_type_t cert_type = GLOBUS_GSI_CERT_UTILS_TYPE_LIMITED_PROXY; + + if ( activate_globus() < 0 ) { + return -1; + } + + if (globus_gsi_cred_handle_init(&src_handle, NULL)) { + grid_proxy_errmsg = "failed to initialize Globus data structures"; + goto cleanup; + } + + // We should have a proxy file, now, try to read it + if (globus_gsi_cred_read_proxy(src_handle, src_filename)) { + grid_proxy_errmsg = "unable to read proxy file"; + goto cleanup; + } + + if (globus_gsi_cred_get_lifetime(src_handle, &src_time_left)) { + grid_proxy_errmsg = "unable to extract expiration time"; + goto cleanup; + } + if ( src_time_left < 0 ) { + src_time_left = 0; + } + + if (globus_gsi_proxy_handle_init( &dst_proxy_handle, NULL )) { + grid_proxy_errmsg = "failed to initialize Globus data structures"; + goto cleanup; + } + + // lifetime == desired dst lifetime + // src_time_left == time left on src + if ( lifetime == 0 || lifetime > src_time_left ) { + lifetime = src_time_left; + } + if (globus_gsi_proxy_handle_set_time_valid( dst_proxy_handle, lifetime/60 )) { + grid_proxy_errmsg = "unable to set proxy expiration time"; + goto cleanup; + } + + if (globus_gsi_proxy_handle_set_type( dst_proxy_handle, cert_type)) { + grid_proxy_errmsg = "unable to set proxy type"; + goto cleanup; + } + + if (globus_gsi_proxy_create_signed( dst_proxy_handle, src_handle, &dst_handle)) { + grid_proxy_errmsg = "unable to generate proxy"; + goto cleanup; + } + + if (globus_gsi_cred_write_proxy( dst_handle, dst_filename )) { + grid_proxy_errmsg = "unable to write proxy file"; + goto cleanup; + } + + rc = 0; + + cleanup: + if (src_handle) { + globus_gsi_cred_handle_destroy(src_handle); + } + if (dst_handle) { + globus_gsi_cred_handle_destroy(dst_handle); + } + if ( dst_handle ) { + globus_gsi_proxy_handle_destroy( dst_proxy_handle ); + } + + return rc; +} + +static char * +limit_proxy(char* proxy_name, char *limited_proxy_name, char **error_message) { int seconds_left, hours_left, minutes_left; char *limcommand; int res; - char* globuslocation; char *limit_command_output; int tmpfd; exec_cmd_t exe_command = EXEC_CMD_DEFAULT; @@ -2578,27 +2801,39 @@ limit_proxy(char* proxy_name, char *limited_proxy_name) limited_proxy_name = limited_proxy_made_up_name; } - globuslocation = (getenv("GLOBUS_LOCATION") ? getenv("GLOBUS_LOCATION") : "/opt/globus"); - exe_command.command = make_message("%s/bin/grid-proxy-info -timeleft -file %s", - globuslocation, proxy_name); - if (exe_command.command == NULL) + /* Sanity check - make sure the destination is writable and the source exists */ + tmpfd = open(limited_proxy_name, O_WRONLY|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR); + if (tmpfd == -1) { - fprintf(stderr, "blahpd: out of memory\n"); - exit(1); + char * errmsg = make_message("Unable to create limited proxy file (%s):" + " errno=%d, %s", limited_proxy_name, errno, strerror(errno)); + if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); + if (!errmsg) return(NULL); + if (error_message) *error_message = errmsg; else free(errmsg); + return NULL; } - res = execute_cmd(&exe_command); - free(exe_command.command); - - if (res != 0) + else + { + close(tmpfd); + } + if ((tmpfd = open(proxy_name, O_WRONLY|O_CREAT, S_IRUSR|S_IWUSR)) == -1) { - perror("blahpd error invoking grid-proxy-info"); + char * errmsg = make_message("Unable to read proxy file (%s):" + " errno=%d, %s", proxy_name, errno, strerror(errno)); if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); - return(NULL); + if (!errmsg) return(NULL); + if (error_message) *error_message = errmsg; else if (errmsg) free(errmsg); + return NULL; } else { - seconds_left = atoi(exe_command.output); - cleanup_cmd(&exe_command); + close(tmpfd); + } + + seconds_left = grid_proxy_info( proxy_name ); + if ( seconds_left < 0 ) { + perror("blahpd error reading proxy lifetime"); + return NULL; } limit_command_output = make_message("%s_XXXXXX", limited_proxy_name); @@ -2621,18 +2856,9 @@ limit_proxy(char* proxy_name, char *limited_proxy_name) get_lock_on_limited_proxy = config_test_boolean(config_get("blah_get_lock_on_limited_proxies",blah_config_handle)); - if (seconds_left <= 0) - { + if (seconds_left <= 0) { /* Something's wrong with the current proxy - use defaults */ - exe_command.command = make_message("%s/bin/grid-proxy-init -old -limited -cert %s -key %s -out %s", - globuslocation, proxy_name, proxy_name, limit_command_output); - } - else - { - hours_left = (int)(seconds_left/3600); - minutes_left = (int)((seconds_left%3600)/60) + 1; - exe_command.command = make_message("%s/bin/grid-proxy-init -old -limited -valid %d:%d -cert %s -key %s -out %s", - globuslocation, hours_left, minutes_left, proxy_name, proxy_name, limit_command_output); + seconds_left = 12*60*60; } if ((limit_command_output == limited_proxy_name) && @@ -2642,7 +2868,9 @@ limit_proxy(char* proxy_name, char *limited_proxy_name) if (fpr == NULL) { fprintf(stderr, "blahpd limit_proxy: Cannot open %s in append mode to obtain file lock: %s\n", limited_proxy_name, strerror(errno)); + char * errmsg = make_message("Cannot open %s in append mode to obtain file lock: %s", limited_proxy_name, strerror(errno)); if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); + if (error_message && errmsg) *error_message= errmsg; else if (errmsg) free(errmsg); return(NULL); } /* Acquire lock on limited proxy */ @@ -2654,13 +2882,14 @@ limit_proxy(char* proxy_name, char *limited_proxy_name) { fclose(fpr); fprintf(stderr, "blahpd limit_proxy: Cannot obtain write file lock on %s: %s\n", limited_proxy_name, strerror(errno)); + char * errmsg = make_message("Cannot obtain write file lock on %s: %s", limited_proxy_name, strerror(errno)); if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); + if (error_message && errmsg) *error_message= errmsg; else if (errmsg) free(errmsg); return(NULL); } } - res = execute_cmd(&exe_command); - free(exe_command.command); + res = grid_proxy_init( proxy_name, limit_command_output, seconds_left ); if ((limit_command_output == limited_proxy_name) && get_lock_on_limited_proxy) @@ -2677,25 +2906,6 @@ limit_proxy(char* proxy_name, char *limited_proxy_name) return(NULL); } - /* If exitcode != 0 there may be a problem due to a warning by grid-proxy-init but */ - /* the call may have been successful. We just check the temporary proxy */ - if (exe_command.exit_code != 0) - { - cleanup_cmd(&exe_command); - exe_command.command = make_message("%s/bin/grid-proxy-info -f %s", globuslocation, limit_command_output); - res = execute_cmd(&exe_command); - free(exe_command.command); - if (res != 0 || exe_command.exit_code != 0) - { - if (limit_command_output != limited_proxy_name) - free(limit_command_output); - if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); - return(NULL); - } - } - - cleanup_cmd(&exe_command); - if (limit_command_output != limited_proxy_name) { if (get_lock_on_limited_proxy) @@ -2705,8 +2915,10 @@ limit_proxy(char* proxy_name, char *limited_proxy_name) { fprintf(stderr, "blahpd limit_proxy: Cannot open %s in append mode to obtain file lock: %s\n", limited_proxy_name, strerror(errno)); unlink(limit_command_output); + char * errmsg = make_message("Cannot open %s in append mode to obtain file lock: %s", limited_proxy_name, strerror(errno)); free(limit_command_output); if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); + if (error_message && errmsg) *error_message= errmsg; else if (errmsg) free(errmsg); return(NULL); } /* Acquire lock on limited proxy */ @@ -2718,9 +2930,11 @@ limit_proxy(char* proxy_name, char *limited_proxy_name) { fclose(fpr); fprintf(stderr, "blahpd limit_proxy: Cannot obtain write file lock on %s: %s\n", limited_proxy_name, strerror(errno)); + char * errmsg = make_message("Cannot obtain write file lock on %s: %s", limited_proxy_name, strerror(errno)); unlink(limit_command_output); free(limit_command_output); if (limited_proxy_made_up_name != NULL) free(limited_proxy_made_up_name); + if (error_message && errmsg) *error_message= errmsg; else free(errmsg); return(NULL); } } @@ -3388,10 +3602,10 @@ char* outputfileRemaps(char *sb,char *sbrmp) #define SINGLE_QUOTE_CHAR '\'' #define DOUBLE_QUOTE_CHAR '\"' -#define CONVARG_OPENING "'\"" -#define CONVARG_OPENING_LEN 2 -#define CONVARG_CLOSING "\"'\000" -#define CONVARG_CLOSING_LEN 3 +#define CONVARG_OPENING "\"\\\"" +#define CONVARG_OPENING_LEN 3 +#define CONVARG_CLOSING "\\\"\"\000" +#define CONVARG_CLOSING_LEN 4 #define CONVARG_QUOTSEP "\\\"%c\\\"" #define CONVARG_QUOTSEP_LEN 5 #define CONVARG_DBLQUOTESC "\\\\\\\"" @@ -3457,6 +3671,11 @@ ConvertArgs(char* original, char separator) memcpy(result + j, CONVARG_DBLQUOTESC, CONVARG_DBLQUOTESC_LEN); j += CONVARG_DBLQUOTESC_LEN; } + else if ((original[i] == '(') || (original[i] == ')') || (original[i] == '&')) + { /* Must escape a few meta-characters for wordexp */ + result[j++] = '\\'; + result[j++] = original[i]; + } else { /* plain copy from the original */ result[j++] = original[i];