diff --git a/lib/workflowmgr/lsfbatchsystem.rb b/lib/workflowmgr/lsfbatchsystem.rb index 69b229b..94cfca6 100644 --- a/lib/workflowmgr/lsfbatchsystem.rb +++ b/lib/workflowmgr/lsfbatchsystem.rb @@ -322,7 +322,7 @@ def refresh_jobqueue rescue Timeout::Error,WorkflowMgr::SchedulerDown WorkflowMgr.log("#{$!}") - WorkflowMgr.stderr("#{$!}",3) + WorkflowMgr.stderr("error running bjobs: #{$!}",3) raise WorkflowMgr::SchedulerDown end @@ -354,6 +354,7 @@ def refresh_jobqueue record[:state]="RUNNING" else record[:state]="UNKNOWN" + next end record[:queue]=jobattributes[3] record[:jobname]=jobattributes[6] @@ -426,8 +427,10 @@ def run_bhist_bjobs(nacctfiles=1,bjobs=true) exit_status=0 timeout=nacctfiles==1 ? 30 : 90 if(bjobs) then + WorkflowMgr.stderr("bjobs -l -a ",10) completed_jobs,errors,exit_status=WorkflowMgr.run4("bjobs -l -a",timeout) else + WorkflowMgr.stderr("bhist -n #{nacctfiles} -l -d -w ",10) completed_jobs,errors,exit_status=WorkflowMgr.run4("bhist -n #{nacctfiles} -l -d -w",timeout) end @@ -444,8 +447,8 @@ def run_bhist_bjobs(nacctfiles=1,bjobs=true) end rescue Timeout::Error,WorkflowMgr::SchedulerDown - WorkflowMgr.log("#{$!}") - WorkflowMgr.stderr("#{$!}",3) + WorkflowMgr.log("Error running bhist or bjobs: #{$!}") + WorkflowMgr.stderr("Error running bhist or bjobs: #{$!}",3) raise WorkflowMgr::SchedulerDown end # Build job records from output of bhist @@ -458,7 +461,7 @@ def run_bhist_bjobs(nacctfiles=1,bjobs=true) recordstring.gsub!(/\n\s{3,}/,'') recordstring.split(/\n+/).each { |event| case event.strip - when /^Job <(\d+)>,( Job Name <([^>]+)>,)* User <([^>]+)>,/ + when /^Job <(\d+)>, *(Job Name <([^>]+)>,)? *User <([^>]+)>,/ record[:jobid]=$1 record[:jobname]=$3 record[:user]=$4 @@ -565,7 +568,9 @@ def run_bhist_bjobs(nacctfiles=1,bjobs=true) } if !jobacct.has_key?(record[:jobid]) - jobacct[record[:jobid]]=record + if record.has_key?(:state) and record[:state]!='UNKNOWN' + jobacct[record[:jobid]]=record + end end }