diff --git a/lib/workflowmgr/slurmbatchsystem.rb b/lib/workflowmgr/slurmbatchsystem.rb index 7005247..de1853e 100644 --- a/lib/workflowmgr/slurmbatchsystem.rb +++ b/lib/workflowmgr/slurmbatchsystem.rb @@ -289,6 +289,10 @@ def submit(task) queued_jobs="" errors="" exit_status=0 + + # Wait a few seconds for information to propagate before trying to look if job was still submitted + sleep(5) + begin # Get the username of this process @@ -316,9 +320,9 @@ def submit(task) # Look for a job that matches the randomID we inserted into the comment queued_jobs.split("\n").each { |job| - # Skip headers - next if job=~/CLUSTER/ - next if job=~/JOBID/ + # Skip headings + next if job[0..4] == 'JOBID' + next if job[0..7] == 'CLUSTER:' # Extract job id jobid=job[0..39].strip @@ -331,6 +335,10 @@ def submit(task) end } + WorkflowMgr.stderr("WARNING: Unable to retrieve jobid after sbatch failed with socket time out when submitting #{task.attributes[:name]}",1) + + return nil,output + else WorkflowMgr.stderr("WARNING: job submission failed: #{output}", 1) return nil,output