Merge pull request #587 from nf-core/sanger_multifarm

Updated the Sanger config to list more queues and cover multiple internal clusters
nf-core · Nov 10, 2023 · c3c9060 · c3c9060
2 parents f1d5daa + 555b99c
commit c3c9060
Showing 1 changed file with 59 additions and 11 deletions.
diff --git a/conf/sanger.config b/conf/sanger.config
@@ -1,14 +1,50 @@
+
+// Extract the name of the cluster to tune the parameters below
+def clustername = "farm5"
+try {
+    clustername = ['/bin/bash', '-c', 'lsid | awk \'$0 ~ /^My cluster name is/ {print $5}\''].execute().text.trim()
+} catch (java.io.IOException e) {
+    System.err.println("WARNING: Could not run lsid to determine current cluster, defaulting to farm")
+}
+
 // Profile details
 params {
-    config_profile_description = 'The Wellcome Sanger Institute HPC cluster (farm5) profile'
+    config_profile_description = "The Wellcome Sanger Institute HPC cluster (${clustername}) profile"
     config_profile_contact = 'Priyanka Surana (@priyanka-surana)'
     config_profile_url = 'https://www.sanger.ac.uk'
 }
 
-// Queue and retry strategy
-process{
+
+// Queue and LSF submission options
+process {
     executor = 'lsf'
-    queue = { task.time < 20.m ? 'small' : task.time < 12.h ? 'normal' : task.time < 48.h ? 'long' : task.time < 168.h ? 'week' : 'basement' }
+
+    // Currently a single set of rules for all clusters, but we could use $clustername to apply
+    // different rules to different clusters.
+    queue = {
+        if ( task.time >= 15.day ) {
+            if ( task.memory > 680.GB ) {
+                error "There is no queue for jobs that need >680 GB and >15 days"
+            } else {
+                "basement"
+            }
+        } else if ( task.memory > 720.GB ) {
+            "teramem"
+        } else if ( task.memory > 350.GB ) {
+            "hugemem"
+        } else if ( task.time > 7.day ) {
+            "basement"
+        } else if ( task.time > 2.day ) {
+            "week"
+        } else if ( task.time > 12.hour ) {
+            "long"
+        } else if ( task.time > 1.min ) {
+            "normal"
+        } else {
+            "small"
+        }
+    }
+
     withLabel: gpu {
         clusterOptions = { "-M "+task.memory.toMega()+" -R 'select[ngpus>0 && mem>="+task.memory.toMega()+"] rusage[ngpus_physical=1.00,mem="+task.memory.toMega()+"] span[ptile=1]' -gpu 'mode=exclusive_process'" }
         queue = { task.time > 12.h ? 'gpu-huge' : task.time > 48.h ? 'gpu-basement' : 'gpu-normal' }
@@ -19,21 +55,33 @@ process{
     }
 }
 
+
 // Executor details
-executor{
+executor {
     name = 'lsf'
     perJobMemLimit = true
     poolSize = 4
     submitRateLimit = '5 sec'
     killBatchSize = 50
 }
 
+
 // Max resources
-params {
-    max_memory = 683.GB
-    max_cpus = 256
-    max_time = 720.h
+if (clustername.startsWith("tol")) {
+    // tol cluster
+    params.max_memory = 1.4.TB
+    params.max_cpus = 64
+    params.max_time = 89280.min // 62 days
+    // As opposite to the farm settings below, we don't mount any filesystem by default.
+    // Pipelines that need to see certain filesystems have to set singularity.runOptions themselves
+
+} else {
+    // defaults for the main farm
+    params.max_memory = 2.9.TB
+    params.max_cpus = 256
+    params.max_time = 43200.min // 30 days
+
+    // Mount all filesystems by default
+    singularity.runOptions = '--bind /lustre --bind /nfs --bind /data --bind /software'
 }
 
-// For singularity
-singularity.runOptions = '--bind /lustre --bind /nfs --bind /software'