diff --git a/group_vars/maintenance.yml b/group_vars/maintenance.yml index 02efecf82..1b515df6e 100644 --- a/group_vars/maintenance.yml +++ b/group_vars/maintenance.yml @@ -232,6 +232,13 @@ telegraf_plugins_extra: - timeout = "10s" - data_format = "influx" - interval = "1m" + monitor_condor_util_split: + plugin: "exec" + config: + - commands = ["sudo /usr/bin/monitor-condor-utilisation-split"] + - timeout = "10s" + - data_format = "influx" + - interval = "1m" postgres_extra: plugin: "exec" config: diff --git a/roles/hxr.monitor-cluster/files/cluster_util-condor-split.sh b/roles/hxr.monitor-cluster/files/cluster_util-condor-split.sh index 358775433..e2bdd01a6 100755 --- a/roles/hxr.monitor-cluster/files/cluster_util-condor-split.sh +++ b/roles/hxr.monitor-cluster/files/cluster_util-condor-split.sh @@ -1,10 +1,18 @@ #!/bin/bash +# Details: For each GalaxyGroup we calculate the following to monitor the cluster usage + for cluster in $(condor_status -autoformat GalaxyGroup | sort | grep -v undefined | uniq); do - mem_total=$(condor_status -autoformat TotalMemory -constraint 'GalaxyGroup == "'$cluster'" && Activity == "Idle"' | paste -s -d'+' | bc) - mem_remain=$(condor_status -autoformat Memory -constraint 'GalaxyGroup == "'$cluster'" && Activity == "Idle"' | paste -s -d'+' | bc) - cpu_total=$(condor_status -autoformat DetectedCpus -constraint 'GalaxyGroup == "'$cluster'" && Activity == "Idle"' | paste -s -d'+' | bc) - cpu_remain=$(condor_status -autoformat Cpus -constraint 'GalaxyGroup == "'$cluster'" && Activity == "Idle"' | paste -s -d'+' | bc) - mem_perc=$(echo "($mem_total - $mem_remain) / $mem_total" | bc -l) - cpu_perc=$(echo "($cpu_total - $cpu_remain) / $cpu_total" | bc -l) - echo "cluster.alloc,cluster=condor-sep,group=$cluster cores=0$cpu_perc,memory=0$mem_perc" + total_slots=$(condor_status -af Name -constraint 'GalaxyGroup == "'$cluster'" && (SlotType == "Partitionable" || SlotType == "Dynamic")' | wc -l) + claimed_slots=$(condor_status -af Name -constraint 'GalaxyGroup == "'$cluster'" && State == "Claimed"' | wc -l) + unclaimed_slots=$(condor_status -af Name -constraint 'GalaxyGroup == "'$cluster'" && State == "Unclaimed"' | wc -l) + total_cpus=$(condor_status -af DetectedCpus -constraint 'GalaxyGroup == "'$cluster'" && SlotType == "Partitionable"' | paste -s -d'+' | bc) + claimed_cpus=$(condor_status -af Cpus -constraint 'GalaxyGroup == "'$cluster'" && State == "Claimed"' | paste -s -d'+' | bc) + unclaimed_cpus=$(condor_status -af Cpus -constraint 'GalaxyGroup == "'$cluster'" && State == "Unclaimed"' | paste -s -d'+' | bc) + total_memory=$(condor_status -af TotalMemory -constraint 'GalaxyGroup == "'$cluster'" && SlotType == "Partitionable"' | paste -s -d'+' | bc) + claimed_memory=$(condor_status -af Memory -constraint 'GalaxyGroup == "'$cluster'" && State == "Claimed"' | paste -s -d'+' | bc) + unclaimed_memory=$(condor_status -af Memory -constraint 'GalaxyGroup == "'$cluster'" && State == "Unclaimed"' | paste -s -d'+' | bc) + total_gpu_slots=$(condor_status -af Name -constraint 'GalaxyGroup == "'$cluster'" && CUDADeviceName =!= undefined' | wc -l) + claimed_gpus=$(condor_status -af Name -constraint 'GalaxyGroup == "'$cluster'" && State == "Claimed" && CUDADeviceName =!= undefined' | wc -l) + unclaimed_gpus=$(condor_status -af Name -constraint 'GalaxyGroup == "'$cluster'" && State == "Unclaimed" && CUDADeviceName =!= undefined' | wc -l) + echo "htcondor_cluster_usage,classad='cluster',group=$cluster total_slots=$total_slots,claimed_slots=$claimed_slots,unclaimed_slots=$unclaimed_slots,total_cpus=$total_cpus,claimed_cpus=$claimed_cpus,unclaimed_cpus=$unclaimed_cpus,total_memory=$total_memory,claimed_memory=$claimed_memory,unclaimed_memory=$unclaimed_memory,total_gpu_slots=$total_gpu_slots,claimed_gpus=$claimed_gpus,unclaimed_gpus=$unclaimed_gpus" done diff --git a/roles/hxr.monitor-cluster/files/cluster_util-condor.sh b/roles/hxr.monitor-cluster/files/cluster_util-condor.sh index df96dfc12..b1a80cbf9 100755 --- a/roles/hxr.monitor-cluster/files/cluster_util-condor.sh +++ b/roles/hxr.monitor-cluster/files/cluster_util-condor.sh @@ -1,25 +1,51 @@ #!/bin/bash -# mem_total=$(condor_status -autoformat TotalMemory | paste -s -d'+' | bc) -# mem_alloc=$(condor_status -autoformat Memory | paste -s -d'+' | bc) -# mem_perc=$(echo "$mem_alloc / $mem_total" | bc -l) -# cpu_total=$(condor_status -autoformat DetectedCpus | paste -s -d'+' | bc) -# cpu_alloc=$(condor_status -autoformat Cpus | paste -s -d'+' | bc) -# cpu_perc=$(echo "$cpu_alloc / $cpu_total" | bc -l) -# echo "cluster.alloc,cluster=condor cores=0$cpu_perc,memory=0$mem_perc" - -# As of 04.07.2023, the following is used to collect data from the cluster -# Details: -# SlotType: Dynamic or partitionable slots. Each host is partitioned to 1 slot and that slot is further dynamically partitioned to several slots -# Name: Name of the slot -# State: Claimed or Unclaimed slot -# Activity: Idle or Busy -# DetectedCpus: Total CPU cores available at machine level -# Cpus: Total CPU cores available at slot level -# TotalMemory: Total memory available at machine level -# Memory: Total memory available at slot level -# LoadAvg: Load avergate at slot level -# TotalLoadAvg: Total load average at the machine level -# GalaxyGroup: Group name of the machine - -# Command: -condor_status -af:l Name SlotType State Activity GalaxyGroup DetectedCpus Cpus TotalMemory Memory LoadAvg TotalLoadAvg -constraint 'SlotType == "Dynamic" || SlotType == "Partitionable"' | awk -F '[= ]+' '{printf("htcondor_cluster_usage,classad=\"slot\",%s=\"%s\",%s=\"%s\",%s=\"%s\",%s=\"%s\",%s=\"%s\",%s=%s,%s=%s,%s=%s,%s=%s,%s=%s,%s=%s %s=\"%s\",%s=\"%s\",%s=\"%s\",%s=\"%s\",%s=\"%s\",%s=%s,%s=%s,%s=%s,%s=%s,%s=%s,%s=%s\n", $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22)}' +# Details: This script is used to monitor the entire HTCondor cluster usage independent of GalaxyGroup's + +# Total number of detected CPUs at the machine level +total_detected_cpus=$(condor_status -af DetectedCpus -constraint 'SlotType == "Partitionable"' | paste -s -d'+' | bc) + +# Claimed CPUs +claimed_cpus=$(condor_status -af Cpus -constraint 'State == "Claimed"' | paste -s -d'+' | bc) + +# Unclaimed CPUs +unclaimed_cpus=$(condor_status -af Cpus -constraint 'State == "Unclaimed"' | paste -s -d'+' | bc) + +# Total memory at the machine level +total_memory=$(condor_status -af TotalMemory -constraint 'SlotType == "Partitionable"' | paste -s -d'+' | bc) + +# Claimed memory +claimed_memory=$(condor_status -af Memory -constraint 'State == "Claimed"' | paste -s -d'+' | bc) + +# Unclaimed memory +unclaimed_memory=$(condor_status -af Memory -constraint 'State == "Unclaimed"' | paste -s -d'+' | bc) + +# Total number of GPU slots +total_gpu_slots=$(condor_status -af Name -constraint 'CUDADeviceName =!= undefined' | wc -l) + +# Claimed GPUs slots +claimed_gpus=$(condor_status -af Name -constraint 'State == "Claimed" && CUDADeviceName =!= undefined' | wc -l) + +# Unclaimed GPUs slots +unclaimed_gpus=$(condor_status -af Name -constraint 'State == "Unclaimed" && CUDADeviceName =!= undefined' | wc -l) + +# Total load average at the machine level +total_loadavg=$(condor_status -af TotalLoadAvg -constraint 'SlotType == "Partitionable"' | paste -s -d'+' | bc) + +# Claimed load average +claimed_loadavg=$(condor_status -af LoadAvg -constraint 'State == "Claimed"' | paste -s -d'+' | bc) + +# Unclaimed load average +unclaimed_loadavg=$(condor_status -af LoadAvg -constraint 'State == "Unclaimed"' | paste -s -d'+' | bc) + +# Total number of slots +total_slots=$(condor_status -af Name -constraint 'SlotType == "Partitionable" || SlotType == "Dynamic" ' | wc -l) + +# Total number of Claimed slots with Activity Busy +claimed_busy_slots=$(condor_status -af Name -constraint 'State == "Claimed" && Activity == "Busy"' | wc -l) + +# Total number of Unclaimed slots with Activity Idle +unclaimed_idle_slots=$(condor_status -af Name -constraint 'State == "Unclaimed" && Activity == "Idle"' | wc -l) + +# Output in influxdb protocol format +echo "htcondor_cluster_usage,classad='machine' total_detected_cpus=$total_detected_cpus,claimed_cpus=$claimed_cpus,unclaimed_cpus=$unclaimed_cpus,total_memory=$total_memory,claimed_memory=$claimed_memory,unclaimed_memory=$unclaimed_memory,total_loadavg=$total_loadavg,claimed_loadavg=$claimed_loadavg,unclaimed_loadavg=$unclaimed_loadavg,total_slots=$total_slots,claimed_busy_slots=$claimed_busy_slots,unclaimed_idle_slots=$unclaimed_idle_slots,total_gpu_slots=$total_gpu_slots,claimed_gpus=$claimed_gpus,unclaimed_gpus=$unclaimed_gpus" + diff --git a/roles/hxr.monitor-cluster/tasks/condor.yml b/roles/hxr.monitor-cluster/tasks/condor.yml index e4df9037c..996d0bcb6 100644 --- a/roles/hxr.monitor-cluster/tasks/condor.yml +++ b/roles/hxr.monitor-cluster/tasks/condor.yml @@ -6,7 +6,6 @@ owner: root group: root mode: 0755 - when: monitor_condor_split_util - name: Allow telegraf to run monitor-condor-utilisation-split lineinfile: @@ -15,7 +14,6 @@ insertafter: EOF line: 'telegraf ALL=(ALL) NOPASSWD: /usr/bin/monitor-condor-utilisation-split' validate: 'visudo -cf %s' - when: monitor_condor_split_util - name: "Send condor cluster utilisation monitor" copy: @@ -48,43 +46,3 @@ insertafter: EOF line: 'telegraf ALL=(ALL) NOPASSWD: /usr/bin/monitor-condor-queue' validate: 'visudo -cf %s' - -#- set_fact: - #monitor_condor_queue: - #- plugin: exec - #disambiguation: monitor_condor_queue - #config: - #- commands = ["/usr/bin/monitor-condor-queue"] - #- timeout = "5s" - #- data_format = "influx" - #- interval = "10s" - -#- set_fact: - #telegraf_plugins_extra: "{{ telegraf_plugins_extra + monitor_condor_queue }}" - -#- set_fact: - #monitor_condor_util: - #- plugin: exec - #disambiguation: monitor_condor_util - #config: - #- commands = ["/usr/bin/monitor-condor-utilisation"] - #- timeout = "5s" - #- data_format = "influx" - #- interval = "10s" - -#- set_fact: - #telegraf_plugins_extra: "{{ telegraf_plugins_extra + monitor_condor_util }}" - -#- set_fact: - #monitor_condor_util_split: - #- plugin: exec - #disambiguation: monitor_condor_util_split - #config: - #- commands = ["/usr/bin/monitor-condor-utilisation-split"] - #- timeout = "5s" - #- data_format = "influx" - #- interval = "10s" - -#- set_fact: - #telegraf_plugins_extra: "{{ telegraf_plugins_extra + monitor_condor_util_split }}" - #when: "{{ monitor_condor_split_util }}"