Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Process condor util queries locally #839

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions group_vars/maintenance.yml
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,13 @@ telegraf_plugins_extra:
- timeout = "10s"
- data_format = "influx"
- interval = "1m"
monitor_condor_util_split:
plugin: "exec"
config:
- commands = ["sudo /usr/bin/monitor-condor-utilisation-split"]
- timeout = "10s"
- data_format = "influx"
- interval = "1m"
postgres_extra:
plugin: "exec"
config:
Expand Down
22 changes: 15 additions & 7 deletions roles/hxr.monitor-cluster/files/cluster_util-condor-split.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
#!/bin/bash
# Details: For each GalaxyGroup we calculate the following to monitor the cluster usage

for cluster in $(condor_status -autoformat GalaxyGroup | sort | grep -v undefined | uniq); do
mem_total=$(condor_status -autoformat TotalMemory -constraint 'GalaxyGroup == "'$cluster'" && Activity == "Idle"' | paste -s -d'+' | bc)
mem_remain=$(condor_status -autoformat Memory -constraint 'GalaxyGroup == "'$cluster'" && Activity == "Idle"' | paste -s -d'+' | bc)
cpu_total=$(condor_status -autoformat DetectedCpus -constraint 'GalaxyGroup == "'$cluster'" && Activity == "Idle"' | paste -s -d'+' | bc)
cpu_remain=$(condor_status -autoformat Cpus -constraint 'GalaxyGroup == "'$cluster'" && Activity == "Idle"' | paste -s -d'+' | bc)
mem_perc=$(echo "($mem_total - $mem_remain) / $mem_total" | bc -l)
cpu_perc=$(echo "($cpu_total - $cpu_remain) / $cpu_total" | bc -l)
echo "cluster.alloc,cluster=condor-sep,group=$cluster cores=0$cpu_perc,memory=0$mem_perc"
total_slots=$(condor_status -af Name -constraint 'GalaxyGroup == "'$cluster'" && (SlotType == "Partitionable" || SlotType == "Dynamic")' | wc -l)
claimed_slots=$(condor_status -af Name -constraint 'GalaxyGroup == "'$cluster'" && State == "Claimed"' | wc -l)
unclaimed_slots=$(condor_status -af Name -constraint 'GalaxyGroup == "'$cluster'" && State == "Unclaimed"' | wc -l)
total_cpus=$(condor_status -af DetectedCpus -constraint 'GalaxyGroup == "'$cluster'" && SlotType == "Partitionable"' | paste -s -d'+' | bc)
claimed_cpus=$(condor_status -af Cpus -constraint 'GalaxyGroup == "'$cluster'" && State == "Claimed"' | paste -s -d'+' | bc)
unclaimed_cpus=$(condor_status -af Cpus -constraint 'GalaxyGroup == "'$cluster'" && State == "Unclaimed"' | paste -s -d'+' | bc)
total_memory=$(condor_status -af TotalMemory -constraint 'GalaxyGroup == "'$cluster'" && SlotType == "Partitionable"' | paste -s -d'+' | bc)
claimed_memory=$(condor_status -af Memory -constraint 'GalaxyGroup == "'$cluster'" && State == "Claimed"' | paste -s -d'+' | bc)
unclaimed_memory=$(condor_status -af Memory -constraint 'GalaxyGroup == "'$cluster'" && State == "Unclaimed"' | paste -s -d'+' | bc)
total_gpu_slots=$(condor_status -af Name -constraint 'GalaxyGroup == "'$cluster'" && CUDADeviceName =!= undefined' | wc -l)
claimed_gpus=$(condor_status -af Name -constraint 'GalaxyGroup == "'$cluster'" && State == "Claimed" && CUDADeviceName =!= undefined' | wc -l)
unclaimed_gpus=$(condor_status -af Name -constraint 'GalaxyGroup == "'$cluster'" && State == "Unclaimed" && CUDADeviceName =!= undefined' | wc -l)
echo "htcondor_cluster_usage,classad='cluster',group=$cluster total_slots=$total_slots,claimed_slots=$claimed_slots,unclaimed_slots=$unclaimed_slots,total_cpus=$total_cpus,claimed_cpus=$claimed_cpus,unclaimed_cpus=$unclaimed_cpus,total_memory=$total_memory,claimed_memory=$claimed_memory,unclaimed_memory=$unclaimed_memory,total_gpu_slots=$total_gpu_slots,claimed_gpus=$claimed_gpus,unclaimed_gpus=$unclaimed_gpus"
done
74 changes: 50 additions & 24 deletions roles/hxr.monitor-cluster/files/cluster_util-condor.sh
Original file line number Diff line number Diff line change
@@ -1,25 +1,51 @@
#!/bin/bash
# mem_total=$(condor_status -autoformat TotalMemory | paste -s -d'+' | bc)
# mem_alloc=$(condor_status -autoformat Memory | paste -s -d'+' | bc)
# mem_perc=$(echo "$mem_alloc / $mem_total" | bc -l)
# cpu_total=$(condor_status -autoformat DetectedCpus | paste -s -d'+' | bc)
# cpu_alloc=$(condor_status -autoformat Cpus | paste -s -d'+' | bc)
# cpu_perc=$(echo "$cpu_alloc / $cpu_total" | bc -l)
# echo "cluster.alloc,cluster=condor cores=0$cpu_perc,memory=0$mem_perc"

# As of 04.07.2023, the following is used to collect data from the cluster
# Details:
# SlotType: Dynamic or partitionable slots. Each host is partitioned to 1 slot and that slot is further dynamically partitioned to several slots
# Name: Name of the slot
# State: Claimed or Unclaimed slot
# Activity: Idle or Busy
# DetectedCpus: Total CPU cores available at machine level
# Cpus: Total CPU cores available at slot level
# TotalMemory: Total memory available at machine level
# Memory: Total memory available at slot level
# LoadAvg: Load avergate at slot level
# TotalLoadAvg: Total load average at the machine level
# GalaxyGroup: Group name of the machine

# Command:
condor_status -af:l Name SlotType State Activity GalaxyGroup DetectedCpus Cpus TotalMemory Memory LoadAvg TotalLoadAvg -constraint 'SlotType == "Dynamic" || SlotType == "Partitionable"' | awk -F '[= ]+' '{printf("htcondor_cluster_usage,classad=\"slot\",%s=\"%s\",%s=\"%s\",%s=\"%s\",%s=\"%s\",%s=\"%s\",%s=%s,%s=%s,%s=%s,%s=%s,%s=%s,%s=%s %s=\"%s\",%s=\"%s\",%s=\"%s\",%s=\"%s\",%s=\"%s\",%s=%s,%s=%s,%s=%s,%s=%s,%s=%s,%s=%s\n", $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22)}'
# Details: This script is used to monitor the entire HTCondor cluster usage independent of GalaxyGroup's

# Total number of detected CPUs at the machine level
total_detected_cpus=$(condor_status -af DetectedCpus -constraint 'SlotType == "Partitionable"' | paste -s -d'+' | bc)

# Claimed CPUs
claimed_cpus=$(condor_status -af Cpus -constraint 'State == "Claimed"' | paste -s -d'+' | bc)

# Unclaimed CPUs
unclaimed_cpus=$(condor_status -af Cpus -constraint 'State == "Unclaimed"' | paste -s -d'+' | bc)

# Total memory at the machine level
total_memory=$(condor_status -af TotalMemory -constraint 'SlotType == "Partitionable"' | paste -s -d'+' | bc)

# Claimed memory
claimed_memory=$(condor_status -af Memory -constraint 'State == "Claimed"' | paste -s -d'+' | bc)

# Unclaimed memory
unclaimed_memory=$(condor_status -af Memory -constraint 'State == "Unclaimed"' | paste -s -d'+' | bc)

# Total number of GPU slots
total_gpu_slots=$(condor_status -af Name -constraint 'CUDADeviceName =!= undefined' | wc -l)

# Claimed GPUs slots
claimed_gpus=$(condor_status -af Name -constraint 'State == "Claimed" && CUDADeviceName =!= undefined' | wc -l)

# Unclaimed GPUs slots
unclaimed_gpus=$(condor_status -af Name -constraint 'State == "Unclaimed" && CUDADeviceName =!= undefined' | wc -l)

# Total load average at the machine level
total_loadavg=$(condor_status -af TotalLoadAvg -constraint 'SlotType == "Partitionable"' | paste -s -d'+' | bc)

# Claimed load average
claimed_loadavg=$(condor_status -af LoadAvg -constraint 'State == "Claimed"' | paste -s -d'+' | bc)

# Unclaimed load average
unclaimed_loadavg=$(condor_status -af LoadAvg -constraint 'State == "Unclaimed"' | paste -s -d'+' | bc)

# Total number of slots
total_slots=$(condor_status -af Name -constraint 'SlotType == "Partitionable" || SlotType == "Dynamic" ' | wc -l)

# Total number of Claimed slots with Activity Busy
claimed_busy_slots=$(condor_status -af Name -constraint 'State == "Claimed" && Activity == "Busy"' | wc -l)

# Total number of Unclaimed slots with Activity Idle
unclaimed_idle_slots=$(condor_status -af Name -constraint 'State == "Unclaimed" && Activity == "Idle"' | wc -l)

# Output in influxdb protocol format
echo "htcondor_cluster_usage,classad='machine' total_detected_cpus=$total_detected_cpus,claimed_cpus=$claimed_cpus,unclaimed_cpus=$unclaimed_cpus,total_memory=$total_memory,claimed_memory=$claimed_memory,unclaimed_memory=$unclaimed_memory,total_loadavg=$total_loadavg,claimed_loadavg=$claimed_loadavg,unclaimed_loadavg=$unclaimed_loadavg,total_slots=$total_slots,claimed_busy_slots=$claimed_busy_slots,unclaimed_idle_slots=$unclaimed_idle_slots,total_gpu_slots=$total_gpu_slots,claimed_gpus=$claimed_gpus,unclaimed_gpus=$unclaimed_gpus"

42 changes: 0 additions & 42 deletions roles/hxr.monitor-cluster/tasks/condor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
owner: root
group: root
mode: 0755
when: monitor_condor_split_util

- name: Allow telegraf to run monitor-condor-utilisation-split
lineinfile:
Expand All @@ -15,7 +14,6 @@
insertafter: EOF
line: 'telegraf ALL=(ALL) NOPASSWD: /usr/bin/monitor-condor-utilisation-split'
validate: 'visudo -cf %s'
when: monitor_condor_split_util

- name: "Send condor cluster utilisation monitor"
copy:
Expand Down Expand Up @@ -48,43 +46,3 @@
insertafter: EOF
line: 'telegraf ALL=(ALL) NOPASSWD: /usr/bin/monitor-condor-queue'
validate: 'visudo -cf %s'

#- set_fact:
#monitor_condor_queue:
#- plugin: exec
#disambiguation: monitor_condor_queue
#config:
#- commands = ["/usr/bin/monitor-condor-queue"]
#- timeout = "5s"
#- data_format = "influx"
#- interval = "10s"

#- set_fact:
#telegraf_plugins_extra: "{{ telegraf_plugins_extra + monitor_condor_queue }}"

#- set_fact:
#monitor_condor_util:
#- plugin: exec
#disambiguation: monitor_condor_util
#config:
#- commands = ["/usr/bin/monitor-condor-utilisation"]
#- timeout = "5s"
#- data_format = "influx"
#- interval = "10s"

#- set_fact:
#telegraf_plugins_extra: "{{ telegraf_plugins_extra + monitor_condor_util }}"

#- set_fact:
#monitor_condor_util_split:
#- plugin: exec
#disambiguation: monitor_condor_util_split
#config:
#- commands = ["/usr/bin/monitor-condor-utilisation-split"]
#- timeout = "5s"
#- data_format = "influx"
#- interval = "10s"

#- set_fact:
#telegraf_plugins_extra: "{{ telegraf_plugins_extra + monitor_condor_util_split }}"
#when: "{{ monitor_condor_split_util }}"
Loading