-
Notifications
You must be signed in to change notification settings - Fork 0
/
vd_progress_check_sge.sh
executable file
·71 lines (57 loc) · 2.23 KB
/
vd_progress_check_sge.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/bin/bash
# DV-4
# Usage: $0 [QUEUE] [SAFE_RATE] [DONE_RATE] [NF_CHANNEL_FILE_DIR]
#
# SAFE_RATE: when the progress is BELOW SAFE_RATE, we will disable "slacker" only.
# when the progress is ABOVE SAFE_RATE, we will disable "resister" nodes
#
# DONE_RATE: indicates completion. Normally it's 1 (100%)
#
# NF_CHANNEL_FILE_DIR: where to generate signal file which are used by nextflow channel
# IMPORTANT: We do not use "exec >> /tmp/log" here like other scripts
# because nextflow needs to write to the same file in the processes.
LOG="/tmp/vd_progress_check_sge.log"
q="${1:-short.q}"
safe="${2:-0.50}"
done="${3:-1}"
nf_probe_dir="${4:-/tmp}"
nf_probe_file="$nf_probe_dir"/nf_probe_progress_state
# number of mismatch nodes found the first time
# is also the total number for the entire process.
vd_find_mismatch_sge.sh $q
count_mismatch=$?
count_all=$count_mismatch
SSH_TIMEOUT=5
BAD_INTVL="$(($SSH_TIMEOUT * $count_all))s"
SAFE_INTVL="120m"
printf "#################### Started: $(date "+%Y.%m.%d-%H.%M.%S") ####################\n" >> "$LOG"
printf "## sgeQueue : $q\n" >> "$LOG"
printf "## bad : %% of completed nodes < $safe (interval $BAD_INTVL)\n" >> "$LOG"
printf "## safe : $safe < %% of completed nodes < $done (interval $SAFE_INTVL)\n" >> "$LOG"
printf "## done : %% of completed nodes = $done\n" >> "$LOG"
printf "######################################################################\n" >> "$LOG"
while :
do
current_time="$(date "+%Y.%m.%d-%H.%M.%S")"
count_completed="$(( count_all - count_mismatch ))"
current="$(echo "scale=2; $count_completed/$count_all" | bc)"
if (( $(echo "$current < $done" | bc -l) ))
then
if (( $(echo "$current > $safe" | bc -l) ))
then
state="SAFE"
sleep_time="$SAFE_INTVL"
else
state="BAD"
sleep_time="$BAD_INTVL"
fi
printf "Progress: $state [ $current_time | All: $count_all | Completed: $count_completed | Ratio: $current ]\n" >> "$LOG"
echo "$state" > "$nf_probe_file"
sleep "$sleep_time"
else
printf "Progress: DONE [ $current_time | All: $count_all | Completed: $count_completed | Ratio: $current ]\n\n\n" >> "$LOG"
exit 0
fi
vd_find_mismatch_sge.sh $q
count_mismatch=$? # vd_find_mismatch_sge.sh rc number of nodes or 0
done