forked from HabanaAI/Model-References
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.sub
executable file
·146 lines (128 loc) · 4.95 KB
/
run.sub
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/bin/bash
#SBATCH -p mlperf # partition
#SBATCH -N 1 # number of nodes
#SBATCH -t 12:00:00 # wall time
#SBATCH -J single_stage_detector # job name
#SBATCH --exclusive # exclusive node access
#SBATCH --mem=0 # all mem avail
#SBATCH --mail-type=FAIL # only send email on failure
#SBATCH --ntasks-per-node=8 # n tasks per machine (one task per gpu)
#SBATCH --threads-per-core=2 # HT is on
#SBATCH --cores-per-socket=20 # 20 cores on each socket
DATESTAMP=${DATESTAMP:-`date +'%y%m%d%H%M%S%N'`}
## Data, container and volumes
BENCHMARK=${BENCHMARK:-"single_stage_detector"}
BENCHMARK_NAME="ssd"
CONT=${CONT:-"mlperf-nvidia:$BENCHMARK"}
DATADIR=${DATADIR:-"/raid/data/coco-2017"} # there should be ./coco2017 and ./torchvision dirs in here
LOGDIR=${LOGDIR:-"/raid/results/$BENCHMARK"}
NEXP=${NEXP:-5} # Default number of times to run the benchmark
## DO NOT CHANGE ANYTHING BELOW -- DL params are in run_and_time.sh and config_<system>.sh files
## Load system-specific parameters for benchmark
DGXSYSTEM=${DGXSYSTEM:-"DGX1"}
if [[ ! -f "config_${DGXSYSTEM}.sh" ]]; then
echo "Unknown system, assuming DGX1"
DGXSYSTEM="DGX1"
fi
source config_${DGXSYSTEM}.sh
echo "config_${DGXSYSTEM}.sh"
echo ${EXTRA_PARAMS[@]}
cat config_${DGXSYSTEM}.sh
IBDEVICES=${IBDEVICES:-$DGXIBDEVICES}
## Check whether we are running in a slurm env
INSLURM=1
if [[ -z "$SLURM_JOB_ID" ]]; then
INSLURM=0
export SLURM_JOB_ID="${DATESTAMP}"
export SLURM_NNODES=1
fi
if [[ -z "${SLURM_JOB_ID}" || $SLURM_NNODES -eq 1 ]]; then
# don't need IB if not multi-node
export IBDEVICES=""
fi
# Create results directory
LOGFILE_BASE="${LOGDIR}/${DATESTAMP}"
mkdir -p $(dirname "${LOGFILE_BASE}")
## Docker params
CONTVOLS="-v $DATADIR:/data -v $LOGDIR:/results"
NV_GPU="${NVIDIA_VISIBLE_DEVICES:-$(seq 0 $((${SLURM_NTASKS_PER_NODE:-${DGXNGPU}}-1)) | tr '\n' ',' | sed 's/,$//')}"
DOCKEREXEC="env NV_GPU=${NV_GPU} nvidia-docker run --rm --net=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --name=cont_${SLURM_JOB_ID} --security-opt seccomp=unconfined $IBDEVICES"
## Prep run and launch
MASTER_IP=`getent hosts \`hostname\` | cut -d ' ' -f1`
PORT=$((4242 + RANDOM%1000))
SSH=''
SRUN=''
if [[ $INSLURM -eq 1 ]]; then
hosts=( `scontrol show hostname |tr "\n" " "` )
SSH='ssh -q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no $hostn'
SRUN='srun -N 1 -n 1 -w $hostn'
else
hosts=( `hostname` )
fi
# Test the base container launch
pids=();
for hostn in ${hosts[@]}; do
timeout -k 600s 600s \
$(eval echo $SRUN) $DOCKEREXEC $CONT python -c 'import torch; print("Found",torch.cuda.device_count(),"CUDA GPUs")' &
pids+=($!);
done
wait "${pids[@]}"
success=$? ; if [ $success -ne 0 ]; then echo "ERR: Base container launch failed."; exit $success ; fi
# Launch containers
pids=(); rets=()
for hostn in ${hosts[@]}; do
$(eval echo $SSH) $DOCKEREXEC $CONTVOLS $CONT sleep infinity &
pids+=($!); rets+=($?);
done
success=0; for s in ${rets[@]}; do ((success+=s)); done ; if [ $success -ne 0 ]; then echo "ERR: Container launch failed."; exit $success ; fi
sleep 10 # Making sure containers have time to launch
# Run benchmarks
export NEXP
for nrun in `seq 1 $NEXP`; do
(
echo "Beginning trial $nrun of $NEXP"
## Clear RAM cache dentries and inodes
echo "Clearing caches"
# LOG_COMPLIANCE="'from mlperf_compliance import mlperf_log as log; log.${BENCHMARK_NAME}_print(key=log.RUN_CLEAR_CACHES)'"
pids=(); rets=()
for hostn in ${hosts[@]}; do
if [[ $INSLURM -eq 1 ]]; then
$(eval echo $SSH) bash -c 'sync && sudo /sbin/sysctl vm.drop_caches=3' # && \
# $(eval echo $SSH) docker exec cont_${SLURM_JOB_ID} python -c $LOG_COMPLIANCE &
else
docker run --rm --privileged --entrypoint bash $CONT -c "sync && echo 3 > /proc/sys/vm/drop_caches && python -c $LOG_COMPLIANCE || exit 1" &
fi
pids+=($!); rets+=($?);
done
wait "${pids[@]}"
success=0; for s in ${rets[@]}; do ((success+=s)); done ; if [ $success -ne 0 ]; then echo "ERR: Cache clearing failed."; exit $success ; fi
## Launching benchmark
pids=();
export MULTI_NODE=''
for h in `seq 0 $((SLURM_NNODES-1))`; do
hostn="${hosts[$h]}"
echo "Launching on node $hostn"
if [[ $SLURM_NNODES -gt 1 ]]; then
export MULTI_NODE=" --nnodes=$SLURM_NNODES --node_rank=$h --master_addr=$MASTER_IP --master_port=$PORT"
else
export MULTI_NODE=" --master_port=$PORT"
fi
export DOCKERENV=(
"-e" "DGXSYSTEM=$DGXSYSTEM"
"-e" "MULTI_NODE=$MULTI_NODE"
"-e" "SLURM_JOB_ID=$SLURM_JOB_ID"
"-e" "SLURM_NTASKS_PER_NODE=$SLURM_NTASKS_PER_NODE"
)
# Execute command
set -x
$(eval echo $SRUN) docker exec "${DOCKERENV[@]}" cont_${SLURM_JOB_ID} ./run_and_time.sh &
pids+=($!);
set +x
done
wait "${pids[@]}"
) |& tee ${LOGFILE_BASE}_$nrun.log
done
# Clean up (note: on SLURM we skip this, as the epilogue will take care of it)
if [[ $INSLURM -eq 0 ]]; then
docker rm -f cont_${SLURM_JOB_ID}
fi