forked from facebookresearch/ImageNet-Adversarial-Training
-
Notifications
You must be signed in to change notification settings - Fork 1
/
eval.sh
executable file
·34 lines (29 loc) · 1.23 KB
/
eval.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/bin/bash -e
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#SBATCH --output=logs/job-%j.%N.out
#SBATCH --error=logs/job-%j.%N.err
#SBATCH --ntasks-per-node=8 # 8 tasks per node
#SBATCH --gres=gpu:8 # 8 GPUs per node
#SBATCH --cpus-per-task=10 # 80/8 cpus per task
#SBATCH --mem=200G # ask for 200G
# To run on 4 nodes x 8 GPUs: use "mkdir -p logs && sbatch --nodes=4 slurm.script"
echo "NNODES: $SLURM_NNODES"
echo "JOBID: $SLURM_JOB_ID"
env | grep PATH
export TENSORPACK_PROGRESS_REFRESH=20
export TENSORPACK_SERIALIZE=msgpack
DATA_PATH=~/data/imagenet
BATCH=32
CONFIG=$1
# launch eval
# https://www.open-mpi.org/faq/?category=openfabrics#ib-router has document on IB options
# the queue parameters sometimes can hang the communication (for some MPI versions and some operations)
mpirun -output-filename logs/eval-$SLURM_JOB_ID.log -tag-output \
-bind-to none -map-by slot \
-mca pml ob1 -mca btl_openib_receive_queues P,128,32:P,2048,32:P,12288,32:P,65536,32 \
-x NCCL_IB_CUDA_SUPPORT=1 -x NCCL_IB_DISABLE=0 -x NCCL_DEBUG=INFO \
python ./main.py --eval --data $DATA_PATH --batch $BATCH $CONFIG