-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.sh
41 lines (30 loc) · 1.15 KB
/
test.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/bin/bash
#SBATCH --job-name=test
#SBATCH --partition=gpu_test
#SBATCH --time=8:00:00
### e.g. request 2 nodes with 1 gpu each, totally 2 gpus (WORLD_SIZE==2)
### Note: --gres=gpu:x should equal to ntasks-per-node
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=1
#SBATCH --mem=150G
#SBATCH --chdir=/n/home11/nswood/HPC_Parallel_Computing/
#SBATCH --output=slurm_monitoring/%x-%j.out
### change 5-digit MASTER_PORT as you wish, slurm will raise Error if duplicated with others
### change WORLD_SIZE as gpus/node * num_nodes
export MASTER_PORT=9304
export WORLD_SIZE=2
### get the first node name as master address - customized for vgg slurm
### e.g. master(gnodee[2-5],gnoded1) == gnodee2
echo "NODELIST="${SLURM_NODELIST}
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
echo "MASTER_ADDR="$MASTER_ADDR
### init virtual environment if needed
### Must be updated with your environment
source ~/.bashrc
source /n/holystore01/LABS/iaifi_lab/Users/nswood/mambaforge/etc/profile.d/conda.sh
conda activate tune_env
# conda activate flat-samples
srun python train_ddp_test.py