-
Notifications
You must be signed in to change notification settings - Fork 0
/
run-singularity-dist.slurm.sh
47 lines (36 loc) · 1.47 KB
/
run-singularity-dist.slurm.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env bash
#SBATCH --job-name=corpuscleaner
#SBATCH --output=logs/corpuscleaner_%j.out
#SBATCH --error=logs/corpuscleaner_%j.err
#SBATCH --ntasks=3
#SBATCH --cpus-per-task=48
#SBATCH --time=2-00:00:00
#SBATCH --wait
#SBATCH --wait-all-nodes=1
# --parallel & --backend ray are needed to execute in distributed mode!
PARAMETERS="example-output \
--input-path data/toy_wiki \
--input-format wikipedia \
--output-format fairseq-lm \
--lang-filter ca \
--parallel \
--backend ray"
module load singularity/3.6.4
hostlist=$(scontrol show hostname $SLURM_JOB_NODELIST)
master=$(echo "${hostlist}" | head -n 1)
work_dir=$(pwd)
echo ${hostlist}
singularity instance start --writable-tmpfs --bind $(realpath data):/cc/data --bind $(realpath output):/cc/output corpuscleaner-singularity.sif cc
singularity exec instance://cc bash -c "ray start --head --port=6379"
i=1
while [ $i -lt $SLURM_JOB_NUM_NODES ]
do
j=$(($i + 1))
host=$(echo "${hostlist}" | sed "${j}q;d")
echo $master ${SLURM_JOB_NUM_NODES} ${i}
ssh -n "$host" "module load singularity/3.6.4; cd ${work_dir}; singularity instance start --writable-tmpfs --bind $(realpath data):/cc/data --bind $(realpath output):/cc/output corpuscleaner-singularity.sif cc; singularity exec instance://cc bash -c \"ray start --address=${master}:6379\"" &
((i++))
done
sleep 30
singularity exec instance://cc bash -c "cd /cc/corpus-cleaner && RAY_ADDRESS=auto python3 clean.py ${PARAMETERS}"
wait