forked from wzmsltw/BSN-boundary-sensitive-network.pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_on_cluster.py
152 lines (126 loc) · 4.55 KB
/
run_on_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""Run train on the cluster."""
import os
from copy import deepcopy
# NOTE:
# email should be your email. name should be identifying, like `cinjon`.
# code_directory is the directory containing train_video_cycle_simple.py, relative to ${HOME}.
# Examples:
# email = '[email protected]'
# name = 'cinjon'
# code_directory = 'Code/cycle-consistent-supervision'
# from local_config import email, name, code_directory
def _ensure_dirs(slurm_logs, slurm_scripts):
for d in [slurm_logs, slurm_scripts]:
if not os.path.exists(d):
os.makedirs(d)
def _is_float(v):
try:
v = float(v)
return True
except:
return False
def _is_int(v):
try:
return int(v) == float(v)
except:
return False
def _run_batch(job,
counter,
slurm_logs,
slurm_scripts,
module_load,
directory,
email,
code_directory,
local_comet_dir=None):
_ensure_dirs(slurm_logs, slurm_scripts)
job = deepcopy(job)
time = job.get('time', 16)
hours = int(time)
minutes = int((time - hours) * 60)
if local_comet_dir:
job['local_comet_dir'] = os.path.join(local_comet_dir, job['module'].lower())
num_gpus = job['num_gpus']
num_cpus = job.pop('num_cpus')
job['data_workers'] = min(int(2.5 * num_gpus), num_cpus - num_gpus)
job['data_workers'] = max(job['data_workers'], 12)
gb = job.pop('gb')
memory_per_node = min(gb, 500)
flagstring = " --counter %d" % counter
for key, value in sorted(job.items()):
if type(value) == bool:
if value == True:
flagstring += " --%s" % key
elif _is_int(value):
flagstring += ' --%s %d' % (key, value)
elif _is_float(value):
flagstring += ' --%s %.6f' % (key, value)
else:
flagstring += ' --%s %s' % (key, value)
if job['module'] == 'TEM':
if job['mode'] == 'train':
jobname = "temtr.%s" % job['name']
elif job['mode'] == 'inference':
jobname = "teminf.%s" % job['name']
else:
raise
elif job['module'] == 'PGM':
jobname = "pgm.%s" % job['name']
elif job['module'] == 'PEM':
if job['mode'] == 'train':
jobname = "pemtr.%s" % job['name']
elif job['mode'] == 'inference':
jobname = "peminf.%s" % job['name']
else:
raise
elif job['module'] == 'Post_processing':
jobname = 'postproc.%s' % job['name']
elif job['module'] == 'Evaluation':
jobname = 'eval.%s' % job['name']
else:
raise
jobcommand = "python main.py %s" % flagstring
print(jobcommand)
# print(jobname)
slurmfile = os.path.join(slurm_scripts, jobname + '.slurm')
with open(slurmfile, 'w') as f:
f.write("#!/bin/bash\n")
f.write("#SBATCH --job-name=%s\n" % jobname)
f.write("#SBATCH --mail-type=END,FAIL\n")
f.write("#SBATCH --mail-user=%s\n" % email)
f.write("#SBATCH --cpus-per-task=%d\n" % num_cpus)
f.write("#SBATCH --time=%d:%d:00\n" % (hours, minutes))
if num_gpus > 0:
f.write("#SBATCH --gres=ntasks-per-node=1\n")
f.write("#SBATCH --gres=gpu:%d\n" % num_gpus)
f.write("#SBATCH --mem=%dG\n" % memory_per_node)
f.write("#SBATCH --nodes=1\n")
f.write("#SBATCH --output=%s\n" %
os.path.join(slurm_logs, jobname + ".out"))
f.write("#SBATCH --error=%s\n" %
os.path.join(slurm_logs, jobname + ".err"))
f.write("module purge" + "\n")
module_load(f, num_gpus)
f.write("source activate onoff\n")
f.write("SRCDIR=%s\n" % code_directory)
f.write("cd ${SRCDIR}\n")
f.write(jobcommand + "\n")
s = "sbatch %s" % os.path.join(slurm_scripts, jobname + ".slurm")
os.system(s)
def fb_run_batch(job, counter, email, code_directory):
def module_load(f, num_gpus):
if num_gpus > 0:
f.write("module load cuda/10.0\n")
directory = '/checkpoint/cinjon/spaceofmotion'
slurm_logs = os.path.join(directory, 'bsn', 'slurm_logs')
slurm_scripts = os.path.join(directory, 'bsn', 'slurm_scripts')
comet_dir = os.path.join(directory, 'bsn', 'comet')
_run_batch(job,
counter,
slurm_logs,
slurm_scripts,
module_load,
directory,
email,
code_directory,
local_comet_dir=comet_dir)