-
Notifications
You must be signed in to change notification settings - Fork 2
/
train_net.py
255 lines (217 loc) · 7.29 KB
/
train_net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
r"""
Basic training script for PyTorch
"""
import argparse
import os
import torch
from torch.utils.collect_env import get_pretty_env_info
from hit.config import cfg
from hit.dataset import make_data_loader
from hit.solver import make_lr_scheduler, make_optimizer
from hit.engine.inference import inference
from hit.engine.trainer import do_train
from hit.modeling.detector import build_detection_model
from hit.utils.checkpoint import ActionCheckpointer
from hit.utils.comm import synchronize, get_rank
from hit.utils.logger import setup_logger, setup_tblogger
from hit.utils.random_seed import set_seed
from hit.utils.IA_helper import has_memory
from hit.structures.memory_pool import MemoryPool
# pytorch issuse #973
import resource
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# CUDA_LAUNCH_BLOCKING=1
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (rlimit[1], rlimit[1]))
def train(cfg, local_rank, distributed, tblogger=None, transfer_weight=False, adjust_lr=False, skip_val=False,
no_head=False):
# build the model.
model = build_detection_model(cfg)
device = torch.device("cuda")
model.to(device)
# make solver.
optimizer = make_optimizer(cfg, model)
scheduler = make_lr_scheduler(cfg, optimizer)
if distributed:
model = torch.nn.parallel.DistributedDataParallel(
model, device_ids=[local_rank], output_device=local_rank,
# this should be removed if we update BatchNorm stats
broadcast_buffers=False, find_unused_parameters=True,
)
arguments = {}
arguments["iteration"] = 0
arguments["person_pool"] = MemoryPool()
output_dir = cfg.OUTPUT_DIR
# load weight.
save_to_disk = get_rank() == 0
checkpointer = ActionCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk)
extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT, model_weight_only=transfer_weight,
adjust_scheduler=adjust_lr, no_head=no_head)
arguments.update(extra_checkpoint_data)
# make dataloader.
data_loader = make_data_loader(
cfg,
is_train=True,
is_distributed=distributed,
start_iter=arguments['iteration'],
)
checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
val_period = cfg.SOLVER.EVAL_PERIOD
mem_active = has_memory(cfg.MODEL.HIT_STRUCTURE)
# make validation dataloader if necessary
if not skip_val:
dataset_names_val = cfg.DATASETS.TEST
data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
else:
dataset_names_val = []
data_loaders_val = []
# training
do_train(
model,
data_loader,
optimizer,
scheduler,
checkpointer,
device,
checkpoint_period,
arguments,
tblogger,
val_period,
dataset_names_val,
data_loaders_val,
distributed,
mem_active,
)
return model
def run_test(cfg, model, distributed):
if distributed:
model = model.module
torch.cuda.empty_cache()
output_folders = [None] * len(cfg.DATASETS.TEST)
dataset_names = cfg.DATASETS.TEST
if cfg.OUTPUT_DIR:
for idx, dataset_name in enumerate(dataset_names):
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
os.makedirs(output_folder, exist_ok=True)
output_folders[idx] = output_folder
# make test dataloader.
data_loaders_test = make_data_loader(cfg, is_train=False, is_distributed=distributed)
# test for each dataset.
for output_folder, dataset_name, data_loader_test in zip(output_folders, dataset_names, data_loaders_test):
inference(
model,
data_loader_test,
dataset_name,
mem_active=has_memory(cfg.MODEL.HIT_STRUCTURE),
output_folder=output_folder,
)
synchronize()
def main():
parser = argparse.ArgumentParser(description="PyTorch Action Detection Training")
parser.add_argument(
"--config-file",
default="config_files/hitnet.yaml",
metavar="FILE",
help="path to config file",
type=str,
)
parser.add_argument("--local_rank", type=int, default=0)
parser.add_argument(
"--skip-final-test",
dest="skip_test",
help="Do not test the final model",
action="store_true",
)
parser.add_argument(
"--skip-val-in-train",
dest="skip_val",
help="Do not validate during training",
action="store_true",
#
default=True,
)
parser.add_argument(
"--transfer",
dest="transfer_weight",
help="Transfer weight from a pretrained model",
action="store_true",
#
default=True
)
parser.add_argument(
"--adjust-lr",
dest="adjust_lr",
help="Adjust learning rate scheduler from old checkpoint",
action="store_true"
)
parser.add_argument(
"--no-head",
dest="no_head",
help="Not load the head layer parameters from weight file",
action="store_true",
#
default=True
)
parser.add_argument(
"--use-tfboard",
action='store_true',
dest='tfboard',
help='Use tensorboard to log stats',
#
default=True
)
parser.add_argument(
"--seed",
type=int,
default=2,
help="Manual seed at the begining."
)
parser.add_argument(
"opts",
help="Modify config options using the command-line",
default=None,
nargs=argparse.REMAINDER,
)
args = parser.parse_args()
num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
args.distributed = num_gpus > 1
if args.distributed:
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(
backend="nccl", init_method="env://"
)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
global_rank = get_rank()
# Merge config.
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
output_dir = cfg.OUTPUT_DIR
if output_dir:
os.makedirs(output_dir, exist_ok=True)
# Print experimental infos.
logger = setup_logger("hit", output_dir, global_rank)
logger.info("Using {} GPUs".format(num_gpus))
logger.info(args)
logger.info("Collecting env info (might take some time)")
logger.info("\n" + get_pretty_env_info())
logger.info("Loaded configuration file {}".format(args.config_file))
with open(args.config_file, "r") as cf:
config_str = "\n" + cf.read()
logger.info(config_str)
logger.info("Running with config:\n{}".format(cfg))
tblogger = None
if args.tfboard:
tblogger = setup_tblogger(output_dir, global_rank)
set_seed(args.seed, global_rank, num_gpus)
# do training.
model = train(cfg, args.local_rank, args.distributed, tblogger, args.transfer_weight, args.adjust_lr, args.skip_val,
args.no_head)
if tblogger is not None:
tblogger.close()
# do final testing.
if not args.skip_test:
run_test(cfg, model, args.distributed)
if __name__ == "__main__":
main()