This repository has been archived by the owner on Sep 2, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
training.py
112 lines (93 loc) · 3.6 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
import torch
from progress.bar import Bar
from models.data_parallel import DataParallel
from utils.utils import AverageMeter
class ModelWithLoss(torch.nn.Module):
def __init__(self, model, loss):
super(ModelWithLoss, self).__init__()
self.model = model
self.loss = loss
def forward(self, batch):
outputs = self.model(batch['input'])
loss, loss_stats = self.loss(outputs, batch)
return outputs[-1], loss, loss_stats
class Trainer(object):
def __init__(self, opt, model, optimizer=None):
self.opt = opt
self.optimizer = optimizer
self.loss_stats, self.loss = self._get_losses(opt)
self.model_with_loss = ModelWithLoss(model, self.loss)
self.model_with_loss = self.model_with_loss.to(device)
for state in self.optimizer.state.values():
for k, v in state.items():
if isinstance(v, torch.Tensor):
state[k] = v.to(device=device, non_blocking=True)
def run_epoch(self, phase, epoch, data_loader):
model_with_loss = self.model_with_loss
if phase == 'train':
model_with_loss.train()
else:
if len(self.opt.gpus) > 1:
model_with_loss = self.model_with_loss.module
model_with_loss.eval()
torch.cuda.empty_cache()
opt = self.opt
results = {}
data_time, batch_time = AverageMeter(), AverageMeter()
avg_loss_stats = {l: AverageMeter() for l in self.loss_stats}
num_iters = len(data_loader) if opt.num_iters < 0 else opt.num_iters
bar = Bar('{}/{}'.format(opt.task, opt.exp_id), max=num_iters)
end = time.time()
for iter_id, batch in enumerate(data_loader):
if iter_id >= num_iters:
break
data_time.update(time.time() - end)
for k in batch:
if k != 'meta':
batch[k] = batch[k].to(device=opt.device, non_blocking=True)
output, loss, loss_stats = model_with_loss(batch)
loss = loss.mean()
if phase == 'train':
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
batch_time.update(time.time() - end)
end = time.time()
Bar.suffix = '{phase}: [{0}][{1}/{2}]|Tot: {total:} |ETA: {eta:} '.format(
epoch, iter_id, num_iters, phase=phase,
total=bar.elapsed_td, eta=bar.eta_td)
for l in avg_loss_stats:
avg_loss_stats[l].update(
loss_stats[l].mean().item(), batch['input'].size(0))
Bar.suffix = Bar.suffix + '|{} {:.4f} '.format(l, avg_loss_stats[l].avg)
if not opt.hide_data_time:
Bar.suffix = Bar.suffix + '|Data {dt.val:.3f}s({dt.avg:.3f}s) ' \
'|Net {bt.avg:.3f}s'.format(dt=data_time, bt=batch_time)
if opt.print_iter > 0:
if iter_id % opt.print_iter == 0:
print('{}/{}| {}'.format(opt.task, opt.exp_id, Bar.suffix))
else:
bar.next()
if opt.debug > 0:
self.debug(batch, output, iter_id)
if opt.test:
self.save_result(output, batch, results)
del output, loss, loss_stats
bar.finish()
ret = {k: v.avg for k, v in avg_loss_stats.items()}
ret['time'] = bar.elapsed_td.total_seconds() / 60.
return ret, results
def debug(self, batch, output, iter_id):
raise NotImplementedError
def save_result(self, output, batch, results):
raise NotImplementedError
def _get_losses(self, opt):
raise NotImplementedError
def val(self, epoch, data_loader):
return self.run_epoch('val', epoch, data_loader)
def train(self, epoch, data_loader):
return self.run_epoch('train', epoch, data_loader)