-
Notifications
You must be signed in to change notification settings - Fork 42
/
train_parallel.py
232 lines (195 loc) · 10.3 KB
/
train_parallel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import os
import argparse
import time
import tqdm
import cv2
import matplotlib.pylab as plt
import torch.cuda
import numpy as np
import torch.nn as nn
import torch.optim as optim
from config.config import GetConfig, COCOSourceConfig, TrainingOpt
from data.mydataset import MyDataset
from torch.utils.data import DataLoader
from models.posenet import Network
import warnings
os.environ['CUDA_VISIBLE_DEVICES'] = "0, 1, 2, 3" # choose the available GPUs
warnings.filterwarnings("ignore")
torch.cuda.empty_cache()
parser = argparse.ArgumentParser(description='PoseNet Training')
parser.add_argument('--resume', '-r', action='store_true', default=False, help='resume from checkpoint')
parser.add_argument('--checkpoint_path', '-p', default='checkpoints_parallel', help='save path')
parser.add_argument('--max_grad_norm', default=5, type=float,
help="If the norm of the gradient vector exceeds this, re-normalize it to have the norm equal to max_grad_norm")
args = parser.parse_args()
torch.backends.cudnn.benchmark = True # 如果我们每次训练的输入数据的size不变,那么开启这个就会加快我们的训练速度
# torch.backends.cudnn.deterministic = True
checkpoint_path = args.checkpoint_path
opt = TrainingOpt()
config = GetConfig(opt.config_name)
soureconfig = COCOSourceConfig(opt.hdf5_train_data)
train_data = MyDataset(config, soureconfig, shuffle=False, augment=True) # shuffle in data loader
train_loader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=16,
pin_memory=True) # num_workers is tuned according to project, too big or small is not good.
soureconfig_val = COCOSourceConfig(opt.hdf5_val_data)
val_data = MyDataset(config, soureconfig_val, shuffle=False, augment=True) # shuffle in data loader
val_loader = DataLoader(val_data, batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=16,
pin_memory=True) # num_workers is tuned according to project, too big or small is not good.
# # ############# for debug ###################
# if __name__ == '__main__':
# t0 = time.time()
# count = 0
# print(torch.cuda.get_device_name(0))
# torch.backends.cudnn.benchmark = True
# for epoch in range(20):
# for bath_id, data_dict in enumerate(train_loader):
#
# t = data_dict[0].cuda(non_blocking=True) # , non_blocking=True
# count += opt.batch_size
# print(bath_id, ' of ', epoch)
# if count > 50:
# break
# print('**************** ', count / (time.time() - t0))
use_cuda = torch.cuda.is_available() # 判断GPU cuda是否可用
best_loss = float('inf')
start_epoch = 0 # 从0开始或者从上一个epoch开始
posenet = Network(opt, config, dist=False, bn=True)
posenet.cuda()
optimizer = optim.SGD(posenet.parameters(), lr=opt.learning_rate, momentum=0.9, weight_decay=1e-4)
if args.resume:
print('Resuming from checkpoint ...... ')
checkpoint = torch.load(opt.ckpt_path, map_location=torch.device('cpu')) # map to cpu to save the gpu memory
# # #################################################
# from collections import OrderedDict
# new_state_dict = OrderedDict()
# for k, v in checkpoint['weights'].items():
# if 'out' in k or 'merge' in k:
# continue
# name = k # add prefix `posenet.`
# new_state_dict[name] = v
# posenet.load_state_dict(new_state_dict, strict=False)
# # #################################################
posenet.load_state_dict(checkpoint['weights']) # 加入他人训练的模型,可能需要忽略部分层,则strict=False
print('Network weights have been resumed from checkpoint...')
optimizer.load_state_dict(checkpoint['optimizer_weight'])
# We must convert the resumed state data of optimizer to gpu
"""It is because the previous training was done on gpu, so when saving the optimizer.state_dict, the stored
states(tensors) are of cuda version. During resuming, when we load the saved optimizer, load_state_dict()
loads this cuda version to cpu. But in this project, we use map_location to map the state tensors to cpu.
In the training process, we need cuda version of state tensors, so we have to convert them to gpu."""
for state in optimizer.state.values():
for k, v in state.items():
if torch.is_tensor(v):
state[k] = v.cuda()
print('Optimizer has been resumed from checkpoint...')
best_loss = checkpoint['train_loss']
print('******************** Best loss resumed is :', best_loss, ' ************************')
start_epoch = checkpoint['epoch'] + 1
del checkpoint
torch.cuda.empty_cache()
if use_cuda:
posenet = torch.nn.parallel.DataParallel(posenet) # , device_ids=[0, 1, 2, 3]
# module.cuda() only move the registered parameters to GPU.
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1, last_epoch=-1) # 设置学习率下降策略
for i in range(start_epoch):
# update the learning rate for start_epoch times
scheduler.step()
for param in posenet.parameters():
if param.requires_grad:
print('Parameters of network: Autograd')
break
def train(epoch):
print('\n ############################# Train phase, Epoch: {} #############################'.format(epoch))
posenet.train()
train_loss = 0
scheduler.step()
print('\nLearning rate at this epoch is: %0.9f\n' % optimizer.param_groups[0]['lr']) # scheduler.get_lr()[0]
for batch_idx, target_tuple in enumerate(train_loader):
# images.requires_grad_()
# loc_targets.requires_grad_()
# conf_targets.requires_grad_()
if use_cuda:
target_tuple = [target_tensor.cuda(non_blocking=True) for target_tensor in target_tuple]
# target tensor shape: [8,512,512,3], [8, 1, 128,128], [8,43,128,128], [8,36,128,128], [8,36,128,128]
images, mask_misses, heatmaps = target_tuple # , offsets, mask_offsets
# images = Variable(images)
# loc_targets = Variable(loc_targets)
# conf_targets = Variable(conf_targets)
optimizer.zero_grad() # zero the gradient buff
loss_ngpu = posenet(target_tuple) # reduce losses of all GPUs on cuda 0
loss = torch.sum(loss_ngpu) / opt.batch_size
# print(loc_preds.requires_grad)
# print(conf_preds.requires_grad)
if loss.item() > 1e6:
print("\nLoss is abnormal, drop this batch !")
loss.zero_()
continue
# print(loss.requires_grad)
loss.backward() # retain_graph=True
# torch.nn.utils.clip_grad_norm(posenet.parameters(), args.max_grad_norm)
optimizer.step() # TODO:可以使用累加的loss变相增大batch size,但对于bn层需要减少默认的momentum
train_loss += loss.item() # 累加的loss !
# 使用loss += loss.detach()来获取不需要梯度回传的部分。
# 或者使用loss.item()直接获得所对应的python数据类型,但是仅仅限于only one element tensors can be converted to Python scalars
print('########################### Epoch:', epoch, ', -- batch:', batch_idx, '/', len(train_loader), ', ',
'Train loss: %.3f, accumulated average loss: %.3f ##############################' % (loss.item(), train_loss / (batch_idx + 1)))
global best_loss
train_loss /= len(train_loader)
os.makedirs(checkpoint_path, exist_ok=True)
logger = open(os.path.join('./' + checkpoint_path, 'log'), 'a+')
logger.write('\nEpoch {}\ttrain_loss: {}'.format(epoch, train_loss)) # validation时不要\n换行
logger.flush()
logger.close()
if train_loss < best_loss:
best_loss = train_loss
print('=====> Saving checkpoint...')
state = {
# not posenet.state_dict(). then, we don't ge the "module" string to begin with
'weights': posenet.module.state_dict(),
'optimizer_weight': optimizer.state_dict(),
'train_loss': train_loss,
'epoch': epoch,
}
torch.save(state, './' + checkpoint_path + '/PoseNet_' + str(epoch) + '_epoch.pth')
def test(epoch, show_image=False):
print('\nTest phase, Epoch: {}'.format(epoch))
posenet.eval()
with torch.no_grad(): # will save gpu memory and speed up
test_loss = 0
for batch_idx, target_tuple in enumerate(val_loader):
# images.requires_grad_()
# loc_targets.requires_grad_()
# conf_targets.requires_grad_()
if use_cuda:
target_tuple = [target_tensor.cuda(non_blocking=True) for target_tensor in target_tuple]
# target tensor shape: [8,512,512,3], [8, 1, 128,128], [8,43,128,128], [8,36,128,128], [8,36,128,128]
images, mask_misses, heatmaps = target_tuple # , offsets, mask_offsets
# images = Variable(images)
# loc_targets = Variable(loc_targets)
# conf_targets = Variable(conf_targets)
output_tuple, loss_ngpu = posenet(target_tuple)
loss = torch.sum(loss_ngpu) / opt.batch_size
test_loss += loss.item() # 累加的loss
print(' Test loss : %.3f, accumulated average loss: %.3f' % (loss.item(), test_loss / (batch_idx + 1)))
if show_image:
image, mask_miss, labels = [v.cpu().numpy() for v in target_tuple] # , offsets, mask_offset
output = output_tuple[-1][0].cpu().numpy() # different scales can be shown
# show the generated ground truth
img = image[0]
output = output[0].transpose((1, 2, 0))
labels = labels[0].transpose((1, 2, 0))
img = cv2.resize(img, output.shape[:2], interpolation=cv2.INTER_CUBIC)
plt.imshow(img[:, :, [2, 1, 0]]) # Opencv image format: BGR
plt.imshow(output[:, :, 32], alpha=0.5) # mask_all
# plt.imshow(mask_offset[:, :, 2], alpha=0.5) # mask_all
plt.show()
t=2
os.makedirs(checkpoint_path, exist_ok=True)
logger = open(os.path.join('./' + checkpoint_path, 'log'), 'a+')
logger.write('\tval_loss: {}'.format(test_loss / len(val_loader))) # validation时不要\n换行
logger.flush()
logger.close()
if __name__ == '__main__':
for epoch in range(start_epoch, start_epoch + 200):
train(epoch)
# test(epoch, show_image=True)