-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlearning_rate.py
249 lines (209 loc) · 9.73 KB
/
learning_rate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 8 15:57:47 2018
@author: ai
"""
# learning rate schduler
import numpy as np
from torch.optim import Optimizer
import torch
import logging
import math
import torch.optim.lr_scheduler
from torch.optim.lr_scheduler import _LRScheduler
logger = logging.getLogger(__name__)
class CyclicLR(object):
"""Sets the learning rate of each parameter group according to
cyclical learning rate policy (CLR). The policy cycles the learning
rate between two boundaries with a constant frequency, as detailed in
the paper `Cyclical Learning Rates for Training Neural Networks`_.
The distance between the two boundaries can be scaled on a per-iteration
or per-cycle basis.
Cyclical learning rate policy changes the learning rate after every batch.
`step` should be called after a batch has been used for training.
To resume training, save `last_batch_iteration` and use it to instantiate `CycleLR`.
This class has three built-in policies, as put forth in the paper:
"triangular":
A basic triangular cycle w/ no amplitude scaling.
"triangular2":
A basic triangular cycle that scales initial amplitude by half each cycle.
"exp_range":
A cycle that scales initial amplitude by gamma**(cycle iterations) at each
cycle iteration.
This implementation was adapted from the github repo: `bckenstler/CLR`_
Args:
optimizer (Optimizer): Wrapped optimizer.
base_lr (float or list): Initial learning rate which is the
lower boundary in the cycle for eachparam groups.
Default: 0.001
max_lr (float or list): Upper boundaries in the cycle for
each parameter group. Functionally,
it defines the cycle amplitude (max_lr - base_lr).
The lr at any cycle is the sum of base_lr
and some scaling of the amplitude; therefore
max_lr may not actually be reached depending on
scaling function. Default: 0.006
step_size (int): Number of training iterations per
half cycle. Authors suggest setting step_size
2-8 x training iterations in epoch. Default: 2000
mode (str): One of {triangular, triangular2, exp_range}.
Values correspond to policies detailed above.
If scale_fn is not None, this argument is ignored.
Default: 'triangular'
gamma (float): Constant in 'exp_range' scaling function:
gamma**(cycle iterations)
Default: 1.0
scale_fn (function): Custom scaling policy defined by a single
argument lambda function, where
0 <= scale_fn(x) <= 1 for all x >= 0.
mode paramater is ignored
Default: None
scale_mode (str): {'cycle', 'iterations'}.
Defines whether scale_fn is evaluated on
cycle number or cycle iterations (training
iterations since start of cycle).
Default: 'cycle'
last_batch_iteration (int): The index of the last batch. Default: -1
Example:
>>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
>>> scheduler = torch.optim.CyclicLR(optimizer)
>>> data_loader = torch.utils.data.DataLoader(...)
>>> for epoch in range(10):
>>> for batch in data_loader:
>>> scheduler.step()
>>> train_batch(...)
.. _Cyclical Learning Rates for Training Neural Networks: https://arxiv.org/abs/1506.01186
.. _bckenstler/CLR: https://github.com/bckenstler/CLR
"""
def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
step_size=2000, mode='triangular', gamma=1.,
scale_fn=None, scale_mode='cycle', last_batch_iteration=-1):
if not isinstance(optimizer, Optimizer):
raise TypeError('{} is not an Optimizer'.format(
type(optimizer).__name__))
self.optimizer = optimizer
if isinstance(base_lr, list) or isinstance(base_lr, tuple):
if len(base_lr) != len(optimizer.param_groups):
raise ValueError("expected {} base_lr, got {}".format(
len(optimizer.param_groups), len(base_lr)))
self.base_lrs = list(base_lr)
else:
self.base_lrs = [base_lr] * len(optimizer.param_groups)
if isinstance(max_lr, list) or isinstance(max_lr, tuple):
if len(max_lr) != len(optimizer.param_groups):
raise ValueError("expected {} max_lr, got {}".format(
len(optimizer.param_groups), len(max_lr)))
self.max_lrs = list(max_lr)
else:
self.max_lrs = [max_lr] * len(optimizer.param_groups)
self.step_size = step_size
if mode not in ['triangular', 'triangular2', 'exp_range'] \
and scale_fn is None:
raise ValueError('mode is invalid and scale_fn is None')
self.mode = mode
self.gamma = gamma
if scale_fn is None:
if self.mode == 'triangular':
self.scale_fn = self._triangular_scale_fn
self.scale_mode = 'cycle'
elif self.mode == 'triangular2':
self.scale_fn = self._triangular2_scale_fn
self.scale_mode = 'cycle'
elif self.mode == 'exp_range':
self.scale_fn = self._exp_range_scale_fn
self.scale_mode = 'iterations'
else:
self.scale_fn = scale_fn
self.scale_mode = scale_mode
self.step(last_batch_iteration + 1)
self.last_batch_iteration = last_batch_iteration
def step(self, batch_iteration=None):
if batch_iteration is None:
batch_iteration = self.last_batch_iteration + 1
self.last_batch_iteration = batch_iteration
for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
param_group['lr'] = lr
@staticmethod
def _triangular_scale_fn(x):
return 1.
@staticmethod
def _triangular2_scale_fn(x):
return 1 / (2. ** (x - 1))
def _exp_range_scale_fn(self, x):
return self.gamma**(x)
def get_lr(self):
step_size = float(self.step_size)
cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)
lrs = []
param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
for param_group, base_lr, max_lr in param_lrs:
base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
if self.scale_mode == 'cycle':
lr = base_lr + base_height * self.scale_fn(cycle)
else:
lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
lrs.append(lr)
return lrs
class CosineAnnealingLR_with_Restart(_LRScheduler):
"""Set the learning rate of each parameter group using a cosine annealing
schedule, where :math:`\eta_{max}` is set to the initial lr and
:math:`T_{cur}` is the number of epochs since the last restart in SGDR:
.. math::
\eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})(1 +
\cos(\frac{T_{cur}}{T_{max}}\pi))
When last_epoch=-1, sets initial lr as lr.
It has been proposed in
`SGDR: Stochastic Gradient Descent with Warm Restarts`_. The original pytorch
implementation only implements the cosine annealing part of SGDR,
I added my own implementation of the restarts part.
Args:
optimizer (Optimizer): Wrapped optimizer.
T_max (int): Maximum number of iterations.
T_mult (float): Increase T_max by a factor of T_mult
eta_min (float): Minimum learning rate. Default: 0.
last_epoch (int): The index of last epoch. Default: -1.
model (pytorch model): The model to save.
out_dir (str): Directory to save snapshots
take_snapshot (bool): Whether to save snapshots at every restart
.. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
https://arxiv.org/abs/1608.03983
"""
def __init__(self, optimizer, T_max, T_mult, model, out_dir, take_snapshot, eta_min=0, last_epoch=-1):
self.T_max = T_max
self.T_mult = T_mult
self.Te = self.T_max
self.eta_min = eta_min
self.current_epoch = last_epoch
self.model = model
self.out_dir = out_dir
self.take_snapshot = take_snapshot
self.lr_history = []
super(CosineAnnealingLR_with_Restart, self).__init__(optimizer, last_epoch)
def get_lr(self):
new_lrs = [self.eta_min + (base_lr - self.eta_min) *
(1 + math.cos(math.pi * self.current_epoch / self.Te)) / 2
for base_lr in self.base_lrs]
self.lr_history.append(new_lrs)
return new_lrs
def step(self, epoch=None):
if epoch is None:
epoch = self.last_epoch + 1
self.last_epoch = epoch
self.current_epoch += 1
for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
param_group['lr'] = lr
## restart
if self.current_epoch == self.Te:
print("restart at epoch {:03d}".format(self.last_epoch + 1))
if self.take_snapshot:
torch.save({
'epoch': self.T_max,
'state_dict': self.model.state_dict()
}, self.out_dir + "/" + 'snapshot_e_{:03d}.pth.tar'.format(self.T_max))
## reset epochs since the last reset
self.current_epoch = 0
## reset the next goal
self.Te = int(self.Te * self.T_mult)
self.T_max = self.T_max + self.Te