-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTempExpn.py
174 lines (149 loc) · 6.32 KB
/
TempExpn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# -*- coding: utf-8 -*-
import torch
import torchvision
from torchvision import transforms, datasets
import torch.nn as nn
import torch.nn.functional as F
from Generator import ExponGenerator
import torch.optim as optim
from Plotter import Plotter
"""
in this test run, all weights have the same learning rate for every iteration. the learning rates are taken from an
exponential distribution. once this test is done we can continue to test per-weight learning rates
"""
# testing a manual gradient calculation
# hyper parameters
# we implemented it in a way such that the default step size is small and the variate lr is much larger
alpha_0 = 1e-2
expon_scale = 0.1
train = datasets.MNIST('', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor()
]))
test = datasets.MNIST('', train=False, download=True,
transform=transforms.Compose([
transforms.ToTensor()
]))
trainset = torch.utils.data.DataLoader(train, batch_size=10, shuffle=True)
testset = torch.utils.data.DataLoader(test, batch_size=10, shuffle=False)
class Net(nn.Module):
def __init__(self):
super().__init__()
# define the layer
self.fc1 = nn.Linear(28*28, 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, 64)
self.fc4 = nn.Linear(64, 10)
def forward(self, x):
# define per layer actions
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = self.fc4(x)
return F.log_softmax(x, dim=1)
# first test it with universal lr
net = Net()
# print(net)
print("training")
# create a new exponential distribution to sample from
learning_rates = ExponGenerator(scale=expon_scale)
# yields better results but I wish to see the impact better
# optimizer = optim.Adam(net.parameters(), lr=0.001)
loss_plot = Plotter(interval=100)
EPOCHS = 3
for i in range(EPOCHS):
# according to the data loader, the data is sent in mini batches of 10
for data in trainset:
# taking a set of features and labels
X, y = data
# we have to reset the gradient after each iteration
net.zero_grad()
output = net(X.view(-1, 28*28))
# a scalar value, we will use negative log likelyhood
# if our data is a one-hot vector we use mean squared error
loss = F.nll_loss(output, y)
loss_plot.update(loss)
# we would now like to backpropogate the loss and compute the gradients
loss.backward()
# for now we can only use the manual learning without fancy adam
alpha = learning_rates.generate_variate() + alpha_0
for f in net.parameters():
# \\generating numbers from a levy distribution
# rand_tensor = learning_rates.generate_levy_tensor(f.shape)
# \\generating numbers from a uniform distribution and multiplying it with the default LR
# rand_tensor = torch.rand(f.shape) * alpha_0
f.data.sub_(alpha * f.grad.data)
# for f in net.parameters():
# print(f.grad.data.shape)
#print(f"loss: {loss} in epoch: {i}")
print("testing")
correct, total = 0, 0
with torch.no_grad():
for data in testset:
# split into label and sample and run them through the network
X, y = data
output = net(X.view(-1, 28 * 28))
# print(output)
# iterate over all the outputs and compare the network results to the data results
for idx, ans in enumerate(output):
# ans is the vector of probablities for the i'th example and idx is the index of the current example in the batch
# print(torch.argmax(ans), idx)
if torch.argmax(ans) == y[idx]:
correct += 1
total += 1
# claculate accuracy up to 3 digits after the dot
print("universal weight lr accuracy: ", round(correct / total, 3))
loss_plot.plot(title="universal weight learning rate")
# then we test it with per-weight lr
net = Net()
# print(net)
print("training")
# create a new exponential distribution to sample from
learning_rates = ExponGenerator(scale=expon_scale)
# yields better results but I wish to see the impact better
# optimizer = optim.Adam(net.parameters(), lr=0.001)
loss_plot = Plotter(interval=100)
EPOCHS = 3
for i in range(EPOCHS):
# according to the data loader, the data is sent in mini batches of 10
for data in trainset:
# taking a set of features and labels
X, y = data
# we have to reset the gradient after each iteration
net.zero_grad()
output = net(X.view(-1, 28*28))
# a scalar value, we will use negative log likelyhood
# if our data is a one-hot vector we use mean squared error
loss = F.nll_loss(output, y)
loss_plot.update(loss)
# we would now like to backpropogate the loss and compute the gradients
loss.backward()
# for now we can only use the manual learning without fancy adam
for f in net.parameters():
# \\generating numbers from a levy distribution
# rand_tensor = learning_rates.generate_levy_tensor(f.shape)
# \\generating numbers from a uniform distribution and multiplying it with the default LR
# rand_tensor = torch.rand(f.shape) * alpha_0
alpha = learning_rates.generate_tensor(shape=f.data.shape) + alpha_0
f.data.sub_(alpha * f.grad.data)
# for f in net.parameters():
# print(f.grad.data.shape)
#print(f"loss: {loss} in epoch: {i}")
print("testing")
correct, total = 0, 0
with torch.no_grad():
for data in testset:
# split into label and sample and run them through the network
X, y = data
output = net(X.view(-1, 28 * 28))
# print(output)
# iterate over all the outputs and compare the network results to the data results
for idx, ans in enumerate(output):
# ans is the vector of probablities for the i'th example and idx is the index of the current example in the batch
# print(torch.argmax(ans), idx)
if torch.argmax(ans) == y[idx]:
correct += 1
total += 1
# claculate accuracy up to 3 digits after the dot
print("per weight lr accuracy: ", round(correct / total, 3))
loss_plot.plot(title="per-weight learning rate")