-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrial.py
252 lines (208 loc) · 7.71 KB
/
trial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# # import torch
# # import numpy as np
# # import torch.nn as nn
# # import torch.nn.functional as F
# # from torch.autograd import Variable
# # import matplotlib.pyplot as plt
# # from soft_argmax import SoftArgmax1D
# # def plot_grad_flow(named_parameters):
# # ave_grads = []
# # layers = []
# # for n, p in named_parameters:
# # if(p.requires_grad) and ("bias" not in n):
# # layers.append(n)
# # ave_grads.append(p.grad.abs().mean())
# # plt.plot(ave_grads, alpha=0.3, color="b")
# # plt.hlines(0, 0, len(ave_grads)+1, linewidth=1, color="k" )
# # plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
# # plt.xlim(xmin=0, xmax=len(ave_grads))
# # plt.xlabel("Layers")
# # plt.ylabel("average gradient")
# # plt.title("Gradient flow")
# # plt.grid(True)
# # class ArgMax(torch.autograd.Function):
# # @staticmethod
# # def forward(ctx, input):
# # idx = torch.argmax(input, 1)
# # output = torch.zeros_like(input)
# # output.scatter_(1, idx, 1)
# # ctx.save_for_backward(output)
# # return output
# # @staticmethod
# # def backward(ctx, grad_output):
# # return grad_output
# # class Net1(nn.Module):
# # def __init__(self):
# # super().__init__()
# # self.model = nn.Sequential(
# # nn.Linear(1000,100),
# # nn.ReLU(),
# # nn.Linear(100,100))
# # def forward(self, x):
# # return self.model(x)
# # class Net2(nn.Module):
# # def __init__(self):
# # super().__init__()
# # self.model = nn.Sequential(
# # nn.Linear(1000,100),
# # nn.ReLU(),
# # nn.Linear(100,100))
# # def forward(self, x):
# # return self.model(x)
# # train_x, train_y = np.random.randn(10,1000), np.random.choice(100,size=10)
# # valid_x, valid_y = np.random.randn(10,1000), np.random.choice(100,size=10)
# # pseudo_x = np.random.randn(10,1000)
# # train_x, train_y = torch.tensor(train_x).float(), torch.tensor(train_y)
# # valid_x, valid_y = torch.tensor(valid_x).float(), torch.tensor(valid_y)
# # pseudo_x = torch.tensor(pseudo_x).float()
# # n1 = Net1()
# # n2 = Net2()
# # optim1 = torch.optim.Adam(n1.parameters(),lr = 0.001)
# # optim2 = torch.optim.Adam(n2.parameters(),lr = 0.001)
# # criterion = nn.CrossEntropyLoss()
# # argmax = SoftArgmax1D()
# # for i in range(100):
# # optim1.zero_grad()
# # logits = n1(train_x)
# # loss = criterion(logits, train_y)
# # loss.mean()
# # loss.backward()
# # # plot_grad_flow(n1.named_parameters())
# # optim1.step()
# # print("initial logits from n1: ")
# # print(n1(train_x))
# # pseudo_logits = n1(pseudo_x)
# # pseudo_y = argmax(pseudo_logits).round()
# # print("pseudo_y: ",pseudo_y)
# # optim2.zero_grad()
# # logits = n2(pseudo_x)
# # loss = criterion(logits, pseudo_y.long())
# # loss.mean()
# # loss.backward()
# # optim2.step()
# # optim1.zero_grad()
# # logits = n2(valid_x)
# # loss = criterion(logits, valid_y)
# # loss.mean()
# # loss.backward()
# # plot_grad_flow(n1.named_parameters())
# # optim1.step()
# # plt.show()
# # print("after n2 step: ")
# # print(n1(train_x))
# import torch.nn as nn
# import torch
# from functools import reduce
# from operator import mul
# # from utils import get_logger
# """Implements the EmbeddingMul class
# Author: Noémien Kocher
# Date: Fall 2018
# Unit test: embedding_mul_test.py
# """
# # logger = None
# # A pytorch module can not have a logger as its attrbute, because
# # it then cannot be serialized.
# # def set_logger(alogger):
# # global logger
# # logger = alogger
# class EmbeddingMul(nn.Module):
# """This class implements a custom embedding mudule which uses matrix
# multiplication instead of a lookup. The method works in the functional
# way.
# Note: this class accepts the arguments from the original pytorch module
# but only with values that have no effects, i.e set to False, None or -1.
# """
# def __init__(self, depth, device):
# super(EmbeddingMul, self).__init__()
# # i.e the dictionnary size
# self.depth = depth
# self.device = device
# self.ones = torch.eye(depth, requires_grad=False, device=self.device)
# self._requires_grad = True
# # "oh" means One Hot
# self.last_oh = None
# self.last_weight = None
# @property
# def requires_grad(self):
# return self._requires_grad
# @requires_grad.setter
# def requires_grad(self, value):
# self._requires_grad = value
# logger.info(
# f"(embedding mul) requires_grad set to {self.requires_grad}. ")
# def forward(self, input, weight, padding_idx=None, max_norm=None,
# norm_type=2., scale_grad_by_freq=False, sparse=False):
# """Declares the same arguments as the original pytorch implementation
# but only for backward compatibility. Their values must be set to have
# no effects.
# Args:
# - input: of shape (bptt, bsize)
# - weight: of shape (dict_size, emsize)
# Returns:
# - result: of shape (bptt, bsize, dict_size)
# """
# # ____________________________________________________________________
# # Checks if unsupported argument are used
# if padding_idx != -1:
# raise NotImplementedError(
# f"padding_idx must be -1, not {padding_idx}")
# if max_norm is not None:
# raise NotImplementedError(f"max_norm must be None, not {max_norm}")
# if scale_grad_by_freq:
# raise NotImplementedError(f"scale_grad_by_freq must be False, "
# f"not {scale_grad_by_freq}")
# if sparse:
# raise NotImplementedError(f"sparse must be False, not {sparse}")
# # ____________________________________________________________________
# if self.last_oh is not None:
# del self.last_oh
# self.last_oh = self.to_one_hot(input)
# with torch.set_grad_enabled(self.requires_grad):
# result = torch.stack(
# [torch.mm(batch.float(), weight)
# for batch in self.last_oh], dim=0)
# self.last_weight = weight.clone()
# return result
# def to_one_hot(self, input):
# # Returns a new tensor that doesn't share memory
# result = torch.index_select(
# self.ones, 0, input.view(-1).long()).view(
# input.size()+(self.depth,))
# result.requires_grad = self.requires_grad
# return result
# def __repr__(self):
# return self.__class__.__name__ + "({})".format(self.depth)
# if __name__ == "__main__":
# input = torch.tensor([[1., 2., 0.], [3., 4., 5.]])
# dim = 10
# mod = EmbeddingMul(dim, 'cpu')
# emmatrix = torch.rand(10, 5, requires_grad=True)
# print(emmatrix)
# output = mod(input, emmatrix, -1)
# output = output.mean()
# print(torch.autograd.grad(output,emmatrix))
# print(output)
import torch
# class HotEmbedding(torch.nn.Module):
# def __init__(self, max_val, embedding_dim, eps=1e-2):
# super(HotEmbedding, self).__init__()
# self.A = torch.arange(max_val, requires_grad=False)
# self.B = torch.randn((max_val, embedding_dim), requires_grad=True)
# self.eps = eps
# def forward(self, x):
# print(x.shape, self.A.shape, self.B.shape)
# return 1/((x.unsqueeze(-1)**2 - self.A.unsqueeze(0).repeat(x.size(0),1,1)**2)+self.eps) @ self.B
# layer = HotEmbedding(10331, 512)
# x = torch.tensor([[1.,2.,3.,1.,2.,3.],[1.,2.,3.,1.,2.,3.]], requires_grad=True)
# y = layer(x)
# print(y.shape)
a = torch.Tensor([1.,2.,3.,4.])
# a.requires_grad=True
# b = torch.eye(5)
# for i in range(4):
# b[i][i] = a[i]/a.detach()[i]
# print(b)
# c = torch.randn(5,512, requires_grad=True)
# y = b@c
# print(torch.autograd.grad(y.mean(), a, retain_graph=True))