-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtrain-videoknet.lua
372 lines (287 loc) · 15.9 KB
/
train-videoknet.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
----------------------------------------------------------------------
-- Massive online trained network on videos
-- load all sort of video, run Clustering learning, online-learn forever
-- January 18th 2013, E. Culurciello with discussion w/ Clement Farabet
--
-- 1. load a video
-- 2. for each few frames: extract patches, cluster-learn filter
-- 3. setup net layer layer, process video through layer, then repeat step 2,3 for all layers
-- 4. periodically check what is learned: test on dataset?
-- 5. enjoy the ultimate net - Yay!
--
----------------------------------------------------------------------
-- TODO: extend k-means to multiple "winners" = average on multiple kernels
-- TODO: create NMaxPool layer: propagate multiple max as winners or average a few of them
-- TODO: group features for pooling
-- TODO: volumetric nn.Tanh, nn.pooling, etc, so we can add more volumeteric layers
-- TODO: test with ABS value on layers out
require 'nnx'
require 'eex'
require 'image'
--require 'kmec'
--require 'unsup'
require 'online-kmeans' -- allow you to re-train k-means kernels
require 'ffmpeg'
require 'trainLayer' -- functions for Clustering Learning on video
require 'optim'
require "slac"
cmd = torch.CmdLine()
cmd:text('Options')
cmd:option('-visualize', true, 'display kernels')
cmd:option('-seed', 1, 'initial random seed')
cmd:option('-threads', 8, 'threads')
cmd:option('-inputsize', 9, 'size of each input patches')
cmd:option('-nkernels', 64, 'number of kernels to learn')
cmd:option('-niter', 15, 'nb of k-means iterations')
cmd:option('-batchsize', 1000, 'batch size for k-means\' inner loop')
cmd:option('-nsamples', 10*1000, 'nb of random training samples')
cmd:option('-initstd', 0.1, 'standard deviation to generate random initial templates')
cmd:option('-statinterval', 5000, 'interval for reporting stats/displaying stuff')
cmd:option('-savedataset', false, 'save modified dataset')
cmd:option('-classify', true, 'run classification train/test')
cmd:option('-nnframes', 4, 'nb of frames uses for temporal learning of features')
cmd:option('-dataset', '../datasets/faces_cut_yuv_32x32/','path to FACE dataset root dir')
cmd:option('-patches', 'all', 'nb of patches to use')
-- loss:
cmd:option('-loss', 'nll', 'type of loss function to minimize: nll | mse | margin')
-- training:
cmd:option('-save', 'results', 'subdirectory to save/log experiments in')
cmd:option('-plot', false, 'live plot')
cmd:option('-optimization', 'SGD', 'optimization method: SGD | ASGD | CG | LBFGS')
cmd:option('-learningRate', 1e-3, 'learning rate at t=0')
cmd:option('-batchSize', 1, 'mini-batch size (1 = pure stochastic)')
cmd:option('-weightDecay', 0, 'weight decay (SGD only)')
cmd:option('-momentum', 0, 'momentum (SGD only)')
cmd:option('-t0', 1, 'start averaging at t0 (ASGD only), in nb of epochs')
cmd:option('-maxIter', 2, 'maximum nb of iterations for CG and LBFGS')
cmd:option('-type', 'double', 'type: double | float | cuda')
cmd:text()
opt = cmd:parse(arg or {}) -- pass parameters to training files:
--if not qt then
-- opt.visualize = false
--end
torch.manualSeed(opt.seed)
torch.setnumthreads(opt.threads)
torch.setdefaulttensortype('torch.DoubleTensor')
is = opt.inputsize
nk1 = opt.nkernels
nnf1 = opt.nnframes
print 'SUPER-NET script!'
----------------------------------------------------------------------
print '==> loading and processing (local-contrast-normalization) of dataset'
--dspath = '/Users/eugenioculurciello/Pictures/2013/1-13-13/VID_20130105_111419.mp4'
--source = ffmpeg.Video{path=dspath, encoding='jpg', fps=24, loaddump=false, load=false}
--dspath = '/Users/eugenioculurciello/Desktop/driving1.mov'
--source = ffmpeg.Video{path=dspath, encoding='jpg', fps=24, loaddump=false, load=false}
--dspath = '../datasets/TLD/06_car'
--source = ffmpeg.Video{path=dspath, encoding='jpg', fps=24, loaddump=true, load=false}
--dspath = '../datasets/TLD/08_volkswagen'
--source = ffmpeg.Video{path=dspath, encoding='jpg', fps=24, loaddump=true, load=false}
--dspath = '../datasets/TLD/09_carchase'
--source = ffmpeg.Video{path=dspath, encoding='jpg', fps=24, loaddump=true, load=false}
dspath = '../datasets/euge.mov'
--source = ffmpeg.Video{path=dspath, encoding='jpg', fps=24, loaddump=false, load=false}
-- smaller video test:
source = ffmpeg.Video{path=dspath, width = 120, height = 80, encoding='jpg', fps=24, loaddump=false, load=false}
rawFrame = source:forward()
-- input video params:
ivch = rawFrame:size(1) -- channels
ivhe = rawFrame:size(2) -- height
ivwi = rawFrame:size(3) -- width
source.current = 1 -- rewind video frames
-- number of frames to process:
nfpr = 200 + nnf1 -- batch process size [video frames]
-- normalize and prepare dataset:
neighborhood = image.gaussian1D(9)
normalization = nn.SpatialContrastiveNormalization(ivch, neighborhood, 1e-3)
function createDataBatch()
trainData = torch.Tensor(nfpr,ivch,ivhe,ivwi)
for i = 1, nfpr do -- just get a few frames to begin with
procFrame = normalization:forward(rawFrame) -- full LCN!
trainData[i] = procFrame
rawFrame = source:forward()
end
return trainData
end
createDataBatch()
----------------------------------------------------------------------
print '==> generating filters for layer 1:'
nlayer = 1
-- FULL CONNECT MODEL:
kernels1 = trainLayer(nlayer, trainData, opt.nsamples, nil, nk1*ivch, nnf1, is) -- learn 3*nk1 filters!better results!--no slac
--kernels1 = trainLayer(nlayer, trainData, opt.nsamples, nil, nk1*ivch, 1, is) -- NO VOL FILTERS!
-- SLAC MODEL:
-- SLAC: nk1*4 filters to learn, then narrow down to nk1:
--kernels1 = trainLayer(nlayer, trainData, opt.nsamples, nil, nk1*4, nnf1, is) -- with slac
---- kernels1, connTable1 = slac(kernels1, startN, finalN,tau,Delta) -- SLAC algorithm to aggregate kernels
--kernels1s, connTable1 = slac(kernels1, nk1*4, nk1, 5, 4.5) -- SLAC algorithm to aggregate kernels
--image.display{image=kernels1s:reshape(kernels1s:size(1),is,is), padding=2, symmetric=true, zoom=2} --slac kernels/groups
--nk1s=kernels1s:size(1)
----------------------------------------------------------------------
print '==> create model 1st layer:'
poolsize = 2
cvstepsize = 1
normkernel = image.gaussian1D(7)
ovhe = (ivhe-is+1)/poolsize/cvstepsize -- output video feature height
ovwi = (ivwi-is+1)/poolsize/cvstepsize -- output video feature width
-- FULL CONNEX MODEL:
vnet = nn.Sequential()
-- usage: VolumetricConvolution(nInputPlane, nOutputPlane, kT, kW, kH, dT, dW, dH)
vnet:add(nn.VolumetricConvolution(ivch, nk1, nnf1, is, is, 1, cvstepsize,cvstepsize))
vnet:add(nn.Sum(2)) -- needed by volconv
--vnet:add(nn.SpatialSAD(ivch, nk1, is, is))
--vnet:add(nn.SpatialSubtractiveNormalization(nk1, normkernel))
vnet:add(nn.Tanh())
--vnet:add(nn.HardShrink(0.5)) -- tried: really bad, maybe with SAD ok?
vnet:add(nn.SpatialMaxPooling(poolsize, poolsize))
--vnet:add(nn.SpatialLPPooling(nk1, 2, poolsize, poolsize, poolsize, poolsize))
--vnet:add(nn.SpatialSubtractiveNormalization(nk1, normkernel))
vnet:add(nn.SpatialContrastiveNormalization(nk1, normkernel,1e-3))
-- load kernels into network:
kernels1:div(nnf1*nk1*ivch/4) -- divide kernels so output of SpatialConv is about ~1 or more
--vnet.modules[1].weight = kernels1:reshape(nk1,ivch,nnf1,is,is) -- full connex filters!
--vnet.modules[1].weight = kernels1:reshape(nk1,ivch,is,is) -- for spatial SAD
-- NO VOL FILTERS!
vnet.modules[1].weight = kernels1:reshape(nk1,ivch,1,is,is):expand(nk1,ivch,nnf1,is,is) -- full connex filters!
-- SLAC MODEL:
--vnet = nn.Sequential()
---- usage: VolumetricConvolution(nInputPlane, nOutputPlane, kT, kW, kH, dT, dW, dH)
--vnet:add(nn.VolumetricConvolution(ivch, nk1s, nnf1, is, is, 1, cvstepsize,cvstepsize)) --SLAC with nk1s
--vnet:add(nn.Sum(2)) -- needed by volconv
--vnet:add(nn.SpatialMaxMap(connTable1)) -- slac function to pick max(each group) from VolConv layer
--vnet:add(nn.Tanh())
--vnet:add(nn.SpatialMaxPooling(poolsize, poolsize))
----vnet:add(nn.SpatialLPPooling(nk1, 2, poolsize, poolsize, poolsize, poolsize))
----vnet:add(nn.SpatialSubtractiveNormalization(nk1, normkernel))
--vnet:add(nn.SpatialContrastiveNormalization(nk1, normkernel,1e-3))
--
---- load kernels into network:
--kernels1s:div(nnf1*nk1*ivch/2) -- divide kernels so output of SpatialConv is about ~1 or more
--vnet.modules[1].weight = kernels1s:reshape(nk1s,nnf1,is,is):reshape(nk1s,1,nnf1,is,is):expand(nk1s,ivch,nnf1,is,is) -- SLAC
----------------------------------------------------------------------
print '==> process video throught 1st layer:'
function processLayer(lv, network, data_in, nkernels, oheight, owidth)
data_out = torch.Tensor(nfpr, nkernels, oheight, owidth)
for i = nnf1, nfpr do -- just get a few frames to begin with
if lv == 1 then procFrames = data_in[{{i-nnf1+1,i},{},{}}]:transpose(1,2) -- swap order of indices here for VolConvolution to work
else procFrames = data_in[i] end
data_out[i] = network:forward(procFrames)
xlua.progress(i, nfpr)
-- do a live display of the input video and output feature maps
winm = image.display{image=data_out[i], padding=2, zoom=1, win=winm, nrow=math.floor(math.sqrt(nkernels))}
end
-- data_out = nil --free memory if needed
return data_out
end
trainData2 = processLayer(1, vnet, trainData, nk1, ovhe, ovwi)
--report some statistics:
print('1st layer conv out. Max: '..vnet.modules[1].output:max()..' and min: '..vnet.modules[1].output:min()..' and mean: '..vnet.modules[1].output:mean())
print('1st layer output. Max: '..vnet.output:max()..' and min: '..vnet.output:min()..' and mean: '..vnet.output:mean())
----------------------------------------------------------------------
print '==> generating filters for layer 2:'
nlayer = 2
nnf2 = 1
nk2 = 128
-- FULL CONNEX MODEL:
-- we get better results training more filters for the full connected system. It is increasing the number of diff kenrnels!
kernels2 = trainLayer(nlayer, trainData2, opt.nsamples*10, nil, nk2*nk1, nnf2, is) -- nk2*nk1 to train more filters for full connex
-- SLAC MODEL:
-- SLAC: nk1*4 filters to learn, then narrow down to nk1:
--kernels2 = trainLayer(nlayer, trainData2, opt.nsamples, nil, nk2*nk1, nnf2, is) -- with slac
--kernels2s, connTable2 = slac(kernels2, nk2*nk1, nk2, 5, 4.5) -- SLAC algorithm to aggregate kernels
--image.display{image=kernels2s:reshape(kernels2s:size(1),is,is), padding=2, symmetric=true, zoom=2} --slac kernels/groups
--nk2s=kernels2s:size(1)
----------------------------------------------------------------------
print '==> create model 2nd layer:'
poolsize = 2
cvstepsize = 1
ovhe2 = (ovhe-is+1)/poolsize/cvstepsize -- output video feature height
ovwi2 = (ovwi-is+1)/poolsize/cvstepsize -- output video feature width
fanin = 16
-- FULL CONNEX MODEL:
vnet2 = nn.Sequential()
--vnet2:add(nn.SpatialConvolutionMap(nn.tables.random(nk1, nk2, fanin), is, is,cvstepsize,cvstepsize)) -- radom conn table NOT WORKING NOW: we do not know how to set filters for this one
--vnet2:add(nn.SpatialConvolutionMap(createConnexTable(nk1, nk2, 3), is, is,cvstepsize,cvstepsize)) -- use custom connex table by Euge
vnet2:add(nn.SpatialConvolution(nk1, nk2, is, is,cvstepsize,cvstepsize)) -- fully connected (BEST NOW)
--vnet:add(nn.SpatialSAD(nk1, nk2, is, is, cvstepsize,cvstepsize))
--vnet:add(nn.SpatialSubtractiveNormalization(nk2, normkernel))
vnet2:add(nn.Tanh())
--vnet2:add(nn.SpatialLPPooling(nk2, 2, poolsize, poolsize, poolsize, poolsize))
vnet2:add(nn.SpatialMaxPooling(poolsize, poolsize))
--vnet2:add(nn.SpatialSubtractiveNormalization(nk2, normkernel))
vnet2:add(nn.SpatialContrastiveNormalization(nk2, normkernel,1e-3))
-- load kernels into network:
kernels2:div(nk2/2) -- divide kernels so output of SpatialConv is about ~1 or more
--vnet2.modules[1].weight = kernels2:reshape(nk2,is,is):reshape(nk2,1,is,is):expand(nk2,nk1,is,is) -- reuse less filters (worse)
vnet2.modules[1].weight = kernels2:reshape(nk2,nk1,is,is) --full connex filters
-- SLAC MODEL:
--vnet2 = nn.Sequential()
--vnet2:add(nn.SpatialConvolution(nk1, nk2s, is, is,cvstepsize,cvstepsize)) -- nk2s for SLAC
--vnet2:add(nn.SpatialMaxMap(connTable2)) -- slac function to pick max(each group) from VolConv layer
--vnet2:add(nn.Tanh())
--vnet2:add(nn.SpatialMaxPooling(poolsize, poolsize))
--vnet2:add(nn.SpatialSubtractiveNormalization(nk2, normkernel))
---- load kernels into network:
--kernels2s:div(nk2/2) -- divide kernels so output of SpatialConv is about ~1 or more
--vnet2.modules[1].weight = kernels2s:reshape(nk2s,is,is):reshape(nk2s,1,is,is):expand(nk2s,nk1,is,is) -- SLAC
----------------------------------------------------------------------
print '==> process video throught 2nd layer:'
print 'Initial frames will be blank because of the VolConv on 1st layer~'
trainData3 = processLayer(2, vnet2, trainData2, nk2, ovhe2, ovwi2)
--report some statistics:
print('2nd layer conv out.Max: '..vnet2.modules[1].output:max()..' and min: '..vnet2.modules[1].output:min()..' and mean: '..vnet2.modules[1].output:mean())
print('1st layer output. Max: '..vnet2.output:max()..' and min: '..vnet2.output:min()..' and mean: '..vnet2.output:mean())
----------------------------------------------------------------------
-- 2 layer test:
print '==> Test network'
dofile 'test-videoknet.lua'
torch.load() -- break function
----------------------------------------------------------------------
print '==> generating filters for layer 3:'
nlayer = 3
nnf3 = 1
nk3 = 128
kernels3 = trainLayer(nlayer, trainData2, nsamples, nk3, nnf3, is)
----------------------------------------------------------------------
print '==> create model 3nd layer:'
poolsize = 2
cvstepsize = 1
ovhe3 = (ovhe2-is+1)/poolsize/cvstepsize -- output video feature height
ovwi3 = (ovwi2-is+1)/poolsize/cvstepsize -- output video feature width
vnet3 = nn.Sequential()
vnet3:add(nn.SpatialConvolution(nk2, nk3, is, is,cvstepsize,cvstepsize))
vnet3:add(nn.Tanh())
vnet3:add(nn.SpatialLPPooling(nk3, 2, poolsize, poolsize, poolsize, poolsize))
vnet3:add(nn.SpatialSubtractiveNormalization(nk3, normkernel))
-- load kernels into network:
kernels3:div(nk3) -- divide kernels so output of SpatialConv is about ~1 or more
vnet3.modules[1].weight = kernels3:reshape(nk3,is,is):reshape(nk3,1,is,is):expand(nk3,nk2,is,is)
----------------------------------------------------------------------
-- 3 layer test:
print '==> Test network'
dofile 'test-videoknet.lua'
torch.load() -- break function
----------------------------------------------------------------------
----------------------------------------------------------------------
print '==> Now test a few loops of online learning on video'
-- save older kernels to x-check online routines:
kernels1_old = kernels1:clone()
kernels2_old = kernels2:clone()
-- generate more samples:
source.current = source.current - nnf1 -- rewind video
createDataBatch()
-- update kernels with new data:
kernels1 = trainLayer(1, trainData, kernels1, nk1, nnf1, is)
kernels2 = trainLayer(2, trainData2, kernels2, nk2, nnf2, is)
processLayer1()
--report some statistics:
print('1st layer max: '..vnet.modules[1].output:max()..' and min: '..vnet.modules[1].output:min()..' and mean: '..vnet.modules[1].output:mean())
processLayer2()
--report some statistics:
print('2nd layer max: '..vnet2.modules[1].output:max()..' and min: '..vnet2.modules[1].output:min()..' and mean: '..vnet2.modules[1].output:mean())
-- show filters before and after new training:
--image.display{image=kernels1:reshape(nk1,nnf1*is,is), padding=2, symmetric=true, zoom=2, nrow=math.floor(math.sqrt(nk1)), legend='Layer '..nlayer..' filters'}
--image.display{image=kernels1_old:reshape(nk1,nnf1*is,is), padding=2, symmetric=true, zoom=2, nrow=math.floor(math.sqrt(nk1)), legend='Layer '..nlayer..' filters'}
--
--image.display{image=kernels2:reshape(nk2,nnf2*is,is), padding=2, symmetric=true, zoom=2, nrow=math.floor(math.sqrt(nk2)), legend='Layer '..nlayer..' filters'}
--image.display{image=kernels2_old:reshape(nk2,nnf2*is,is), padding=2, symmetric=true, zoom=2, nrow=math.floor(math.sqrt(nk2)), legend='Layer '..nlayer..' filters'}