diff --git a/include/darknet.h b/include/darknet.h index 4483d166a3c..69d459694f6 100644 --- a/include/darknet.h +++ b/include/darknet.h @@ -176,6 +176,7 @@ typedef enum { GRU, LSTM, CONV_LSTM, + HISTORY, CRNN, BATCHNORM, NETWORK, @@ -272,6 +273,7 @@ struct layer { int keep_delta_gpu; int optimized_memory; int steps; + int history_size; int bottleneck; float time_normalizer; int state_constrain; diff --git a/src/conv_lstm_layer.c b/src/conv_lstm_layer.c index 9aec1874dc0..d56eb1c13e3 100644 --- a/src/conv_lstm_layer.c +++ b/src/conv_lstm_layer.c @@ -240,6 +240,180 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i return l; } +layer make_history_layer(int batch, int h, int w, int c, int history_size, int steps, int train) +{ + //steps = 1; + layer l = { (LAYER_TYPE)0 }; + l.train = train; + l.batch = batch; + l.type = HISTORY; + l.steps = steps; + l.history_size = history_size; + l.h = h; + l.w = w; + l.c = c; + l.out_h = h; + l.out_w = w; + l.out_c = c * history_size; + l.inputs = h * w * c; + l.outputs = h * w * c * history_size; + + l.forward = forward_history_layer; + l.backward = backward_history_layer; + + fprintf(stderr, "HISTORY b = %d, s = %2d, steps = %2d %4d x%4d x%4d -> %4d x%4d x%4d \n", l.batch / l.steps, l.history_size, l.steps, w, h, c, l.out_w, l.out_h, l.out_c); + + l.output = (float*)xcalloc(l.batch * l.outputs, sizeof(float)); + l.delta = (float*)xcalloc(l.batch * l.outputs, sizeof(float)); + + l.prev_state_cpu = (float*)xcalloc(l.batch*l.outputs, sizeof(float)); + +#ifdef GPU + + l.forward_gpu = forward_history_layer_gpu; + l.backward_gpu = backward_history_layer_gpu; + + l.output_gpu = cuda_make_array(0, l.batch * l.outputs); + l.delta_gpu = cuda_make_array(0, l.batch * l.outputs); + + l.prev_state_gpu = cuda_make_array(0, l.batch*l.outputs); + +#endif // GPU + + //l.batch = 4; + //l.steps = 1; + + return l; +} + +void forward_history_layer(layer l, network_state state) +{ + const int batch = l.batch / l.steps; + + float *prev_output = l.prev_state_cpu; + + int i; + for (i = 0; i < l.steps; ++i) { + // shift cell + int shift_size = l.inputs * (l.history_size - 1); + int output_sift = l.inputs; + + int b; + for (b = 0; b < batch; ++b) { + int input_start = b*l.inputs + i*l.inputs*batch; + int output_start = b*l.outputs + i*l.outputs*batch; + float *input = state.input + input_start; + float *output = l.output + output_start; + + copy_cpu(shift_size, prev_output + b*l.outputs, 1, output + output_sift, 1); + + copy_cpu(l.inputs, input, 1, output, 1); + + } + prev_output = l.output + i*l.outputs*batch; + } + + int output_start = (l.steps-1)*l.outputs*batch; + copy_cpu(batch*l.outputs, l.output + output_start, 1, l.prev_state_cpu, 1); +} + +void backward_history_layer(layer l, network_state state) +{ + const int batch = l.batch / l.steps; + + // l.delta -> state.delta + int i; + for (i = 0; i < l.steps; ++i) { + int b; + for (b = 0; b < batch; ++b) { + int input_start = b*l.inputs + i*l.inputs*batch; + int output_start = b*l.outputs + i*l.outputs*batch; + float *state_delta = state.delta + input_start; + float *l_delta = l.delta + output_start; + + //copy_cpu(l.inputs, l_delta, 1, state_delta, 1); + axpy_cpu(l.inputs, 1, l_delta, 1, state_delta, 1); + } + } +} + +#ifdef GPU +void forward_history_layer_gpu(const layer l, network_state state) +{ + const int batch = l.batch / l.steps; + + //int copy_size = l.inputs*batch*l.steps; + //printf(" copy_size = %d, inputs = %d, batch = %d, steps = %d, l.history_size = %d \n", copy_size, l.inputs, batch, l.steps, l.history_size); + //simple_copy_ongpu(copy_size, state.input, l.output_gpu); + //return; + + //fill_ongpu(batch*l.outputs, 0, l.prev_state_gpu, 1); + float *prev_output = l.prev_state_gpu; + + int i; + for (i = 0; i < l.steps; ++i) { + // shift cell + int shift_size = l.inputs * (l.history_size - 1); + int output_sift = l.inputs; + + int b; + for (b = 0; b < batch; ++b) { + //printf(" hist-fw: i = %d, b = %d \n", i, b); + + int input_start = b*l.inputs + i*l.inputs*batch; + int output_start = b*l.outputs + i*l.outputs*batch; + float *input = state.input + input_start; + float *output = l.output_gpu + output_start; + + //copy_cpu(shift_size, prev_output + b*l.outputs, 1, output + output_sift, 1); + simple_copy_ongpu(shift_size, prev_output + b*l.outputs, output + output_sift); + + //copy_cpu(l.inputs, input, 1, output, 1); + simple_copy_ongpu(l.inputs, input, output); + + int h; + for (h = 1; h < l.history_size; ++h) { + //scal_ongpu(l.inputs, (l.history_size - h)/ (float)l.history_size, output + h*l.inputs, 1); + //scal_ongpu(l.inputs, 0, output + h*l.inputs, 1); + } + } + prev_output = l.output_gpu + i*l.outputs*batch; + } + + int output_start = (l.steps - 1)*l.outputs*batch; + //copy_cpu(batch*l.outputs, l.output + output_start, 1, l.prev_state_cpu, 1); + simple_copy_ongpu(batch*l.outputs, l.output_gpu + output_start, l.prev_state_gpu); +} + +void backward_history_layer_gpu(const layer l, network_state state) +{ + const int batch = l.batch / l.steps; + + //int copy_size = l.inputs*batch*l.steps; + //printf(" copy_size = %d, inputs = %d, batch = %d, steps = %d, l.history_size = %d \n", copy_size, l.inputs, batch, l.steps, l.history_size); + //axpy_ongpu(copy_size, 1, l.delta_gpu, 1, state.delta, 1); + //return; + + // l.delta -> state.delta + int i; + for (i = 0; i < l.steps; ++i) { + int b; + for (b = 0; b < batch; ++b) { + //printf(" hist-bw: i = %d, b = %d \n", i, b); + + int input_start = b*l.inputs + i*l.inputs*batch; + int output_start = b*l.outputs + i*l.outputs*batch; + float *state_delta = state.delta + input_start; + float *l_delta = l.delta_gpu + output_start; + + //copy_cpu(l.inputs, l_delta, 1, state_delta, 1); + axpy_ongpu(l.inputs, 1, l_delta, 1, state_delta, 1); + } + } +} +#endif + + void update_conv_lstm_layer(layer l, int batch, float learning_rate, float momentum, float decay) { if (l.peephole) { diff --git a/src/conv_lstm_layer.h b/src/conv_lstm_layer.h index a79c06f6453..fae59f14996 100644 --- a/src/conv_lstm_layer.h +++ b/src/conv_lstm_layer.h @@ -20,10 +20,17 @@ void forward_conv_lstm_layer(layer l, network_state state); void backward_conv_lstm_layer(layer l, network_state state); void update_conv_lstm_layer(layer l, int batch, float learning_rate, float momentum, float decay); +layer make_history_layer(int batch, int h, int w, int c, int history_size, int steps, int train); +void forward_history_layer(layer l, network_state state); +void backward_history_layer(layer l, network_state state); + #ifdef GPU void forward_conv_lstm_layer_gpu(layer l, network_state state); void backward_conv_lstm_layer_gpu(layer l, network_state state); void update_conv_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale); + +void forward_history_layer_gpu(const layer l, network_state state); +void backward_history_layer_gpu(const layer l, network_state state); #endif #ifdef __cplusplus diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index eaa8fcd8ca7..ae922eef921 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -569,7 +569,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) float *a = l.weights_gpu + j*l.nweights / l.groups; float *b = state.workspace; float *c = l.output_gpu + (i*l.groups + j)*n*m; - if (l.size == 1) { + if (l.size == 1 && l.stride == 1 && l.dilation == 1) { b = im; } else { diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index a169f3fe87c..1ee241ea3a9 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -1364,7 +1364,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) else { //printf(" l.index = %d - FP32 \n", l.index); float *im = state.input + (i*l.groups + j)*(l.c / l.groups)*l.h*l.w; - if (l.size == 1) { + if (l.size == 1 && l.stride == 1 && l.dilation == 1) { b = im; } else { diff --git a/src/parser.c b/src/parser.c index 474a525882d..59d71bed45b 100644 --- a/src/parser.c +++ b/src/parser.c @@ -69,6 +69,7 @@ LAYER_TYPE string_to_layer_type(char * type) if (strcmp(type, "[gru]")==0) return GRU; if (strcmp(type, "[lstm]")==0) return LSTM; if (strcmp(type, "[conv_lstm]") == 0) return CONV_LSTM; + if (strcmp(type, "[history]") == 0) return HISTORY; if (strcmp(type, "[rnn]")==0) return RNN; if (strcmp(type, "[conn]")==0 || strcmp(type, "[connected]")==0) return CONNECTED; @@ -329,6 +330,13 @@ layer parse_conv_lstm(list *options, size_params params) return l; } +layer parse_history(list *options, size_params params) +{ + int history_size = option_find_int(options, "history_size", 4); + layer l = make_history_layer(params.batch, params.h, params.w, params.c, history_size, params.time_steps, params.train); + return l; +} + connected_layer parse_connected(list *options, size_params params) { int output = option_find_int(options, "output",1); @@ -1377,6 +1385,8 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps) l = parse_lstm(options, params); }else if (lt == CONV_LSTM) { l = parse_conv_lstm(options, params); + }else if (lt == HISTORY) { + l = parse_history(options, params); }else if(lt == CRNN){ l = parse_crnn(options, params); }else if(lt == CONNECTED){