diff --git a/include/darknet.h b/include/darknet.h
index 4483d166a3c..69d459694f6 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -176,6 +176,7 @@ typedef enum {
     GRU,
     LSTM,
     CONV_LSTM,
+    HISTORY,
     CRNN,
     BATCHNORM,
     NETWORK,
@@ -272,6 +273,7 @@ struct layer {
     int keep_delta_gpu;
     int optimized_memory;
     int steps;
+    int history_size;
     int bottleneck;
     float time_normalizer;
     int state_constrain;
diff --git a/src/conv_lstm_layer.c b/src/conv_lstm_layer.c
index 9aec1874dc0..d56eb1c13e3 100644
--- a/src/conv_lstm_layer.c
+++ b/src/conv_lstm_layer.c
@@ -240,6 +240,180 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i
     return l;
 }
 
+layer make_history_layer(int batch, int h, int w, int c, int history_size, int steps, int train)
+{
+    //steps = 1;
+    layer l = { (LAYER_TYPE)0 };
+    l.train = train;
+    l.batch = batch;
+    l.type = HISTORY;
+    l.steps = steps;
+    l.history_size = history_size;
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.out_h = h;
+    l.out_w = w;
+    l.out_c = c * history_size;
+    l.inputs = h * w * c;
+    l.outputs = h * w * c * history_size;
+
+    l.forward = forward_history_layer;
+    l.backward = backward_history_layer;
+
+    fprintf(stderr, "HISTORY b = %d, s = %2d, steps = %2d   %4d x%4d x%4d -> %4d x%4d x%4d \n", l.batch / l.steps, l.history_size, l.steps, w, h, c, l.out_w, l.out_h, l.out_c);
+
+    l.output = (float*)xcalloc(l.batch * l.outputs, sizeof(float));
+    l.delta = (float*)xcalloc(l.batch * l.outputs, sizeof(float));
+
+    l.prev_state_cpu = (float*)xcalloc(l.batch*l.outputs, sizeof(float));
+
+#ifdef GPU
+
+    l.forward_gpu = forward_history_layer_gpu;
+    l.backward_gpu = backward_history_layer_gpu;
+
+    l.output_gpu = cuda_make_array(0, l.batch * l.outputs);
+    l.delta_gpu = cuda_make_array(0, l.batch * l.outputs);
+
+    l.prev_state_gpu = cuda_make_array(0, l.batch*l.outputs);
+
+#endif  // GPU
+
+    //l.batch = 4;
+    //l.steps = 1;
+
+    return l;
+}
+
+void forward_history_layer(layer l, network_state state)
+{
+    const int batch = l.batch / l.steps;
+
+    float *prev_output = l.prev_state_cpu;
+
+    int i;
+    for (i = 0; i < l.steps; ++i) {
+        // shift cell
+        int shift_size = l.inputs * (l.history_size - 1);
+        int output_sift = l.inputs;
+
+        int b;
+        for (b = 0; b < batch; ++b) {
+            int input_start = b*l.inputs + i*l.inputs*batch;
+            int output_start = b*l.outputs + i*l.outputs*batch;
+            float *input = state.input + input_start;
+            float *output = l.output + output_start;
+
+            copy_cpu(shift_size, prev_output + b*l.outputs, 1, output + output_sift, 1);
+
+            copy_cpu(l.inputs, input, 1, output, 1);
+
+        }
+        prev_output = l.output + i*l.outputs*batch;
+    }
+
+    int output_start = (l.steps-1)*l.outputs*batch;
+    copy_cpu(batch*l.outputs, l.output + output_start, 1, l.prev_state_cpu, 1);
+}
+
+void backward_history_layer(layer l, network_state state)
+{
+    const int batch = l.batch / l.steps;
+
+    // l.delta -> state.delta
+    int i;
+    for (i = 0; i < l.steps; ++i) {
+        int b;
+        for (b = 0; b < batch; ++b) {
+            int input_start = b*l.inputs + i*l.inputs*batch;
+            int output_start = b*l.outputs + i*l.outputs*batch;
+            float *state_delta = state.delta + input_start;
+            float *l_delta = l.delta + output_start;
+
+            //copy_cpu(l.inputs, l_delta, 1, state_delta, 1);
+            axpy_cpu(l.inputs, 1, l_delta, 1, state_delta, 1);
+        }
+    }
+}
+
+#ifdef GPU
+void forward_history_layer_gpu(const layer l, network_state state)
+{
+    const int batch = l.batch / l.steps;
+
+    //int copy_size = l.inputs*batch*l.steps;
+    //printf(" copy_size = %d, inputs = %d, batch = %d, steps = %d, l.history_size = %d \n", copy_size, l.inputs, batch, l.steps, l.history_size);
+    //simple_copy_ongpu(copy_size, state.input, l.output_gpu);
+    //return;
+
+    //fill_ongpu(batch*l.outputs, 0, l.prev_state_gpu, 1);
+    float *prev_output = l.prev_state_gpu;
+
+    int i;
+    for (i = 0; i < l.steps; ++i) {
+        // shift cell
+        int shift_size = l.inputs * (l.history_size - 1);
+        int output_sift = l.inputs;
+
+        int b;
+        for (b = 0; b < batch; ++b) {
+            //printf(" hist-fw: i = %d, b = %d \n", i, b);
+
+            int input_start = b*l.inputs + i*l.inputs*batch;
+            int output_start = b*l.outputs + i*l.outputs*batch;
+            float *input = state.input + input_start;
+            float *output = l.output_gpu + output_start;
+
+            //copy_cpu(shift_size, prev_output + b*l.outputs, 1, output + output_sift, 1);
+            simple_copy_ongpu(shift_size, prev_output + b*l.outputs, output + output_sift);
+
+            //copy_cpu(l.inputs, input, 1, output, 1);
+            simple_copy_ongpu(l.inputs, input, output);
+
+            int h;
+            for (h = 1; h < l.history_size; ++h) {
+                //scal_ongpu(l.inputs, (l.history_size - h)/ (float)l.history_size, output + h*l.inputs, 1);
+                //scal_ongpu(l.inputs, 0, output + h*l.inputs, 1);
+            }
+        }
+        prev_output = l.output_gpu + i*l.outputs*batch;
+    }
+
+    int output_start = (l.steps - 1)*l.outputs*batch;
+    //copy_cpu(batch*l.outputs, l.output + output_start, 1, l.prev_state_cpu, 1);
+    simple_copy_ongpu(batch*l.outputs, l.output_gpu + output_start, l.prev_state_gpu);
+}
+
+void backward_history_layer_gpu(const layer l, network_state state)
+{
+    const int batch = l.batch / l.steps;
+
+    //int copy_size = l.inputs*batch*l.steps;
+    //printf(" copy_size = %d, inputs = %d, batch = %d, steps = %d, l.history_size = %d \n", copy_size, l.inputs, batch, l.steps, l.history_size);
+    //axpy_ongpu(copy_size, 1, l.delta_gpu, 1, state.delta, 1);
+    //return;
+
+    // l.delta -> state.delta
+    int i;
+    for (i = 0; i < l.steps; ++i) {
+        int b;
+        for (b = 0; b < batch; ++b) {
+            //printf(" hist-bw: i = %d, b = %d \n", i, b);
+
+            int input_start = b*l.inputs + i*l.inputs*batch;
+            int output_start = b*l.outputs + i*l.outputs*batch;
+            float *state_delta = state.delta + input_start;
+            float *l_delta = l.delta_gpu + output_start;
+
+            //copy_cpu(l.inputs, l_delta, 1, state_delta, 1);
+            axpy_ongpu(l.inputs, 1, l_delta, 1, state_delta, 1);
+        }
+    }
+}
+#endif
+
+
 void update_conv_lstm_layer(layer l, int batch, float learning_rate, float momentum, float decay)
 {
     if (l.peephole) {
diff --git a/src/conv_lstm_layer.h b/src/conv_lstm_layer.h
index a79c06f6453..fae59f14996 100644
--- a/src/conv_lstm_layer.h
+++ b/src/conv_lstm_layer.h
@@ -20,10 +20,17 @@ void forward_conv_lstm_layer(layer l, network_state state);
 void backward_conv_lstm_layer(layer l, network_state state);
 void update_conv_lstm_layer(layer l, int batch, float learning_rate, float momentum, float decay);
 
+layer make_history_layer(int batch, int h, int w, int c, int history_size, int steps, int train);
+void forward_history_layer(layer l, network_state state);
+void backward_history_layer(layer l, network_state state);
+
 #ifdef GPU
 void forward_conv_lstm_layer_gpu(layer l, network_state state);
 void backward_conv_lstm_layer_gpu(layer l, network_state state);
 void update_conv_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale);
+
+void forward_history_layer_gpu(const layer l, network_state state);
+void backward_history_layer_gpu(const layer l, network_state state);
 #endif
 
 #ifdef __cplusplus
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index eaa8fcd8ca7..ae922eef921 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -569,7 +569,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
             float *a = l.weights_gpu + j*l.nweights / l.groups;
             float *b = state.workspace;
             float *c = l.output_gpu + (i*l.groups + j)*n*m;
-            if (l.size == 1) {
+            if (l.size == 1 && l.stride == 1 && l.dilation == 1) {
                 b = im;
             }
             else {
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index a169f3fe87c..1ee241ea3a9 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -1364,7 +1364,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
             else {
                 //printf(" l.index = %d - FP32 \n", l.index);
                 float *im = state.input + (i*l.groups + j)*(l.c / l.groups)*l.h*l.w;
-                if (l.size == 1) {
+                if (l.size == 1 && l.stride == 1 && l.dilation == 1) {
                     b = im;
                 }
                 else {
diff --git a/src/parser.c b/src/parser.c
index 474a525882d..59d71bed45b 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -69,6 +69,7 @@ LAYER_TYPE string_to_layer_type(char * type)
     if (strcmp(type, "[gru]")==0) return GRU;
     if (strcmp(type, "[lstm]")==0) return LSTM;
     if (strcmp(type, "[conv_lstm]") == 0) return CONV_LSTM;
+    if (strcmp(type, "[history]") == 0) return HISTORY;
     if (strcmp(type, "[rnn]")==0) return RNN;
     if (strcmp(type, "[conn]")==0
             || strcmp(type, "[connected]")==0) return CONNECTED;
@@ -329,6 +330,13 @@ layer parse_conv_lstm(list *options, size_params params)
     return l;
 }
 
+layer parse_history(list *options, size_params params)
+{
+    int history_size = option_find_int(options, "history_size", 4);
+    layer l = make_history_layer(params.batch, params.h, params.w, params.c, history_size, params.time_steps, params.train);
+    return l;
+}
+
 connected_layer parse_connected(list *options, size_params params)
 {
     int output = option_find_int(options, "output",1);
@@ -1377,6 +1385,8 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
             l = parse_lstm(options, params);
         }else if (lt == CONV_LSTM) {
             l = parse_conv_lstm(options, params);
+        }else if (lt == HISTORY) {
+            l = parse_history(options, params);
         }else if(lt == CRNN){
             l = parse_crnn(options, params);
         }else if(lt == CONNECTED){