Special case for a Conv with a single output pixel

thesps · Apr 23, 2021 · a6cf3c0 · a6cf3c0
1 parent bafb268
commit a6cf3c0
Show file tree

Hide file tree

Showing 4 changed files with 143 additions and 0 deletions.
diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
@@ -10,6 +10,7 @@
 from hls4ml.model.optimizer.passes.conv_same_pad import InsertZeroPaddingBeforeConv1D
 from hls4ml.model.optimizer.passes.conv_same_pad import InsertZeroPaddingBeforeConv2D
 from hls4ml.model.optimizer.passes.pointwise import OptimizePointwiseConv
+from hls4ml.model.optimizer.passes.conv_single_output import OptimizeSingleOutConv
 from hls4ml.model.optimizer.passes.clone import CloneOutput
 from hls4ml.model.optimizer.passes.repack_stream import ReshapeStream, BroadcastStream
 
@@ -29,6 +30,7 @@
 register_pass('conv1d_same_pad', InsertZeroPaddingBeforeConv1D)
 register_pass('conv2d_same_pad', InsertZeroPaddingBeforeConv2D)
 register_pass('optimize_pointwise_conv', OptimizePointwiseConv)
+register_pass('optimize_single_out_conv', OptimizeSingleOutConv)
 register_pass('clone_output', CloneOutput)
 register_pass('reshape_stream', ReshapeStream)
 register_pass('broadcast_stream', BroadcastStream)

diff --git a/hls4ml/model/optimizer/passes/conv_single_output.py b/hls4ml/model/optimizer/passes/conv_single_output.py
@@ -0,0 +1,57 @@
+import numpy as np
+import re
+
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.hls_model import Conv1D, Conv2D, register_layer
+from hls4ml.templates import templates
+
+class SingleOutputConv1D(Conv1D):
+    ''' Optimized Conv1D implementation for kernel_size = input_size resulting in single output pixel. '''
+
+    # Nothing to do, will pick up function and config from class name
+    pass
+
+class SingleOutputConv2D(Conv2D):
+    ''' Optimized Conv2D implementation for kernel_size = input_size resulting in single output pixel. '''
+
+    # Nothing to do, will pick up function and config from class name
+    pass
+
+single_out_conv1d_function_template = 'nnet::single_output_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+single_out_conv2d_function_template = 'nnet::single_output_conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+
+single_out_conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h']
+single_out_conv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_conv2d_stream.h']
+
+# Register the layer types to the layer map
+register_layer('SingleOutputConv1D', SingleOutputConv1D)
+register_layer('SingleOutputConv2D', SingleOutputConv2D)
+
+# Register the templates for config and function
+templates.get_backend('Vivado').register_templates(
+    'SingleOutputConv1D',
+    single_out_conv1d_function_template,
+    templates.get_backend('Vivado').get_config_template('Conv1D'),
+    single_out_conv1d_include_list
+)
+
+templates.get_backend('Vivado').register_templates(
+    'SingleOutputConv2D',
+    single_out_conv2d_function_template,
+    templates.get_backend('Vivado').get_config_template('Conv2D'),
+    single_out_conv2d_include_list
+)
+
+class OptimizeSingleOutConv(OptimizerPass):
+    def match(self, node):
+        return node.__class__.__name__ in ['Conv1D', 'Conv2D'] and \
+            node.get_attr('filt_height', 1) == node.get_attr('in_height', 1) and \
+            node.get_attr('filt_width') == node.get_attr('in_width') and \
+            node.get_attr('out_height', 1) == 1 and node.get_attr('out_width') == 1
+
+    def transform(self, model, node):
+        dim = node.__class__.__name__[-2:] # '1D' or '2D'
+        pw_node = model.make_node('SingleOutputConv' + dim, node.name, node.attributes.copy(), node.inputs.copy())
+        model.replace_node(node, pw_node)
+
+        return True
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
@@ -56,5 +56,47 @@ void conv_1d_cl(
     }
 }
 
+template<class data_T, class res_T, typename CONFIG_T>
+void single_output_conv_1d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::filt_width == CONFIG_T::in_width);
+    assert(CONFIG_T::out_width == 1);
+
+    typename data_T::value_type data_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=data_window complete
+    typename res_T::value_type res_elem[CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable=res_elem complete
+    res_T res_pack;
+    #pragma HLS DATA_PACK variable=res_pack
+
+    ReadInputImage: for (unsigned i = 0; i < CONFIG_T::in_width; i++) {
+        #pragma HLS PIPELINE
+        data_T in_elem = data.read();
+        CopyDataChan: for (unsigned c = 0; c < CONFIG_T::n_chan; c++) {
+            #pragma HLS UNROLL
+            data_window[i * CONFIG_T::n_chan + c] = in_elem[c];
+        }
+    }
+
+    #pragma HLS INLINE region
+    if (CONFIG_T::strategy == nnet::latency) {
+        dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(data_window, res_elem, weights, biases);
+    } else {
+        dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(data_window, res_elem, weights, biases);
+    }
+
+    CastLoop: for (unsigned jj = 0; jj < CONFIG_T::n_filt; jj++) {
+        #pragma HLS UNROLL
+        res_pack[jj] = res_elem[jj];
+    }
+
+    res.write(res_pack);
+}
+
 }
 #endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h
@@ -61,5 +61,47 @@ void conv_2d_cl(
     }
 }
 
+template<class data_T, class res_T, typename CONFIG_T>
+void single_output_conv_2d_cl(
+    hls::stream<data_T> &data,
+    hls::stream<res_T>  &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt])
+{
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::filt_height == CONFIG_T::in_height && CONFIG_T::filt_width == CONFIG_T::in_width);
+    assert(CONFIG_T::out_height == 1 && CONFIG_T::out_width == 1);
+
+    typename data_T::value_type data_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=data_window complete
+    typename res_T::value_type res_elem[CONFIG_T::n_filt];
+    #pragma HLS ARRAY_PARTITION variable=res_elem complete
+    res_T res_pack;
+    #pragma HLS DATA_PACK variable=res_pack
+
+    ReadInputImage: for (unsigned i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) {
+        #pragma HLS PIPELINE
+        data_T in_elem = data.read();
+        CopyDataChan: for (unsigned c = 0; c < CONFIG_T::n_chan; c++) {
+            #pragma HLS UNROLL
+            data_window[i * CONFIG_T::n_chan + c] = in_elem[c];
+        }
+    }
+
+    #pragma HLS INLINE region
+    if (CONFIG_T::strategy == nnet::latency) {
+        dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(data_window, res_elem, weights, biases);
+    } else {
+        dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(data_window, res_elem, weights, biases);
+    }
+
+    CastLoop: for (unsigned jj = 0; jj < CONFIG_T::n_filt; jj++) {
+        #pragma HLS UNROLL
+        res_pack[jj] = res_elem[jj];
+    }
+
+    res.write(res_pack);
+}
+
 }
 #endif