centernet/centernet.py

import numpy as np

import tensorrt as trt
import torch

from sample import common
import argparse
import time

# You can set the logger severity higher to suppress messages (or lower to display more messages).
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

trt.init_libnvinfer_plugins(TRT_LOGGER, '')
PLUGIN_CREATORS = trt.get_plugin_registry().plugin_creator_list

for plugin_creator in PLUGIN_CREATORS:
    if plugin_creator.name == 'DCNv2_TRT':
        dcnCreator = plugin_creator


class ModelData(object):
    INPUT_NAME = "data"
    INPUT_SHAPE = (3, 512, 512)
    OUTPUT_NAME = "prob"
    DTYPE = trt.float16


class Centernet_dla34(object):
    def __init__(self, weights) -> None:
        super().__init__()
        self.weights = weights
        self.levels = [1, 1, 1, 2, 2, 1]
        self.channels = [16, 32, 64, 128, 256, 512]
        self.down_ratio = 4
        self.last_level = 5
        self.engine = self.build_engine()

    def add_batchnorm_2d(self, input_tensor, parent):
        gamma = self.weights[parent + '.weight'].numpy()
        beta = self.weights[parent + '.bias'].numpy()
        mean = self.weights[parent + '.running_mean'].numpy()
        var = self.weights[parent + '.running_var'].numpy()
        eps = 1e-5

        scale = gamma / np.sqrt(var + eps)
        shift = beta - mean * gamma / np.sqrt(var + eps)
        power = np.ones_like(scale)

        return self.network.add_scale(input=input_tensor.get_output(0), mode=trt.ScaleMode.CHANNEL, shift=shift, scale=scale, power=power)

    def add_basic_block(self, input_tensor, out_channels, residual=None, stride=1, dilation=1, parent=''):
        conv1_w = self.weights[parent + '.conv1.weight'].numpy()
        conv1 = self.network.add_convolution(input=input_tensor.get_output(
            0), num_output_maps=out_channels, kernel_shape=(3, 3), kernel=conv1_w)
        conv1.stride = (stride, stride)
        conv1.padding = (dilation, dilation)
        conv1.dilation = (dilation, dilation)

        bn1 = self.add_batchnorm_2d(conv1, parent + '.bn1')
        ac1 = self.network.add_activation(
            input=bn1.get_output(0), type=trt.ActivationType.RELU)

        conv2_w = self.weights[parent + '.conv2.weight'].numpy()
        conv2 = self.network.add_convolution(input=ac1.get_output(
            0), num_output_maps=out_channels, kernel_shape=(3, 3), kernel=conv2_w)
        conv2.padding = (dilation, dilation)
        conv2.dilation = (dilation, dilation)

        out = self.add_batchnorm_2d(conv2, parent + '.bn2')

        if residual is None:
            out = self.network.add_elementwise(input_tensor.get_output(
                0), out.get_output(0), trt.ElementWiseOperation.SUM)
        else:
            out = self.network.add_elementwise(residual.get_output(
                0), out.get_output(0), trt.ElementWiseOperation.SUM)
        return self.network.add_activation(input=out.get_output(0), type=trt.ActivationType.RELU)

    def add_level(self, input_tensor, out_channels, stride=1, dilation=1, parent=''):
        conv1_w = self.weights[parent + '.0.weight'].numpy()
        conv1 = self.network.add_convolution(input=input_tensor.get_output(
            0), num_output_maps=out_channels, kernel_shape=(3, 3), kernel=conv1_w)
        conv1.stride = (stride, stride)
        conv1.padding = (dilation, dilation)
        conv1.dilation = (dilation, dilation)

        bn1 = self.add_batchnorm_2d(conv1, parent + '.1')
        ac1 = self.network.add_activation(
            input=bn1.get_output(0), type=trt.ActivationType.RELU)
        return ac1

    def add_root(self, input_tensors: list, out_channels, kernel_size=1, residual=False, parent=''):
        ct = self.network.add_concatenation(
            [x.get_output(0) for x in input_tensors])

        conv_w = self.weights[parent + '.conv.weight'].numpy()
        conv = self.network.add_convolution(input=ct.get_output(
            0), num_output_maps=out_channels, kernel_shape=(1, 1), kernel=conv_w)
        conv.padding = ((kernel_size - 1) // 2, (kernel_size - 1) // 2)

        bn1 = self.add_batchnorm_2d(conv, parent + '.bn')
        out = self.network.add_activation(
            input=bn1.get_output(0), type=trt.ActivationType.RELU)

        if residual:
            out = self.network.add_elementwise(input_tensors[0].get_output(
                0), out.get_output(0), trt.ElementWiseOperation.SUM)

        return self.network.add_activation(input=out.get_output(0), type=trt.ActivationType.RELU)

    def add_tree(self, input_tensor, level, out_channels, residual=None, children=None, stride=1, level_root=False, parent=''):
        children = [] if children is None else children
        if stride > 1:
            bottom = self.network.add_pooling(input_tensor.get_output(
                0), trt.PoolingType.MAX, (stride, stride))
            bottom.stride = (stride, stride)
        else:
            bottom = input_tensor

        if input_tensor.get_output(0).shape[0] != out_channels:
            project_conv1_w = self.weights[parent +
                                           '.project.0.weight'].numpy()
            project_conv1 = self.network.add_convolution(input=bottom.get_output(
                0), num_output_maps=out_channels, kernel_shape=(1, 1), kernel=project_conv1_w)
            residual = self.add_batchnorm_2d(
                project_conv1, parent + '.project.1')
        else:
            residual = bottom

        if level_root:
            children.append(bottom)

        if level == 1:
            tree1 = self.add_basic_block(
                input_tensor, out_channels, residual, stride, parent=parent+'.tree1')
            tree2 = self.add_basic_block(
                tree1, out_channels, parent=parent+'.tree2')
            return self.add_root([tree2, tree1]+children, out_channels, parent=parent+'.root')
        else:
            tree1 = self.add_tree(input_tensor, level-1, out_channels,
                                  residual, stride=stride, parent=parent+'.tree1')
            children.append(tree1)
            return self.add_tree(tree1, level-1, out_channels, children=children, parent=parent+'.tree2')

    def add_base(self, input_tensor, parent):
        base_conv1_w = self.weights[parent+'.base_layer.0.weight'].numpy()
        base_conv1 = self.network.add_convolution(
            input=input_tensor, num_output_maps=self.channels[0], kernel_shape=(7, 7), kernel=base_conv1_w)
        base_conv1.padding = (3, 3)

        base_bn1 = self.add_batchnorm_2d(base_conv1, parent+'.base_layer.1')
        base_ac1 = self.network.add_activation(
            input=base_bn1.get_output(0), type=trt.ActivationType.RELU)

        level0 = self.add_level(
            base_ac1, self.channels[0],    parent=parent+'.level0')
        level1 = self.add_level(
            level0,   self.channels[1], 2, parent=parent+'.level1')

        level2 = self.add_tree(
            level1, self.levels[2], self.channels[2], stride=2, level_root=False, parent=parent+'.level2')
        level3 = self.add_tree(
            level2, self.levels[3], self.channels[3], stride=2, level_root=True, parent=parent+'.level3')
        level4 = self.add_tree(
            level3, self.levels[4], self.channels[4], stride=2, level_root=True, parent=parent+'.level4')
        level5 = self.add_tree(
            level4, self.levels[5], self.channels[5], stride=2, level_root=True, parent=parent+'.level5')

        return [level0, level1, level2, level3, level4, level5]

    def add_deform_conv(self, input_tensor, out_channels, kernel=3, stride=1, padding=1, dilation=1, deformable_group=1, parent=''):
        conv_offset_mask_w = self.weights[parent +
                                          '.conv.conv_offset_mask.weight'].numpy()
        conv_offset_mask_b = self.weights[parent +
                                          '.conv.conv_offset_mask.bias'].numpy()
        conv_offset_mask = self.network.add_convolution(input=input_tensor.get_output(0),
                                                        num_output_maps=deformable_group*3*kernel*kernel,
                                                        kernel_shape=(
                                                            kernel, kernel),
                                                        kernel=conv_offset_mask_w,
                                                        bias=conv_offset_mask_b)
        conv_offset_mask.stride = (stride, stride)
        conv_offset_mask.padding = (padding, padding)

        out_channels = trt.PluginField("out_channels", np.array(
            [out_channels], dtype=np.int32), trt.PluginFieldType.INT32)
        kernel = trt.PluginField("kernel", np.array(
            [kernel], dtype=np.int32), trt.PluginFieldType.INT32)
        deformable_group = trt.PluginField("deformable_group", np.array(
            [deformable_group], dtype=np.int32), trt.PluginFieldType.INT32)
        dilation = trt.PluginField("dilation", np.array(
            [dilation], dtype=np.int32), trt.PluginFieldType.INT32)
        padding = trt.PluginField("padding", np.array(
            [padding], dtype=np.int32), trt.PluginFieldType.INT32)
        stride = trt.PluginField("stride", np.array(
            [stride], dtype=np.int32), trt.PluginFieldType.INT32)
        weight = trt.PluginField(
            "weight", self.weights[parent + '.conv.weight'].numpy(), trt.PluginFieldType.FLOAT32)
        bias = trt.PluginField(
            "bias", self.weights[parent + '.conv.bias'].numpy(), trt.PluginFieldType.FLOAT32)
        field_collection = trt.PluginFieldCollection(
            [out_channels, kernel, deformable_group, dilation, padding, stride, weight, bias])
        DCN = dcnCreator.create_plugin(
            name='DCNv2_TRT', field_collection=field_collection)

        sigmoid_conv_offset_mask = self.network.add_activation(
            input=conv_offset_mask.get_output(0), type=trt.ActivationType.SIGMOID)

        dcn = self.network.add_plugin_v2(inputs=[input_tensor.get_output(
            0), conv_offset_mask.get_output(0), sigmoid_conv_offset_mask.get_output(0)], plugin=DCN)
        bn = self.add_batchnorm_2d(dcn, parent+'.actf.0')
        return self.network.add_activation(input=bn.get_output(0), type=trt.ActivationType.RELU)

    def add_ida_up(self, input_tensors, out_channels, up_f, startp, parent):
        for i in range(startp + 1, len(input_tensors)):
            proj = self.add_deform_conv(
                input_tensors[i], out_channels, parent=parent+'.proj_%d' % (i-startp))
            f = up_f[i-startp]
            up_w = self.weights[parent + '.up_%d.weight' % (i-startp)].numpy()
            up = self.network.add_deconvolution(
                proj.get_output(0), out_channels, (f*2, f*2), up_w)
            up.stride = (f, f)
            up.padding = (f//2, f//2)
            up.num_groups = out_channels
            node = self.network.add_elementwise(
                input_tensors[i-1].get_output(0), up.get_output(0), trt.ElementWiseOperation.SUM)
            input_tensors[i] = self.add_deform_conv(
                node, out_channels, parent=parent+'.node_%d' % (i-startp))
        return input_tensors

    def add_dla_up(self, input_tensors, first_level, parent):
        channels = self.channels[first_level:]
        scales = [2 ** i for i in range(len(self.channels[first_level:]))]
        scales = np.array(scales, dtype=int)
        out = [input_tensors[-1]]
        for i in range(len(channels) - 1):
            j = -i - 2
            input_tensors = self.add_ida_up(
                input_tensors, channels[j], scales[j:] // scales[j], len(input_tensors) - i - 2, parent+'.ida_%d' % i)
            out.insert(0, input_tensors[-1])
            scales[j + 1:] = scales[j]
            channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]
        return out

    def add_head(self, input_tensor, out_channels, head, head_conv=256, final_kernal=1):
        conv1_w = self.weights[head+'.0.weight'].numpy()
        conv1_b = self.weights[head+'.0.bias'].numpy()
        conv1 = self.network.add_convolution(
            input_tensor.get_output(0), head_conv, (3, 3), conv1_w, conv1_b)
        conv1.padding = (1, 1)
        ac1 = self.network.add_activation(
            input=conv1.get_output(0), type=trt.ActivationType.RELU)
        conv2_w = self.weights[head + '.2.weight'].numpy()
        conv2_b = self.weights[head+'.2.bias'].numpy()
        conv2 = self.network.add_convolution(ac1.get_output(
            0), out_channels, (final_kernal, final_kernal), conv2_w, conv2_b)
        return conv2

    def populate_network(self):
        # Configure the network layers based on the self.weights provided.
        input_tensor = self.network.add_input(
            name=ModelData.INPUT_NAME, dtype=ModelData.DTYPE, shape=ModelData.INPUT_SHAPE)

        y = self.add_base(input_tensor, 'module.base')

        first_level = int(np.log2(self.down_ratio))
        last_level = self.last_level
        dla_up = self.add_dla_up(y, first_level, 'module.dla_up')
        ida_up = self.add_ida_up(dla_up[:last_level-first_level], self.channels[first_level], [
                                 2 ** i for i in range(last_level - first_level)], 0, 'module.ida_up')

        hm = self.add_head(ida_up[-1], 80, 'module.hm')
        wh = self.add_head(ida_up[-1], 2, 'module.wh')
        reg = self.add_head(ida_up[-1], 2, 'module.reg')

        hm.get_output(0).name = 'hm'
        wh.get_output(0).name = 'wh'
        reg.get_output(0).name = 'reg'
        self.network.mark_output(tensor=hm.get_output(0))
        self.network.mark_output(tensor=wh.get_output(0))
        self.network.mark_output(tensor=reg.get_output(0))

    def build_engine(self):
        # For more information on TRT basics, refer to the introductory samples.
        with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network:
            self.network = network
            builder.max_workspace_size = common.GiB(1)
            builder.max_batch_size = 1
            # Populate the network using self.weights from the PyTorch model.
            self.populate_network()
            # Build and return an engine.
            return builder.build_cuda_engine(self.network)


def load_random_test_case(pagelocked_buffer):
    # Select an image at random to be the test case.
    img = np.random.randn(1, 3, 512, 512).astype(np.float32)
    # Copy to the pagelocked input buffer
    np.copyto(pagelocked_buffer, img.ravel())
    return img


def main(args):
    # Get the PyTorch weights
    weights = torch.load(args.model, map_location={
                         'cuda:0': 'cpu'})['state_dict']
    # Do inference with TensorRT.
    with Centernet_dla34(weights).engine as engine:
        if args.save_engine:
            with open('centernet.engine', "wb") as f:
                f.write(engine.serialize())
        inputs, outputs, bindings, stream = common.allocate_buffers(engine)
        with engine.create_execution_context() as context:
            img = load_random_test_case(pagelocked_buffer=inputs[0].host)
            # For more information on performing inference, refer to the introductory samples.
            # The common.do_inference function will return a list of outputs - we only have one in this case.
            t = time.time()
            [hm, wh, reg] = common.do_inference(
                context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=1)
            t = time.time() - t
            print('output:   hm:%f, wh:%f, reg:%f' %
                  (hm.mean(), wh.mean(), reg.mean()))
            print(t)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='CenterNet dla34 ctdet')
    parser.add_argument('--model',  '-m', type=str,
                        default='./ctdet_coco_dla_2x.pth', help='path of pytorch .pth')
    parser.add_argument('--save_engine', '-s',
                        action='store_true', help='if save trt engine')
    args = parser.parse_args()
    main(args)